shithub: openh264

Download patch

ref: b6a765ad71c5ea861694e16adc01f33b1c77b869
parent: b47606a4ff30456e581cba830ed5484fd1ff1713
parent: ad9e2dab4f1e829fecdf412c036dcf2e9c45635c
author: Licai Guo <licaguo@cisco.com>
date: Wed Apr 23 09:58:26 EDT 2014

Merge pull request #734 from dongzha/MC_ARM64

Add Motion Compehension ARM64 Neon Code

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -27,6 +27,7 @@
 		F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; };
 		F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; };
 		F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };
+		F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };
 		FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };
 /* End PBXBuildFile section */
 
@@ -87,6 +88,7 @@
 		F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = "<group>"; };
 		F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = "<group>"; };
 		F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = "<group>"; };
+		F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };
 		FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };
 		FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -223,6 +225,7 @@
 		F556A81D1906669F00E156A8 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				F5B8D82C190757290037849A /* mc_aarch64_neon.S */,
 				F556A8221906673900E156A8 /* arm_arch64_common_macro.S */,
 				F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */,
 			);
@@ -310,6 +313,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */,
 				4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */,
 				F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,
 				4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,
--- /dev/null
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -1,0 +1,2274 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+.align 16
+filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
+
+#ifdef __APPLE__
+
+.macro	FILTER_6TAG_8BITS1
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun $6.8b, v18.8h, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 $6.16b, v18.8h, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun $6.8b, v18.8h, #5
+    uaddl  v19.8h, $2.8b, $6.8b
+    rshrn $6.8b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 $6.16b, v18.8h, #5
+    uaddl2  v19.8h, $2.16b, $6.16b
+    rshrn2 $6.16b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun $6.8b, v18.8h, #5
+    uaddl  v19.8h, $3.8b, $6.8b
+    rshrn $6.8b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 $6.16b, v18.8h, #5
+    uaddl2  v19.8h, $3.16b, $6.16b
+    rshrn2 $6.16b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS1
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl	$6.8h, $0.8b, $5.8b		//dst_q=src[-2]+src[3]
+    uaddl	v31.8h, $2.8b, $3.8b	//src[0]+src[1]
+    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl	v31.8h, $1.8b, $4.8b	//src[-1]+src[2]
+    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS2
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2	$6.8h, $0.16b, $5.16b		//dst_q=src[-2]+src[3]
+    uaddl2	v31.8h, $2.16b, $3.16b	//src[0]+src[1]
+    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2	v31.8h, $1.16b, $4.16b	//src[-1]+src[2]
+    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS1
+//	{	// input:a, b, c, dst_d;
+    sub	$0.8h, $0.8h, $1.8h			//a-b
+    sshr	$0.8h, $0.8h, #2			//(a-b)/4
+    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
+    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
+    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
+    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun	$3.8b, $0.8h, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS2
+//	{	// input:a, b, c, dst_d;
+    sub	$0.8h, $0.8h, $1.8h			//a-b
+    sshr	$0.8h, $0.8h, #2			//(a-b)/4
+    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
+    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
+    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
+    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2	$3.16b, $0.8h, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext	$4.16b, $0.16b, $1.16b, #4		//src[0]
+    ext	$3.16b, $0.16b, $1.16b, #6		//src[1]
+    add	$4.8h, $4.8h, $3.8h					//c=src[0]+src[1]
+
+    ext	$3.16b, $0.16b, $1.16b, #2		//src[-1]
+    ext	$2.16b, $0.16b, $1.16b, #8		//src[2]
+    add	$3.8h, $3.8h, $2.8h					//b=src[-1]+src[2]
+
+    ext	$2.16b, $0.16b, $1.16b, #10		//src[3]
+    add	$2.8h, $2.8h, $0.8h					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	AVERAGE_TWO_8BITS1
+//	{	// input:dst_d, src_d A and B; working: v5
+    uaddl	v30.8h, $2.8b, $1.8b
+    rshrn	$0.8b, v30.8h, #1
+//	}
+.endm
+
+.macro	AVERAGE_TWO_8BITS2
+//	{	// input:dst_d, src_d A and B; working: v5
+    uaddl2	v30.8h, $2.16b, $1.16b
+    rshrn2	$0.16b, v30.8h, #1
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X},
+    rev64	$2.8b, $0.8b				// X[5][4][3][2][1][0]O
+    uaddl	$2.8h, $0.8b, $2.8b			// each 16bits, *[50][41][32][23][14][05]*
+    mul	$2.4h, $2.4h, $1.4h			// 0+1*[50]-5*[41]+20[32]
+    addv $3, $2.4h
+    sqrshrun $0.8b, $0.8h, #5
+//	}
+.endm
+
+.macro	UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
+//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+    ext.16b $3, $1, $1, #14       // X[0][1][2][3][4][5]O
+    ext.16b $4, $3, $3, #8      // [3][4][5]OX[0][1][2]
+    rev64  $4.8h, $4.8h			// X[5][4][3][2][1][0]O
+    add   $3.8h, $3.8h, $4.8h    // each 16bits, *[50][41][32][23][14][05]*
+    smull $3.4s, $3.4h, $2.4h			// 0+1*[50]-5*[41]+20[32]
+    saddlv $5, $3.4s
+    //sshr $0.2d, $0.2d, #4
+    sqrshrun $0.2s, $0.2d, #10
+    uqxtn $0.4h, $0.4s
+    uqxtn $0.8b, $0.8h
+   //	}
+.endm
+
+#else
+.macro	FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, \arg2.8b, \arg3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, \arg1.8b, \arg4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun \arg6.8b, v18.8h, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, \arg2.16b, \arg3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, \arg1.16b, \arg4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 \arg6.16b, v18.8h, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, \arg2.8b, \arg3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, \arg1.8b, \arg4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun \arg6.8b, v18.8h, #5
+    uaddl  v19.8h, \arg2.8b, \arg6.8b
+    rshrn \arg6.8b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, \arg2.16b, \arg3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, \arg1.16b, \arg4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 \arg6.16b, v18.8h, #5
+    uaddl2  v19.8h, \arg2.16b, \arg6.16b
+    rshrn2 \arg6.16b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
+    uaddl v19.8h, \arg2.8b, \arg3.8b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl v19.8h, \arg1.8b, \arg4.8b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun \arg6.8b, v18.8h, #5
+    uaddl  v19.8h, \arg3.8b, \arg6.8b
+    rshrn \arg6.8b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+    uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
+    uaddl2 v19.8h, \arg2.16b, \arg3.16b	//src[0]+src[1]
+    mla v18.8h, v19.8h, \arg7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
+    uaddl2 v19.8h, \arg1.16b, \arg4.16b  //src[-1]+src[2]
+    mls v18.8h, v19.8h, \arg8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
+    sqrshrun2 \arg6.16b, v18.8h, #5
+    uaddl2  v19.8h, \arg3.16b, \arg6.16b
+    rshrn2 \arg6.16b, v19.8h, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl	\arg6.8h, \arg0.8b, \arg5.8b		//dst_q=src[-2]+src[3]
+    uaddl	v31.8h, \arg2.8b, \arg3.8b	//src[0]+src[1]
+    mla	\arg6.8h, v31.8h, \arg7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl	v31.8h, \arg1.8b, \arg4.8b	//src[-1]+src[2]
+    mls	\arg6.8h, v31.8h, \arg8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2	\arg6.8h, \arg0.16b, \arg5.16b		//dst_q=src[-2]+src[3]
+    uaddl2	v31.8h, \arg2.16b, \arg3.16b	//src[0]+src[1]
+    mla	\arg6.8h, v31.8h, \arg7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2	v31.8h, \arg1.16b, \arg4.16b	//src[-1]+src[2]
+    mls	\arg6.8h, v31.8h, \arg8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
+//	{	// input:a, b, c, dst_d;
+    sub	\arg0.8h, \arg0.8h, \arg1.8h			//a-b
+    sshr	\arg0.8h, \arg0.8h, #2			//(a-b)/4
+    sub	\arg0.8h, \arg0.8h, \arg1.8h			//(a-b)/4-b
+    add	\arg0.8h, \arg0.8h, \arg2.8h			//(a-b)/4-b+c
+    sshr	\arg0.8h, \arg0.8h, #2			//((a-b)/4-b+c)/4
+    add	\arg0.8h, \arg0.8h, \arg2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun	\arg3.8b, \arg0.8h, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
+//	{	// input:a, b, c, dst_d;
+    sub	\arg0.8h, \arg0.8h, \arg1.8h			//a-b
+    sshr	\arg0.8h, \arg0.8h, #2			//(a-b)/4
+    sub	\arg0.8h, \arg0.8h, \arg1.8h			//(a-b)/4-b
+    add	\arg0.8h, \arg0.8h, \arg2.8h			//(a-b)/4-b+c
+    sshr	\arg0.8h, \arg0.8h, #2			//((a-b)/4-b+c)/4
+    add	\arg0.8h, \arg0.8h, \arg2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2	\arg3.16b, \arg0.8h, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext	\arg4.16b, \arg0.16b, \arg1.16b, #4		//src[0]
+    ext	\arg3.16b, \arg0.16b, \arg1.16b, #6		//src[1]
+    add	\arg4.8h, \arg4.8h, \arg3.8h					//c=src[0]+src[1]
+
+    ext	\arg3.16b, \arg0.16b, \arg1.16b, #2		//src[-1]
+    ext	\arg2.16b, \arg0.16b, \arg1.16b, #8		//src[2]
+    add	\arg3.8h, \arg3.8h, \arg2.8h					//b=src[-1]+src[2]
+
+    ext	\arg2.16b, \arg0.16b, \arg1.16b, #10		//src[3]
+    add	\arg2.8h, \arg2.8h, \arg0.8h					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	AVERAGE_TWO_8BITS1 arg0, arg1, arg2
+//	{	// input:dst_d, src_d A and B; working: v5
+    uaddl	v30.8h, \arg2.8b, \arg1.8b
+    rshrn	\arg0.8b, v30.8h, #1
+//	}
+.endm
+
+.macro	AVERAGE_TWO_8BITS2 arg0, arg1, arg2
+//	{	// input:dst_d, src_d A and B; working: v5
+    uaddl2	v30.8h, \arg2.16b, \arg1.16b
+    rshrn2	\arg0.16b, v30.8h, #1
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
+// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X},
+    rev64	\arg2.8b, \arg0.8b				// X[5][4][3][2][1][0]O
+    uaddl	\arg2.8h, \arg0.8b, \arg2.8b			// each 16bits, *[50][41][32][23][14][05]*
+    mul	\arg2.4h, \arg2.4h, \arg1.4h			// 0+1*[50]-5*[41]+20[32]
+    addv \arg3, \arg2.4h
+    sqrshrun \arg0.8b, \arg0.8h, #5
+//	}
+.endm
+
+.macro	UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
+//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+    ext.16b \arg3, \arg1, \arg1, #14       // X[0][1][2][3][4][5]O
+    ext.16b \arg4, \arg3, \arg3, #8      // [3][4][5]OX[0][1][2]
+    rev64  \arg4.8h, \arg4.8h			// X[5][4][3][2][1][0]O
+    add   \arg3.8h, \arg3.8h, \arg4.8h    // each 16bits, *[50][41][32][23][14][05]*
+    smull \arg3.4s, \arg3.4h, \arg2.4h			// 0+1*[50]-5*[41]+20[32]
+    saddlv \arg5, \arg3.4s
+    //sshr \arg0.2d, \arg0.2d, #4
+    sqrshrun \arg0.2s, \arg0.2d, #10
+    uqxtn \arg0.4h, \arg0.4s
+    uqxtn \arg0.8b, \arg0.8h
+   //	}
+.endm
+#endif
+
+//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w16_h_mc_luma_loop:
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.16b}, [x2], x3 //write 16Byte
+	cbnz x4, w16_h_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w8_h_mc_luma_loop:
+    ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.8b}, [x2], x3 //write 8Byte
+	cbnz x4, w8_h_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    asr x4, x4, #1
+w4_h_mc_luma_loop:
+    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
+    //prfm pldl1strm, [x0]
+    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
+    //prfm pldl1strm, [x0]
+
+    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
+    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
+    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
+    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
+    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
+    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
+
+    FILTER_6TAG_8BITS1 v4, v5, v6, v16, v17, v7, v20, v0, v1
+
+    st1 {v20.s}[0], [x2], x3 //write 4Byte
+    st1 {v20.s}[1], [x2], x3 //write 4Byte
+    sub x4, x4, #1
+	cbnz x4, w4_h_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w16_xy_10_mc_luma_loop:
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.16b}, [x2], x3 //write 16Byte
+	cbnz x4, w16_xy_10_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w8_xy_10_mc_luma_loop:
+    ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.8b}, [x2], x3 //write 8Byte
+	cbnz x4, w8_xy_10_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    asr x4, x4, #1
+w4_xy_10_mc_luma_loop:
+    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
+    //prfm pldl1strm, [x0]
+    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
+    //prfm pldl1strm, [x0]
+
+    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
+    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
+    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
+    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
+    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
+    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v16, v17, v7, v20, v0, v1
+
+    st1 {v20.s}[0], [x2], x3 //write 4Byte
+    st1 {v20.s}[1], [x2], x3 //write 4Byte
+    sub x4, x4, #1
+	cbnz x4, w4_xy_10_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w16_xy_30_mc_luma_loop:
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.16b}, [x2], x3 //write 16Byte
+	cbnz x4, w16_xy_30_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+w8_xy_30_mc_luma_loop:
+    ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+
+    sub x4, x4, #1
+    st1 {v20.8b}, [x2], x3 //write 8Byte
+	cbnz x4, w8_xy_30_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
+    sub x0, x0, #2
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    asr x4, x4, #1
+w4_xy_30_mc_luma_loop:
+    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
+    //prfm pldl1strm, [x0]
+    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
+    //prfm pldl1strm, [x0]
+
+    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
+    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
+    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
+    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
+    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
+
+    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
+    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
+    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
+
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v16, v17, v7, v20, v0, v1
+
+    st1 {v20.s}[0], [x2], x3 //write 4Byte
+    st1 {v20.s}[1], [x2], x3 //write 4Byte
+    sub x4, x4, #1
+	cbnz x4, w4_xy_30_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
+
+
+w16_xy_01_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
+
+    mov.16b v3, v5
+    mov.16b v5, v7
+    mov.16b v7, v2
+    mov.16b v2, v4
+    mov.16b v4, v6
+    mov.16b v6, v7
+    sub x4, x4, #8
+	cbnz x4, w16_xy_01_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
+
+
+w8_xy_01_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v7, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v7
+    sub x4, x4, #4
+	cbnz x4, w8_xy_01_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
+    mov v2.s[1], v3.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
+    mov v3.s[1], v4.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
+    mov v4.s[1], v5.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
+    mov v5.s[1], v6.s[0]
+
+w4_xy_01_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
+    mov v6.s[1], v7.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
+    mov v2.s[0], v7.s[1]
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
+    mov v3.s[0], v2.s[1]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
+    mov v4.s[0], v3.s[1]
+
+    mov.8b v21, v6
+    mov.8b v6, v4
+    mov.8b v4, v2
+    mov.8b v2, v21
+    mov.8b v21, v3
+    mov.8b v3, v7
+    mov.8b v7, v5
+    mov.8b v5, v21
+
+    sub x4, x4, #4
+	cbnz x4, w4_xy_01_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
+
+
+w16_xy_03_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
+
+    mov.16b v3, v5
+    mov.16b v5, v7
+    mov.16b v7, v2
+    mov.16b v2, v4
+    mov.16b v4, v6
+    mov.16b v6, v7
+    sub x4, x4, #8
+	cbnz x4, w16_xy_03_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
+
+
+w8_xy_03_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v7, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v7
+    sub x4, x4, #4
+	cbnz x4, w8_xy_03_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
+    mov v2.s[1], v3.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
+    mov v3.s[1], v4.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
+    mov v4.s[1], v5.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
+    mov v5.s[1], v6.s[0]
+
+w4_xy_03_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
+    mov v6.s[1], v7.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
+    mov v2.s[0], v7.s[1]
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
+    mov v3.s[0], v2.s[1]
+    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
+    mov v4.s[0], v3.s[1]
+
+    mov.8b v21, v6
+    mov.8b v6, v4
+    mov.8b v4, v2
+    mov.8b v2, v21
+    mov.8b v21, v3
+    mov.8b v3, v7
+    mov.8b v7, v5
+    mov.8b v5, v21
+
+    sub x4, x4, #4
+	cbnz x4, w4_xy_03_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
+
+
+w16_xy_02_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
+    FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
+    FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
+
+    mov.16b v3, v5
+    mov.16b v5, v7
+    mov.16b v7, v2
+    mov.16b v2, v4
+    mov.16b v4, v6
+    mov.16b v6, v7
+    sub x4, x4, #8
+	cbnz x4, w16_xy_02_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
+
+
+w8_xy_02_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[3*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[3*stride]
+    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[3*stride]
+    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v7, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v7
+    sub x4, x4, #4
+	cbnz x4, w8_xy_02_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
+    mov v2.s[1], v3.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
+    mov v3.s[1], v4.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
+    mov v4.s[1], v5.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
+    mov v5.s[1], v6.s[0]
+
+w4_xy_02_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
+    mov v6.s[1], v7.s[0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
+    mov v2.s[0], v7.s[1]
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
+    mov v3.s[0], v2.s[1]
+    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
+    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
+    mov v4.s[0], v3.s[1]
+
+    mov.8b v21, v6
+    mov.8b v6, v4
+    mov.8b v4, v2
+    mov.8b v2, v21
+    mov.8b v21, v3
+    mov.8b v3, v7
+    mov.8b v7, v5
+    mov.8b v5, v21
+
+    sub x4, x4, #4
+	cbnz x4, w4_xy_02_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon
+    stp d8, d9, [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+    sub x0, x0, #2
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride]
+
+w16_hv_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
+
+    mov.16b v5, v11
+    mov.16b v11, v17
+    mov.16b v30, v2
+    mov.16b v2, v8
+    mov.16b v8, v14
+    mov.16b v14, v30
+
+    mov.16b v6, v12
+    mov.16b v12, v18
+    mov.16b v30, v3
+    mov.16b v3, v9
+    mov.16b v9, v15
+    mov.16b v15, v30
+
+    mov.16b v7, v13
+    mov.16b v13, v19
+    mov.16b v30, v4
+    mov.16b v4, v10
+    mov.16b v10, v16
+    mov.16b v16, v30
+
+    sub x4, x4, #8
+	cbnz x4, w16_hv_mc_luma_loop
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8, d9, [sp], #16
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon
+    sub x0, x0, #2
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
+
+w8_hv_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
+
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v30, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v30
+
+    sub x4, x4, #4
+	cbnz x4, w8_hv_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon
+    sub x0, x0, #2
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
+
+w4_hv_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    // vertical filtered into v20/v21 1st line
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v16=src[4*stride]
+    // vertical filtered into v22/v23 2nd line
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    zip1 v24.2d, v24.2d, v28.2d
+    zip1 v25.2d, v25.2d, v29.2d
+    zip1 v26.2d, v26.2d, v30.2d
+	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
+    st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    zip1 v24.2d, v24.2d, v28.2d
+    zip1 v25.2d, v25.2d, v29.2d
+    zip1 v26.2d, v26.2d, v30.2d
+	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
+    st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v30, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v30
+
+    sub x4, x4, #4
+	cbnz x4, w4_hv_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
+    //prfm pldl1strm, [x0]
+w16_copy_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v0.16b}, [x0], x1  //read 16Byte : 0 line
+    st1 {v0.16b}, [x2], x3 //write 16Byte : 0 line
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v1.16b}, [x0], x1  //read 16Byte : 1 line
+    st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
+
+    sub x4, x4, #2
+	cbnz x4, w16_copy_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
+    //prfm pldl1strm, [x0]
+w8_copy_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v0.8b}, [x0], x1  //read 16Byte : 0 line
+    st1 {v0.8b}, [x2], x3 //write 16Byte : 0 line
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v1.8b}, [x0], x1  //read 16Byte : 1 line
+    st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
+
+    sub x4, x4, #2
+	cbnz x4, w8_copy_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
+    //prfm pldl1strm, [x0]
+w4_copy_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v0.s}[0], [x0], x1  //read 16Byte : 0 line
+    st1 {v0.s}[0], [x2], x3 //write 16Byte : 0 line
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v1.s}[0], [x0], x1  //read 16Byte : 1 line
+    st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
+
+    sub x4, x4, #2
+	cbnz x4, w4_copy_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
+
+enc_w16_pix_avg_loop:
+    ld1 {v0.16b}, [x2], x3  //read 16Byte : src0: 0 line
+    ld1 {v1.16b}, [x4], x5  //read 16Byte : src1: 0 line
+    ld1 {v2.16b}, [x2], x3  //read 16Byte : src0: 1 line
+    ld1 {v3.16b}, [x4], x5  //read 16Byte : src1: 1 line
+    ld1 {v4.16b}, [x2], x3  //read 16Byte : src0: 2 line
+    ld1 {v5.16b}, [x4], x5  //read 16Byte : src1: 2 line
+    ld1 {v6.16b}, [x2], x3  //read 16Byte : src0: 3 line
+    ld1 {v7.16b}, [x4], x5  //read 16Byte : src1: 3 line
+    AVERAGE_TWO_8BITS1  v16, v0, v1
+    AVERAGE_TWO_8BITS2  v16, v0, v1
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v2, v3
+    AVERAGE_TWO_8BITS2  v16, v2, v3
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v4, v5
+    AVERAGE_TWO_8BITS2  v16, v4, v5
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line
+
+    AVERAGE_TWO_8BITS1  v16, v6, v7
+    AVERAGE_TWO_8BITS2  v16, v6, v7
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
+
+    sub x6, x6, #4
+	cbnz x6, enc_w16_pix_avg_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
+    //prfm pldl1strm, [x2]
+    //prfm pldl1strm, [x4]
+enc_w8_pix_avg_loop:
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v0.8b}, [x2], x3  //read 8Byte : src0: 0 line
+    ld1 {v1.8b}, [x4], x5  //read 8Byte : src1: 0 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v2.8b}, [x2], x3  //read 8Byte : src0: 1 line
+    ld1 {v3.8b}, [x4], x5  //read 8Byte : src1: 1 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v4.8b}, [x2], x3  //read 8Byte : src0: 2 line
+    ld1 {v5.8b}, [x4], x5  //read 8Byte : src1: 2 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v6.8b}, [x2], x3  //read 8Byte : src0: 3 line
+    ld1 {v7.8b}, [x4], x5  //read 8Byte : src1: 3 line
+    AVERAGE_TWO_8BITS1  v16, v0, v1
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line
+
+    AVERAGE_TWO_8BITS1  v16, v2, v3
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v4, v5
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line
+
+    AVERAGE_TWO_8BITS1  v16, v6, v7
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
+
+    sub x6, x6, #4
+	cbnz x6, enc_w8_pix_avg_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
+    //prfm pldl1strm, [x2]
+    //prfm pldl1strm, [x4]
+w16_pix_avg_loop:
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v0.16b}, [x2], x3  //read 16Byte : src0: 0 line
+    ld1 {v1.16b}, [x4], x5  //read 16Byte : src1: 0 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v2.16b}, [x2], x3  //read 16Byte : src0: 1 line
+    ld1 {v3.16b}, [x4], x5  //read 16Byte : src1: 1 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v4.16b}, [x2], x3  //read 16Byte : src0: 2 line
+    ld1 {v5.16b}, [x4], x5  //read 16Byte : src1: 2 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v6.16b}, [x2], x3  //read 16Byte : src0: 3 line
+    ld1 {v7.16b}, [x4], x5  //read 16Byte : src1: 3 line
+    AVERAGE_TWO_8BITS1  v16, v0, v1
+    AVERAGE_TWO_8BITS2  v16, v0, v1
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v2, v3
+    AVERAGE_TWO_8BITS2  v16, v2, v3
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v4, v5
+    AVERAGE_TWO_8BITS2  v16, v4, v5
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line
+
+    AVERAGE_TWO_8BITS1  v16, v6, v7
+    AVERAGE_TWO_8BITS2  v16, v6, v7
+    st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
+
+    sub x6, x6, #4
+	cbnz x6, w16_pix_avg_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
+    //prfm pldl1strm, [x2]
+    //prfm pldl1strm, [x4]
+w8_pix_avg_loop:
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v0.8b}, [x2], x3  //read 8Byte : src0: 0 line
+    ld1 {v1.8b}, [x4], x5  //read 8Byte : src1: 0 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v2.8b}, [x2], x3  //read 8Byte : src0: 1 line
+    ld1 {v3.8b}, [x4], x5  //read 8Byte : src1: 1 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v4.8b}, [x2], x3  //read 8Byte : src0: 2 line
+    ld1 {v5.8b}, [x4], x5  //read 8Byte : src1: 2 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v6.8b}, [x2], x3  //read 8Byte : src0: 3 line
+    ld1 {v7.8b}, [x4], x5  //read 8Byte : src1: 3 line
+    AVERAGE_TWO_8BITS1  v16, v0, v1
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line
+
+    AVERAGE_TWO_8BITS1  v16, v2, v3
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line
+
+
+    AVERAGE_TWO_8BITS1  v16, v4, v5
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line
+
+    AVERAGE_TWO_8BITS1  v16, v6, v7
+    st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
+
+    sub x6, x6, #4
+	cbnz x6, w8_pix_avg_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon
+    //prfm pldl1strm, [x2]
+    //prfm pldl1strm, [x4]
+w4_pix_avg_loop:
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v0.s}[0], [x2], x3  //read 4Byte : src0: 0 line
+    ld1 {v1.s}[0], [x4], x5  //read 4Byte : src1: 0 line
+    //prfm pldl1strm, [x2, x3]
+    //prfm pldl1strm, [x4, x5]
+    ld1 {v0.s}[1], [x2], x3  //read 4Byte : src0: 1 line
+    ld1 {v1.s}[1], [x4], x5  //read 4Byte : src1: 1 line
+    AVERAGE_TWO_8BITS1  v2, v0, v1
+    st1 {v2.s}[0], [x0], x1 //write 4Byte : 0 line
+    st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
+
+    sub x6, x6, #2
+	cbnz x6, w4_pix_avg_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
+    ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
+    ld1 {v0.16b}, [x0], x1  // src[x]
+    ext.16b v1, v0, v0, #1  // src[x+1]
+w8_mc_chroma_loop:
+    ld1 {v2.16b}, [x0], x1  // src[x+stride]
+    ext.16b v3, v2, v2, #1  // src[x+stride+1]
+    ld1 {v18.16b}, [x0], x1  // src[x+2*stride]
+    ext.16b v19, v18, v18, #1  // src[x+2*stride+1]
+
+    umull v16.8h, v0.8b, v4.8b
+    umlal v16.8h, v1.8b, v5.8b
+    umlal v16.8h, v2.8b, v6.8b
+    umlal v16.8h, v3.8b, v7.8b
+    rshrn v17.8b, v16.8h, #6
+    st1 {v17.8b}, [x2], x3
+
+
+    umull v16.8h, v2.8b, v4.8b
+    umlal v16.8h, v3.8b, v5.8b
+    umlal v16.8h, v18.8b, v6.8b
+    umlal v16.8h, v19.8b, v7.8b
+    rshrn v17.8b, v16.8h, #6
+    st1 {v17.8b}, [x2], x3
+
+    mov.16b v0, v18
+    mov.16b v1, v19
+    sub x5, x5, #2
+	cbnz x5, w8_mc_chroma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
+    ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
+    ld1 {v0.8b}, [x0], x1  // src[x]
+    ext.8b v1, v0, v0, #1  // src[x+1]
+w4_mc_chroma_loop:
+    ld1 {v2.8b}, [x0], x1  // src[x+stride]
+    ext.8b v3, v2, v2, #1  // src[x+stride+1]
+    ld1 {v18.8b}, [x0], x1  // src[x+2*stride]
+    ext.8b v19, v18, v18, #1  // src[x+2*stride+1]
+
+    zip1 v0.4s, v0.4s, v2.4s
+    zip1 v1.4s, v1.4s, v3.4s
+    zip1 v2.4s, v2.4s, v18.4s
+    zip1 v3.4s, v3.4s, v19.4s
+
+    umull v16.8h, v0.8b, v4.8b
+    umlal v16.8h, v1.8b, v5.8b
+    umlal v16.8h, v2.8b, v6.8b
+    umlal v16.8h, v3.8b, v7.8b
+    rshrn v17.8b, v16.8h, #6
+    st1 {v17.s}[0], [x2], x3
+    st1 {v17.s}[1], [x2], x3
+
+    mov.8b v0, v18
+    mov.8b v1, v19
+    sub x5, x5, #2
+	cbnz x5, w4_mc_chroma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon
+    sub x0, x0, #2
+    sub x3, x3, #16
+    mov x5, #16
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    ldr q22, filter_para
+w17_h_mc_luma_loop:
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 22(17+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    st1 {v20.16b}, [x2], x5 //write 16Byte
+
+    ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
+	st1 {v21.b}[0], [x2], x3 //write 16th Byte
+
+    sub x4, x4, #1
+	cbnz x4, w17_h_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
+    sub x0, x0, #2
+    sub x3, x3, #8
+    mov x5, #8
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    ldr q22, filter_para
+w9_h_mc_luma_loop:
+    ld1 {v2.8b, v3.8b}, [x0], x1 //only use 14(9+5); v2=src[-2]
+    trn1 v2.2d, v2.2d, v3.2d
+    //prfm pldl1strm, [x0]
+    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
+    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
+    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
+    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
+    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
+
+    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+    st1 {v20.8b}, [x2], x5 //write 8Byte
+
+    ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
+	st1 {v21.b}[0], [x2], x3 //write 9th Byte
+
+    sub x4, x4, #1
+	cbnz x4, w9_h_mc_luma_loop
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon
+    stp d8, d9, [sp,#-16]!
+    stp d10, d11, [sp,#-16]!
+    stp d12, d13, [sp,#-16]!
+    stp d14, d15, [sp,#-16]!
+    sub x0, x0, #2
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    sub x3, x3, #16
+    mov x5, #16
+    ldr q29, filter_para
+
+    sub x4, x4, #1
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride]
+
+w17_hv_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[4*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[5*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[6*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[7*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[8*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[9*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[10*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
+
+    mov.16b v5, v11
+    mov.16b v11, v17
+    mov.16b v30, v2
+    mov.16b v2, v8
+    mov.16b v8, v14
+    mov.16b v14, v30
+
+    mov.16b v6, v12
+    mov.16b v12, v18
+    mov.16b v30, v3
+    mov.16b v3, v9
+    mov.16b v9, v15
+    mov.16b v15, v30
+
+    mov.16b v7, v13
+    mov.16b v13, v19
+    mov.16b v30, v4
+    mov.16b v4, v10
+    mov.16b v10, v16
+    mov.16b v16, v30
+
+    sub x4, x4, #8
+	cbnz x4, w17_hv_mc_luma_loop
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    // vertical filtered into v21/v22
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
+	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
+
+    ldp d14, d15, [sp], #16
+    ldp d12, d13, [sp], #16
+    ldp d10, d11, [sp], #16
+    ldp d8, d9, [sp], #16
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon
+    sub x0, x0, #2
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    sub x3, x3, #8
+    mov x5, #8
+    ldr q29, filter_para
+    sub x4, x4, #1
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
+
+w9_hv_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
+
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v30, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v30
+
+    sub x4, x4, #4
+	cbnz x4, w9_hv_mc_luma_loop
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    // vertical filtered into v20/v21
+    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+    // horizon filtered
+	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
+	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
+    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
+    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    sub x4, x4, #1
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
+
+
+w17_v_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
+    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
+    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.16b}, [x0], x1 // v5=src[7*stride]
+    FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
+
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.16b}, [x0], x1 // v6=src[8*stride]
+    FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[9*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.16b}, [x0], x1 // v2=src[10*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
+
+    mov.16b v3, v5
+    mov.16b v5, v7
+    mov.16b v7, v2
+    mov.16b v2, v4
+    mov.16b v4, v6
+    mov.16b v6, v7
+    sub x4, x4, #8
+	cbnz x4, w17_v_mc_luma_loop
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.16b}, [x2], x3 //write 16Byte : last line
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon
+    sub x0, x0, x1, lsl #1
+    movi v0.8h, #20, lsl #0
+    movi v1.8h, #5, lsl #0
+    sub x4, x4, #1
+
+    //prfm pldl1strm, [x0]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
+
+w9_v_mc_luma_loop:
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v2.8b}, [x0], x1 // v2=src[4*stride]
+    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v3.8b}, [x0], x1 // v3=src[5*stride]
+    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v4.8b}, [x0], x1 // v4=src[6*stride]
+    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
+
+    mov.16b v5, v3
+    mov.16b v3, v7
+    mov.16b v7, v2
+    mov.16b v2, v6
+    mov.16b v6, v4
+    mov.16b v4, v7
+    sub x4, x4, #4
+	cbnz x4, w9_v_mc_luma_loop
+
+    //prfm pldl1strm, [x0, x1]
+    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+WELS_ASM_ARCH64_FUNC_END
+
+#endif
+
--- a/codec/common/inc/mc_common.h
+++ b/codec/common/inc/mc_common.h
@@ -40,62 +40,173 @@
 #endif//__cplusplus
 
 #if defined(HAVE_NEON)
-void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McCopyWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 
-void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McCopyWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 
-void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McCopyWidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 
-void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+void McChromaWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t* pWeights, int32_t iHeight);
 
-void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+void McChromaWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t* pWeights, int32_t iHeight);
 
-void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
 
-void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer01WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer01WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer03WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer03WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer03WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
 
-void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer10WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer10WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer30WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer30WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer30WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
 
-    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer20WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer20WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
 
-    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer02WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer02WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
 
-    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer22WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer22WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
 
-void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                 const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
 
-void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);// width+1
+void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t iHeight);// width+1
 
-void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);// height+1
+void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);// height+1
 
-void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t iHeight);//width+1&&height+1
 #endif
 
+#if defined(HAVE_NEON_AARCH64)
+void McCopyWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                   int32_t iHeight);
+void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t* pWeights, int32_t iHeight);
+void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t* pWeights, int32_t iHeight);
+void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+    const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                        const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);// width+1
+void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);// width+1
+void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);// height+1
+void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);// height+1
+void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);//width+1&&height+1
+#endif
+
 #if defined(X86_ASM)
 //***************************************************************************//
 //                       MMXEXT definition                                   //
@@ -131,18 +242,21 @@
 void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
                                         int32_t iWidth, int32_t iHeight);
 void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                         int32_t iWidth, int32_t iHeight);
+    int32_t iWidth, int32_t iHeight);
 
 void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
                              const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 
-void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth,
                                 int32_t iHeight);
 
-void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
                                  int32_t iHeight);
 
-void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
+void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
+                              int32_t iWidth,
                               int32_t iHeight);
 
 //***************************************************************************//
--- /dev/null
+++ b/codec/common/inc/mc_common.h.orig
@@ -1,0 +1,204 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef MC_COMMON_H
+#define MC_COMMON_H
+
+#include "typedefs.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(HAVE_NEON)
+void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+
+void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+
+void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+
+void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+
+void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+#endif
+
+#if defined(HAVE_NEON_AARCH64)
+void McCopyWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McCopyWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McCopyWidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McChromaWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+void McChromaWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+void PixelAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq4_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void McHorVer01WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void PixStrideAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void McHorVer20Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer02Height17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer22Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+#endif
+    
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       MMXEXT definition                                   //
+//***************************************************************************//
+void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);
+void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+//***************************************************************************//
+//                       SSE2 definition                                     //
+//***************************************************************************//
+void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           int32_t iHeight);
+void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);
+void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                         int32_t iWidth, int32_t iHeight);
+
+void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                int32_t iHeight);
+
+void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight);
+
+void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
+                              int32_t iHeight);
+
+//***************************************************************************//
+//                       SSSE3 definition                                    //
+//***************************************************************************//
+
+void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             const uint8_t* kpABCD, int32_t iHeight);
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//MC_COMMON_H
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -85,13 +85,13 @@
 };
 
 typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight);
+                                        int32_t iWidth, int32_t iHeight);
 
 //***************************************************************************//
 //                          C code implementation                            //
 //***************************************************************************//
 static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
+                                     int32_t iHeight) {
   int32_t i;
   for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
     ST16A2 (pDst, LD16 (pSrc));
@@ -101,7 +101,7 @@
 }
 
 static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
+                                     int32_t iHeight) {
   int32_t i;
   for (i = 0; i < iHeight; i++) {
     ST32A4 (pDst, LD32 (pSrc));
@@ -111,7 +111,7 @@
 }
 
 static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight) {
+                                     int32_t iHeight) {
   int32_t i;
   for (i = 0; i < iHeight; i++) {
     ST64A8 (pDst, LD64 (pSrc));
@@ -121,7 +121,7 @@
 }
 
 static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight) {
+                                      int32_t iHeight) {
   int32_t i;
   for (i = 0; i < iHeight; i++) {
     ST64A8 (pDst  , LD64 (pSrc));
@@ -138,7 +138,7 @@
   int32_t iPix14 = pSrc[-1] + pSrc[2];
   int32_t iPix23 = pSrc[ 0] + pSrc[1];
 
-  return (iPix05 - (iPix14 * 5)+ (iPix23 * 20));
+  return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
 }
 // h: iOffset=1 / v: iOffset=iSrcStride
 static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
@@ -153,7 +153,7 @@
 }
 
 static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+                               const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
   int32_t i, j;
   for (i = 0; i < iHeight; i++) {
     for (j = 0; j < iWidth; j++) {
@@ -165,7 +165,7 @@
   }
 }
 static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                               int32_t iHeight) {
+                             int32_t iHeight) {
   if (iWidth == 16)
     McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
@@ -176,8 +176,9 @@
     McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
 
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   int32_t i, j;
   for (i = 0; i < iHeight; i++) {
     for (j = 0; j < iWidth; j++) {
@@ -188,8 +189,9 @@
   }
 }
 
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   int32_t i, j;
   for (i = 0; i < iHeight; i++) {
     for (j = 0; j < iWidth; j++) {
@@ -200,8 +202,9 @@
   }
 }
 
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   int16_t iTmp[16 + 5]; //16
   int32_t i, j, k;
 
@@ -218,26 +221,30 @@
 }
 
 /////////////////////luma MC//////////////////////////
-static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiTmp[256];
   McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiTmp[256];
   McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiTmp[256];
   McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiVerTmp[256];
   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -244,8 +251,9 @@
   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiVerTmp[256];
   uint8_t uiCtrTmp[256];
   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
@@ -252,8 +260,9 @@
   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiVerTmp[256];
   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -260,8 +269,9 @@
   McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiCtrTmp[256];
   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -268,8 +278,9 @@
   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiCtrTmp[256];
   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -276,14 +287,16 @@
   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiVerTmp[256];
   McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -290,8 +303,9 @@
   McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiVerTmp[256];
   uint8_t uiCtrTmp[256];
   McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
@@ -298,8 +312,9 @@
   McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
   PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
 }
-static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                   int32_t iHeight) {
+static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
   uint8_t uiHorTmp[256];
   uint8_t uiVerTmp[256];
   McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
@@ -308,7 +323,7 @@
 }
 
 void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
 //pSrc has been added the offset of mv
 {
   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
@@ -326,7 +341,7 @@
   int32_t i, j;
   int32_t iA, iB, iC, iD;
   const uint8_t* pSrcNext = pSrc + iSrcStride;
-  const uint8_t *pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
+  const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
   iA = pABCD[0];
   iB = pABCD[1];
   iC = pABCD[2];
@@ -342,7 +357,7 @@
 }
 
 void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
 //pSrc has been added the offset of mv
 {
   const int32_t kiD8x = iMvX & 0x07;
@@ -376,8 +391,9 @@
   McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
 }
 
-static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                  int32_t iHeight) {
+static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth,
+                                int32_t iHeight) {
   if (iWidth == 16)
     McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
@@ -389,7 +405,7 @@
 }
 
 static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
@@ -399,7 +415,7 @@
 }
 
 static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
@@ -409,7 +425,7 @@
 }
 
 static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
     McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
@@ -419,7 +435,7 @@
 }
 
 static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
   if (iWidth == 16) {
     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
@@ -433,7 +449,7 @@
   }
 }
 static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
   if (iWidth == 16) {
     McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
@@ -447,7 +463,7 @@
   }
 }
 static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
   if (iWidth == 16) {
     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
@@ -461,7 +477,7 @@
   }
 }
 static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   if (iWidth == 16) {
@@ -479,7 +495,7 @@
   }
 }
 static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
   if (iWidth == 16) {
@@ -497,7 +513,7 @@
   }
 }
 static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   if (iWidth == 16) {
@@ -515,7 +531,7 @@
   }
 }
 static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
   if (iWidth == 16) {
@@ -533,7 +549,7 @@
   }
 }
 static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
   if (iWidth == 16) {
@@ -551,7 +567,7 @@
   }
 }
 static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   if (iWidth == 16) {
     McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
@@ -565,7 +581,7 @@
   }
 }
 static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   if (iWidth == 16) {
@@ -583,7 +599,7 @@
   }
 }
 static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
   if (iWidth == 16) {
@@ -601,7 +617,7 @@
   }
 }
 static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iWidth, int32_t iHeight) {
+                                    int32_t iWidth, int32_t iHeight) {
   ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
   ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
   if (iWidth == 16) {
@@ -620,7 +636,7 @@
 }
 
 void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
 //pSrc has been added the offset of mv
 {
   static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
@@ -634,9 +650,9 @@
 }
 
 void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
   static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
-				McChromaWidthEq4_mmx,
+    McChromaWidthEq4_mmx,
     McChromaWidthEq8_sse2
   };
   const int32_t kiD8x = iMvX & 0x07;
@@ -656,331 +672,528 @@
 //                       NEON implementation                      //
 //***************************************************************************//
 #if defined(HAVE_NEON)
-void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																						int32_t iWidth, int32_t iHeight)
-{
+void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int32_t iWidth, int32_t iHeight) {
   if (16 == iWidth)
-				McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if(8 == iWidth)
-				McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else
-				McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		}
-void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
+    McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (8 == iWidth)
+    McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-	   McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
-		if (iWidth == 16)
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																				int32_t iWidth, int32_t iHeight)
-{
-		if (iWidth == 16)
-    McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
 
-void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
+void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-				McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
-				McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 4)
-				McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
+void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-				McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
+void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-				McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
   }
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
 }
-void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+}
+void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-		{
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+    McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
   }
-		else if (iWidth == 8)
-		{
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+}
+void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
   }
-		else if (iWidth == 4)
-		{
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+}
+void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
   }
 }
-void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+
+void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
+    {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
+    {McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
+    {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
+  };
+  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  if (0 == iMvX && 0 == iMvY) {
+    if (8 == iWidth)
+      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //here iWidth == 2
+      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    const int32_t kiD8x = iMvX & 0x07;
+    const int32_t kiD8y = iMvY & 0x07;
+    if (8 == iWidth)
+      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else if (4 == iWidth)
+      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //here iWidth == 2
+      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+  }
+}
+#endif
+#if defined(HAVE_NEON_AARCH64)
+void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                          int32_t iWidth, int32_t iHeight) {
+  if (16 == iWidth)
+    McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (8 == iWidth)
+    McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-  {
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-		}
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
+
+void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-				McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 8)
-				McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-		else if (iWidth == 4)
-				McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  if (iWidth == 16) {
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8){
-				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
+void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-		{
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-		}
+    McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-																							int32_t iWidth, int32_t iHeight)
-{
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
-  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-		{
-				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 8)
-		{
-				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
-		else if (iWidth == 4)
-		{
-				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
-				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-		}
+    McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
+void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
 
-void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-											int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
-  {
-				{McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
-				{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
-				{McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
-				{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
-		};
+void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                          int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
+    {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
+    {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
+    {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
+  };
   //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-  pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
 }
-void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-												int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-{
-  if (0 == iMvX && 0 == iMvY)
-		{
-				if(8 == iWidth)
-				  McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-				else if(iWidth == 4)
-				  McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
-				else //here iWidth == 2
-				  McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
-		}
-		else
-		{
-				const int32_t kiD8x = iMvX & 0x07;
-				const int32_t kiD8y = iMvY & 0x07;
-				if(8 == iWidth)
-				  McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-				else if(4 == iWidth)
-				  McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
-				else //here iWidth == 2
-				  McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-		}
+void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  if (0 == iMvX && 0 == iMvY) {
+    if (8 == iWidth)
+      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //here iWidth == 2
+      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    const int32_t kiD8x = iMvX & 0x07;
+    const int32_t kiD8y = iMvY & 0x07;
+    if (8 == iWidth)
+      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else if (4 == iWidth)
+      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //here iWidth == 2
+      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+  }
 }
 #endif
+
 void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
   pMcFunc->pMcLumaFunc   = McLuma_c;
   pMcFunc->pMcChromaFunc = McChroma_c;
 
 #ifdef	HAVE_NEON
-  if ( iCpu & WELS_CPU_NEON ) {
-	   pMcFunc->pMcLumaFunc	  = McLuma_neon;
-	   pMcFunc->pMcChromaFunc  = McChroma_neon;
-		}
+  if (iCpu & WELS_CPU_NEON) {
+    pMcFunc->pMcLumaFunc	  = McLuma_neon;
+    pMcFunc->pMcChromaFunc  = McChroma_neon;
+  }
 #endif
-
+#ifdef	HAVE_NEON_AARCH64
+  if (iCpu & WELS_CPU_NEON) {
+    pMcFunc->pMcLumaFunc	  = McLuma_AArch64_neon;
+    pMcFunc->pMcChromaFunc  = McChroma_AArch64_neon;
+  }
+#endif
 #if defined (X86_ASM)
   if (iCpu & WELS_CPU_SSE2) {
-  pMcFunc->pMcLumaFunc   = McLuma_sse2;
-  pMcFunc->pMcChromaFunc = McChroma_sse2;
+    pMcFunc->pMcLumaFunc   = McLuma_sse2;
+    pMcFunc->pMcChromaFunc = McChroma_sse2;
   }
 #endif //(X86_ASM)
 }
--- /dev/null
+++ b/codec/decoder/core/src/mc.cpp.orig
@@ -1,0 +1,1305 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mc.h"
+
+#include "cpu_core.h"
+
+namespace WelsDec {
+
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//iA = (8 - dx) * (8 - dy);
+//iB = dx * (8 - dy);
+//iC = (8 - dx) * dy;
+//iD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] = {	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
+  {
+    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
+    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
+  },
+  {
+    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
+    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
+  },
+  {
+    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
+    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
+  },
+  {
+    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
+    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
+  },
+  {
+    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
+    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
+  },
+  {
+    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
+    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
+  },
+  {
+    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
+    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
+  },
+  {
+    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
+    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
+  }
+};
+
+typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight);
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
+    ST16A2 (pDst, LD16 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST32A4 (pDst, LD32 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64A8 (pDst, LD64 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64A8 (pDst  , LD64 (pSrc));
+    ST64A8 (pDst + 8, LD64 (pSrc + 8));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//--------------------Luma sample MC------------------//
+
+static inline int32_t HorFilterInput16bit_c (int16_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - (iPix14 * 5)+ (iPix23 * 20));
+}
+// h: iOffset=1 / v: iOffset=iSrcStride
+static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
+  const int32_t kiOffset1 = kiOffset;
+  const int32_t kiOffset2 = (kiOffset << 1);
+  const int32_t kiOffset3 = kiOffset + kiOffset2;
+  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
+  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
+  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
+
+  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
+}
+
+static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                 const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                               int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //here iWidth == 2
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  int16_t iTmp[16 + 5]; //16
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth + 5; j++) {
+      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < iWidth; k++) {
+      pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+
+/////////////////////luma MC//////////////////////////
+static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiVerTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiVerTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                   int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+
+void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
+    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
+    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
+    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  int32_t iA, iB, iC, iD;
+  const uint8_t* pSrcNext = pSrc + iSrcStride;
+  const uint8_t *pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
+  iA = pABCD[0];
+  iB = pABCD[1];
+  iC = pABCD[2];
+  iD = pABCD[3];
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
+    }
+    pDst     += iDstStride;
+    pSrc      = pSrcNext;
+    pSrcNext += iSrcStride;
+  }
+}
+
+void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (0 == kiD8x && 0 == kiD8y)
+    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       SSE2 implement                          //
+//***************************************************************************//
+static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                  int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+
+void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
+    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
+    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
+    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+				McChromaWidthEq4_mmx,
+    McChromaWidthEq8_sse2
+  };
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (kiD8x == 0 && kiD8y == 0) {
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    return;
+  }
+  if (iWidth != 2) {
+    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  } else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+#endif //X86_ASM
+//***************************************************************************//
+//                       NEON implementation                      //
+//***************************************************************************//
+#if defined(HAVE_NEON)
+void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																						int32_t iWidth, int32_t iHeight)
+{
+  if (16 == iWidth)
+				McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if(8 == iWidth)
+				McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else
+				McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		}
+void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+	   McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+		if (iWidth == 16)
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+		if (iWidth == 16)
+    McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+				McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+				McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+		else if (iWidth == 8)
+		{
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+		else if (iWidth == 4)
+		{
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+  {
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+  else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16) {
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8){
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+
+void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+											int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
+  {
+				{McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
+				{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
+				{McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
+				{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
+		};
+  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+												int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+  if (0 == iMvX && 0 == iMvY)
+		{
+				if(8 == iWidth)
+				  McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+				else if(iWidth == 4)
+				  McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+				else //here iWidth == 2
+				  McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+		}
+		else
+		{
+				const int32_t kiD8x = iMvX & 0x07;
+				const int32_t kiD8y = iMvY & 0x07;
+				if(8 == iWidth)
+				  McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+				else if(4 == iWidth)
+				  McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+				else //here iWidth == 2
+				  McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+		}
+}
+#endif
+#if defined(HAVE_NEON_AARCH64)
+void McCopy_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 int32_t iWidth, int32_t iHeight)
+{
+    if (16 == iWidth)
+        McCopyWidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if(8 == iWidth)
+        McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else
+        McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer20_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer01_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer01WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer01WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer01WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer03_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer03WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer03WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer03WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer10_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer10WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer10WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer10WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+}
+void McHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+}
+void McHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+}
+void McHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+}
+void McHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+    }
+}
+void McHorVer30_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    if (iWidth == 16)
+        McHorVer30WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 8)
+        McHorVer30WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+        McHorVer30WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    if (iWidth == 16) {
+        McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 8){
+        McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+}
+void McHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+    }
+}
+void McHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int32_t iWidth, int32_t iHeight)
+{
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+    ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+    if (iWidth == 16)
+    {
+        McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 8)
+    {
+        McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+    else if (iWidth == 4)
+    {
+        McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+        McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+        PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+    }
+}
+
+void McLuma_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+    static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
+    {
+        {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
+        {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
+        {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
+        {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
+    };
+    //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+    pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+                   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+    if (0 == iMvX && 0 == iMvY)
+    {
+        if(8 == iWidth)
+            McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+        else if(iWidth == 4)
+            McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+        else //here iWidth == 2
+            McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+    }
+    else
+    {
+        const int32_t kiD8x = iMvX & 0x07;
+        const int32_t kiD8y = iMvY & 0x07;
+        if(8 == iWidth)
+            McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+        else if(4 == iWidth)
+            McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+        else //here iWidth == 2
+            McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+    }
+}
+#endif
+
+void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
+  pMcFunc->pMcLumaFunc   = McLuma_c;
+  pMcFunc->pMcChromaFunc = McChroma_c;
+
+#ifdef	HAVE_NEON
+  if ( iCpu & WELS_CPU_NEON ) {
+	   pMcFunc->pMcLumaFunc	  = McLuma_neon;
+	   pMcFunc->pMcChromaFunc  = McChroma_neon;
+		}
+#endif
+#ifdef	HAVE_NEON_AARCH64
+    if ( iCpu & WELS_CPU_NEON ) {
+        pMcFunc->pMcLumaFunc	  = McLuma_AArch64_neon;
+        pMcFunc->pMcChromaFunc  = McChroma_AArch64_neon;
+    }
+#endif
+#if defined (X86_ASM)
+  if (iCpu & WELS_CPU_SSE2) {
+  pMcFunc->pMcLumaFunc   = McLuma_sse2;
+  pMcFunc->pMcChromaFunc = McChroma_sse2;
+  }
+#endif //(X86_ASM)
+}
+
+} // namespace WelsDec
--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -89,8 +89,10 @@
 HorFilterFunc fpHorFilter			= NULL;
 HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
 
-typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,  const uint8_t* pSrcB,
+typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);
+typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,
+                             const uint8_t* pSrcB,
                              int32_t iSrcBStride, int32_t iHeight);
 WelsMcFunc0 McCopyWidthEq16 = NULL;
 WelsMcFunc0 McCopyWidthEq8 = NULL;
@@ -323,7 +325,8 @@
   pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
 }
 
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
                                  int32_t iHeight) {
   int32_t i, j;
   for (i = 0; i < iHeight; i++) {
@@ -335,7 +338,8 @@
   }
 }
 //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
                                  int32_t iHeight) {
   int32_t i, j;
   for (i = 0; i < iHeight; i++) {
@@ -347,7 +351,8 @@
   }
 }
 //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
                                  int32_t iHeight) {
   int16_t pTmp[17 + 5] = {0}; //w+1
   int32_t i, j, k;
@@ -481,94 +486,190 @@
 
 #endif //X86_ASM
 
-    //***************************************************************************//
-    //                       NEON implementation                      //
-    //***************************************************************************//
+//***************************************************************************//
+//                       NEON implementation                      //
+//***************************************************************************//
 #if defined(HAVE_NEON)
-void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                   int32_t iWidth, int32_t iHeight) {
+void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth, int32_t iHeight) {
   if (iWidth == 17)
-    McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else //if (iWidth == 9)
-    McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight){
+void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth, int32_t iHeight) {
   if (iWidth == 16)
-    McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else //if (iWidth == 8)
-    McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                              int32_t iWidth, int32_t iHeight){
+void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
   if (iWidth == 17)
-    McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else //if (iWidth == 9)
-    McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+void EncMcHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
 }
-void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
-                          SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = sMv.iMvX&0x07;
-  const int32_t kiD8y = sMv.iMvY&0x07;
+void EncMcChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                       SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
   if (0 == kiD8x && 0 == kiD8y) {
-    if(8 == iWidth)
-      McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    if (8 == iWidth)
+      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
     else // iWidth == 4
-      McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    if (8 == iWidth)
+      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //if(4 == iWidth)
+      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
   }
-  else {
-    if(8 == iWidth)
-      McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+}
+#endif
+
+#if defined(HAVE_NEON_AARCH64)
+void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 8)
+    McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+    int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void EncMcHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
+  if (0 == kiD8x && 0 == kiD8y) {
+    if (8 == iWidth)
+      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else // iWidth == 4
+      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    if (8 == iWidth)
+      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
     else //if(4 == iWidth)
-      McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
   }
 }
 #endif
@@ -599,7 +700,14 @@
     McHorVer03WidthEq16_neon,    EncMcHorVer13_neon,         EncMcHorVer23_neon,          EncMcHorVer33_neon
   };
 #endif
-
+#if defined(HAVE_NEON_AARCH64)
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
+    McCopyWidthEq16_AArch64_neon,        McHorVer10WidthEq16_AArch64_neon,   McHorVer20WidthEq16_AArch64_neon,    McHorVer30WidthEq16_AArch64_neon,
+    McHorVer01WidthEq16_AArch64_neon,    EncMcHorVer11_AArch64_neon,         EncMcHorVer21_AArch64_neon,          EncMcHorVer31_AArch64_neon,
+    McHorVer02WidthEq16_AArch64_neon,    EncMcHorVer12_AArch64_neon,         McHorVer22WidthEq16_AArch64_neon,    EncMcHorVer32_AArch64_neon,
+    McHorVer03WidthEq16_AArch64_neon,    EncMcHorVer13_AArch64_neon,         EncMcHorVer23_AArch64_neon,          EncMcHorVer33_AArch64_neon
+  };
+#endif
   pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
   pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
   pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
@@ -649,6 +757,17 @@
     pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
     pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
     pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
+  }
+#endif
+#if defined(HAVE_NEON_AARCH64)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pFuncList->sMcFuncs.pfLumaQuarpelMc	= pWelsMcFuncWidthEq16_AArch64_neon;
+    pFuncList->sMcFuncs.pfChromaMc	= EncMcChroma_AArch64_neon;
+    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
+    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
+    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
+    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
+    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
   }
 #endif
 }
--- /dev/null
+++ b/codec/encoder/core/src/mc.cpp.orig
@@ -1,0 +1,762 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mc.h"
+#include "cpu_core.h"
+
+namespace WelsSVCEnc {
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//kuiA = (8 - dx) * (8 - dy);
+//kuiB = dx * (8 - dy);
+//kuiC = (8 - dx) * dy;
+//kuiD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
+  {
+    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
+    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
+  },
+  {
+    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
+    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
+  },
+  {
+    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
+    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
+  },
+  {
+    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
+    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
+  },
+  {
+    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
+    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
+  },
+  {
+    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
+    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
+  },
+  {
+    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
+    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
+  },
+  {
+    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
+    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
+  }
+};
+typedef int32_t (*VerFilterFunc) (const uint8_t* pSrc, const int32_t kiSrcStride);
+typedef int32_t (*HorFilterFunc) (const uint8_t* pSrc);
+typedef int32_t (*HorFilterFuncInput16Bits) (int16_t* pSrc);
+
+VerFilterFunc fpVerFilter			= NULL;
+HorFilterFunc fpHorFilter			= NULL;
+HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
+
+typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,  const uint8_t* pSrcB,
+                             int32_t iSrcBStride, int32_t iHeight);
+WelsMcFunc0 McCopyWidthEq16 = NULL;
+WelsMcFunc0 McCopyWidthEq8 = NULL;
+WelsMcFunc0 McCopyWidthEq4 = NULL;
+WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
+WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
+WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
+WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight)
+
+{
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    memcpy (pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//--------------------Luma sample MC------------------//
+static inline int32_t HorFilter_c (const uint8_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
+}
+
+static inline int32_t HorFilterInput16bit1_c (int16_t* pSrc) {
+  int32_t iPix05 = pSrc[-2] + pSrc[3];
+  int32_t iPix14 = pSrc[-1] + pSrc[2];
+  int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
+}
+static inline int32_t VerFilter_c (const uint8_t* pSrc, const int32_t kiSrcStride) {
+  const int32_t kiLine1	= kiSrcStride;
+  const int32_t kiLine2	= (kiSrcStride << 1);
+  const int32_t kiLine3 = kiLine1 + kiLine2;
+  const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3);
+  const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2);
+  const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1);
+
+  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
+}
+
+static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                       const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 8; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                        const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16; j++) {
+      pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  int16_t pTmp[16 + 5] = {0}; //16
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < 16 + 5; j++) {
+      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < 16; k++) {
+      pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+
+/////////////////////luma MC//////////////////////////
+
+static inline void McHorVer01WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer03WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer10WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer11WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer12WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer13WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer21WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer23WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer30WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+}
+static inline void McHorVer31WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer32WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
+
+  pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
+  pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+
+static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight) {
+  int16_t pTmp[17 + 5] = {0}; //w+1
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth + 5; j++) {
+      pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < iWidth; k++) {
+      pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+static inline void McCopy (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                           int32_t iHeight) {
+  int32_t i;
+  if (iWidth == 16 && McCopyWidthEq16 != NULL)
+    McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8 && McCopyWidthEq8 != NULL)
+    McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4 && McCopyWidthEq4 != NULL)
+    McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else {
+    for (i = 0; i < iHeight; i++) {
+      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
+      pDst += iDstStride;
+      pSrc += iSrcStride;
+    }
+  }
+}
+
+void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  const int32_t kiDx = mv.iMvX & 0x07;
+  const int32_t kiDy = mv.iMvY & 0x07;
+
+  if (0 == kiDx && 0 == kiDy) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
+    const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
+    const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
+    const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
+
+    int32_t i, j;
+
+    const uint8_t* pSrcNext = pSrc + iSrcStride;
+
+    for (i = 0; i < iHeight; i++) {
+      for (j = 0; j < iWidth; j++) {
+        pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6;
+      }
+      pDst += iDstStride;
+      pSrc = pSrcNext;
+      pSrcNext += iSrcStride;
+    }
+  }
+}
+//***************************************************************************//
+//                       MMXEXT and SSE2 implementation                      //
+//***************************************************************************//
+#if defined(X86_ASM)
+
+static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+//2010.2.5
+
+static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, PDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight);
+}
+static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+  int32_t tmp1 = 2 * (iWidth - 8);
+  McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
+  McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+}
+
+typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  const uint8_t* pABCD, int32_t iHeigh);
+void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
+  static const McChromaWidthEqx kpfFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_sse2
+  };
+
+  if (0 == kiD8x && 0 == kiD8y) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  }
+}
+
+void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX & 0x07;
+  const int32_t kiD8y = sMv.iMvY & 0x07;
+
+  static const McChromaWidthEqx kpfFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_ssse3
+  };
+  if (0 == kiD8x && 0 == kiD8y) {
+    McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  } else {
+    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  }
+
+}
+
+#endif //X86_ASM
+
+    //***************************************************************************//
+    //                       NEON implementation                      //
+    //***************************************************************************//
+#if defined(HAVE_NEON)
+void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                   int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight){
+  if (iWidth == 16)
+    McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 8)
+    McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                              int32_t iWidth, int32_t iHeight){
+  if (iWidth == 17)
+    McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
+  McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+  McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+  PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+                          SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+  const int32_t kiD8x = sMv.iMvX&0x07;
+  const int32_t kiD8y = sMv.iMvY&0x07;
+  if (0 == kiD8x && 0 == kiD8y) {
+    if(8 == iWidth)
+      McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else // iWidth == 4
+      McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  }
+  else {
+    if(8 == iWidth)
+      McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //if(4 == iWidth)
+      McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+  }
+}
+#endif
+
+#if defined(HAVE_NEON_AARCH64)
+void McHorVer20Width9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iWidth, int32_t iHeight) {
+    if (iWidth == 17)
+        McHorVer20Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //if (iWidth == 9)
+        McHorVer20Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth, int32_t iHeight){
+    if (iWidth == 16)
+        McHorVer02Height17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //if (iWidth == 8)
+        McHorVer02Height9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                          int32_t iWidth, int32_t iHeight){
+    if (iWidth == 17)
+        McHorVer22Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //if (iWidth == 9)
+        McHorVer22Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void EncMcHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+void EncMcHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+    ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+    McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
+}
+void EncMcChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+                      SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+    const int32_t kiD8x = sMv.iMvX&0x07;
+    const int32_t kiD8y = sMv.iMvY&0x07;
+    if (0 == kiD8x && 0 == kiD8y) {
+        if(8 == iWidth)
+            McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+        else // iWidth == 4
+            McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    }
+    else {
+        if(8 == iWidth)
+            McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+        else //if(4 == iWidth)
+            McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    }
+}
+#endif
+
+typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
+void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
+  static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
+
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
+    McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,
+    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
+    McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,
+    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+  };
+#if defined (X86_ASM)
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
+    McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,
+    McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
+    McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,
+    McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+  };
+#endif
+#if defined(HAVE_NEON)
+  static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
+    McCopyWidthEq16_neon,        McHorVer10WidthEq16_neon,   McHorVer20WidthEq16_neon,    McHorVer30WidthEq16_neon,
+    McHorVer01WidthEq16_neon,    EncMcHorVer11_neon,         EncMcHorVer21_neon,          EncMcHorVer31_neon,
+    McHorVer02WidthEq16_neon,    EncMcHorVer12_neon,         McHorVer22WidthEq16_neon,    EncMcHorVer32_neon,
+    McHorVer03WidthEq16_neon,    EncMcHorVer13_neon,         EncMcHorVer23_neon,          EncMcHorVer33_neon
+  };
+#endif
+#if defined(HAVE_NEON_AARCH64)
+    static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
+        McCopyWidthEq16_AArch64_neon,        McHorVer10WidthEq16_AArch64_neon,   McHorVer20WidthEq16_AArch64_neon,    McHorVer30WidthEq16_AArch64_neon,
+        McHorVer01WidthEq16_AArch64_neon,    EncMcHorVer11_AArch64_neon,         EncMcHorVer21_AArch64_neon,          EncMcHorVer31_AArch64_neon,
+        McHorVer02WidthEq16_AArch64_neon,    EncMcHorVer12_AArch64_neon,         McHorVer22WidthEq16_AArch64_neon,    EncMcHorVer32_AArch64_neon,
+        McHorVer03WidthEq16_AArch64_neon,    EncMcHorVer13_AArch64_neon,         EncMcHorVer23_AArch64_neon,          EncMcHorVer33_AArch64_neon
+    };
+#endif
+  pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
+  pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
+  pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
+  pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
+  pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
+  fpVerFilter				= VerFilter_c;
+  fpHorFilter				= HorFilter_c;
+  fpHorFilterInput16Bits			= HorFilterInput16bit1_c;
+  McCopyWidthEq4 = McCopyWidthEq4_c;
+  McCopyWidthEq8 = McCopyWidthEq8_c;
+  McCopyWidthEq16 = McCopyWidthEq16_c;
+  pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
+  pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
+  pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
+  pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
+  pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
+#if defined (X86_ASM)
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
+    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
+    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
+    pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
+    McCopyWidthEq4 = McCopyWidthEq4_mmx;
+    McCopyWidthEq8 = McCopyWidthEq8_mmx;
+    McCopyWidthEq16 = McCopyWidthEq16_sse2;
+    pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
+    pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
+    pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
+    pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;
+    pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
+  }
+
+#endif //(X86_ASM)
+
+#if defined(HAVE_NEON)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pFuncList->sMcFuncs.pfLumaQuarpelMc	= pWelsMcFuncWidthEq16_neon;
+    pFuncList->sMcFuncs.pfChromaMc	= EncMcChroma_neon;
+    pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
+    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
+    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
+    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
+    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
+  }
+#endif
+#if defined(HAVE_NEON_AARCH64)
+    if (uiCpuFlag & WELS_CPU_NEON) {
+        pFuncList->sMcFuncs.pfLumaQuarpelMc	= pWelsMcFuncWidthEq16_AArch64_neon;
+        pFuncList->sMcFuncs.pfChromaMc	= EncMcChroma_AArch64_neon;
+        pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
+        pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
+        pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
+        pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
+        pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
+    }
+#endif
+}
+}