shithub: openh264

Download patch

ref: c9433ee73bcc4a0e42fec17a34692833e75a26cf
parent: 50daa8f737fe174d723a10f97fa7ec24f54a6178
parent: 1ecb9582dfc4893348806ec31582ea9ab3114fae
author: HaiboZhu <haibozhu@cisco.com>
date: Mon Apr 18 05:21:24 EDT 2016

Merge pull request #2442 from ruil2/deblocking_fix

fix 32-bit parameters issue on arm64 assembly function

--- a/codec/common/arm64/copy_mb_aarch64_neon.S
+++ b/codec/common/arm64/copy_mb_aarch64_neon.S
@@ -105,9 +105,10 @@
 //  }
 .endm
 
-
+//void WelsCopy8x8_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
 
     STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
@@ -120,7 +121,8 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     LOAD16_ALIGNED_DATA_WITH_STRIDE   v0, v1, v2, v3, x2, x3
 
     STORE16_ALIGNED_DATA_WITH_STRIDE  v0, v1, v2, v3, x0, x1
@@ -141,7 +143,8 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x16NotAligned_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
 
     STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
@@ -162,7 +165,8 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy16x8NotAligned_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     LOAD16_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
 
     STORE16_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
@@ -175,7 +179,8 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x16_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     LOAD_UNALIGNED_DATA_WITH_STRIDE v0, v1, v2, v3, x2, x3
 
     STORE_UNALIGNED_DATA_WITH_STRIDE    v0, v1, v2, v3, x0, x1
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -32,8 +32,11 @@
 
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
-
+//void ExpandPictureLuma_AArch64_neon (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
 WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureLuma_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x2,w2
+    SIGN_EXTENSION x3,w3
     mov x7, x0
     mov x8, x3
     add x4, x7, x2
@@ -73,8 +76,13 @@
     cbnz x2, _expand_picture_luma_loop0
 WELS_ASM_AARCH64_FUNC_END
 
+//void ExpandPictureChroma_AArch64_neon (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+//                                       const int32_t kiPicH);
 WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
     //Save the dst
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x2,w2
+    SIGN_EXTENSION x3,w3
     mov x7, x0
     mov x8, x3
     mov x10, #16
--- a/codec/common/arm64/intra_pred_common_aarch64_neon.S
+++ b/codec/common/arm64/intra_pred_common_aarch64_neon.S
@@ -34,7 +34,9 @@
 #include "arm_arch64_common_macro.S"
 
 //for Luma 16x16
+//void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.16b}, [x3]
 .rept 16
@@ -42,7 +44,9 @@
 .endr
 WELS_ASM_AARCH64_FUNC_END
 
+//void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
 .rept 16
     ld1r    {v0.16b}, [x3], x2
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -294,6 +294,9 @@
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w16_h_mc_luma_loop:
     ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
     trn1 v2.2d, v2.2d, v3.2d
@@ -312,11 +315,15 @@
     cbnz x4, w16_h_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
     sub x0, x0, #2
     stp d8,d9, [sp,#-16]!
     movi v8.8h, #20, lsl #0
     movi v9.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w8_h_mc_luma_loop:
     VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
@@ -366,10 +373,15 @@
     ldp d8,d9,[sp],#16
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     asr x4, x4, #1
 w4_h_mc_luma_loop:
     ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
@@ -401,10 +413,15 @@
     cbnz x4, w4_h_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w16_xy_10_mc_luma_loop:
     ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
     trn1 v2.2d, v2.2d, v3.2d
@@ -423,11 +440,16 @@
     cbnz x4, w16_xy_10_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon
     sub x0, x0, #2
     stp d8,d9, [sp,#-16]!
     movi v8.8h, #20, lsl #0
     movi v9.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w8_xy_10_mc_luma_loop:
     VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
@@ -479,10 +501,15 @@
     ldp d8,d9,[sp],#16
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     asr x4, x4, #1
 w4_xy_10_mc_luma_loop:
     ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
@@ -514,11 +541,15 @@
     cbnz x4, w4_xy_10_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w16_xy_30_mc_luma_loop:
     ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
     trn1 v2.2d, v2.2d, v3.2d
@@ -537,11 +568,16 @@
     cbnz x4, w16_xy_30_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon
     sub x0, x0, #2
     stp d8,d9, [sp,#-16]!
     movi v8.8h, #20, lsl #0
     movi v9.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w8_xy_30_mc_luma_loop:
     VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
     sub x4, x4, #4
@@ -593,10 +629,15 @@
     ldp d8,d9,[sp],#16
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
     sub x0, x0, #2
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     asr x4, x4, #1
 w4_xy_30_mc_luma_loop:
     ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
@@ -628,8 +669,12 @@
     cbnz x4, w4_xy_30_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -711,7 +756,12 @@
     cbnz x4, w16_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
@@ -750,7 +800,12 @@
     cbnz x4, w8_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -805,8 +860,12 @@
     cbnz x4, w4_xy_01_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -888,7 +947,12 @@
     cbnz x4, w16_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
@@ -927,7 +991,12 @@
     cbnz x4, w8_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -982,8 +1051,12 @@
     cbnz x4, w4_xy_03_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -1065,7 +1138,12 @@
     cbnz x4, w16_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v30.8h, #20, lsl #0
     movi v31.8h, #5, lsl #0
@@ -1100,7 +1178,12 @@
     cbnz x4, w8_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -1155,8 +1238,12 @@
     cbnz x4, w4_xy_02_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                       int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     stp d8, d9, [sp,#-16]!
     stp d10, d11, [sp,#-16]!
     stp d12, d13, [sp,#-16]!
@@ -1321,7 +1408,12 @@
     ldp d8, d9, [sp], #16
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
@@ -1391,9 +1483,13 @@
     sub x4, x4, #4
     cbnz x4, w8_hv_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
+//void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);
 
-
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
@@ -1462,9 +1558,13 @@
     sub x4, x4, #4
     cbnz x4, w4_hv_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                   int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
     //prfm pldl1strm, [x0]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w16_copy_loop:
     //prfm pldl1strm, [x0, x1]
     ld1 {v0.16b}, [x0], x1  //read 16Byte : 0 line
@@ -1476,9 +1576,13 @@
     sub x4, x4, #2
     cbnz x4, w16_copy_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                  int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
     //prfm pldl1strm, [x0]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w8_copy_loop:
     //prfm pldl1strm, [x0, x1]
     ld1 {v0.8b}, [x0], x1  //read 16Byte : 0 line
@@ -1493,6 +1597,9 @@
 
 WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
     //prfm pldl1strm, [x0]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
 w4_copy_loop:
     //prfm pldl1strm, [x0, x1]
     ld1 {v0.s}[0], [x0], x1  //read 16Byte : 0 line
@@ -1505,8 +1612,14 @@
     cbnz x4, w4_copy_loop
 WELS_ASM_AARCH64_FUNC_END
 
-WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
+//void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+//const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
 
+WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
+    SIGN_EXTENSION x6,w6
 enc_w16_pix_avg_loop:
     ld1 {v0.16b}, [x2], x3  //read 16Byte : src0: 0 line
     ld1 {v1.16b}, [x4], x5  //read 16Byte : src1: 0 line
@@ -1538,9 +1651,15 @@
     cbnz x6, enc_w16_pix_avg_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+//                                        const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
     //prfm pldl1strm, [x2]
     //prfm pldl1strm, [x4]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
+    SIGN_EXTENSION x6,w6
 enc_w8_pix_avg_loop:
     //prfm pldl1strm, [x2, x3]
     //prfm pldl1strm, [x4, x5]
@@ -1574,10 +1693,15 @@
     sub x6, x6, #4
     cbnz x6, enc_w8_pix_avg_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+//                                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
     //prfm pldl1strm, [x2]
     //prfm pldl1strm, [x4]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
+    SIGN_EXTENSION x6,w6
 w16_pix_avg_loop:
     //prfm pldl1strm, [x2, x3]
     //prfm pldl1strm, [x4, x5]
@@ -1616,10 +1740,15 @@
     sub x6, x6, #4
     cbnz x6, w16_pix_avg_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+//                                   const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
     //prfm pldl1strm, [x2]
     //prfm pldl1strm, [x4]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
+    SIGN_EXTENSION x6,w6
 w8_pix_avg_loop:
     //prfm pldl1strm, [x2, x3]
     //prfm pldl1strm, [x4, x5]
@@ -1654,10 +1783,15 @@
     cbnz x6, w8_pix_avg_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+//                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon
     //prfm pldl1strm, [x2]
     //prfm pldl1strm, [x4]
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
+    SIGN_EXTENSION x6,w6
 w4_pix_avg_loop:
     //prfm pldl1strm, [x2, x3]
     //prfm pldl1strm, [x4, x5]
@@ -1674,8 +1808,12 @@
     sub x6, x6, #2
     cbnz x6, w4_pix_avg_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t* pWeights, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     ld4r {v28.8b, v29.8b, v30.8b, v31.8b}, [x4] //load A/B/C/D
     ld1 {v16.16b}, [x0], x1  // src[x]
     ext v17.16b, v16.16b, v16.16b, #1  // src[x+1]
@@ -1729,8 +1867,12 @@
     sub x5, x5, #4
     cbnz x5, w8_mc_chroma_loop
 WELS_ASM_AARCH64_FUNC_END
-
+//void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t* pWeights, int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
     ld1 {v0.8b}, [x0], x1  // src[x]
     ext v1.8b, v0.8b, v0.8b, #1  // src[x+1]
@@ -1759,8 +1901,12 @@
     cbnz x5, w4_mc_chroma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t iHeight);// width+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x3, x3, #16
     mov x5, #16
@@ -1789,7 +1935,12 @@
     cbnz x4, w17_h_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
+//void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t iHeight);// width+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x3, x3, #8
     mov x5, #8
@@ -1817,8 +1968,12 @@
     cbnz x4, w9_h_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer20Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t iHeight);// width+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width5_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x3, x3, #4
     mov x5, #4
@@ -1841,12 +1996,16 @@
     cbnz x4, w5_h_mc_luma_loop
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                     int32_t iHeight);
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon
     stp d8, d9, [sp,#-16]!
     stp d10, d11, [sp,#-16]!
     stp d12, d13, [sp,#-16]!
     stp d14, d15, [sp,#-16]!
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
@@ -2044,8 +2203,12 @@
     ldp d8, d9, [sp], #16
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t iHeight);//width+1&&height+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
@@ -2140,8 +2303,12 @@
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer22Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                    int32_t iHeight);//width+1&&height+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, #2
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
@@ -2231,8 +2398,12 @@
     st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                      int32_t iHeight);// height+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -2320,8 +2491,12 @@
     FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
     st1 {v20.16b}, [x2], x3 //write 16Byte : last line
 WELS_ASM_AARCH64_FUNC_END
-
+//void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                     int32_t iHeight);// height+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
@@ -2375,8 +2550,12 @@
     st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void McHorVer02Height5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+//                                     int32_t iHeight);// height+1
 WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height5_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x4,w4
     sub x0, x0, x1, lsl #1
     movi v0.8h, #20, lsl #0
     movi v1.8h, #5, lsl #0
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -68,7 +68,7 @@
 
 //  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
     ld4        {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]      // cost 3 cycles!
     ROW_TRANSFORM_1_STEP        v0, v1, v2, v3, v16, v17, v18, v19, v4, v5
     TRANSFORM_4BYTES        v0, v1, v2, v3, v16, v17, v18, v19
@@ -113,6 +113,7 @@
 WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero16x16_AArch64_neon
     eor v0.16b, v0.16b, v0.16b
     eor v1.16b, v1.16b, v1.16b
+    SIGN_EXTENSION x1,w1
     lsl x1, x1, 1
 .rept 16
     st1 {v0.16b, v1.16b}, [x0], x1
--- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
@@ -35,6 +35,7 @@
 
 // for Luma 4x4
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredH_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
 .rept 4
     ld1r    {v0.8b}, [x3], x2
@@ -43,6 +44,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     x4, x1, #1
     ldr     s0, [x3]
@@ -59,6 +61,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     v0.8b, v0.8b, v0.8b
     ldr     s0, [x3]
@@ -71,6 +74,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
     dup     v1.8b, v0.b[7]
@@ -90,6 +94,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
     dup     v1.8b, v0.b[3]
@@ -110,6 +115,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
     ext     v1.8b, v0.8b, v0.8b, #1
@@ -127,6 +133,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
     dup     v1.8b, v0.b[3]
@@ -146,6 +153,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.s}[1], [x3]
     sub     x3, x3, #1
@@ -177,6 +185,7 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
     mov     x4, #3
     mul     x4, x4, x2
@@ -203,6 +212,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
     sub     x3, x3, x2 // x2 points to top left
     ld1     {v0.s}[1], [x3], x2
@@ -228,6 +238,7 @@
 
 // for Chroma 8x8
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredV_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
 .rept   8
@@ -236,6 +247,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredH_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
 .rept 8
     ld1r    {v0.8b}, [x3], x2
@@ -245,6 +257,7 @@
 
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     x4, x1, #1
     ld1     {v0.8b}, [x3]
@@ -280,6 +293,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.8b}, [x3]
     uaddlp  v0.4h, v0.8b
@@ -298,6 +312,7 @@
 intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     x3, x3, #1
     mov     x4, x3
@@ -349,6 +364,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     x4, x1, #1
     ld1     {v0.16b}, [x3]
@@ -380,6 +396,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     ld1     {v0.16b}, [x3]
     // reduce instruction
@@ -392,6 +409,7 @@
 WELS_ASM_AARCH64_FUNC_END
 
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, #1
     ld1     {v1.b}[0], [x3], x2
     ld1     {v1.b}[1], [x3], x2
@@ -422,8 +440,9 @@
 .align 4
 intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
 intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
-
+//void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
+    SIGN_EXTENSION x2,w2
     sub     x3, x1, x2
     sub     x3, x3, #1
     mov     x4, x3
--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
@@ -179,9 +179,12 @@
     add     \arg7, \arg7, v4.4s
 .endm
 
+//int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
     ldr     x11, [sp, #0]
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     LOAD_CHROMA_DATA x0, v0.8b, v0.b
 
     uaddlp  v1.8h, v0.16b
@@ -279,8 +282,11 @@
     str     w7, [x4]
 WELS_ASM_AARCH64_FUNC_END
 
+//int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     LOAD_LUMA_DATA
 
     uaddlv    h2, v0.16b
@@ -331,7 +337,13 @@
     str     w7, [x4]
 WELS_ASM_AARCH64_FUNC_END
 
+//int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,int32_t);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x6,w6
+    SIGN_EXTENSION x7,w7
+
     sub     x9, x0, x1
     ld1     {v16.s}[0], [x9]      //top
     sub     x9, x0, #1
@@ -421,9 +433,13 @@
 
 WELS_ASM_AARCH64_FUNC_END
 
+//int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
     ldr     x11, [sp, #0]
 
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     LOAD_CHROMA_DATA x0, v0.8b, v0.b
 
     LOAD_CHROMA_DATA x7, v1.8b, v1.b
@@ -511,8 +527,11 @@
     str     w7, [x4]
 WELS_ASM_AARCH64_FUNC_END
 
-
+//int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
+    SIGN_EXTENSION x5,w5
     LOAD_LUMA_DATA
 
     uaddlv  h2, v0.16b
--- a/codec/encoder/core/arm64/memory_aarch64_neon.S
+++ b/codec/encoder/core/arm64/memory_aarch64_neon.S
@@ -33,9 +33,10 @@
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
 
-
+//void WelsSetMemZero_AArch64_neon (void* pDst, int32_t iSize);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsSetMemZero_AArch64_neon
     eor v0.16b, v0.16b, v0.16b
+    SIGN_EXTENSION x1,w1
     cmp x1, #32
     b.eq mem_zero_32_neon_start
     b.lt mem_zero_24_neon_start
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -490,8 +490,10 @@
     st4       {v0.d, v1.d, v2.d, v3.d}[0], [x0]
 WELS_ASM_AARCH64_FUNC_END
 
-
+//void WelsDctFourT4_AArch64_neon (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon
+    SIGN_EXTENSION x2,w2
+    SIGN_EXTENSION x4,w4
 .rept 2
     LOAD_8x4_DATA_FOR_DCT   v0, v1, v2, v3, v4, v5, v6, v7, x1, x3
     usubl    v0.8h, v0.8b, v4.8b
@@ -518,8 +520,10 @@
     st1     {v6.16b, v7.16b}, [x0], #32
 .endr
 WELS_ASM_AARCH64_FUNC_END
-
+//void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct)
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     ld1     {v16.s}[0], [x2], x3
     ld1     {v16.s}[1], [x2], x3
     ld1     {v16.s}[2], [x2], x3
@@ -552,8 +556,10 @@
     st1     {v1.s}[0],[x0],x1
     st1     {v1.s}[1],[x0],x1
 WELS_ASM_AARCH64_FUNC_END
-
+//void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
 .rept 2
     ld1     {v16.d}[0], [x2], x3
     ld1     {v16.d}[1], [x2], x3
@@ -644,7 +650,11 @@
     st1       {v4.16b, v5.16b}, [x0]  //store
 WELS_ASM_AARCH64_FUNC_END
 
+//void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
+//                int16_t* pDctDc);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x3,w3
     ld1       {v16.16b,v17.16b}, [x4]
     srshr     v16.8h, v16.8h, #6
     srshr     v17.8h, v17.8h, #6
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -32,8 +32,9 @@
 
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
-
+//int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
 WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
+    SIGN_EXTENSION x1,w1
     ld1 {v0.d}[0], [x0], x1
     ld1 {v0.d}[1], [x0], x1
     ld1 {v1.d}[0], [x0], x1
@@ -50,7 +51,9 @@
     mov    x0, v0.d[0]
 WELS_ASM_AARCH64_FUNC_END
 
+//int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
 WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon
+    SIGN_EXTENSION x1,w1
     ld1 {v0.16b}, [x0], x1
     uaddlp v0.8h, v0.16b
 .rept 15
@@ -61,11 +64,17 @@
     mov    x0, v0.d[0]
 WELS_ASM_AARCH64_FUNC_END
 
+//void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
+//                               const int32_t kiRefStride,
+//                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
 WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
 //(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
     //x5: pTimesOfFeatureValue
     //x4: pFeatureOfBlock
 
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x2,w2
+    SIGN_EXTENSION x3,w3
     mov x8, x0
     mov x6, x1
     add x8, x8, x6
@@ -147,6 +156,9 @@
     //x5: pTimesOfFeatureValue
     //x4: pFeatureOfBlock
 
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x2,w2
+    SIGN_EXTENSION x3,w3
     mov x8, x0
     mov x6, x1
     add x8, x8, x6
@@ -219,6 +231,7 @@
 
 WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
 // (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
+    SIGN_EXTENSION x2,w2
     mov x9, #3
     bic x5, x2, x9
     mov x8, #0
@@ -280,7 +293,8 @@
     ldr q7, mv_x_inc_x4
     ldr q6, mv_y_inc_x4
     ldr q5, mx_x_offset_x4
-
+    SIGN_EXTENSION x1,w1
+    SIGN_EXTENSION x2,w2
     eor v4.16b, v4.16b, v4.16b
     eor v3.16b, v3.16b, v3.16b
     dup v16.2d, x3 // v8->v16
--- a/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
+++ b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
@@ -33,6 +33,8 @@
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
 WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon
+    SIGN_EXTENSION x1, w1
+    SIGN_EXTENSION x3, w3
     ld1   {v1.16b}, [x0], x1 //save the ref data (16bytes)
     ld1   {v0.16b}, [x2], x3 //save the src data (16bytes)
     uabd  v2.16b, v0.16b, v1.16b
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -174,6 +174,9 @@
     //restore   the tailer for the unasigned size
     st1     {v16.16b}, [x0]
 WELS_ASM_AARCH64_FUNC_END
+//void DyadicBilinearQuarterDownsampler_AArch64_neon(uint8_t* pDst, const int32_t kiDstStride,
+//uint8_t* pSrc, const int32_t kiSrcStride,
+//const int32_t kiSrcWidth, const int32_t kiHeight);
 
 WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
     //Initialize the register
@@ -229,6 +232,9 @@
     st1     {v16.16b}, [x0]
 WELS_ASM_AARCH64_FUNC_END
 
+//void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
+//    const int32_t kiDstWidth, const int32_t kiDstHeight,
+//   uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
 WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
     mov     w10, #32767
     and     w8, w6, w10
@@ -259,6 +265,13 @@
 
     eor     v26.16b, v26.16b, v26.16b
     eor     v27.16b, v27.16b, v27.16b
+    SIGN_EXTENSION x1, w1
+    SIGN_EXTENSION x2, w2
+    SIGN_EXTENSION x3, w3
+    SIGN_EXTENSION x5, w5
+    SIGN_EXTENSION x6, w6
+    SIGN_EXTENSION x7, w7
+
     sub     x1, x1, x2
     sub     x3, x3, #1
 
--- a/codec/processing/src/arm64/pixel_sad_aarch64_neon.S
+++ b/codec/processing/src/arm64/pixel_sad_aarch64_neon.S
@@ -32,8 +32,10 @@
 
 #ifdef HAVE_NEON_AARCH64
 #include "arm_arch64_common_macro.S"
-
+//int32_t WelsProcessingSampleSad8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
 WELS_ASM_AARCH64_FUNC_BEGIN WelsProcessingSampleSad8x8_AArch64_neon
+    SIGN_EXTENSION x1, w1
+    SIGN_EXTENSION x3, w3
     ld1     {v0.8b}, [x0], x1
     ld1     {v1.8b}, [x2], x3
     uabdl   v2.8h, v0.8b, v1.8b
--- a/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
+++ b/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
@@ -61,6 +61,8 @@
  */
 WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSad_AArch64_neon
     eor     v31.16b, v31.16b, v31.16b
+
+    SIGN_EXTENSION x4, w4
     lsl     x9, x4, #4
     sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
     sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
@@ -119,6 +121,7 @@
     ldr     x15, [sp, #0]
     eor     v28.16b, v28.16b, v28.16b
 
+    SIGN_EXTENSION x4, w4
     lsl     x9, x4, #4
     sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
     sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
@@ -260,6 +263,7 @@
     ldr     x15, [sp, #24]  //p_mad8x8
     eor     v17.16b, v17.16b, v17.16b
 
+    SIGN_EXTENSION x4, w4
     lsl     x9, x4, #4
     sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
     sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
@@ -400,6 +404,7 @@
     ldr     x13, [sp, #8]   //psqdiff16x16
     eor     v17.16b, v17.16b, v17.16b
 
+    SIGN_EXTENSION x4, w4
     lsl     x9, x4, #4
     sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
     sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
@@ -500,6 +505,7 @@
     ldr     x12, [sp, #0]   //psqsum16x16
     eor     v17.16b, v17.16b, v17.16b
 
+    SIGN_EXTENSION x4, w4
     lsl     x9, x4, #4
     sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
     sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width