ref: 2531e26da13cd5ff9801cf32884b611d0f59a63d
parent: d070f9e7f2f6933d3fd5f139715bd5b9f9ec9d1f
parent: beaf7c1893a03712781225efa5eb9e310384ba52
author: ruil2 <ruil2@cisco.com>
date: Thu Jul 10 05:29:47 EDT 2014
Merge pull request #1133 from dongzha/SpeedupArm64Neon speed up memory loading in arm64 MC
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -413,8 +413,8 @@
movi v0.8h, #20, lsl #0
movi v1.8h, #5, lsl #0
w8_h_mc_luma_loop:
- ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
- trn1 v2.2d, v2.2d, v3.2d
+ ld1 {v2.16b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+
//prfm pldl1strm, [x0]
ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]
@@ -492,8 +492,8 @@
movi v0.8h, #20, lsl #0
movi v1.8h, #5, lsl #0
w8_xy_10_mc_luma_loop:
- ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
- trn1 v2.2d, v2.2d, v3.2d
+ ld1 {v2.16b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+
//prfm pldl1strm, [x0]
ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]
@@ -572,8 +572,8 @@
movi v0.8h, #20, lsl #0
movi v1.8h, #5, lsl #0
w8_xy_30_mc_luma_loop:
- ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2]
- trn1 v2.2d, v2.2d, v3.2d
+ ld1 {v2.16b}, [x0], x1 //only use 13(8+5); v2=src[-2]
+
//prfm pldl1strm, [x0]
ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]
@@ -1779,20 +1779,20 @@
movi v1.8h, #5, lsl #0
ldr q22, filter_para
w17_h_mc_luma_loop:
- ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 22(17+5); v2=src[-2]
- trn1 v2.2d, v2.2d, v3.2d
+ ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2]
+
//prfm pldl1strm, [x0]
- ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
- ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]
- ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1]
- ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2]
- ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3]
+ ext v5.16b, v2.16b, v3.16b, #1 //v5=src[-1]
+ ext v6.16b, v2.16b, v3.16b, #2 //v6=src[0]
+ ext v7.16b, v2.16b, v3.16b, #3 //v7=src[1]
+ ext v16.16b, v2.16b, v3.16b, #4 //v16=src[2]
+ ext v17.16b, v2.16b, v3.16b, #5 //v17=src[3]
FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.16b}, [x2], x5 //write 16Byte
- ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+ ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 16th Byte
@@ -1808,8 +1808,8 @@
movi v1.8h, #5, lsl #0
ldr q22, filter_para
w9_h_mc_luma_loop:
- ld1 {v2.8b, v3.8b}, [x0], x1 //only use 14(9+5); v2=src[-2]
- trn1 v2.2d, v2.2d, v3.2d
+ ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2]
+ mov v3.d[0], v2.d[1]
//prfm pldl1strm, [x0]
ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]