ref: c0feee817dd20c82de47dc704d02c4b48c39786f
parent: 24d8b8a82c55c736cde326a8b2684883637684f0
parent: 1f8ef8f0a3649671639f500b68b8dba75939925d
author: huili2 <huili2@cisco.com>
date: Thu Jul 16 09:33:04 EDT 2015
Merge pull request #2032 from GuangweiWang/sub8x8asm add new assembly functions to support sub8x8 mode
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -1635,6 +1635,36 @@
WELS_ASM_FUNC_END
+WELS_ASM_FUNC_BEGIN McHorVer20Width5_neon
+ push {r4}
+ sub r3, #4
+ sub r0, #2
+ ldr r4, [sp, #4]
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w5_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 10(5+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
+
+ sub r4, #1
+ vst1.u32 {d16[0]}, [r2]! //write [0:3] Byte
+ vst1.u8 {d16[4]}, [r2], r3 //write 5th Byte
+
+ cmp r4, #0
+ bne w5_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
push {r4}
ldr r4, [sp, #4]
@@ -1780,6 +1810,63 @@
WELS_ASM_FUNC_END
+WELS_ASM_FUNC_BEGIN McHorVer02Height5_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w5_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u32 {d16[0]}, [r2], r3 //write 1st 4Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u32 {d16[0]}, [r2], r3 //write 2nd 4Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u32 {d16[0]}, [r2], r3 //write 3rd 4Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u32 {d16[0]}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #1
+ bne w5_v_mc_luma_loop
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vst1.u32 {d16[0]}, [r2], r3 //write last 4Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
push {r4}
vpush {q4-q7}
@@ -2014,6 +2101,105 @@
vst1.u8 d18, [r2]! //write 8Byte
UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vpop {q4}
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width5_neon
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 10(5+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 10(5+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 10(5+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 10(5+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 10(5+5), =src[2]
+ sub r3, #4
+
+w5_hv_mc_luma_loop:
+
+ vld1.u8 {q8}, [r0], r1 //use 10(5+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18
+ vst1.u32 {d18[0]}, [r2]! //write 4Byte
+ vst1.u8 {d18[4]}, [r2], r3 //write 5th Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18
+ vst1.u32 {d18[0]}, [r2]! //write 4Byte
+ vst1.u8 {d18[4]}, [r2], r3 //write 5th Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18
+ vst1.u32 {d18[0]}, [r2]! //write 4Byte
+ vst1.u8 {d18[4]}, [r2], r3 //write 5th Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18
+ vst1.u32 {d18[0]}, [r2]! //write 4Byte
+ vst1.u8 {d18[4]}, [r2], r3 //write 5th Byte
+
+ //q4~q8, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
+
+ sub r4, #4
+ cmp r4, #1
+ bne w5_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q8}, [r0], r1 //use 10(5+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18
+ vst1.u32 {d18[0]}, [r2]! //write 4Byte
+ vst1.u8 {d18[4]}, [r2], r3 //write 5th Byte
vpop {q4}
pop {r4}
WELS_ASM_FUNC_END
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -1818,6 +1818,30 @@
WELS_ASM_AARCH64_FUNC_END
+WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width5_AArch64_neon
+ sub x0, x0, #2
+ sub x3, x3, #4
+ mov x5, #4
+ movi v0.8h, #20, lsl #0
+ movi v1.8h, #5, lsl #0
+w5_h_mc_luma_loop:
+ ld1 {v2.16b}, [x0], x1 //only use 10(5+5); v2=src[-2]
+
+ ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1]
+ ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0]
+ ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1]
+ ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2]
+ ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3]
+
+ FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
+ st1 {v20.s}[0], [x2], x5 //write 4Byte
+ st1 {v20.b}[4], [x2], x3 //write 5th Byte
+
+ sub x4, x4, #1
+ cbnz x4, w5_h_mc_luma_loop
+WELS_ASM_AARCH64_FUNC_END
+
+
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon
stp d8, d9, [sp,#-16]!
stp d10, d11, [sp,#-16]!
@@ -2116,6 +2140,98 @@
st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon
+ sub x0, x0, #2
+ sub x0, x0, x1, lsl #1
+ movi v0.8h, #20, lsl #0
+ movi v1.8h, #5, lsl #0
+ sub x3, x3, #4
+ mov x5, #4
+ ldr q29, filter_para
+ sub x4, x4, #1
+
+ //prfm pldl1strm, [x0]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
+
+w5_hv_mc_luma_loop:
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+ // vertical filtered into v20/v21
+ FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+ FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line
+ st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
+ // vertical filtered into v20/v21
+ FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+ FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 1 line
+ st1 {v26.b}[4], [x2], x3 //write 5th Byte : 1 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
+ // vertical filtered into v20/v21
+ FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+ FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 2 line
+ st1 {v26.b}[4], [x2], x3 //write 5th Byte : 2 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
+ // vertical filtered into v20/v21
+ FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+ FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 3 line
+ st1 {v26.b}[4], [x2], x3 //write 5th Byte : 3 line
+
+
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v30.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v30.16b
+
+ sub x4, x4, #4
+ cbnz x4, w5_hv_mc_luma_loop
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
+ // vertical filtered into v20/v21
+ FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+ FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25
+ FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
+ st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line
+ st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line
+WELS_ASM_AARCH64_FUNC_END
+
+
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon
sub x0, x0, x1, lsl #1
movi v0.8h, #20, lsl #0
@@ -2257,6 +2373,62 @@
ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height5_AArch64_neon
+ sub x0, x0, x1, lsl #1
+ movi v0.8h, #20, lsl #0
+ movi v1.8h, #5, lsl #0
+ sub x4, x4, #1
+
+ //prfm pldl1strm, [x0]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
+
+w5_v_mc_luma_loop:
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+ FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+ st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v2.8b}, [x0], x1 // v2=src[4*stride]
+ FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
+ st1 {v20.s}[0], [x2], x3 //write 4Byte : 1 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v3.8b}, [x0], x1 // v3=src[5*stride]
+ FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
+ st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v4.8b}, [x0], x1 // v4=src[6*stride]
+ FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
+ st1 {v20.s}[0], [x2], x3 //write 4Byte : 3 line
+
+ mov v5.16b, v3.16b
+ mov v3.16b, v7.16b
+ mov v7.16b, v2.16b
+ mov v2.16b, v6.16b
+ mov v6.16b, v4.16b
+ mov v4.16b, v7.16b
+ sub x4, x4, #4
+ cbnz x4, w5_v_mc_luma_loop
+
+ //prfm pldl1strm, [x0, x1]
+ ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
+ FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
+ st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
+
WELS_ASM_AARCH64_FUNC_END
#endif
--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -140,16 +140,22 @@
int32_t iHeight);// width+1
void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// width+1
+void McHorVer20Width5_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);// width+1
void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// height+1
void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// height+1
+void McHorVer02Height5_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);// height+1
void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);//width+1&&height+1
void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);//width+1&&height+1
+void McHorVer22Width5_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);//width+1&&height+1
#endif
#if defined(HAVE_NEON_AARCH64)
@@ -222,14 +228,20 @@
int32_t iHeight);// width+1
void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// width+1
+void McHorVer20Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);// width+1
void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// height+1
void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);// height+1
+void McHorVer02Height5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);// height+1
void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);//width+1&&height+1
void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);//width+1&&height+1
+void McHorVer22Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);//width+1&&height+1
#endif
#if defined(X86_ASM)
@@ -275,14 +287,24 @@
void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth,
int32_t iHeight);
+void McHorVer20Width5_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth,
int32_t iHeight);
+void McHorVer02Height5_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
int32_t iWidth,
int32_t iHeight);
+void McHorVer22Width5HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
+ int32_t iWidth, int32_t iHeight);
+void McHorVer22Width4VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
+void McHorVer22Width4VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
//***************************************************************************//
// SSSE3 definition //
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -399,15 +399,41 @@
McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
}
-void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iWidth, int32_t iHeight) {
- ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
- int32_t tmp1 = 2 * (iWidth - 8);
- McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
- McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
- McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+
+void McHorVer20Width5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 17 || iWidth == 9)
+ McHorVer20Width9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+ else //if (iWidth == 5)
+ McHorVer20Width5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
}
+void McHorVer02Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 16 || iWidth == 8)
+ McHorVer02Height9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+ else //if (iWidth == 4)
+ McHorVer02Height5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void McHorVer22Width5Or9Or17Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+ if (iWidth == 17 || iWidth == 9){
+ int32_t tmp1 = 2 * (iWidth - 8);
+ McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+ McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
+ McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+ }
+ else{ //if(iWidth == 5)
+ int32_t tmp1 = 2 * (iWidth - 4);
+ McHorVer22Width5HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+ McHorVer22Width4VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
+ McHorVer22Width4VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 4, iDstStride, 4, iHeight);
+ }
+
+}
+
static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth,
int32_t iHeight) {
@@ -716,26 +742,32 @@
// NEON implementation //
//***************************************************************************//
#if defined(HAVE_NEON)
-void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer20Width5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 17)
McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 9)
+ else if (iWidth == 9)
McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 5)
+ McHorVer20Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
-void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer02Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 16)
McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 8)
+ else if (iWidth == 8)
McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 4)
+ McHorVer02Height5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
-void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer22Width5Or9Or17Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 17)
McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 9)
+ else if (iWidth == 9)
McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 5)
+ McHorVer22Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
@@ -998,27 +1030,33 @@
}
#endif
#if defined(HAVE_NEON_AARCH64)
-void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer20Width5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 17)
McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 9)
+ else if (iWidth == 9)
McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 5)
+ McHorVer20Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
-void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer02Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 16)
McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 8)
+ else if (iWidth == 8)
McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 4)
+ McHorVer02Height5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
-void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+void McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
if (iWidth == 17)
McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- else //if (iWidth == 9)
+ else if (iWidth == 9)
McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 5)
+ McHorVer22Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) {
@@ -1291,11 +1329,9 @@
#if defined (X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
- pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
-#if 1 //could not work well for sub8x8: should disable it for now, or bugfix for it!
- pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
- pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
-#endif
+ pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_sse2;
+ pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_sse2;
+ pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_sse2;
pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
pMcFuncs->pMcChromaFunc = McChroma_sse2;
pMcFuncs->pMcLumaFunc = McLuma_sse2;
@@ -1311,9 +1347,9 @@
pMcFuncs->pMcLumaFunc = McLuma_neon;
pMcFuncs->pMcChromaFunc = McChroma_neon;
pMcFuncs->pfSampleAveraging = PixelAvg_neon;
- pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
- pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
- pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
+ pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_neon;//iWidth+1:4/8/16
+ pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_neon;//heigh+1:4/8/16
+ pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_neon;//iWidth+1/heigh+1
}
#endif
#if defined(HAVE_NEON_AARCH64)
@@ -1321,9 +1357,9 @@
pMcFuncs->pMcLumaFunc = McLuma_AArch64_neon;
pMcFuncs->pMcChromaFunc = McChroma_AArch64_neon;
pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon;
- pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
- pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
- pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
+ pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_AArch64_neon;//iWidth+1:4/8/16
+ pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_AArch64_neon;//heigh+1:4/8/16
+ pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
}
#endif
}
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -150,6 +150,26 @@
movq %9, %1
%endmacro
+
+%macro FILTER_HV_W4 9
+paddw %1, %6
+movdqa %8, %3
+movdqa %7, %2
+paddw %1, [h264_w0x10_1]
+paddw %8, %4
+paddw %7, %5
+psllw %8, 2
+psubw %8, %7
+paddw %1, %8
+psllw %8, 2
+paddw %1, %8
+psraw %1, 5
+WELS_Zero %8
+packuswb %1, %8
+movd %9, %1
+%endmacro
+
+
;*******************************************************************************
; Code
;*******************************************************************************
@@ -574,6 +594,140 @@
;***********************************************************************
+; void McHorVer02Height5_sse2( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight )
+;***********************************************************************
+WELS_EXTERN McHorVer02Height5_sse2
+%assign push_num 0
+LOAD_6_PARA
+PUSH_XMM 8
+SIGN_EXTENSION r1, r1d
+SIGN_EXTENSION r3, r3d
+SIGN_EXTENSION r4, r4d
+SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+push r12
+push r13
+push r14
+mov r12, r0
+mov r13, r2
+mov r14, r5
+%endif
+
+shr r4, 2
+sub r0, r1
+sub r0, r1
+
+.xloop:
+WELS_Zero xmm7
+SSE_LOAD_8P xmm0, xmm7, [r0]
+SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm2, xmm7, [r0]
+SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm4, xmm7, [r0]
+SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm6, xmm7, [r0]
+movdqa xmm0,xmm1
+movdqa xmm1,xmm2
+movdqa xmm2,xmm3
+movdqa xmm3,xmm4
+movdqa xmm4,xmm5
+movdqa xmm5,xmm6
+add r2, r3
+sub r0, r1
+
+.start:
+FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm6, xmm7, [r0]
+FILTER_HV_W4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+FILTER_HV_W4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm0, xmm1, [r0]
+FILTER_HV_W4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+FILTER_HV_W4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm2, xmm3, [r0]
+FILTER_HV_W4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+FILTER_HV_W4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+SSE_LOAD_8P xmm4, xmm5, [r0]
+FILTER_HV_W4 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+jmp near .start
+
+.x_loop_dec:
+dec r4
+jz near .xx_exit
+%ifdef X86_32
+mov r0, arg1
+mov r2, arg3
+mov r5, arg6
+%else
+mov r0, r12
+mov r2, r13
+mov r5, r14
+%endif
+sub r0, r1
+sub r0, r1
+add r0, 4
+add r2, 4
+jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+pop r14
+pop r13
+pop r12
+%endif
+POP_XMM
+LOAD_6_PARA_POP
+ret
+
+
+;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
@@ -733,7 +887,81 @@
ret
+;***********************************************************************
+; void McHorVer20Width5_sse2( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
+; );
+;***********************************************************************
+WELS_EXTERN McHorVer20Width5_sse2
+%assign push_num 0
+LOAD_6_PARA
+PUSH_XMM 8
+SIGN_EXTENSION r1, r1d
+SIGN_EXTENSION r3, r3d
+SIGN_EXTENSION r4, r4d
+SIGN_EXTENSION r5, r5d
+sub r0, 2
+pxor xmm7, xmm7
+.yloop_width_5:
+movq xmm0, [r0]
+punpcklbw xmm0, xmm7
+movq xmm1, [r0+5]
+punpcklbw xmm1, xmm7
+movq xmm2, [r0+1]
+punpcklbw xmm2, xmm7
+movq xmm3, [r0+4]
+punpcklbw xmm3, xmm7
+movq xmm4, [r0+2]
+punpcklbw xmm4, xmm7
+movq xmm5, [r0+3]
+punpcklbw xmm5, xmm7
+
+movdqa xmm7, xmm2
+paddw xmm7, xmm3
+movdqa xmm6, xmm4
+paddw xmm6, xmm5
+psllw xmm6, 2
+psubw xmm6, xmm7
+paddw xmm0, xmm1
+paddw xmm0, xmm6
+psllw xmm6, 2
+paddw xmm0, xmm6
+paddw xmm0, [h264_w0x10_1]
+psraw xmm0, 5
+packuswb xmm0, xmm0
+movd [r2], xmm0
+
+pxor xmm7, xmm7
+movq xmm0, [r0+6]
+punpcklbw xmm0, xmm7
+
+paddw xmm4, xmm1
+paddw xmm5, xmm3
+psllw xmm5, 2
+psubw xmm5, xmm4
+paddw xmm2, xmm0
+paddw xmm2, xmm5
+psllw xmm5, 2
+paddw xmm2, xmm5
+paddw xmm2, [h264_w0x10_1]
+psraw xmm2, 5
+packuswb xmm2, xmm2
+movd [r2+1], xmm2
+
+add r0, r1
+add r2, r3
+dec r5
+jnz .yloop_width_5
+POP_XMM
+LOAD_6_PARA_POP
+ret
+
+
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
@@ -1162,3 +1390,359 @@
POP_XMM
LOAD_6_PARA_POP
ret
+
+
+;***********************************************************************
+;void McHorVer22Width5HorFirst_sse2
+; (const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+WELS_EXTERN McHorVer22Width5HorFirst_sse2
+%assign push_num 0
+LOAD_6_PARA
+PUSH_XMM 8
+SIGN_EXTENSION r1, r1d
+SIGN_EXTENSION r3, r3d
+SIGN_EXTENSION r4, r4d
+SIGN_EXTENSION r5, r5d
+pxor xmm7, xmm7
+sub r0, r1 ;;;;;;;;need more 5 lines.
+sub r0, r1
+
+.yloop_width_5:
+movq xmm0, [r0]
+punpcklbw xmm0, xmm7
+movq xmm1, [r0+5]
+punpcklbw xmm1, xmm7
+movq xmm2, [r0+1]
+punpcklbw xmm2, xmm7
+movq xmm3, [r0+4]
+punpcklbw xmm3, xmm7
+movq xmm4, [r0+2]
+punpcklbw xmm4, xmm7
+movq xmm5, [r0+3]
+punpcklbw xmm5, xmm7
+
+movdqa xmm7, xmm2
+paddw xmm7, xmm3
+movdqa xmm6, xmm4
+paddw xmm6, xmm5
+psllw xmm6, 2
+psubw xmm6, xmm7
+paddw xmm0, xmm1
+paddw xmm0, xmm6
+psllw xmm6, 2
+paddw xmm0, xmm6
+movd [r2], xmm0
+
+pxor xmm7, xmm7
+movq xmm0, [r0+6]
+punpcklbw xmm0, xmm7
+
+paddw xmm4, xmm1
+paddw xmm5, xmm3
+psllw xmm5, 2
+psubw xmm5, xmm4
+paddw xmm2, xmm0
+paddw xmm2, xmm5
+psllw xmm5, 2
+paddw xmm2, xmm5
+movq [r2+2], xmm2
+movhps [r2+2+8], xmm2
+
+add r0, r1
+add r2, r3
+dec r5
+jnz .yloop_width_5
+POP_XMM
+LOAD_6_PARA_POP
+ret
+
+
+%macro FILTER_VER_4 9
+paddw %1, %6
+movdqa %7, %2
+movdqa %8, %3
+
+
+paddw %7, %5
+paddw %8, %4
+
+psubw %1, %7
+psraw %1, 2
+paddw %1, %8
+psubw %1, %7
+psraw %1, 2
+paddw %8, %1
+paddw %8, [h264_mc_hc_32]
+psraw %8, 6
+packuswb %8, %8
+movd %9, %8
+%endmacro
+
+
+;***********************************************************************
+;void McHorVer22Width4VerLastAlign_sse2(
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
+%assign push_num 0
+LOAD_6_PARA
+PUSH_XMM 8
+SIGN_EXTENSION r1, r1d
+SIGN_EXTENSION r3, r3d
+SIGN_EXTENSION r4, r4d
+SIGN_EXTENSION r5, r5d
+%ifndef X86_32
+push r12
+push r13
+push r14
+mov r12, r0
+mov r13, r2
+mov r14, r5
+%endif
+
+shr r4, 2
+
+.width_loop:
+movdqa xmm0, [r0]
+movdqa xmm1, [r0+r1]
+lea r0, [r0+2*r1]
+movdqa xmm2, [r0]
+movdqa xmm3, [r0+r1]
+lea r0, [r0+2*r1]
+movdqa xmm4, [r0]
+movdqa xmm5, [r0+r1]
+
+FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+lea r0, [r0+2*r1]
+movdqa xmm6, [r0]
+
+movdqa xmm0, xmm1
+movdqa xmm1, xmm2
+movdqa xmm2, xmm3
+movdqa xmm3, xmm4
+movdqa xmm4, xmm5
+movdqa xmm5, xmm6
+
+add r2, r3
+sub r0, r1
+
+.start:
+FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqa xmm6, [r0]
+FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqa xmm7, [r0+r1]
+FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqa xmm0, [r0]
+FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqa xmm1, [r0+r1]
+FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqa xmm2, [r0]
+FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqa xmm3, [r0+r1]
+FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqa xmm4, [r0]
+FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqa xmm5, [r0+r1]
+jmp near .start
+
+.x_loop_dec:
+dec r4
+jz near .exit
+%ifdef X86_32
+mov r0, arg1
+mov r2, arg3
+mov r5, arg6
+%else
+mov r0, r12
+mov r2, r13
+mov r5, r14
+%endif
+add r0, 8
+add r2, 4
+jmp .width_loop
+
+.exit:
+%ifndef X86_32
+pop r14
+pop r13
+pop r12
+%endif
+POP_XMM
+LOAD_6_PARA_POP
+ret
+
+
+;***********************************************************************
+;void McHorVer22Width4VerLastUnAlign_sse2(
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
+%assign push_num 0
+LOAD_6_PARA
+PUSH_XMM 8
+SIGN_EXTENSION r1, r1d
+SIGN_EXTENSION r3, r3d
+SIGN_EXTENSION r4, r4d
+SIGN_EXTENSION r5, r5d
+%ifndef X86_32
+push r12
+push r13
+push r14
+mov r12, r0
+mov r13, r2
+mov r14, r5
+%endif
+shr r4, 2
+
+.width_loop:
+movdqu xmm0, [r0]
+movdqu xmm1, [r0+r1]
+lea r0, [r0+2*r1]
+movdqu xmm2, [r0]
+movdqu xmm3, [r0+r1]
+lea r0, [r0+2*r1]
+movdqu xmm4, [r0]
+movdqu xmm5, [r0+r1]
+
+FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+lea r0, [r0+2*r1]
+movdqu xmm6, [r0]
+
+movdqa xmm0, xmm1
+movdqa xmm1, xmm2
+movdqa xmm2, xmm3
+movdqa xmm3, xmm4
+movdqa xmm4, xmm5
+movdqa xmm5, xmm6
+
+add r2, r3
+sub r0, r1
+
+.start:
+FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqu xmm6, [r0]
+FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqu xmm7, [r0+r1]
+FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqu xmm0, [r0]
+FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqu xmm1, [r0+r1]
+FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqu xmm2, [r0]
+FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqu xmm3, [r0+r1]
+FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+dec r5
+jz near .x_loop_dec
+
+lea r0, [r0+2*r1]
+movdqu xmm4, [r0]
+FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+dec r5
+jz near .x_loop_dec
+
+lea r2, [r2+2*r3]
+movdqu xmm5, [r0+r1]
+jmp near .start
+
+.x_loop_dec:
+dec r4
+jz near .exit
+%ifdef X86_32
+mov r0, arg1
+mov r2, arg3
+mov r5, arg6
+%else
+mov r0, r12
+mov r2, r13
+mov r5, r14
+%endif
+add r0, 8
+add r2, 4
+jmp .width_loop
+
+.exit:
+%ifndef X86_32
+pop r14
+pop r13
+pop r12
+%endif
+POP_XMM
+LOAD_6_PARA_POP
+ret
+