ref: c2c861f19927a87d12bb6243c1ae0b7cc8018a07
parent: e1262a9ac97b614d9c7fc66e42baa6f97c065409
parent: 92bc88eacb6a7acc28963e60349860db0125221e
author: HaiboZhu <haibozhu@cisco.com>
date: Thu Nov 27 09:21:02 EST 2014
Merge pull request #1562 from zhilwang/asm-cabac Add asm code for decoder cabac
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -156,4 +156,22 @@
vst1.32 {d22[0]},[r2],r1
vst1.32 {d22[1]},[r2]
WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsBlockZero16x16_neon
+ veor q0, q0
+ veor q1, q1
+ lsl r1, r1, 1
+.rept 16
+ vst1.64 {q0, q1}, [r0], r1
+.endr
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN WelsBlockZero8x8_neon
+ veor q0, q0
+ lsl r1, r1, 1
+.rept 8
+ vst1.64 {q0}, [r0], r1
+.endr
+WELS_ASM_FUNC_END
#endif
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -158,4 +158,21 @@
st1 {v1.s}[0],[x2],x1
st1 {v1.s}[1],[x2]
WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero16x16_AArch64_neon
+ eor v0.16b, v0.16b, v0.16b
+ eor v1.16b, v1.16b, v1.16b
+ lsl x1, x1, 1
+.rept 16
+ st1 {v0.16b, v1.16b}, [x0], x1
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero8x8_AArch64_neon
+ eor v0.16b, v0.16b, v0.16b
+ lsl x1, x1, 1
+.rept 8
+ st1 {v0.16b}, [x0], x1
+.endr
+WELS_ASM_AARCH64_FUNC_END
#endif
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -67,12 +67,21 @@
extern "C" {
#endif//__cplusplus
+#if defined(X86_ASM)
+void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
+#endif
+
#if defined(HAVE_NEON)
void SetNonZeroCount_neon (int8_t* pNonZeroCount);
+void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
#endif
#if defined(HAVE_NEON_AARCH64)
void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
+void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
+void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
#endif
#ifdef __cplusplus
}
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1644,15 +1644,25 @@
//TO DO add neon and X86
#ifdef HAVE_NEON
if (iCpu & WELS_CPU_NEON) {
-
+ pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_neon;
+ pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_neon;
}
#endif
#ifdef HAVE_NEON_AARCH64
if (iCpu & WELS_CPU_NEON) {
+ pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_AArch64_neon;
+ pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_AArch64_neon;
+ }
+#endif
+#if defined(X86_ASM)
+ if (iCpu & WELS_CPU_SSE2) {
+ pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_sse2;
+ pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_sse2;
}
#endif
+
}
void SetNonZeroCount_c (int8_t* pNonZeroCount) {
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -113,3 +113,30 @@
emms
ret
+
+;void WelsBlockZero16x16_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero16x16_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ shl r1, 1
+ pxor xmm0, xmm0
+%rep 16
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ add r0, r1
+%endrep
+ ret
+
+;void WelsBlockZero8x8_sse2(int16_t * block, int32_t stride);
+WELS_EXTERN WelsBlockZero8x8_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ shl r1, 1
+ pxor xmm0, xmm0
+%rep 8
+ movdqa [r0], xmm0
+ add r0, r1
+%endrep
+ ret