ref: ac08cc4b2fcd087c9d56bd89a8092d582a9433fa
parent: 9a89ee72b0bfb0f0202d40466c15a596caace543
parent: 01b74ea7c1b8deba1fe2f8ecbc3a4d4e9afeab48
author: huili2 <huili2@cisco.com>
date: Tue Jan 6 08:37:56 EST 2015
Merge pull request #1694 from zhilwang/asm-SetNoneZero Add asm code for NoneZeroCount and refine related code
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -834,17 +834,12 @@
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
-
- vld1.64 {d0-d2}, [r0]
-
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
-
- vst1.64 {d0-d2}, [r0]
+ mov r1, #1
+ vdup.8 q2, r1
+ vld1.64 {d0,d1,d2}, [r0]
+ vmin.s8 q0, q0, q2
+ vmin.s8 d2, d2, d4
+ vst1.64 {d0,d1,d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -553,16 +553,12 @@
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
+ mov w1, #1
+ dup v3.8b, w1
ld1 {v0.8b, v1.8b, v2.8b}, [x0]
- ins v0.d[1], v1.d[0]
- uzp1 v0.2d, v0.2d, v1.2d
- cmeq v0.16b, v0.16b, #0
- cmeq v2.8b, v2.8b, #0
- mvn v0.16b, v0.16b
- mvn v2.8b, v2.8b
- abs v0.16b, v0.16b
- abs v2.8b, v2.8b
- ins v1.d[0], v0.d[1]
+ umin v0.8b, v0.8b, v3.8b
+ umin v1.8b, v1.8b, v3.8b
+ umin v2.8b, v2.8b, v3.8b
st1 {v0.8b, v1.8b, v2.8b}, [x0]
WELS_ASM_AARCH64_FUNC_END
--- a/codec/common/inc/deblocking_common.h
+++ b/codec/common/inc/deblocking_common.h
@@ -15,6 +15,8 @@
int8_t* pTc);
void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void WelsNonZeroCount_c (int8_t* pNonZeroCount);
+
#if defined(__cplusplus)
extern "C" {
#endif//__cplusplus
@@ -32,6 +34,7 @@
void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
+void WelsNonZeroCount_sse2 (int8_t* pNonZeroCount);
#endif
#if defined(HAVE_NEON)
@@ -48,6 +51,7 @@
void DeblockChromaLt4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
#endif
#if defined(HAVE_NEON_AARCH64)
@@ -61,6 +65,7 @@
void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
#endif
#if defined(__cplusplus)
}
--- a/codec/common/src/deblocking_common.cpp
+++ b/codec/common/src/deblocking_common.cpp
@@ -180,6 +180,13 @@
DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
}
+void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
+ int32_t i;
+ for (i = 0; i < 24; i++) {
+ pNonZeroCount[i] = !!pNonZeroCount[i];
+ }
+}
+
#ifdef X86_ASM
extern "C" {
void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -5276,3 +5276,14 @@
pop r3
ret
+WELS_EXTERN WelsNonZeroCount_sse2
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqu xmm0, [r0]
+ movq xmm1, [r0+16]
+ WELS_DB1 xmm2
+ pminub xmm0, xmm2
+ pminub xmm1, xmm2
+ movdqu [r0], xmm0
+ movq [r0+16], xmm1
+ ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -99,18 +99,6 @@
// }
.endm
#endif
-// r0 int8_t* non_zero_count,
-WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
- vld1.64 {d0-d2}, [r0]
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
- vst1.64 {d0-d2}, [r0]
-WELS_ASM_FUNC_END
-
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -100,20 +100,6 @@
// }
.endm
#endif
-// x0 int8_t* non_zero_count,
-WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon
- mov x1, x0
- ld1 {v0.16b}, [x1], #16
- ld1 {v1.8b}, [x1]
- cmeq v0.16b, v0.16b, #0
- cmeq v1.8b, v1.8b, #0
- mvn v0.16b, v0.16b
- mvn v1.8b, v1.8b
- abs v0.16b, v0.16b
- abs v1.8b, v1.8b
- st1 {v0.16b}, [x0], #16
- st1 {v1.8b}, [x0]
-WELS_ASM_AARCH64_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -73,13 +73,11 @@
#endif
#if defined(HAVE_NEON)
-void SetNonZeroCount_neon (int8_t* pNonZeroCount);
void WelsBlockZero16x16_neon(int16_t * block, int32_t stride);
void WelsBlockZero8x8_neon(int16_t * block, int32_t stride);
#endif
#if defined(HAVE_NEON_AARCH64)
-void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
void WelsBlockZero16x16_AArch64_neon(int16_t * block, int32_t stride);
void WelsBlockZero8x8_AArch64_neon(int16_t * block, int32_t stride);
#endif
@@ -86,8 +84,6 @@
#ifdef __cplusplus
}
#endif//__cplusplus
-
-void SetNonZeroCount_c (int8_t* pNonZeroCount);
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu);
void WelsBlockZero16x16_c(int16_t * block, int32_t stride);
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1695,25 +1695,13 @@
}
void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
- pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_c;
+ pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_c;
+ pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_c;
+ pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
#ifdef HAVE_NEON
if (iCpu & WELS_CPU_NEON) {
- pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
- }
-#endif
-
-#ifdef HAVE_NEON_AARCH64
- if (iCpu & WELS_CPU_NEON) {
- pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_AArch64_neon;
- }
-#endif
-
- pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_c;
- pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_c;
- //TO DO add neon and X86
-#ifdef HAVE_NEON
- if (iCpu & WELS_CPU_NEON) {
+ pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_neon;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_neon;
}
@@ -1721,6 +1709,7 @@
#ifdef HAVE_NEON_AARCH64
if (iCpu & WELS_CPU_NEON) {
+ pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_AArch64_neon;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_AArch64_neon;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_AArch64_neon;
}
@@ -1728,19 +1717,12 @@
#if defined(X86_ASM)
if (iCpu & WELS_CPU_SSE2) {
+ pFunc->pWelsSetNonZeroCountFunc = WelsNonZeroCount_sse2;
pFunc->pWelsBlockZero16x16Func = WelsBlockZero16x16_sse2;
pFunc->pWelsBlockZero8x8Func = WelsBlockZero8x8_sse2;
}
#endif
-}
-
-void SetNonZeroCount_c (int8_t* pNonZeroCount) {
- int32_t i;
-
- for (i = 0; i < 24; i++) {
- pNonZeroCount[i] = !!pNonZeroCount[i];
- }
}
void WelsBlockInit (int16_t* pBlock, int iW, int iH, int iStride, uint8_t uiVal) {
--- a/codec/encoder/core/inc/deblocking.h
+++ b/codec/encoder/core/inc/deblocking.h
@@ -65,12 +65,10 @@
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
-void WelsNonZeroCount_neon (int8_t* pNonZeroCount);
void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
uint8_t (*pBS)[4][4]);
#endif
#if defined(HAVE_NEON_AARCH64)
-void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,
uint8_t (*pBS)[4][4]);
#endif
@@ -79,7 +77,6 @@
#endif//__cplusplus
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
-void WelsNonZeroCount_c (int8_t* pNonZeroCount);
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu);
void PerformDeblockingFilter (sWelsEncCtx* pEnc);
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -774,13 +774,6 @@
}
}
-void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
- int32_t i;
-
- for (i = 0; i < 24; i++) {
- pNonZeroCount[i] = !!pNonZeroCount[i];
- }
-}
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
*pfSetNZCZero = WelsNonZeroCount_c;
#ifdef HAVE_NEON
@@ -791,6 +784,11 @@
#ifdef HAVE_NEON_AARCH64
if (iCpu & WELS_CPU_NEON) {
*pfSetNZCZero = WelsNonZeroCount_AArch64_neon;
+ }
+#endif
+#if defined(X86_ASM)
+ if (iCpu & WELS_CPU_SSE2) {
+ *pfSetNZCZero = WelsNonZeroCount_sse2;
}
#endif
}
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -1,7 +1,7 @@
#include <gtest/gtest.h>
#include "macros.h"
#include "decode_mb_aux.h"
-#include "../../codec/decoder/core/src/decode_slice.cpp"
+#include "deblocking.h"
using namespace WelsDec;
void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
int16_t iSrc[16];
@@ -98,7 +98,7 @@
{\
int8_t iNonZeroCount[2][24];\
for(int32_t i = 0; i < 24; i++) {\
- iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\
+ iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 25);\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -106,7 +106,7 @@
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
}\
for(int32_t i =0; i<24; i++) {\
- iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\
+ iNonZeroCount[0][i] = iNonZeroCount[1][i] = 0;\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -114,7 +114,7 @@
ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\
}\
for(int32_t i =0; i<24; i++) {\
- iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\
+ iNonZeroCount[0][i] = iNonZeroCount[1][i] = 16;\
}\
method(iNonZeroCount[0]);\
SetNonZeroCount_ref(iNonZeroCount[1]);\
@@ -123,12 +123,16 @@
}\
}
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_c)
+#if defined(X86_ASM)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_sse2)
+#endif
+
#if defined(HAVE_NEON)
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_neon)
#endif
#if defined(HAVE_NEON_AARCH64)
-GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)
+GENERATE_SETNONZEROCOUNT (WelsNonZeroCount_AArch64_neon)
#endif