ref: c30cc41261f814035fa4c06e098b53152cb3555d
parent: e9dc97803dfb11e3364f8b2cc6c67dd4ffa8f512
parent: 4645bd26aa506fe5dd54dc230f3d36e446261360
author: HaiboZhu <haibozhu@cisco.com>
date: Wed May 4 05:49:47 EDT 2016
Merge pull request #2448 from saamas/encoder-getnonzerocount-sse42 [Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -76,6 +76,7 @@
#ifdef X86_ASM
int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
/****************************************************************************
* Scan and Score functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -523,6 +523,9 @@
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
+ if (uiCpuFlag & WELS_CPU_SSE42) {
+ pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse42;
+ }
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -337,3 +337,17 @@
;add al, [nozero_count_table+r1]
ret
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse42
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ packsswb xmm0, [r0 + 16]
+ pxor xmm1, xmm1
+ pcmpeqb xmm0, xmm1
+ pmovmskb retrd, xmm0
+ xor retrd, 0FFFFh
+ popcnt retrd, retrd
+ ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -269,26 +269,43 @@
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
#endif
-TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
+
+namespace {
+
+void TestGetNoneZeroCount (PGetNoneZeroCountFunc func) {
ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
- int32_t result = 0;
- for (int i = 0; i < 16; i++) {
- pLevel[i] = (rand() & 0x07) - 4;
- if (pLevel[i]) result ++;
+ const int num_test_runs = 1000;
+ for (int run = 0; run < num_test_runs; run++) {
+ const bool all_zero = run == 0;
+ const bool all_nonzero = run == 1;
+ int result = 0;
+ for (int i = 0; i < 16; i++) {
+ const int r = rand();
+ if (all_zero)
+ pLevel[i] = 0;
+ else if (all_nonzero)
+ pLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
+ else
+ pLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
+ result += pLevel[i] != 0;
+ }
+ const int32_t nnz = func (pLevel);
+ EXPECT_EQ (nnz, result);
}
- int32_t nnz = WelsGetNoneZeroCount_c (pLevel);
- EXPECT_EQ (nnz, result);
}
+
+} // anon ns.
+
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
+}
#ifdef X86_ASM
TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
- ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
- int32_t result = 0;
- for (int i = 0; i < 16; i++) {
- pLevel[i] = (rand() & 0x07) - 4;
- if (pLevel[i]) result ++;
- }
- int32_t nnz = WelsGetNoneZeroCount_sse2 (pLevel);
- EXPECT_EQ (nnz, result);
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
+}
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
}
#endif
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)