shithub: openh264

Download patch

ref: c30cc41261f814035fa4c06e098b53152cb3555d
parent: e9dc97803dfb11e3364f8b2cc6c67dd4ffa8f512
parent: 4645bd26aa506fe5dd54dc230f3d36e446261360
author: HaiboZhu <haibozhu@cisco.com>
date: Wed May 4 05:49:47 EDT 2016

Merge pull request #2448 from saamas/encoder-getnonzerocount-sse42

[Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount

--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -76,6 +76,7 @@
 #ifdef X86_ASM
 
 int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
 
 /****************************************************************************
  * Scan and Score functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -523,6 +523,9 @@
   if (uiCpuFlag & WELS_CPU_SSSE3) {
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;
   }
+  if (uiCpuFlag & WELS_CPU_SSE42) {
+    pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse42;
+  }
   if (uiCpuFlag & WELS_CPU_AVX2) {
     pFuncList->pfDctT4                  = WelsDctT4_avx2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -337,3 +337,17 @@
     ;add       al,  [nozero_count_table+r1]
     ret
 
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse42
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa          xmm0, [r0]
+    packsswb        xmm0, [r0 + 16]
+    pxor            xmm1, xmm1
+    pcmpeqb         xmm0, xmm1
+    pmovmskb        retrd, xmm0
+    xor             retrd, 0FFFFh
+    popcnt          retrd, retrd
+    ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -269,26 +269,43 @@
 GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
 GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
 #endif
-TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
+
+namespace {
+
+void TestGetNoneZeroCount (PGetNoneZeroCountFunc func) {
   ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
-  int32_t result = 0;
-  for (int i = 0; i < 16; i++) {
-    pLevel[i] = (rand() & 0x07) - 4;
-    if (pLevel[i]) result ++;
+  const int num_test_runs = 1000;
+  for (int run = 0; run < num_test_runs; run++) {
+    const bool all_zero = run == 0;
+    const bool all_nonzero = run == 1;
+    int result = 0;
+    for (int i = 0; i < 16; i++) {
+      const int r = rand();
+      if (all_zero)
+        pLevel[i] = 0;
+      else if (all_nonzero)
+        pLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
+      else
+        pLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
+      result += pLevel[i] != 0;
+    }
+    const int32_t nnz = func (pLevel);
+    EXPECT_EQ (nnz, result);
   }
-  int32_t nnz = WelsGetNoneZeroCount_c (pLevel);
-  EXPECT_EQ (nnz, result);
 }
+
+} // anon ns.
+
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_c) {
+  TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
+}
 #ifdef X86_ASM
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
-  ENFORCE_STACK_ALIGN_1D (int16_t, pLevel, 16, 16);
-  int32_t result = 0;
-  for (int i = 0; i < 16; i++) {
-    pLevel[i] = (rand() & 0x07) - 4;
-    if (pLevel[i]) result ++;
-  }
-  int32_t nnz = WelsGetNoneZeroCount_sse2 (pLevel);
-  EXPECT_EQ (nnz, result);
+  TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
+}
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+    TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
 }
 #endif
 #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)