shithub: openh264

Download patch

ref: 4645bd26aa506fe5dd54dc230f3d36e446261360
parent: d906dda2240b2c4b39687f7474a4d1607319681a
author: Sindre Aamås <saamas@cisco.com>
date: Tue Apr 19 15:42:17 EDT 2016

[Encoder] Add an SSE4.2 implementation of WelsGetNonZeroCount

Avoid touching some cache lines by using popcnt instead of table
lookups.

Also gives a speedup of ~1.4x on Haswell as compared with SSE2.

--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -76,6 +76,7 @@
 #ifdef X86_ASM
 
 int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
 
 /****************************************************************************
  * Scan and Score functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -523,6 +523,9 @@
   if (uiCpuFlag & WELS_CPU_SSSE3) {
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_ssse3;
   }
+  if (uiCpuFlag & WELS_CPU_SSE42) {
+    pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse42;
+  }
   if (uiCpuFlag & WELS_CPU_AVX2) {
     pFuncList->pfDctT4                  = WelsDctT4_avx2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_avx2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -337,3 +337,17 @@
     ;add       al,  [nozero_count_table+r1]
     ret
 
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse42
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa          xmm0, [r0]
+    packsswb        xmm0, [r0 + 16]
+    pxor            xmm1, xmm1
+    pcmpeqb         xmm0, xmm1
+    pmovmskb        retrd, xmm0
+    xor             retrd, 0FFFFh
+    popcnt          retrd, retrd
+    ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -301,6 +301,10 @@
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
   TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
 }
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+    TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
+}
 #endif
 #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
 #define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16