shithub: openh264

--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h

+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h

@@ -75,9 +75,13 @@

 extern "C" {

 #endif//__cplusplus

+int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,

+                         int32_t iEndIdx);

 #ifdef  X86_ASM

 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,

                             int32_t iEndIdx);

+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,

+                             int32_t iEndIdx);

 #endif

 #if defined(__cplusplus)

--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp

+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp

@@ -279,6 +279,11 @@

     pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;

 #endif

+#ifdef X86_ASM

+  if (uiCpuFlag & WELS_CPU_SSE42) {

+    pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;

+  }

+#endif

   if (iEntropyCodingModeFlag) {

     pFuncList->pfStashMBStatus = StashMBStatusCabac;

     pFuncList->pfStashPopMBStatus = StashPopMBStatusCabac;

--- a/codec/encoder/core/x86/coeff.asm

+++ b/codec/encoder/core/x86/coeff.asm

@@ -42,10 +42,57 @@

 %include "asm_inc.asm"

+SECTION .rodata align=16

+align 16

+wels_shufb_rev:

+    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

+; 4-bit table giving number of preceding zeros for each set bit as well as the

+; eventual next bit. For the case where all 4 bits are set, this requires 5

+; zeros. The 5th zero can either be read from beyond the final table entry or

+; implied via zero-initializing the location being read into.

+wels_cavlc_param_cal_run_lut:

+    db 4, 0, 0, 0

+    db 0, 3, 0, 0

+    db 1, 2, 0, 0

+    db 0, 0, 2, 0

+    db 2, 1, 0, 0

+    db 0, 1, 1, 0

+    db 1, 0, 1, 0

+    db 0, 0, 0, 1

+    db 3, 0, 0, 0

+    db 0, 2, 0, 0

+    db 1, 1, 0, 0

+    db 0, 0, 1, 0

+    db 2, 0, 0, 0

+    db 0, 1, 0, 0

+    db 1, 0, 0, 0

+    db 0, 0, 0, 0

+;   db 0

+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing

+; the words that match zero bits and concatenating in reverse order.

+wels_cavlc_param_cal_shufb_lut:

+    db 0, 0, 0, 0, 0, 0, 0, 0

+    db 6, 7, 0, 0, 0, 0, 0, 0

+    db 4, 5, 0, 0, 0, 0, 0, 0

+    db 6, 7, 4, 5, 0, 0, 0, 0

+    db 2, 3, 0, 0, 0, 0, 0, 0

+    db 6, 7, 2, 3, 0, 0, 0, 0

+    db 4, 5, 2, 3, 0, 0, 0, 0

+    db 6, 7, 4, 5, 2, 3, 0, 0

+    db 0, 1, 0, 0, 0, 0, 0, 0

+    db 6, 7, 0, 1, 0, 0, 0, 0

+    db 4, 5, 0, 1, 0, 0, 0, 0

+    db 6, 7, 4, 5, 0, 1, 0, 0

+    db 2, 3, 0, 1, 0, 0, 0, 0

+    db 6, 7, 2, 3, 0, 1, 0, 0

+    db 4, 5, 2, 3, 0, 1, 0, 0

+    db 6, 7, 4, 5, 2, 3, 0, 1

 %ifdef X86_32

-SECTION .rodata align=16

 align 16

 sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8

@@ -312,6 +359,8 @@

     db 7,6,5,4,3,2,1,7, ;254

     db 7,6,5,4,3,2,1,8, ;255

+%endif ; X86_32

 ;***********************************************************************

 ; Code

 ;***********************************************************************

@@ -318,6 +367,7 @@

 SECTION .text

+%ifdef X86_32

 ;***********************************************************************

 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);

@@ -457,3 +507,166 @@

     pop ebx

ret

 %endif

+;***********************************************************************

+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);

+;***********************************************************************

+WELS_EXTERN CavlcParamCal_sse42

+%define i_endidxd      dword arg5d

+%ifdef X86_32

+    push            r3

+    push            r4

+    push            r5

+    push            r6

+    %assign push_num 4

+    %define p_total_coeffs r0

+    %define r_tmp r1

+    %define r_tmpd r1d

+    %define r_tmpb r1b

+    %define p_level r2

+    %define p_coeff_level r3

+    %define r_mask  r5

+    %define r_maskd r5d

+    %define p_run r6

+    %define p_shufb_lut wels_cavlc_param_cal_shufb_lut

+    %define p_run_lut   wels_cavlc_param_cal_run_lut

+    mov             p_coeff_level, arg1

+    mov             p_run, arg2

+    mov             p_level, arg3

+    mov             p_total_coeffs, arg4

+%elifdef WIN64

+    push            rbx

+    %assign push_num 1

+    %define p_coeff_level r0

+    %define p_run r1

+    %define p_level r2

+    %define p_total_coeffs r3

+    %define r_mask  rbx

+    %define r_maskd ebx

+    %define p_shufb_lut r5

+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))

+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]

+    ; Free up rcx/ecx because only cl is accepted as shift amount operand.

+    mov             r6, r0

+    %undef p_coeff_level

+    %define p_coeff_level r6

+    %define r_tmp r0

+    %define r_tmpd r0d

+    %define r_tmpb r0b

+%else

+    %assign push_num 0

+    %define p_coeff_level r0

+    %define p_run r1

+    %define p_level r2

+    %define p_total_coeffs r3

+    %define r_mask  rax

+    %define r_maskd eax

+    %define p_shufb_lut r5

+    %define i_total_zeros r6

+    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))

+    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]

+%endif

+    ; Acquire a bitmask indicating which words are non-zero.

+    ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.

+    ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.

+    ; Assumptions are taken from previous implementations.

+    pxor            xmm1, xmm1

+    cmp             i_endidxd, 3

+    jg              .load16

+    movq            xmm0, [p_coeff_level]

+    packsswb        xmm0, xmm1

+    jmp             .load_done

+.load16:

+    movdqa          xmm0, [p_coeff_level]

+    packsswb        xmm0, [p_coeff_level + 16]

+.load_done:

+    movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.

+    pcmpeqb         xmm0, xmm1

+    pshufb          xmm0, [wels_shufb_rev]

+    pmovmskb        r_maskd, xmm0

+    xor             r_maskd, 0FFFFh

+%undef i_endidxd

+%define r_tmp2  r4

+%define r_tmp2d r4d

+    popcnt          r_tmp2d, r_maskd

+    mov             [p_total_coeffs], r_tmp2d

+    ; Recycle p_total_coeffs.

+%ifidni p_total_coeffs, rcx

+    %define r_tmp rcx

+    %define r_tmpd ecx

+    %define r_tmpb cl

+%else

+    %xdefine i_total_zeros p_total_coeffs

+%endif

+%undef p_total_coeffs

+    mov             i_total_zeros, r_tmp2

+    jz              .done

+    mov             i_total_zeros, 16

+    sub             i_total_zeros, r_tmp2

+    bsf             r_tmpd, r_maskd                         ; Find first set bit.

+    sub             i_total_zeros, r_tmp

+    ; Skip trailing zeros.

+    ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.

+    and             r_tmpd, -4

+    shr             r_maskd, r_tmpb

+    add             r_tmpd, r_tmpd

+    sub             p_coeff_level, r_tmp

+    ; Handle first quadruple containing a non-zero value.

+    mov             r_tmp, r_mask

+    and             r_tmpd, 0Fh

+    movq            xmm0, [p_coeff_level + 24]

+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]

+    pshufb          xmm0, xmm1

+    mov             r_tmp2d, [p_run_lut + 4 * r_tmp]

+    shr             r_tmp2d, 8                              ; Skip initial zero run.

+    movlps          [p_level], xmm0                         ; Store levels for the first quadruple.

+    mov             [p_run], r_tmp2d                        ; Store accompanying zero runs thus far.

+    shr             r_maskd, 4

+    jz              .done

+.loop:

+    ; Increment pointers.

+    popcnt          r_tmpd, r_tmpd                          ; Number of non-zero values handled.

+    lea             p_level, [p_level + 2 * r_tmp]

+    add             p_run, r_tmp

+    ; Handle next quadruple.

+    mov             r_tmp, r_mask

+    and             r_tmpd, 0Fh

+    movq            xmm0, [p_coeff_level + 16]

+    sub             p_coeff_level, 8

+    movq            xmm1, [p_shufb_lut + 8 * r_tmp]

+    pshufb          xmm0, xmm1

+    movzx           r_tmp2d, byte [p_run - 1]

+    add             r_tmp2d, [p_run_lut + 4 * r_tmp]        ; Add to previous run and get eventual new runs.

+    movlps          [p_level], xmm0                         ; Store levels (potentially none).

+    mov             [p_run - 1], r_tmp2d                    ; Update previous run and store eventual new runs.

+    shr             r_maskd, 4

+    jnz             .loop

+.done:

+%ifnidni retrq, i_total_zeros

+    mov             retrq, i_total_zeros

+%endif

+%ifdef X86_32

+    pop             r6

+    pop             r5

+    pop             r4

+    pop             r3

+%elifdef WIN64

+    pop             rbx

+%endif

+    ret

+%undef p_coeff_level

+%undef p_run

+%undef p_level

+%undef i_total_zeros

+%undef r_mask

+%undef r_maskd

+%undef r_tmp

+%undef r_tmpd

+%undef r_tmpb

+%undef r_tmp2

+%undef r_tmp2d

+%undef p_shufb_lut

+%undef p_run_lut

--- a/test/build/win32/codec_ut/codec_unittest.vcproj

+++ b/test/build/win32/codec_ut/codec_unittest.vcproj

@@ -391,6 +391,10 @@

 			Name="encoder"

 			<File

+				RelativePath="..\..\..\encoder\EncUT_Cavlc.cpp"

+				>

+			</File>

+			<File

 				RelativePath="..\..\..\encoder\EncUT_DecodeMbAux.cpp"

 			</File>

--- /dev/null

+++ b/test/encoder/EncUT_Cavlc.cpp

@@ -1,0 +1,90 @@

+#include "cpu.h"

+#include "macros.h"

+#include "set_mb_syn_cavlc.h"

+#include <gtest/gtest.h>

+#include <cmath>

+#include <cstddef>

+using namespace WelsEnc;

+namespace {

+int32_t CavlcParamCal_ref (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeff,

+                           int32_t iLastIndex) {

+  int32_t iTotalZeros = 0;

+  int32_t iTotalCoeffs = 0;

+  while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {

+    -- iLastIndex;

+  }

+  while (iLastIndex >= 0) {

+    int32_t iCountZero = 0;

+    pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];

+    while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {

+      ++ iCountZero;

+      -- iLastIndex;

+    }

+    iTotalZeros += iCountZero;

+    pRun[iTotalCoeffs++] = iCountZero;

+  }

+  *pTotalCoeff = iTotalCoeffs;

+  return iTotalZeros;

+}

+void TestCavlcParamCalWithEndIdx (PCavlcParamCalFunc func, int endIdx, bool allZero, bool allNonZero) {

+  ENFORCE_STACK_ALIGN_1D(int16_t, coeffLevel, 16, 16);

+  ENFORCE_STACK_ALIGN_1D(int16_t, level, 16, 16);

+  ENFORCE_STACK_ALIGN_1D(uint8_t, run, 16, 16);

+  uint8_t run_ref[16];

+  int16_t level_ref[16];

+  int32_t totalCoeffs = 0;

+  int32_t totalCoeffs_ref = 0;

+  for (int i = 0; i < 16; i++) {

+    const int r = std::rand();

+    if (allZero || (i > endIdx && endIdx > 7))

+      coeffLevel[i] = 0;

+    else if (allNonZero)

+      coeffLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;

+    else

+      coeffLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);

+  }

+  const int32_t totalZeros_ref = CavlcParamCal_ref (coeffLevel, run_ref, level_ref, &totalCoeffs_ref, endIdx);

+  const int32_t totalZeros = func (coeffLevel, run, level, &totalCoeffs, endIdx);

+  ASSERT_EQ (totalCoeffs, totalCoeffs_ref);

+  if (totalCoeffs > 0)

+    ASSERT_EQ (totalZeros, totalZeros_ref);

+  for (int i = 0; i < totalCoeffs_ref; i++)

+    ASSERT_EQ (level[i], level_ref[i]);

+  for (int i = 0; i < totalCoeffs_ref - 1; i++)

+    ASSERT_EQ (run[i], run_ref[i]);

+}

+void TestCavlcParamCal (PCavlcParamCalFunc func) {

+  const int endIdxes[] = { 3, 14, 15 };

+  const int num_test_repetitions = 10000;

+  for (std::size_t i = 0; i < sizeof endIdxes / sizeof *endIdxes; i++) {

+    for (int count = 0; count < num_test_repetitions; count++)

+      TestCavlcParamCalWithEndIdx (func, endIdxes[i], count == 0, count == 1);

+  }

+}

+} // anon ns.

+TEST (CavlcTest, CavlcParamCal_c) {

+  TestCavlcParamCal (CavlcParamCal_c);

+}

+#ifdef X86_32_ASM

+TEST (CavlcTest, CavlcParamCal_sse2) {

+  TestCavlcParamCal (CavlcParamCal_sse2);

+}

+#endif

+#ifdef X86_ASM

+TEST (CavlcTest, CavlcParamCal_sse42) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)

+    TestCavlcParamCal (CavlcParamCal_sse42);

+}

+#endif

--- a/test/encoder/targets.mk

+++ b/test/encoder/targets.mk

@@ -1,5 +1,6 @@

 ENCODER_UNITTEST_SRCDIR=test/encoder

 ENCODER_UNITTEST_CPP_SRCS=\

+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Cavlc.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\

--

⑨