ref: e9dc97803dfb11e3364f8b2cc6c67dd4ffa8f512
parent: 7d65687284a0ae8033a76a380c22b479bdf96d5a
parent: fb0b2b3f414fd68307043e50b747e23cb4d92498
author: ruil2 <ruil2@cisco.com>
date: Thu Apr 28 05:08:44 EDT 2016
Merge pull request #2447 from saamas/encoder-cavlcparamcal-sse42 [Encoder] Add an SSE4.2 implementation of CavlcParamCal
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -75,9 +75,13 @@
extern "C" {
#endif//__cplusplus
+int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+ int32_t iEndIdx);
#ifdef X86_ASM
int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
int32_t iEndIdx);
+int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
+ int32_t iEndIdx);
#endif
#if defined(__cplusplus)
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -279,6 +279,11 @@
pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
+#ifdef X86_ASM
+ if (uiCpuFlag & WELS_CPU_SSE42) {
+ pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
+ }
+#endif
if (iEntropyCodingModeFlag) {
pFuncList->pfStashMBStatus = StashMBStatusCabac;
pFuncList->pfStashPopMBStatus = StashPopMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -42,10 +42,57 @@
%include "asm_inc.asm"
+SECTION .rodata align=16
+align 16
+wels_shufb_rev:
+ db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+; 4-bit table giving number of preceding zeros for each set bit as well as the
+; eventual next bit. For the case where all 4 bits are set, this requires 5
+; zeros. The 5th zero can either be read from beyond the final table entry or
+; implied via zero-initializing the location being read into.
+wels_cavlc_param_cal_run_lut:
+ db 4, 0, 0, 0
+ db 0, 3, 0, 0
+ db 1, 2, 0, 0
+ db 0, 0, 2, 0
+ db 2, 1, 0, 0
+ db 0, 1, 1, 0
+ db 1, 0, 1, 0
+ db 0, 0, 0, 1
+ db 3, 0, 0, 0
+ db 0, 2, 0, 0
+ db 1, 1, 0, 0
+ db 0, 0, 1, 0
+ db 2, 0, 0, 0
+ db 0, 1, 0, 0
+ db 1, 0, 0, 0
+ db 0, 0, 0, 0
+; db 0
+; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
+; the words that match zero bits and concatenating in reverse order.
+wels_cavlc_param_cal_shufb_lut:
+ db 0, 0, 0, 0, 0, 0, 0, 0
+ db 6, 7, 0, 0, 0, 0, 0, 0
+ db 4, 5, 0, 0, 0, 0, 0, 0
+ db 6, 7, 4, 5, 0, 0, 0, 0
+ db 2, 3, 0, 0, 0, 0, 0, 0
+ db 6, 7, 2, 3, 0, 0, 0, 0
+ db 4, 5, 2, 3, 0, 0, 0, 0
+ db 6, 7, 4, 5, 2, 3, 0, 0
+ db 0, 1, 0, 0, 0, 0, 0, 0
+ db 6, 7, 0, 1, 0, 0, 0, 0
+ db 4, 5, 0, 1, 0, 0, 0, 0
+ db 6, 7, 4, 5, 0, 1, 0, 0
+ db 2, 3, 0, 1, 0, 0, 0, 0
+ db 6, 7, 2, 3, 0, 1, 0, 0
+ db 4, 5, 2, 3, 0, 1, 0, 0
+ db 6, 7, 4, 5, 2, 3, 0, 1
+
+
%ifdef X86_32
-SECTION .rodata align=16
align 16
sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
@@ -312,6 +359,8 @@
db 7,6,5,4,3,2,1,7, ;254
db 7,6,5,4,3,2,1,8, ;255
+%endif ; X86_32
+
;***********************************************************************
; Code
;***********************************************************************
@@ -318,6 +367,7 @@
SECTION .text
+%ifdef X86_32
;***********************************************************************
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
@@ -457,3 +507,166 @@
pop ebx
ret
%endif
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+
+WELS_EXTERN CavlcParamCal_sse42
+%define i_endidxd dword arg5d
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num 4
+ %define p_total_coeffs r0
+ %define r_tmp r1
+ %define r_tmpd r1d
+ %define r_tmpb r1b
+ %define p_level r2
+ %define p_coeff_level r3
+ %define r_mask r5
+ %define r_maskd r5d
+ %define p_run r6
+ %define p_shufb_lut wels_cavlc_param_cal_shufb_lut
+ %define p_run_lut wels_cavlc_param_cal_run_lut
+ mov p_coeff_level, arg1
+ mov p_run, arg2
+ mov p_level, arg3
+ mov p_total_coeffs, arg4
+%elifdef WIN64
+ push rbx
+ %assign push_num 1
+ %define p_coeff_level r0
+ %define p_run r1
+ %define p_level r2
+ %define p_total_coeffs r3
+ %define r_mask rbx
+ %define r_maskd ebx
+ %define p_shufb_lut r5
+ %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+ lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+ ; Free up rcx/ecx because only cl is accepted as shift amount operand.
+ mov r6, r0
+ %undef p_coeff_level
+ %define p_coeff_level r6
+ %define r_tmp r0
+ %define r_tmpd r0d
+ %define r_tmpb r0b
+%else
+ %assign push_num 0
+ %define p_coeff_level r0
+ %define p_run r1
+ %define p_level r2
+ %define p_total_coeffs r3
+ %define r_mask rax
+ %define r_maskd eax
+ %define p_shufb_lut r5
+ %define i_total_zeros r6
+ %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
+ lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
+%endif
+
+ ; Acquire a bitmask indicating which words are non-zero.
+ ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
+ ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
+ ; Assumptions are taken from previous implementations.
+ pxor xmm1, xmm1
+ cmp i_endidxd, 3
+ jg .load16
+ movq xmm0, [p_coeff_level]
+ packsswb xmm0, xmm1
+ jmp .load_done
+.load16:
+ movdqa xmm0, [p_coeff_level]
+ packsswb xmm0, [p_coeff_level + 16]
+.load_done:
+ movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros.
+ pcmpeqb xmm0, xmm1
+ pshufb xmm0, [wels_shufb_rev]
+ pmovmskb r_maskd, xmm0
+ xor r_maskd, 0FFFFh
+%undef i_endidxd
+%define r_tmp2 r4
+%define r_tmp2d r4d
+ popcnt r_tmp2d, r_maskd
+ mov [p_total_coeffs], r_tmp2d
+ ; Recycle p_total_coeffs.
+%ifidni p_total_coeffs, rcx
+ %define r_tmp rcx
+ %define r_tmpd ecx
+ %define r_tmpb cl
+%else
+ %xdefine i_total_zeros p_total_coeffs
+%endif
+%undef p_total_coeffs
+ mov i_total_zeros, r_tmp2
+ jz .done
+ mov i_total_zeros, 16
+ sub i_total_zeros, r_tmp2
+ bsf r_tmpd, r_maskd ; Find first set bit.
+ sub i_total_zeros, r_tmp
+ ; Skip trailing zeros.
+ ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
+ and r_tmpd, -4
+ shr r_maskd, r_tmpb
+ add r_tmpd, r_tmpd
+ sub p_coeff_level, r_tmp
+ ; Handle first quadruple containing a non-zero value.
+ mov r_tmp, r_mask
+ and r_tmpd, 0Fh
+ movq xmm0, [p_coeff_level + 24]
+ movq xmm1, [p_shufb_lut + 8 * r_tmp]
+ pshufb xmm0, xmm1
+ mov r_tmp2d, [p_run_lut + 4 * r_tmp]
+ shr r_tmp2d, 8 ; Skip initial zero run.
+ movlps [p_level], xmm0 ; Store levels for the first quadruple.
+ mov [p_run], r_tmp2d ; Store accompanying zero runs thus far.
+ shr r_maskd, 4
+ jz .done
+.loop:
+ ; Increment pointers.
+ popcnt r_tmpd, r_tmpd ; Number of non-zero values handled.
+ lea p_level, [p_level + 2 * r_tmp]
+ add p_run, r_tmp
+ ; Handle next quadruple.
+ mov r_tmp, r_mask
+ and r_tmpd, 0Fh
+ movq xmm0, [p_coeff_level + 16]
+ sub p_coeff_level, 8
+ movq xmm1, [p_shufb_lut + 8 * r_tmp]
+ pshufb xmm0, xmm1
+ movzx r_tmp2d, byte [p_run - 1]
+ add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs.
+ movlps [p_level], xmm0 ; Store levels (potentially none).
+ mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs.
+ shr r_maskd, 4
+ jnz .loop
+.done:
+%ifnidni retrq, i_total_zeros
+ mov retrq, i_total_zeros
+%endif
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%elifdef WIN64
+ pop rbx
+%endif
+ ret
+%undef p_coeff_level
+%undef p_run
+%undef p_level
+%undef i_total_zeros
+%undef r_mask
+%undef r_maskd
+%undef r_tmp
+%undef r_tmpd
+%undef r_tmpb
+%undef r_tmp2
+%undef r_tmp2d
+%undef p_shufb_lut
+%undef p_run_lut
--- a/test/build/win32/codec_ut/codec_unittest.vcproj
+++ b/test/build/win32/codec_ut/codec_unittest.vcproj
@@ -391,6 +391,10 @@
Name="encoder"
>
<File
+ RelativePath="..\..\..\encoder\EncUT_Cavlc.cpp"
+ >
+ </File>
+ <File
RelativePath="..\..\..\encoder\EncUT_DecodeMbAux.cpp"
>
</File>
--- /dev/null
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -1,0 +1,90 @@
+#include "cpu.h"
+#include "macros.h"
+#include "set_mb_syn_cavlc.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <cstddef>
+
+using namespace WelsEnc;
+
+namespace {
+
+int32_t CavlcParamCal_ref (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeff,
+ int32_t iLastIndex) {
+ int32_t iTotalZeros = 0;
+ int32_t iTotalCoeffs = 0;
+
+ while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+ -- iLastIndex;
+ }
+
+ while (iLastIndex >= 0) {
+ int32_t iCountZero = 0;
+ pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];
+
+ while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+ ++ iCountZero;
+ -- iLastIndex;
+ }
+ iTotalZeros += iCountZero;
+ pRun[iTotalCoeffs++] = iCountZero;
+ }
+ *pTotalCoeff = iTotalCoeffs;
+ return iTotalZeros;
+}
+
+void TestCavlcParamCalWithEndIdx (PCavlcParamCalFunc func, int endIdx, bool allZero, bool allNonZero) {
+ ENFORCE_STACK_ALIGN_1D(int16_t, coeffLevel, 16, 16);
+ ENFORCE_STACK_ALIGN_1D(int16_t, level, 16, 16);
+ ENFORCE_STACK_ALIGN_1D(uint8_t, run, 16, 16);
+ uint8_t run_ref[16];
+ int16_t level_ref[16];
+ int32_t totalCoeffs = 0;
+ int32_t totalCoeffs_ref = 0;
+ for (int i = 0; i < 16; i++) {
+ const int r = std::rand();
+ if (allZero || (i > endIdx && endIdx > 7))
+ coeffLevel[i] = 0;
+ else if (allNonZero)
+ coeffLevel[i] = r % 0xFFFF - 0x8000 ? r % 0xFFFF - 0x8000 : 0x7FFF;
+ else
+ coeffLevel[i] = (r >> 16 & 1) * ((r & 0xFFFF) - 0x8000);
+ }
+ const int32_t totalZeros_ref = CavlcParamCal_ref (coeffLevel, run_ref, level_ref, &totalCoeffs_ref, endIdx);
+ const int32_t totalZeros = func (coeffLevel, run, level, &totalCoeffs, endIdx);
+ ASSERT_EQ (totalCoeffs, totalCoeffs_ref);
+ if (totalCoeffs > 0)
+ ASSERT_EQ (totalZeros, totalZeros_ref);
+ for (int i = 0; i < totalCoeffs_ref; i++)
+ ASSERT_EQ (level[i], level_ref[i]);
+ for (int i = 0; i < totalCoeffs_ref - 1; i++)
+ ASSERT_EQ (run[i], run_ref[i]);
+}
+
+void TestCavlcParamCal (PCavlcParamCalFunc func) {
+ const int endIdxes[] = { 3, 14, 15 };
+ const int num_test_repetitions = 10000;
+ for (std::size_t i = 0; i < sizeof endIdxes / sizeof *endIdxes; i++) {
+ for (int count = 0; count < num_test_repetitions; count++)
+ TestCavlcParamCalWithEndIdx (func, endIdxes[i], count == 0, count == 1);
+ }
+}
+
+} // anon ns.
+
+TEST (CavlcTest, CavlcParamCal_c) {
+ TestCavlcParamCal (CavlcParamCal_c);
+}
+
+#ifdef X86_32_ASM
+TEST (CavlcTest, CavlcParamCal_sse2) {
+ TestCavlcParamCal (CavlcParamCal_sse2);
+}
+#endif
+
+#ifdef X86_ASM
+TEST (CavlcTest, CavlcParamCal_sse42) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
+ TestCavlcParamCal (CavlcParamCal_sse42);
+}
+#endif
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -1,5 +1,6 @@
ENCODER_UNITTEST_SRCDIR=test/encoder
ENCODER_UNITTEST_CPP_SRCS=\
+ $(ENCODER_UNITTEST_SRCDIR)/EncUT_Cavlc.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\