ref: bb49e23719b29955c314359b81d0faf48df2e026
parent: 1e83bec860486cbaef6787ffa0c91a3475a6e9b6
author: Sindre Aamås <saamas@cisco.com>
date: Wed Apr 13 04:42:40 EDT 2016
[Encoder] Add AVX2 4x4 quantization routines WelsQuantFour4x4Max_avx2 (~2.06x speedup over SSE2) WelsQuantFour4x4_avx2 (~2.32x speedup over SSE2) WelsQuant4x4Dc_avx2 (~1.49x speedup over SSE2) WelsQuant4x4_avx2 (~1.42x speedup over SSE2)
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -657,3 +657,8 @@
vpsrlw %1, %1, 15
vpsllw %1, %1, 5
%endmacro
+
+%macro WELS_DW32767_VEX 1
+ vpcmpeqw %1, %1, %1
+ vpsrlw %1, %1, 1
+%endmacro
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -106,6 +106,11 @@
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+
#endif
#ifdef HAVE_NEON
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -526,6 +526,11 @@
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
+
+ pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
+ pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
+ pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
+ pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
}
//#endif//MACOS
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -368,3 +368,137 @@
punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2
ret
+
+
+; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
+%macro AVX2_Quant 5
+ vpabsw %2, %1
+ vpor %1, %1, %5 ; ensure non-zero before vpsignw
+ vpaddusw %2, %2, %3
+ vpmulhuw %2, %2, %4
+ vpsignw %1, %2, %1
+%endmacro
+
+
+;***********************************************************************
+; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuant4x4_avx2
+ %assign push_num 0
+ LOAD_3_PARA
+ PUSH_XMM 5
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm2
+ vmovdqu ymm3, [r0]
+ AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
+ vmovdqu [r0], ymm3
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuant4x4Dc_avx2
+ %assign push_num 0
+ LOAD_1_PARA
+ PUSH_XMM 5
+%ifidni r1, arg2
+ vmovd xmm0, arg2d
+ vpbroadcastw ymm0, xmm0
+%else
+ vpbroadcastw ymm0, arg2
+%endif
+%ifidni r2, arg3
+ vmovd xmm1, arg3d
+ vpbroadcastw ymm1, xmm1
+%else
+ vpbroadcastw ymm1, arg3
+%endif
+ WELS_DW32767_VEX ymm2
+ vmovdqu ymm3, [r0]
+ AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
+ vmovdqu [r0], ymm3
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuantFour4x4_avx2
+ %assign push_num 0
+ LOAD_3_PARA
+ PUSH_XMM 6
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm4
+ vmovdqu ymm3, [r0 + 0x00]
+ vmovdqu ymm5, [r0 + 0x20]
+ AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x00], ymm3
+ AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x20], ymm5
+ vmovdqu ymm3, [r0 + 0x40]
+ vmovdqu ymm5, [r0 + 0x60]
+ AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x40], ymm3
+ AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x60], ymm5
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
+;***********************************************************************
+
+WELS_EXTERN WelsQuantFour4x4Max_avx2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm6
+ vmovdqu ymm4, [r0 + 0x00]
+ vmovdqu ymm5, [r0 + 0x20]
+ AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x00], ymm4
+ AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x20], ymm5
+ vperm2i128 ymm4, ymm2, ymm3, 00100000b
+ vperm2i128 ymm3, ymm2, ymm3, 00110001b
+ vpmaxsw ymm2, ymm4, ymm3
+ vmovdqu ymm4, [r0 + 0x40]
+ vmovdqu ymm5, [r0 + 0x60]
+ AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x40], ymm4
+ AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x60], ymm5
+ vperm2i128 ymm5, ymm3, ymm4, 00100000b
+ vperm2i128 ymm4, ymm3, ymm4, 00110001b
+ vpmaxsw ymm3, ymm5, ymm4
+ vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
+ vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
+ vextracti128 xmm4, ymm2, 1
+ vextracti128 xmm5, ymm3, 1
+ vphminposuw xmm2, xmm2
+ vphminposuw xmm3, xmm3
+ vphminposuw xmm4, xmm4
+ vphminposuw xmm5, xmm5
+ vpunpcklwd xmm2, xmm2, xmm4
+ vpunpcklwd xmm3, xmm3, xmm5
+ vpunpckldq xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm6 ; restore non-flipped values.
+ vmovq [r3], xmm2 ; store max values.
+ vzeroupper
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -435,6 +435,22 @@
TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
}
+TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuant4x4 (WelsQuant4x4_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2);
+}
#endif
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
int16_t pDct[4], s[4];