ref: 7d65687284a0ae8033a76a380c22b479bdf96d5a
parent: 56618249d7ba939399679b6c5fd0363b520528e6
parent: bb49e23719b29955c314359b81d0faf48df2e026
author: ruil2 <ruil2@cisco.com>
date: Thu Apr 28 05:08:31 EDT 2016
Merge pull request #2441 from saamas/encoder-add-avx2-4x4-quantization-routines [Encoder] Add AVX2 4x4 quantization routines
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -657,3 +657,8 @@
vpsrlw %1, %1, 15
vpsllw %1, %1, 5
%endmacro
+
+%macro WELS_DW32767_VEX 1
+ vpcmpeqw %1, %1, %1
+ vpsrlw %1, %1, 1
+%endmacro
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -106,6 +106,11 @@
void WelsQuantFour4x4_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_sse2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+void WelsQuant4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuant4x4Dc_avx2 (int16_t* pDct, int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuantFour4x4Max_avx2 (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+
#endif
#ifdef HAVE_NEON
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -526,6 +526,11 @@
if (uiCpuFlag & WELS_CPU_AVX2) {
pFuncList->pfDctT4 = WelsDctT4_avx2;
pFuncList->pfDctFourT4 = WelsDctFourT4_avx2;
+
+ pFuncList->pfQuantization4x4 = WelsQuant4x4_avx2;
+ pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_avx2;
+ pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_avx2;
+ pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_avx2;
}
//#endif//MACOS
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -368,3 +368,137 @@
punpcklqdq xmm2, xmm3
MOVDQ [r0+16], xmm2
ret
+
+
+; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
+%macro AVX2_Quant 5
+ vpabsw %2, %1
+ vpor %1, %1, %5 ; ensure non-zero before vpsignw
+ vpaddusw %2, %2, %3
+ vpmulhuw %2, %2, %4
+ vpsignw %1, %2, %1
+%endmacro
+
+
+;***********************************************************************
+; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuant4x4_avx2
+ %assign push_num 0
+ LOAD_3_PARA
+ PUSH_XMM 5
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm2
+ vmovdqu ymm3, [r0]
+ AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
+ vmovdqu [r0], ymm3
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuant4x4Dc_avx2
+ %assign push_num 0
+ LOAD_1_PARA
+ PUSH_XMM 5
+%ifidni r1, arg2
+ vmovd xmm0, arg2d
+ vpbroadcastw ymm0, xmm0
+%else
+ vpbroadcastw ymm0, arg2
+%endif
+%ifidni r2, arg3
+ vmovd xmm1, arg3d
+ vpbroadcastw ymm1, xmm1
+%else
+ vpbroadcastw ymm1, arg3
+%endif
+ WELS_DW32767_VEX ymm2
+ vmovdqu ymm3, [r0]
+ AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2
+ vmovdqu [r0], ymm3
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+
+WELS_EXTERN WelsQuantFour4x4_avx2
+ %assign push_num 0
+ LOAD_3_PARA
+ PUSH_XMM 6
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm4
+ vmovdqu ymm3, [r0 + 0x00]
+ vmovdqu ymm5, [r0 + 0x20]
+ AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x00], ymm3
+ AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x20], ymm5
+ vmovdqu ymm3, [r0 + 0x40]
+ vmovdqu ymm5, [r0 + 0x60]
+ AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x40], ymm3
+ AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4
+ vmovdqu [r0 + 0x60], ymm5
+ vzeroupper
+ POP_XMM
+ ret
+
+
+;***********************************************************************
+; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max);
+;***********************************************************************
+
+WELS_EXTERN WelsQuantFour4x4Max_avx2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ vbroadcasti128 ymm0, [r1]
+ vbroadcasti128 ymm1, [r2]
+ WELS_DW32767_VEX ymm6
+ vmovdqu ymm4, [r0 + 0x00]
+ vmovdqu ymm5, [r0 + 0x20]
+ AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x00], ymm4
+ AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x20], ymm5
+ vperm2i128 ymm4, ymm2, ymm3, 00100000b
+ vperm2i128 ymm3, ymm2, ymm3, 00110001b
+ vpmaxsw ymm2, ymm4, ymm3
+ vmovdqu ymm4, [r0 + 0x40]
+ vmovdqu ymm5, [r0 + 0x60]
+ AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x40], ymm4
+ AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6
+ vmovdqu [r0 + 0x60], ymm5
+ vperm2i128 ymm5, ymm3, ymm4, 00100000b
+ vperm2i128 ymm4, ymm3, ymm4, 00110001b
+ vpmaxsw ymm3, ymm5, ymm4
+ vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
+ vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value.
+ vextracti128 xmm4, ymm2, 1
+ vextracti128 xmm5, ymm3, 1
+ vphminposuw xmm2, xmm2
+ vphminposuw xmm3, xmm3
+ vphminposuw xmm4, xmm4
+ vphminposuw xmm5, xmm5
+ vpunpcklwd xmm2, xmm2, xmm4
+ vpunpcklwd xmm3, xmm3, xmm5
+ vpunpckldq xmm2, xmm2, xmm3
+ vpxor xmm2, xmm2, xmm6 ; restore non-flipped values.
+ vmovq [r3], xmm2 ; store max values.
+ vzeroupper
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -3,6 +3,8 @@
#include "ls_defines.h"
#include "encode_mb_aux.h"
#include "wels_common_basis.h"
+#include <algorithm>
+#include <cstddef>
using namespace WelsEnc;
@@ -292,41 +294,95 @@
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
#define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
+namespace {
+int16_t WelsQuant4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
+ int16_t max_abs = 0;
+ for (int i = 0; i < 16; i++) {
+ const int j = i & 0x07;
+ const int32_t sign = WELS_SIGN (pDct[i]);
+ pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]);
+ max_abs = std::max(max_abs, pDct[i]);
+ pDct[i] = WELS_ABS_LC (pDct[i]);
+ }
+ return max_abs;
+}
+void WelsQuant4x4DcAnchor (int16_t* pDct, int16_t iFF, int16_t iMF) {
+ for (int i = 0; i < 16; i++) {
+ const int32_t sign = WELS_SIGN (pDct[i]);
+ pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF);
+ }
+}
+void WelsQuantFour4x4Anchor (int16_t* pDct, int16_t* ff, int16_t* mf) {
+ for (int i = 0; i < 4; i++)
+ WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
+}
void WelsQuantFour4x4MaxAnchor (int16_t* pDct, int16_t* ff, int16_t* mf, int16_t* max) {
- int32_t i, j, k, sign;
- int16_t max_abs;
- for (k = 0; k < 4; k++) {
- max_abs = 0;
- for (i = 0; i < 16; i++) {
- j = i & 0x07;
- sign = WELS_SIGN (pDct[i]);
- pDct[i] = NEW_QUANT (pDct[i], ff[j], mf[j]);
- if (max_abs < pDct[i]) max_abs = pDct[i];
- pDct[i] = WELS_ABS_LC (pDct[i]);
- }
- pDct += 16;
- max[k] = max_abs;
+ for (int i = 0; i < 4; i++)
+ max[i] = WelsQuant4x4MaxAnchor (pDct + 16 * i, ff, mf);
+}
+void TestWelsQuant4x4 (PQuantizationFunc func) {
+ const std::size_t f_size = 8;
+ const std::size_t dct_size = 16;
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, ff, f_size);
+ ALLOC_MEMORY (int16_t, mf, f_size);
+ ALLOC_MEMORY (int16_t, iDctC, dct_size);
+ ALLOC_MEMORY (int16_t, iDctS, dct_size);
+ for (std::size_t i = 0; i < f_size; i++) {
+ ff[i] = rand() & 32767;
+ mf[i] = rand() & 32767;
}
+ for (std::size_t i = 0; i < dct_size; i++)
+ iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
+ WelsQuant4x4MaxAnchor (iDctC, ff, mf);
+ func (iDctS, ff, mf);
+ for (std::size_t i = 0; i < dct_size; i++)
+ EXPECT_EQ (iDctC[i], iDctS[i]);
+ FREE_MEMORY (ff);
+ FREE_MEMORY (mf);
+ FREE_MEMORY (iDctC);
+ FREE_MEMORY (iDctS);
}
-TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
- int16_t ff[8], mf[8];
- int16_t iDctA[64], iMaxA[16];
- int16_t iDctC[64], iMaxC[16];
- for (int i = 0; i < 8; i++) {
+void TestWelsQuant4x4Dc (PQuantizationDcFunc func) {
+ const std::size_t dct_size = 16;
+ const int16_t ff = rand() & 32767;
+ const int16_t mf = rand() & 32767;
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iDctC, dct_size);
+ ALLOC_MEMORY (int16_t, iDctS, dct_size);
+ for (std::size_t i = 0; i < dct_size; i++)
+ iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
+ WelsQuant4x4DcAnchor (iDctC, ff, mf);
+ func (iDctS, ff, mf);
+ for (std::size_t i = 0; i < dct_size; i++)
+ EXPECT_EQ (iDctC[i], iDctS[i]);
+ FREE_MEMORY (iDctC);
+ FREE_MEMORY (iDctS);
+}
+void TestWelsQuantFour4x4 (PQuantizationFunc func) {
+ const std::size_t f_size = 8;
+ const std::size_t dct_size = 4 * 16;
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, ff, f_size);
+ ALLOC_MEMORY (int16_t, mf, f_size);
+ ALLOC_MEMORY (int16_t, iDctC, dct_size);
+ ALLOC_MEMORY (int16_t, iDctS, dct_size);
+ for (std::size_t i = 0; i < f_size; i++) {
ff[i] = rand() & 32767;
mf[i] = rand() & 32767;
}
- for (int i = 0; i < 64; i++)
- iDctA[i] = iDctC[i] = (rand() & 65535) - 32767;
- WelsQuantFour4x4MaxAnchor (iDctA, ff, mf, iMaxA);
- WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
- for (int i = 0; i < 64; i++)
- EXPECT_EQ (iDctA[i], iDctC[i]);
- for (int i = 0; i < 4; i++)
- EXPECT_EQ (iMaxA[i], iMaxC[i]);
+ for (std::size_t i = 0; i < dct_size; i++)
+ iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
+ WelsQuantFour4x4Anchor (iDctC, ff, mf);
+ func (iDctS, ff, mf);
+ for (std::size_t i = 0; i < dct_size; i++)
+ EXPECT_EQ (iDctC[i], iDctS[i]);
+ FREE_MEMORY (ff);
+ FREE_MEMORY (mf);
+ FREE_MEMORY (iDctC);
+ FREE_MEMORY (iDctS);
}
-#ifdef X86_ASM
-TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
+void TestWelsQuantFour4x4Max (PQuantizationMaxFunc func) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, ff, 8);
ALLOC_MEMORY (int16_t, mf, 8);
@@ -340,8 +396,8 @@
}
for (int i = 0; i < 64; i++)
iDctC[i] = iDctS[i] = (rand() & 65535) - 32767;
- WelsQuantFour4x4Max_c (iDctC, ff, mf, iMaxC);
- WelsQuantFour4x4Max_sse2 (iDctS, ff, mf, iMaxS);
+ WelsQuantFour4x4MaxAnchor (iDctC, ff, mf, iMaxC);
+ func (iDctS, ff, mf, iMaxS);
for (int i = 0; i < 64; i++)
EXPECT_EQ (iDctC[i], iDctS[i]);
for (int i = 0; i < 4; i++)
@@ -352,6 +408,48 @@
FREE_MEMORY (iDctS);
FREE_MEMORY (iMaxC);
FREE_MEMORY (iMaxS);
+}
+} // anon ns
+TEST (EncodeMbAuxTest, WelsQuant4x4_c) {
+ TestWelsQuant4x4 (WelsQuant4x4_c);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_c) {
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_c);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_c) {
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_c);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_c) {
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_c);
+}
+#ifdef X86_ASM
+TEST (EncodeMbAuxTest, WelsQuant4x4_sse2) {
+ TestWelsQuant4x4 (WelsQuant4x4_sse2);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_sse2) {
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_sse2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_sse2) {
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_sse2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_sse2) {
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_sse2);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuant4x4 (WelsQuant4x4_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_avx2);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_avx2) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_avx2);
}
#endif
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {