shithub: openh264

--- a/build/arch.mk

+++ b/build/arch.mk

@@ -1,6 +1,18 @@

+#for x86

+HAVE_AVX2 := true

 ifneq ($(filter %86 x86_64, $(ARCH)),)

 include $(SRC_PATH)build/x86-common.mk

+ifeq ($(USE_ASM), Yes)

+ifeq ($(HAVE_AVX2), true)

+CFLAGS += -DHAVE_AVX2

+CXXFLAGS += -DHAVE_AVX2

+ASMFLAGS += -DHAVE_AVX2

 endif

+endif

+endif

+#for arm

 ifneq ($(filter-out arm64, $(filter arm%, $(ARCH))),)

 ifeq ($(USE_ASM), Yes)

 ASM_ARCH = arm

@@ -8,6 +20,8 @@

 CFLAGS += -DHAVE_NEON

 endif

 endif

+#for arm64

 ifneq ($(filter arm64 aarch64, $(ARCH)),)

 ifeq ($(USE_ASM), Yes)

 ASM_ARCH = arm64

--- a/codec/common/inc/cpu_core.h

+++ b/codec/common/inc/cpu_core.h

@@ -56,7 +56,6 @@

 #define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */

 /* CPU features application extensive */

-#define WELS_CPU_AVX        0x00000800  /* Advanced Vector eXtentions */

 #define WELS_CPU_FPU        0x00001000  /* x87-FPU on chip */

 #define WELS_CPU_HTT        0x00002000  /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:

                                            physical processor package is capable of supporting more than one logic processor

@@ -67,7 +66,13 @@

 #define WELS_CPU_MOVBE      0x00008000  /* MOVBE instruction */

 #define WELS_CPU_AES        0x00010000  /* AES instruction extensions */

 #define WELS_CPU_FMA        0x00020000  /* AVX VEX FMA instruction sets */

+#define WELS_CPU_AVX        0x00000800  /* Advanced Vector eXtentions */

+#ifdef HAVE_AVX2

 #define WELS_CPU_AVX2       0x00040000  /* AVX2 */

+#else

+#define WELS_CPU_AVX2       0x00000000  /* !AVX2 */

+#endif

 #define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */

 #define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */

--- a/codec/common/x86/dct.asm

+++ b/codec/common/x86/dct.asm

@@ -678,6 +678,7 @@

 ; AVX2 functions

 ;***********************************************************************

+%ifdef HAVE_AVX2

 ; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8

 %macro AVX2_LoadDiff16P 8

     vmovq         x%1, [%2         ]

@@ -1011,3 +1012,5 @@

     POP_XMM

     LOAD_5_PARA_POP

ret

+%endif

--- a/codec/common/x86/satd_sad.asm

+++ b/codec/common/x86/satd_sad.asm

@@ -1504,6 +1504,7 @@

 ;***********************************************************************

+%ifdef HAVE_AVX2

 ; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5

 %macro AVX2_LoadDiffSatd16x1 5

     vbroadcasti128   %1, [%2]

@@ -1722,6 +1723,8 @@

     pop r4

 %endif

ret

+%endif

 ;***********************************************************************

--- a/codec/decoder/core/inc/decode_mb_aux.h

+++ b/codec/decoder/core/inc/decode_mb_aux.h

@@ -48,8 +48,10 @@

 #if defined(X86_ASM)

 void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);

 void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);

+#if defined(HAVE_AVX2)

 void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);

 void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);

+#endif

 #endif//X86_ASM

 #if defined(HAVE_NEON)

--- a/codec/decoder/core/src/decoder.cpp

+++ b/codec/decoder/core/src/decoder.cpp

@@ -1005,10 +1005,13 @@

     pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsDecoderIChromaPredDcTop_sse2;

     pCtx->pGetI4x4LumaPredFunc[I4_PRED_H]     = WelsDecoderI4x4LumaPredH_sse2;

+#if defined(HAVE_AVX2)

   if (uiCpuFlag & WELS_CPU_AVX2) {

     pCtx->pIdctResAddPredFunc     = IdctResAddPred_avx2;

     pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_avx2;

+#endif

 #endif

--- a/codec/encoder/core/x86/quant.asm

+++ b/codec/encoder/core/x86/quant.asm

@@ -370,6 +370,7 @@

ret

+%ifdef HAVE_AVX2

 ; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5

 %macro AVX2_Quant 5

     vpabsw          %2, %1

@@ -502,3 +503,5 @@

     POP_XMM

     LOAD_4_PARA_POP

ret

+%endif

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -107,10 +107,12 @@

     sDownsampleFunc.pfQuarterDownsampler  = DyadicBilinearQuarterDownsampler_sse4;

     sDownsampleFunc.pfGeneralRatioChroma  = GeneralBilinearAccurateDownsamplerWrap_sse41;

+#ifdef HAVE_AVX2

   if (iCpuFlag & WELS_CPU_AVX2) {

     sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;

     sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_avx2;

+#endif

 #endif//X86_ASM

 #if defined(HAVE_NEON)

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -99,8 +99,10 @@

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;

+#ifdef HAVE_AVX2

 GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;

 GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;

+#endif

 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_ssse3;

 SpecificDownsampleFunc  DyadicBilinearOneThirdDownsampler_sse4;

@@ -120,6 +122,7 @@

 void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

     uint32_t uiScaleY);

+#ifdef HAVE_AVX2

 void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

     uint32_t uiScaleY);

@@ -126,6 +129,7 @@

 void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,

     int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,

     uint32_t uiScaleY);

+#endif

 WELSVP_EXTERN_C_END

 #endif

--- a/codec/processing/src/downsample/downsamplefuncs.cpp

+++ b/codec/processing/src/downsample/downsamplefuncs.cpp

@@ -284,8 +284,10 @@

 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)

 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)

 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)

+#ifdef HAVE_AVX2

 DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)

 DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)

+#endif

 #endif //X86_ASM

 #ifdef HAVE_NEON

--- a/codec/processing/src/x86/downsample_bilinear.asm

+++ b/codec/processing/src/x86/downsample_bilinear.asm

@@ -3254,6 +3254,7 @@

 %undef xmm_xfrac1_begin

 %undef xmm_xfrac_inc

+%ifdef HAVE_AVX2

 ; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5

 %macro AVX2_BilinearIncXposuw 5

     vpaddusw        %5, %2, %4

@@ -4552,3 +4553,5 @@

 %undef ymm_xfrac0_begin

 %undef ymm_xfrac1_begin

 %undef ymm_xfrac_inc

+%endif

--- a/codec/processing/src/x86/vaa.asm

+++ b/codec/processing/src/x86/vaa.asm

@@ -2088,6 +2088,7 @@

     %assign push_num push_num - stack_alloc_num

 %endmacro

+%ifdef HAVE_AVX2

 ; Max unsigned byte per quadword

 ; out=%1 in=%2 tmp=%3

 %macro AVX2_Maxubq 3

@@ -3557,3 +3558,6 @@

 %undef           p_sd8x8

 %undef           p_mad8x8

ret

+%endif

--- a/test/decoder/DecUT_IdctResAddPred.cpp

+++ b/test/decoder/DecUT_IdctResAddPred.cpp

@@ -53,6 +53,7 @@

 #if defined(X86_ASM)

+#if defined(HAVE_AVX2)

 void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) {

   IdctResAddPred_ref (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16);

   IdctResAddPred_ref (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16);

@@ -60,6 +61,7 @@

   IdctResAddPred_ref (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16);

 #endif

+#endif

 } // anon ns

@@ -138,8 +140,10 @@

 #if defined(X86_ASM)

 GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)

 GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)

+#if defined(HAVE_AVX2)

 GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)

 GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)

+#endif

 #endif

 #if defined(HAVE_NEON)

--- a/test/processing/ProcessUT_DownSample.cpp

+++ b/test/processing/ProcessUT_DownSample.cpp

@@ -372,10 +372,13 @@

                                         WELS_CPU_SSSE3)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,

                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)

+#ifdef HAVE_AVX2

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,

                                         WELS_CPU_AVX2)

 GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,

                                         GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)

+#endif

 #endif

 #if defined(HAVE_NEON)

--

⑨