ref: 1f770c488cfd5400e6e0092a3e49aef841f6c5de
parent: 5a8f5e8cf12d7cf46d3c28a31299b646e7bf3282
parent: 7d00e8bc42247ebbb3ae5bbd8e4274bf6f489290
author: HaiboZhu <haibozhu@cisco.com>
date: Wed Jul 20 09:49:31 EDT 2016
Merge pull request #2531 from GuangweiWang/enable-disable-AVX2 add option for enable/disable AVX2
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -1,6 +1,18 @@
+#for x86
+HAVE_AVX2 := true
+
ifneq ($(filter %86 x86_64, $(ARCH)),)
include $(SRC_PATH)build/x86-common.mk
+ifeq ($(USE_ASM), Yes)
+ifeq ($(HAVE_AVX2), true)
+CFLAGS += -DHAVE_AVX2
+CXXFLAGS += -DHAVE_AVX2
+ASMFLAGS += -DHAVE_AVX2
endif
+endif
+endif
+
+#for arm
ifneq ($(filter-out arm64, $(filter arm%, $(ARCH))),)
ifeq ($(USE_ASM), Yes)
ASM_ARCH = arm
@@ -8,6 +20,8 @@
CFLAGS += -DHAVE_NEON
endif
endif
+
+#for arm64
ifneq ($(filter arm64 aarch64, $(ARCH)),)
ifeq ($(USE_ASM), Yes)
ASM_ARCH = arm64
--- a/codec/common/inc/cpu_core.h
+++ b/codec/common/inc/cpu_core.h
@@ -56,7 +56,6 @@
#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
/* CPU features application extensive */
-#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
physical processor package is capable of supporting more than one logic processor
@@ -67,7 +66,13 @@
#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
+#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
+
+#ifdef HAVE_AVX2
#define WELS_CPU_AVX2 0x00040000 /* AVX2 */
+#else
+#define WELS_CPU_AVX2 0x00000000 /* !AVX2 */
+#endif
#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -678,6 +678,7 @@
; AVX2 functions
;***********************************************************************
+%ifdef HAVE_AVX2
; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8
%macro AVX2_LoadDiff16P 8
vmovq x%1, [%2 ]
@@ -1011,3 +1012,5 @@
POP_XMM
LOAD_5_PARA_POP
ret
+%endif
+
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -1504,6 +1504,7 @@
;
;***********************************************************************
+%ifdef HAVE_AVX2
; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5
%macro AVX2_LoadDiffSatd16x1 5
vbroadcasti128 %1, [%2]
@@ -1722,6 +1723,8 @@
pop r4
%endif
ret
+
+%endif
;***********************************************************************
;
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -48,8 +48,10 @@
#if defined(X86_ASM)
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+#if defined(HAVE_AVX2)
void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
+#endif
#endif//X86_ASM
#if defined(HAVE_NEON)
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -1005,10 +1005,13 @@
pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_sse2;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_H] = WelsDecoderI4x4LumaPredH_sse2;
}
+#if defined(HAVE_AVX2)
if (uiCpuFlag & WELS_CPU_AVX2) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_avx2;
pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_avx2;
}
+#endif
+
#endif
}
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -370,6 +370,7 @@
ret
+%ifdef HAVE_AVX2
; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5
%macro AVX2_Quant 5
vpabsw %2, %1
@@ -502,3 +503,5 @@
POP_XMM
LOAD_4_PARA_POP
ret
+%endif
+
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -107,10 +107,12 @@
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
}
+#ifdef HAVE_AVX2
if (iCpuFlag & WELS_CPU_AVX2) {
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_avx2;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_avx2;
}
+#endif
#endif//X86_ASM
#if defined(HAVE_NEON)
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -99,8 +99,10 @@
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_ssse3;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse41;
+#ifdef HAVE_AVX2
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_avx2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_avx2;
+#endif
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_ssse3;
SpecificDownsampleFunc DyadicBilinearOneThirdDownsampler_sse4;
@@ -120,6 +122,7 @@
void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
+#ifdef HAVE_AVX2
void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
@@ -126,6 +129,7 @@
void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth,
int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX,
uint32_t uiScaleY);
+#endif
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -284,8 +284,10 @@
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2)
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41)
+#ifdef HAVE_AVX2
DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2)
DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2)
+#endif
#endif //X86_ASM
#ifdef HAVE_NEON
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -3254,6 +3254,7 @@
%undef xmm_xfrac1_begin
%undef xmm_xfrac_inc
+%ifdef HAVE_AVX2
; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5
%macro AVX2_BilinearIncXposuw 5
vpaddusw %5, %2, %4
@@ -4552,3 +4553,5 @@
%undef ymm_xfrac0_begin
%undef ymm_xfrac1_begin
%undef ymm_xfrac_inc
+%endif
+
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -2088,6 +2088,7 @@
%assign push_num push_num - stack_alloc_num
%endmacro
+%ifdef HAVE_AVX2
; Max unsigned byte per quadword
; out=%1 in=%2 tmp=%3
%macro AVX2_Maxubq 3
@@ -3557,3 +3558,6 @@
%undef p_sd8x8
%undef p_mad8x8
ret
+
+%endif
+
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -53,6 +53,7 @@
}
#if defined(X86_ASM)
+#if defined(HAVE_AVX2)
void IdctFourResAddPred_ref (uint8_t* pPred, int32_t iStride, int16_t* pRs) {
IdctResAddPred_ref (pPred + 0 * iStride + 0, iStride, pRs + 0 * 16);
IdctResAddPred_ref (pPred + 0 * iStride + 4, iStride, pRs + 1 * 16);
@@ -60,6 +61,7 @@
IdctResAddPred_ref (pPred + 4 * iStride + 4, iStride, pRs + 3 * 16);
}
#endif
+#endif
} // anon ns
@@ -138,8 +140,10 @@
#if defined(X86_ASM)
GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
+#if defined(HAVE_AVX2)
GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
+#endif
#endif
#if defined(HAVE_NEON)
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -372,10 +372,13 @@
WELS_CPU_SSSE3)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_sse41,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_SSE41)
+#ifdef HAVE_AVX2
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearFastDownsamplerWrap_avx2, GeneralBilinearFastDownsampler_ref, 1,
WELS_CPU_AVX2)
GENERATE_GeneralBilinearDownsampler_UT (GeneralBilinearAccurateDownsamplerWrap_avx2,
GeneralBilinearAccurateDownsampler_ref, 1, WELS_CPU_AVX2)
+#endif
+
#endif
#if defined(HAVE_NEON)