shithub: openh264

Download patch

ref: b6c4a5447c6ec67daf2a394b7927ea15bedbc5f7
parent: 98042f1600790bf80937ca9bbc3458539d3e6ed8
author: Sindre Aamås <saamas@cisco.com>
date: Wed Mar 16 15:33:33 EDT 2016

[Decoder/x86] IDCT one block at a time with SSE2

At lower bitrates, it is overall faster to conditionally do one block
at a time with SSE2 on Haswell and likely other common architectures.
At higher bitrates, it is faster to use the wider routine that IDCTs
four blocks at a time. To avoid potential performance regressions
as compared to MMX, stick with single-block IDCTs with SSE2. There
is still a performance advantage as compared to MMX because the
single-block SSE2 routine is faster than the corresponding MMX
routine.

Stick with four blocks at a time with AVX2 for which that appears
to be consistently faster on Haswell.

--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -510,20 +510,11 @@
     ret
 
 ;***********************************************************************
-; void IdctFourResAddPred_sse2(uint8_t* pPred, int32_t iStride, const int16_t* pDct, const int8_t* pNzc);
-;***********************************************************************
-WELS_EXTERN IdctFourResAddPred_sse2
-    %assign push_num 0
-    LOAD_3_PARA_TO_5_PARA_IDCT
-    jmp prefixed(WelsIDctFourT4Rec_sse2.begin)
-
-;***********************************************************************
 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
 ;***********************************************************************
 WELS_EXTERN WelsIDctFourT4Rec_sse2
     %assign push_num 0
     LOAD_5_PARA
-.begin:
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -49,7 +49,6 @@
 void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
-void IdctFourResAddPred_sse2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
 void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
 #endif//X86_ASM
 
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -992,7 +992,7 @@
   }
   if (uiCpuFlag & WELS_CPU_SSE2) {
     pCtx->pIdctResAddPredFunc     = IdctResAddPred_sse2;
-    pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_sse2;
+    pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_sse2>;
 
     pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_sse2;
     pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_sse2;
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -139,7 +139,6 @@
 GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
 GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
-GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_sse2, WELS_CPU_SSE2)
 GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
 #endif