ref: c06f9ec41e26c9e86314c3207d0dcbd3d5594ceb
parent: b397b397502d940a22a0fda35f1d787336dbdaf2
author: Sindre Aamås <saamas@cisco.com>
date: Fri Jul 22 12:22:31 EDT 2016
[Common/x86] Convert McCopyWidthEq8_sse2 back to MMX Avoid potential performance regressions on CPUs that have lower throughput for 64-bit loads to xmm registers than to mm registers. The emms instruction needed with MMX is somewhat costly on recent Intel cores and causes a slight performance reduction on such cores as such (hence the motivation for the conversion to SSE).
--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -252,6 +252,8 @@
int32_t iHeight);
void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
@@ -262,8 +264,6 @@
//***************************************************************************//
void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);
void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -440,7 +440,7 @@
if (iWidth == 16)
McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
- McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else
@@ -729,7 +729,7 @@
int32_t iWidth, int32_t iHeight) {
switch (iWidth) {
case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
- case 8: return McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ case 8: return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
case 4: return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -527,10 +527,10 @@
%endmacro
;*******************************************************************************
-; void McCopyWidthEq8_sse2( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
+; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_sse2
+WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
%ifdef X86_32
push r5
@@ -543,8 +543,9 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
- CopyStrided4N movsd, movsd, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
+ CopyStrided4N movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1
+ WELSEMMS
LOAD_5_PARA_POP
%ifdef X86_32
pop r6