shithub: openh264

Download patch

ref: c06f9ec41e26c9e86314c3207d0dcbd3d5594ceb
parent: b397b397502d940a22a0fda35f1d787336dbdaf2
author: Sindre Aamås <saamas@cisco.com>
date: Fri Jul 22 12:22:31 EDT 2016

[Common/x86] Convert McCopyWidthEq8_sse2 back to MMX

Avoid potential performance regressions on CPUs that have lower
throughput for 64-bit loads to xmm registers than to mm registers.

The emms instruction needed with MMX is somewhat costly on recent
Intel cores and causes a slight performance reduction on such cores
as such (hence the motivation for the conversion to SSE).

--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -252,6 +252,8 @@
                              int32_t iHeight);
 void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                            const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
 void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
                            const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
 void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
@@ -262,8 +264,6 @@
 //***************************************************************************//
 void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                             const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                          int32_t iHeight);
 void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                            int32_t iHeight);
 void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -440,7 +440,7 @@
   if (iWidth == 16)
     McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 8)
-    McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else if (iWidth == 4)
     McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   else
@@ -729,7 +729,7 @@
                   int32_t iWidth, int32_t iHeight) {
   switch (iWidth) {
   case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  case 8:  return McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  case 8:  return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   case 4:  return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
   }
   return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -527,10 +527,10 @@
 %endmacro
 
 ;*******************************************************************************
-;   void McCopyWidthEq8_sse2( uint8_t *pSrc, int iSrcStride,
-;                             uint8_t *pDst, int iDstStride, int iHeight )
+;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+;                            uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_sse2
+WELS_EXTERN McCopyWidthEq8_mmx
     %assign  push_num 0
 %ifdef X86_32
     push            r5
@@ -543,8 +543,9 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
 
-    CopyStrided4N   movsd, movsd, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
+    CopyStrided4N   movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1
 
+    WELSEMMS
     LOAD_5_PARA_POP
 %ifdef X86_32
     pop             r6