ref: 80721234bef52905fe23d7e4acc87bae29f0639c
parent: a5b53a690fc9d10a3a454ef6f5ea6b9826874ec6
author: Sindre Aamås <saamas@cisco.com>
date: Wed Jul 20 05:28:30 EDT 2016
[Common/x86] Tweak McCopyWidthEq8_mmx ~2x speedup on Haswell.
--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -252,8 +252,6 @@
int32_t iHeight);
void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
@@ -264,6 +262,8 @@
//***************************************************************************//
void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);
void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -440,7 +440,7 @@
if (iWidth == 16)
McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
- McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
else
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -44,6 +44,10 @@
;*********************************************************************************************/
%include "asm_inc.asm"
+%ifdef __NASM_VER__
+ %use smartalign
+%endif
+
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
@@ -502,12 +506,37 @@
LOAD_7_PARA_POP
ret
+; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11
+%macro CopyStrided4N 11
+ lea %8, [3 * %6]
+ lea %9, [3 * %4]
+ALIGN 32
+%%loop:
+ %1 %10, [%5]
+ %1 %11, [%5 + %6]
+ %2 [%3], %10
+ %2 [%3 + %4], %11
+ %1 %10, [%5 + 2 * %6]
+ %1 %11, [%5 + %8]
+ %2 [%3 + 2 * %4], %10
+ %2 [%3 + %9], %11
+ lea %5, [%5 + 4 * %6]
+ lea %3, [%3 + 4 * %4]
+ sub %7, 4
+ jg %%loop
+%endmacro
+
;*******************************************************************************
-; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
+; void McCopyWidthEq8_sse2( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq8_sse2
%assign push_num 0
+%ifdef X86_32
+ push r5
+ push r6
+ %assign push_num 2
+%endif
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -514,17 +543,13 @@
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
-ALIGN 4
-.height_loop:
- movq mm0, [r0]
- movq [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
+ CopyStrided4N movsd, movsd, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
- WELSEMMS
LOAD_5_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+%endif
ret