ref: 2ecbc51c2b030f84c6832696b1d465bfded932b7
parent: 80721234bef52905fe23d7e4acc87bae29f0639c
author: Sindre Aamås <saamas@cisco.com>
date: Wed Jul 20 06:42:05 EDT 2016
[Common] Add SSSE3 motion compensation routines The resulting per-block speedup is measured to between ~1.67x and ~2.37x for horizontal filtering, between ~1.24x and ~1.53x for vertical filtering, and between ~1.33x and ~2.37x for 2-dimensional filtering on Haswell, depending on block size, as compared with the existing SSE2 routines. Outliers are 4-wide vertical and 2-dimensional filtering with a speedups of ~5.69x and ~6.84x respectively on Haswell (with clang-703.0.31) due to the C routines used previously.
--- a/codec/common/inc/mc.h
+++ b/codec/common/inc/mc.h
@@ -305,11 +305,32 @@
int32_t iWidth, int32_t iHeight);
//***************************************************************************//
-// SSSE3 definition //
+// SSE3 definition //
//***************************************************************************//
+void McCopyWidthEq16_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+//***************************************************************************//
+// SSSE3 definition //
+//***************************************************************************//
void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
+void McHorVer02_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
+void McHorVer02Width4S16ToU8_ssse3 (const int16_t* pSrc, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02Width5S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthGe8S16ToU8_ssse3 (const int16_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
+void McHorVer20Width4U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, int16_t* pDst, int32_t iHeight);
+void McHorVer20Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer20Width8U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+ int16_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20Width9Or17U8ToS16_ssse3 (const uint8_t* pSrc, int32_t iSrcStride,
+ int16_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
#endif //X86_ASM
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -44,6 +44,8 @@
#include "ls_defines.h"
#include "macros.h"
+namespace {
+
typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeight);
typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
@@ -51,8 +53,6 @@
typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
-namespace WelsCommon {
-
/*------------------weight for chroma fraction pixel interpolation------------------*/
//iA = (8 - dx) * (8 - dy);
//iB = dx * (8 - dy);
@@ -710,6 +710,183 @@
McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
}
+//***************************************************************************//
+// SSSE3 implementation //
+//***************************************************************************//
+
+void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+ const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+ if (iWidth < 8) {
+ PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+ } else if (iWidth == 8) {
+ PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+ } else {
+ PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+ }
+}
+
+void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ switch (iWidth) {
+ case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ case 8: return McCopyWidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ case 4: return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ }
+ return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16);
+ if (iWidth < 8) {
+ McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5);
+ McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+ McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+ } else {
+ McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+ McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight);
+ McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+ McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight);
+ }
+}
+
+void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+ McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+ &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+ McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride,
+ &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride,
+ &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+ McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+ &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight);
+}
+
+void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16);
+ McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp,
+ &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight);
+}
+
+void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16);
+ ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16);
+ McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight);
+ McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+ PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp,
+ &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight);
+}
+
+void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16)
+ if (iWidth > 5) {
+ McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5);
+ McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight);
+ } else {
+ McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5);
+ McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight);
+ }
+}
+
+void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+ static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = {
+ {McCopy_sse3, McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3},
+ {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3},
+ {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3},
+ {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3},
+ };
+
+ pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
@@ -1319,7 +1496,9 @@
}
#endif
-void InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
+} // anon ns.
+
+void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
pMcFuncs->pfLumaHalfpelHor = McHorVer20_c;
pMcFuncs->pfLumaHalfpelVer = McHorVer02_c;
pMcFuncs->pfLumaHalfpelCen = McHorVer22_c;
@@ -1338,7 +1517,11 @@
}
if (uiCpuFlag & WELS_CPU_SSSE3) {
+ pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_ssse3;
+ pMcFuncs->pfLumaHalfpelVer = McHorVer02_ssse3;
+ pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_ssse3;
pMcFuncs->pMcChromaFunc = McChroma_ssse3;
+ pMcFuncs->pMcLumaFunc = McLuma_ssse3;
}
#endif //(X86_ASM)
@@ -1363,4 +1546,3 @@
}
#endif
}
-} // namespace WelsCommon
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -587,3 +587,28 @@
LOAD_5_PARA_POP
ret
+
+
+;*******************************************************************************
+; void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq16_sse3
+ %assign push_num 0
+%ifdef X86_32
+ push r5
+ push r6
+ %assign push_num 2
+%endif
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+
+ CopyStrided4N lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1
+
+ LOAD_5_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+%endif
+ ret
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -51,9 +51,28 @@
;*******************************************************************************
ALIGN 16
-h264_w0x10:
- dw 16, 16, 16, 16
-ALIGN 16
+shufb_32435465768798A9:
+ db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9
+shufb_011267784556ABBC:
+ db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch
+maddubsw_p1m5_p1m5_m5p1_m5p1_128:
+ times 2 db 1, -5, 1, -5, -5, 1, -5, 1
+maddubsw_m2p10_m40m40_p10m2_p0p0_128:
+ times 2 db -2, 10, -40, -40, 10, -2, 0, 0
+dwm1024_128:
+ times 8 dw -1024
+dd32768_128:
+ times 4 dd 32768
+maddubsw_p1m5_128:
+ times 8 db 1, -5
+maddubsw_m5p1_128:
+ times 8 db -5, 1
+db20_128:
+ times 16 db 20
+maddubsw_m5p20_128:
+ times 8 db -5, 20
+maddubsw_p20m5_128:
+ times 8 db 20, -5
h264_w0x10_1:
dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
@@ -85,7 +104,7 @@
sub r0, 2
WELS_Zero mm7
- movq mm6, [h264_w0x10]
+ movq mm6, [h264_w0x10_1]
.height_loop:
movd mm0, [r0]
punpcklbw mm0, mm7
@@ -1746,3 +1765,1112 @@
LOAD_6_PARA_POP
ret
+
+; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
+%macro SSSE3_FilterVertical_8px 7
+ pmaddubsw %1, %4
+ movdqa %7, %2
+ pmaddubsw %7, %5
+ paddw %1, %7
+ movdqa %7, %3
+ pmaddubsw %7, %6
+ paddw %1, %7
+ paddw %1, [h264_w0x10_1]
+ psraw %1, 5
+%endmacro
+
+; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
+%macro SSSE3_FilterVertical2_8px 8
+ movdqa %8, %2
+ pxor %7, %7
+ punpcklbw %1, %7
+ punpcklbw %8, %7
+ paddw %1, %8
+ movdqa %7, %3
+ pmaddubsw %7, %5
+ paddw %1, %7
+ movdqa %7, %4
+ pmaddubsw %7, %6
+ paddw %1, %7
+ paddw %1, [h264_w0x10_1]
+ psraw %1, 5
+%endmacro
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
+%macro SSSE3_FilterHorizontalbw_8px 6
+ movdqa %5, %1
+ pshufb %1, %2
+ pshufb %5, %3
+ pshufd %6, %1, 10110001b
+ pmaddubsw %1, [db20_128]
+ pmaddubsw %5, %4
+ pmaddubsw %6, %4
+ paddw %1, %5
+ paddw %1, %6
+%endmacro
+
+; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
+%macro SSSE3_FilterHorizontal_8px 6
+ SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
+ paddw %1, [h264_w0x10_1]
+ psraw %1, 5
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
+%macro SSSE3_FilterHorizontalbw_2x4px 7
+ movdqa %6, %1
+ movdqa %7, %2
+ pshufb %1, %3
+ pshufb %2, %3
+ punpcklqdq %1, %2
+ pshufb %6, %4
+ pshufb %7, %4
+ punpcklqdq %6, %7
+ pshufd %7, %1, 10110001b
+ pmaddubsw %1, [db20_128]
+ pmaddubsw %6, %5
+ pmaddubsw %7, %5
+ paddw %1, %6
+ paddw %1, %7
+%endmacro
+
+; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
+%macro SSSE3_FilterHorizontal_2x4px 7
+ SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
+ paddw %1, [h264_w0x10_1]
+ psraw %1, 5
+%endmacro
+
+; pixels=%1 -32768>>scale=%2 tmp=%3
+%macro SSSE3_FilterHorizontalbw_2px 3
+ pmaddubsw %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
+ pmaddwd %1, %2
+ pshufd %3, %1, 10110001b
+ paddd %1, %3
+%endmacro
+
+; pixels=%1 tmp=%2
+%macro SSSE3_FilterHorizontal_2px 2
+ SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
+ paddd %1, [dd32768_128]
+%endmacro
+
+; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
+%macro SSE2_FilterVerticalw_8px 7
+ paddw %1, %6
+ movdqa %7, %2
+ paddw %7, %5
+ psubw %1, %7
+ psraw %1, 2
+ psubw %1, %7
+ movdqa %7, %3
+ paddw %7, %4
+ paddw %1, %7
+ psraw %1, 2
+ paddw %7, [h264_mc_hc_32]
+ paddw %1, %7
+ psraw %1, 6
+%endmacro
+
+;***********************************************************************
+; void McHorVer02_ssse3(const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight)
+;***********************************************************************
+
+WELS_EXTERN McHorVer02_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_width r4
+%define i_height r5
+%define i_srcstride3 r6
+ %assign push_num 0
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub p_src, i_srcstride
+ sub p_src, i_srcstride
+ lea i_srcstride3, [3 * i_srcstride]
+ cmp i_width, 4
+ jg .width8or16
+ movd xmm0, [p_src]
+ movd xmm4, [p_src + i_srcstride]
+ punpcklbw xmm0, xmm4
+ movd xmm1, [p_src + 2 * i_srcstride]
+ punpcklbw xmm4, xmm1
+ punpcklqdq xmm0, xmm4
+ movd xmm4, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklbw xmm1, xmm4
+ movd xmm2, [p_src]
+ punpcklbw xmm4, xmm2
+ punpcklqdq xmm1, xmm4
+ movd xmm4, [p_src + i_srcstride]
+ lea p_src, [p_src + 2 * i_srcstride]
+ punpcklbw xmm2, xmm4
+ movd xmm3, [p_src]
+ punpcklbw xmm4, xmm3
+ punpcklqdq xmm2, xmm4
+ movdqa xmm5, [db20_128]
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm0, xmm0
+ movd [p_dst], xmm0
+ psrlq xmm0, 32
+ movd [p_dst + i_dststride], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movd xmm4, [p_src + i_srcstride]
+ punpcklbw xmm3, xmm4
+ movd xmm0, [p_src + 2 * i_srcstride]
+ punpcklbw xmm4, xmm0
+ punpcklqdq xmm3, xmm4
+ SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm1, xmm1
+ movd [p_dst], xmm1
+ psrlq xmm1, 32
+ movd [p_dst + i_dststride], xmm1
+ cmp i_height, 5
+ jl .width4_height_le5_done
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movd xmm4, [p_src + i_srcstride3]
+ punpcklbw xmm0, xmm4
+ jg .width4_height_ge8
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm2, xmm2
+ movd [p_dst], xmm2
+.width4_height_le5_done:
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+.width4_height_ge8:
+ lea p_src, [p_src + 4 * i_srcstride]
+ movd xmm1, [p_src]
+ punpcklbw xmm4, xmm1
+ punpcklqdq xmm0, xmm4
+ SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm2, xmm2
+ movd [p_dst], xmm2
+ psrlq xmm2, 32
+ movd [p_dst + i_dststride], xmm2
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movd xmm4, [p_src + i_srcstride]
+ punpcklbw xmm1, xmm4
+ movd xmm2, [p_src + 2 * i_srcstride]
+ punpcklbw xmm4, xmm2
+ punpcklqdq xmm1, xmm4
+ SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm3, xmm3
+ movd [p_dst], xmm3
+ psrlq xmm3, 32
+ movd [p_dst + i_dststride], xmm3
+ cmp i_height, 9
+ jl .width4_height_ge8_done
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movd xmm4, [p_src + i_srcstride3]
+ punpcklbw xmm2, xmm4
+ SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+ packuswb xmm0, xmm0
+ movd [p_dst], xmm0
+.width4_height_ge8_done:
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+
+.width8or16:
+ sub i_height, 1
+ push i_height
+%xdefine i_ycnt i_height
+%define i_height [r7]
+.xloop:
+ push p_src
+ push p_dst
+ test i_ycnt, 1
+ jnz .yloop_begin_even
+ movq xmm0, [p_src]
+ movq xmm1, [p_src + i_srcstride]
+ punpcklbw xmm0, xmm1
+ movq xmm2, [p_src + 2 * i_srcstride]
+ movq xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklbw xmm2, xmm3
+ movq xmm4, [p_src]
+ movq xmm5, [p_src + i_srcstride]
+ lea p_src, [p_src + 2 * i_srcstride]
+ punpcklbw xmm4, xmm5
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
+ packuswb xmm0, xmm0
+ movlps [p_dst], xmm0
+ add p_dst, i_dststride
+ jmp .yloop
+.yloop_begin_even:
+ movq xmm1, [p_src]
+ movq xmm2, [p_src + i_srcstride]
+ movq xmm3, [p_src + 2 * i_srcstride]
+ add p_src, i_srcstride3
+ punpcklbw xmm2, xmm3
+ movq xmm4, [p_src]
+ movq xmm5, [p_src + i_srcstride]
+ lea p_src, [p_src + 2 * i_srcstride]
+ punpcklbw xmm4, xmm5
+.yloop:
+ movq xmm6, [p_src]
+ SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
+ movq xmm7, [p_src + i_srcstride]
+ punpcklbw xmm6, xmm7
+ SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
+ packuswb xmm1, xmm2
+ movlps [p_dst], xmm1
+ movhps [p_dst + i_dststride], xmm1
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movq xmm0, [p_src + 2 * i_srcstride]
+ SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
+ movq xmm1, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklbw xmm0, xmm1
+ SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
+ packuswb xmm3, xmm4
+ movlps [p_dst], xmm3
+ movhps [p_dst + i_dststride], xmm3
+ cmp i_ycnt, 4
+ jle .yloop_exit
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movq xmm2, [p_src]
+ SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
+ movq xmm3, [p_src + i_srcstride]
+ punpcklbw xmm2, xmm3
+ SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
+ packuswb xmm5, xmm6
+ movlps [p_dst], xmm5
+ movhps [p_dst + i_dststride], xmm5
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movq xmm4, [p_src + 2 * i_srcstride]
+ SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
+ movq xmm5, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklbw xmm4, xmm5
+ SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
+ packuswb xmm7, xmm0
+ movlps [p_dst], xmm7
+ movhps [p_dst + i_dststride], xmm7
+ lea p_dst, [p_dst + 2 * i_dststride]
+ sub i_ycnt, 8
+ jg .yloop
+.yloop_exit:
+ pop p_dst
+ pop p_src
+ sub i_width, 8
+ jle .width8or16_done
+ add p_src, 8
+ add p_dst, 8
+ mov i_ycnt, i_height
+ jmp .xloop
+.width8or16_done:
+ pop i_ycnt
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+%undef p_src
+%undef i_srcstride
+%undef i_srcstride3
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_ycnt
+
+
+;*******************************************************************************
+; void McHorVer20_ssse3(const uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iWidth,
+; int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_width r4
+%define i_height r5
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ movdqa xmm4, [shufb_32435465768798A9]
+ movdqa xmm5, [shufb_011267784556ABBC]
+ movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+ cmp i_width, 8
+ je .width8_yloop
+ jg .width16_yloop
+.width4_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm1, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+ packuswb xmm0, xmm0
+ movd [p_dst], xmm0
+ psrlq xmm0, 32
+ movd [p_dst + i_dststride], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ sub i_height, 2
+ jg .width4_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+.width8_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm1, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+ SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+ packuswb xmm0, xmm1
+ movlps [p_dst], xmm0
+ movhps [p_dst + i_dststride], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ sub i_height, 2
+ jg .width8_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+.width16_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm1, [p_src + 6]
+ add p_src, i_srcstride
+ SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+ SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+ packuswb xmm0, xmm1
+ MOVDQ [p_dst], xmm0
+ add p_dst, i_dststride
+ sub i_height, 1
+ jg .width16_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width5Or9Or17_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_width r4
+%define i_height r5
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ movdqa xmm5, [shufb_32435465768798A9]
+ movdqa xmm6, [shufb_011267784556ABBC]
+ movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+ cmp i_width, 9
+ je .width9_yloop
+ jg .width17_yloop
+.width5_yloop:
+ movdqu xmm0, [p_src - 2]
+ add p_src, i_srcstride
+ SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+ packuswb xmm0, xmm0
+ movdqa xmm1, xmm0
+ psrlq xmm1, 8
+ movd [p_dst], xmm0
+ movd [p_dst + 1], xmm1
+ add p_dst, i_dststride
+ sub i_height, 1
+ jg .width5_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+.width9_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm4, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ movdqa xmm3, xmm0
+ punpckhqdq xmm3, xmm4
+ SSSE3_FilterHorizontal_2px xmm3, xmm2
+ SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+ packuswb xmm3, xmm0
+ movd [p_dst + 5], xmm3
+ movhps [p_dst], xmm3
+ add p_dst, i_dststride
+ SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2
+ packuswb xmm4, xmm4
+ psrldq xmm3, 4
+ movd [p_dst + 5], xmm3
+ movlps [p_dst], xmm4
+ add p_dst, i_dststride
+ sub i_height, 2
+ jg .width9_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+.width17_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm3, [p_src + 6]
+ add p_src, i_srcstride
+ movdqa xmm4, xmm3
+ SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+ SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
+ packuswb xmm0, xmm3
+ movdqu xmm1, [p_src - 2]
+ movdqu xmm3, [p_src + 6]
+ add p_src, i_srcstride
+ punpckhqdq xmm4, xmm3
+ SSSE3_FilterHorizontal_2px xmm4, xmm2
+ packuswb xmm4, xmm4
+ movd [p_dst + 13], xmm4
+ MOVDQ [p_dst], xmm0
+ add p_dst, i_dststride
+ psrldq xmm4, 4
+ movd [p_dst + 13], xmm4
+ SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2
+ SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2
+ packuswb xmm1, xmm3
+ MOVDQ [p_dst], xmm1
+ add p_dst, i_dststride
+ sub i_height, 2
+ jg .width17_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;*******************************************************************************
+; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,
+; int iSrcStride,
+; int16_t *pDst,
+; int iHeight);
+;*******************************************************************************
+
+WELS_EXTERN McHorVer20Width4U8ToS16_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_height r3
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ sub p_src, i_srcstride
+ sub p_src, i_srcstride
+ movdqa xmm4, [shufb_32435465768798A9]
+ movdqa xmm5, [shufb_011267784556ABBC]
+ movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+ sub i_height, 1
+.yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm1, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+ movdqa [p_dst], xmm0
+ add p_dst, 16
+ sub i_height, 2
+ jg .yloop
+ ; Height % 2 remainder.
+ movdqu xmm0, [p_src - 2]
+ SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+ movlps [p_dst], xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width4S16ToU8_ssse3
+%define p_src r0
+%define p_dst r1
+%define i_dststride r2
+%define i_height r3
+%define i_srcstride 8
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm0, [p_src + 0 * i_srcstride]
+ movdqu xmm1, [p_src + 1 * i_srcstride]
+ movdqa xmm2, [p_src + 2 * i_srcstride]
+ movdqu xmm3, [p_src + 3 * i_srcstride]
+ movdqa xmm4, [p_src + 4 * i_srcstride]
+ movdqu xmm5, [p_src + 5 * i_srcstride]
+ movdqa xmm6, [p_src + 6 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+ packuswb xmm0, xmm0
+ movd [p_dst], xmm0
+ psrlq xmm0, 32
+ movd [p_dst + i_dststride], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movdqu xmm7, [p_src + 7 * i_srcstride]
+ movdqa xmm0, [p_src + 8 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1
+ packuswb xmm2, xmm2
+ movd [p_dst], xmm2
+ psrlq xmm2, 32
+ movd [p_dst + i_dststride], xmm2
+ cmp i_height, 4
+ jle .done
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movdqu xmm1, [p_src + 9 * i_srcstride]
+ movdqa xmm2, [p_src + 10 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3
+ packuswb xmm4, xmm4
+ movd [p_dst], xmm4
+ psrlq xmm4, 32
+ movd [p_dst + i_dststride], xmm4
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movdqu xmm3, [p_src + 11 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5
+ packuswb xmm6, xmm6
+ movd [p_dst], xmm6
+ psrlq xmm6, 32
+ movd [p_dst + i_dststride], xmm6
+.done:
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+%undef p_src
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride
+
+
+;***********************************************************************
+; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,
+; int16_t iSrcStride,
+; int16_t *pDst,
+; int32_t iDstStride,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width8U8ToS16_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_height r4
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub p_src, i_srcstride
+ sub p_src, i_srcstride
+ movdqa xmm4, [shufb_32435465768798A9]
+ movdqa xmm5, [shufb_011267784556ABBC]
+ movdqa xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+ sub i_height, 1
+.yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm1, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+ MOVDQ [p_dst], xmm0
+ add p_dst, i_dststride
+ SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
+ MOVDQ [p_dst], xmm1
+ add p_dst, i_dststride
+ sub i_height, 2
+ jg .yloop
+ jl .done
+ movdqu xmm0, [p_src - 2]
+ SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
+ MOVDQ [p_dst], xmm0
+.done:
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,
+; int32_t iTapStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02Width5S16ToU8_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_height r4
+%define i_srcstride3 r5
+ %assign push_num 0
+%ifdef X86_32
+ push r5
+ %assign push_num 1
+%endif
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea i_srcstride3, [3 * i_srcstride]
+ movdqa xmm0, [p_src]
+ movdqa xmm1, [p_src + i_srcstride]
+ movdqa xmm2, [p_src + 2 * i_srcstride]
+ movdqa xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ movdqa xmm4, [p_src]
+ movdqa xmm5, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movdqa xmm6, [p_src + 2 * i_srcstride]
+ packuswb xmm0, xmm0
+ movdqa xmm7, xmm0
+ psrlq xmm7, 8
+ movd [p_dst + 1], xmm7
+ movd [p_dst], xmm0
+ add p_dst, i_dststride
+ SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ movdqa xmm7, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ packuswb xmm1, xmm1
+ movdqa xmm0, xmm1
+ psrlq xmm0, 8
+ movd [p_dst + 1], xmm0
+ movd [p_dst], xmm1
+ add p_dst, i_dststride
+ SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
+ movdqa xmm0, [p_src]
+ packuswb xmm2, xmm2
+ movdqa xmm1, xmm2
+ psrlq xmm1, 8
+ movd [p_dst + 1], xmm1
+ movd [p_dst], xmm2
+ add p_dst, i_dststride
+ SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ packuswb xmm3, xmm3
+ movdqa xmm2, xmm3
+ psrlq xmm2, 8
+ movd [p_dst + 1], xmm2
+ movd [p_dst], xmm3
+ add p_dst, i_dststride
+ movdqa xmm1, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
+ packuswb xmm4, xmm4
+ movdqa xmm3, xmm4
+ psrlq xmm3, 8
+ movd [p_dst + 1], xmm3
+ movd [p_dst], xmm4
+ cmp i_height, 5
+ jle .done
+ add p_dst, i_dststride
+ movdqa xmm2, [p_src + 2 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
+ movdqa xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ packuswb xmm5, xmm5
+ movdqa xmm4, xmm5
+ psrlq xmm4, 8
+ movd [p_dst + 1], xmm4
+ movd [p_dst], xmm5
+ add p_dst, i_dststride
+ SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
+ movdqa xmm4, [p_src]
+ packuswb xmm6, xmm6
+ movdqa xmm5, xmm6
+ psrlq xmm5, 8
+ movd [p_dst + 1], xmm5
+ movd [p_dst], xmm6
+ add p_dst, i_dststride
+ SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ packuswb xmm7, xmm7
+ movdqa xmm6, xmm7
+ psrlq xmm6, 8
+ movd [p_dst + 1], xmm6
+ movd [p_dst], xmm7
+ add p_dst, i_dststride
+ movdqa xmm5, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ packuswb xmm0, xmm0
+ movdqa xmm7, xmm0
+ psrlq xmm7, 8
+ movd [p_dst + 1], xmm7
+ movd [p_dst], xmm0
+.done:
+ POP_XMM
+ LOAD_5_PARA_POP
+%ifdef X86_32
+ pop r5
+%endif
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_height
+%undef i_srcstride3
+
+
+;***********************************************************************
+; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,
+; int32_t iSrcStride,
+; int16_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_width r4
+%define i_height r5
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub p_src, i_srcstride
+ sub p_src, i_srcstride
+ pcmpeqw xmm4, xmm4
+ psllw xmm4, 15 ; dw -32768
+ movdqa xmm5, [shufb_32435465768798A9]
+ movdqa xmm6, [shufb_011267784556ABBC]
+ movdqa xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+ cmp i_width, 9
+ jne .width17_yloop
+
+.width9_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqa xmm3, xmm0
+ SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+ movdqu xmm2, [p_src + i_srcstride - 2]
+ lea p_src, [p_src + 2 * i_srcstride]
+ punpckhqdq xmm3, xmm2
+ SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1
+ movlps [p_dst + 10], xmm3
+ MOVDQ [p_dst], xmm0
+ add p_dst, i_dststride
+ movhps [p_dst + 10], xmm3
+ SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0
+ MOVDQ [p_dst], xmm2
+ add p_dst, i_dststride
+ sub i_height, 2
+ jg .width9_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+.width17_yloop:
+ movdqu xmm0, [p_src - 2]
+ movdqu xmm3, [p_src + 6]
+ add p_src, i_srcstride
+ SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
+ MOVDQ [p_dst], xmm0
+ movdqa xmm0, xmm3
+ SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
+ movdqu xmm2, [p_src + 6]
+ punpckhqdq xmm0, xmm2
+ SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1
+ movdqu xmm1, [p_src - 2]
+ add p_src, i_srcstride
+ movlps [p_dst + 26], xmm0
+ MOVDQ [p_dst + 16], xmm3
+ add p_dst, i_dststride
+ movhps [p_dst + 26], xmm0
+ SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3
+ MOVDQ [p_dst], xmm1
+ SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3
+ MOVDQ [p_dst + 16], xmm2
+ add p_dst, i_dststride
+ sub i_height, 2
+ jg .width17_yloop
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+
+
+;***********************************************************************
+; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3
+%define p_src r0
+%define i_srcstride r1
+%define p_dst r2
+%define i_dststride r3
+%define i_width r4
+%define i_height r5
+%define i_srcstride3 r6
+ %assign push_num 0
+%ifdef X86_32
+ push r6
+ %assign push_num 1
+%endif
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub i_height, 1
+ push i_height
+ lea i_srcstride3, [3 * i_srcstride]
+ test i_width, 1
+ jz .width_loop
+ push p_src
+ push p_dst
+ lea p_src, [p_src + 2 * i_width - 2]
+ add p_dst, i_width
+ movd xmm0, [p_src]
+ punpcklwd xmm0, [p_src + i_srcstride]
+ movd xmm1, [p_src + 2 * i_srcstride]
+ add p_src, i_srcstride3
+ punpcklwd xmm1, [p_src]
+ punpckldq xmm0, xmm1
+ movd xmm1, [p_src + i_srcstride]
+ cmp i_height, 4
+ je .filter5_unalign
+ punpcklwd xmm1, [p_src + 2 * i_srcstride]
+ movd xmm2, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklwd xmm2, [p_src]
+ punpckldq xmm1, xmm2
+ punpcklqdq xmm0, xmm1
+.height_loop_unalign:
+ movd xmm1, [p_src + i_srcstride]
+ palignr xmm1, xmm0, 2
+ movd xmm2, [p_src + 2 * i_srcstride]
+ palignr xmm2, xmm1, 2
+ movd xmm3, [p_src + i_srcstride3]
+ palignr xmm3, xmm2, 2
+ lea p_src, [p_src + 4 * i_srcstride]
+ movd xmm4, [p_src]
+ palignr xmm4, xmm3, 2
+ movd xmm5, [p_src + i_srcstride]
+ palignr xmm5, xmm4, 2
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
+ packuswb xmm0, xmm0
+ movdqa xmm6, xmm0
+ pslld xmm6, 24
+ movd [p_dst - 4], xmm6
+ movlps [p_dst + 4 * i_dststride - 8], xmm6
+ add p_dst, i_dststride
+ movdqa xmm6, xmm0
+ pslld xmm6, 16
+ movd [p_dst - 4], xmm6
+ movlps [p_dst + 4 * i_dststride - 8], xmm6
+ add p_dst, i_dststride
+ movdqa xmm6, xmm0
+ pslld xmm6, 8
+ movd [p_dst - 4], xmm6
+ movd [p_dst + i_dststride - 4], xmm0
+ lea p_dst, [p_dst + 4 * i_dststride]
+ movlps [p_dst - 8], xmm6
+ movlps [p_dst + i_dststride - 8], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ sub i_height, 8
+ jle .height_loop_unalign_exit
+ movd xmm1, [p_src + 2 * i_srcstride]
+ palignr xmm1, xmm5, 2
+ movd xmm0, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ punpcklwd xmm0, [p_src]
+ palignr xmm0, xmm1, 4
+ jmp .height_loop_unalign
+.height_loop_unalign_exit:
+ movddup xmm6, [p_src + 2 * i_srcstride - 6]
+ SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ packuswb xmm1, xmm1
+ movlps [p_dst - 8], xmm1
+ jmp .unalign_done
+.filter5_unalign:
+ pslldq xmm0, 8
+ palignr xmm1, xmm0, 2
+ movd xmm2, [p_src + 2 * i_srcstride]
+ palignr xmm2, xmm1, 2
+ movd xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ palignr xmm3, xmm2, 2
+ movd xmm4, [p_src]
+ palignr xmm4, xmm3, 2
+ movd xmm5, [p_src + i_srcstride]
+ palignr xmm5, xmm4, 2
+ movd xmm6, [p_src + 2 * i_srcstride]
+ palignr xmm6, xmm5, 2
+ SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ packuswb xmm1, xmm1
+ movdqa xmm0, xmm1
+ psrlq xmm1, 8
+ movdqa xmm2, xmm0
+ psrlq xmm2, 16
+ movdqa xmm3, xmm0
+ psrlq xmm3, 24
+ movd [p_dst - 4], xmm0
+ movd [p_dst + i_dststride - 4], xmm1
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movd [p_dst - 4], xmm2
+ movd [p_dst + i_dststride - 4], xmm3
+ movlps [p_dst + 2 * i_dststride - 8], xmm0
+.unalign_done:
+ pop p_dst
+ pop p_src
+ mov i_height, [r7]
+ sub i_width, 1
+.width_loop:
+ push p_src
+ push p_dst
+ movdqa xmm0, [p_src]
+ movdqa xmm1, [p_src + i_srcstride]
+ movdqa xmm2, [p_src + 2 * i_srcstride]
+ movdqa xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ movdqa xmm4, [p_src]
+.height_loop:
+ movdqa xmm5, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movdqa xmm6, [p_src + 2 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ movdqa xmm7, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ packuswb xmm0, xmm1
+ movlps [p_dst], xmm0
+ movhps [p_dst + i_dststride], xmm0
+ lea p_dst, [p_dst + 2 * i_dststride]
+ SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
+ movdqa xmm0, [p_src]
+ SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
+ packuswb xmm2, xmm3
+ movlps [p_dst], xmm2
+ movhps [p_dst + i_dststride], xmm2
+ cmp i_height, 4
+ jl .x_loop_dec
+ lea p_dst, [p_dst + 2 * i_dststride]
+ movdqa xmm1, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
+ je .store_xmm4_exit
+ movdqa xmm2, [p_src + 2 * i_srcstride]
+ SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
+ movdqa xmm3, [p_src + i_srcstride3]
+ lea p_src, [p_src + 4 * i_srcstride]
+ packuswb xmm4, xmm5
+ movlps [p_dst], xmm4
+ movhps [p_dst + i_dststride], xmm4
+ lea p_dst, [p_dst + 2 * i_dststride]
+ SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
+ movdqa xmm4, [p_src]
+ SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ packuswb xmm6, xmm7
+ movlps [p_dst], xmm6
+ movhps [p_dst + i_dststride], xmm6
+ lea p_dst, [p_dst + 2 * i_dststride]
+ sub i_height, 8
+ jg .height_loop
+ jl .x_loop_dec
+ movdqa xmm5, [p_src + i_srcstride]
+ SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ packuswb xmm0, xmm0
+ movlps [p_dst], xmm0
+.x_loop_dec:
+ pop p_dst
+ pop p_src
+ sub i_width, 8
+ jle .done
+ mov i_height, [r7]
+ add p_src, 16
+ add p_dst, 8
+ jmp .width_loop
+.store_xmm4_exit:
+ packuswb xmm4, xmm4
+ movlps [p_dst], xmm4
+ pop p_dst
+ pop p_src
+.done:
+ pop i_height
+ POP_XMM
+ LOAD_6_PARA_POP
+%ifdef X86_32
+ pop r6
+%endif
+ ret
+%undef p_src
+%undef i_srcstride
+%undef p_dst
+%undef i_dststride
+%undef i_width
+%undef i_height
+%undef i_srcstride3
--- a/test/encoder/EncUT_MotionCompensation.cpp
+++ b/test/encoder/EncUT_MotionCompensation.cpp
@@ -168,8 +168,8 @@
DEF_MCCOPYTEST (8, 16)
DEF_MCCOPYTEST (16, 16)
-#define DEF_LUMA_MCTEST(iW,iH) \
-TEST(McHorVer,iW##x##iH) \
+#define DEF_LUMA_MCTEST(iW, iH, cpu_flags, name_suffix) \
+TEST(McHorVer, iW##x##iH##_##name_suffix) \
{ \
for (int32_t a = 0; a < 4; a++) { \
for (int32_t b = 0; b < 4; b++) { \
@@ -191,43 +191,38 @@
uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256; \
}\
}\
- int32_t iCpuCores = 1; \
- uint32_t uiCpuFlag;\
- for(int32_t k =0; k<2; k++)\
- {\
- if(k==0)\
- {\
- uiCpuFlag = 0;\
- }else \
- {\
- uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
- }\
- InitMcFunc(&sMcFunc,uiCpuFlag);\
- memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
- memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
- MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
- MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
- sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
- for(int32_t j=0;j<MC_BUFF_HEIGHT;j++) \
- { \
- for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++) \
- { \
- ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]); \
- } \
- } \
- }\
+ InitMcFunc(&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \
+ memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+ memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+ MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
+ MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
+ sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
+ for(int32_t j=0;j<MC_BUFF_HEIGHT;j++) \
+ { \
+ for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++) \
+ { \
+ ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]); \
+ } \
+ } \
}\
}\
}
+#define DEF_LUMA_MCTESTS(cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST ( 4, 4, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST ( 4, 8, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST ( 8, 4, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST ( 8, 8, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST (16, 8, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST ( 8, 16, cpu_flags, name_suffix) \
+ DEF_LUMA_MCTEST (16, 16, cpu_flags, name_suffix)
-DEF_LUMA_MCTEST (4, 4)
-DEF_LUMA_MCTEST (4, 8)
-DEF_LUMA_MCTEST (8, 4)
-DEF_LUMA_MCTEST (8, 8)
-DEF_LUMA_MCTEST (16, 8)
-DEF_LUMA_MCTEST (8, 16)
-DEF_LUMA_MCTEST (16, 16)
+DEF_LUMA_MCTESTS(0, c)
+DEF_LUMA_MCTESTS(~0, native)
+#ifdef X86_ASM
+DEF_LUMA_MCTESTS(WELS_CPU_SSE2, sse2)
+DEF_LUMA_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)
+#endif
#define DEF_CHROMA_MCTEST(iW,iH) \
TEST(McChroma,iW##x##iH) \
@@ -315,81 +310,86 @@
}
}
-#define DEF_HALFPEL_MCTEST(iW,iH) \
-TEST (EncMcHalfpel, iW##x##iH) { \
+#define DEF_HALFPEL_MCTEST(iW, iH, cpu_flags, name_suffix) \
+TEST (EncMcHalfpel, iW##x##iH##_##name_suffix) { \
SMcFunc sMcFunc; \
- for (int32_t k = 0; k < 2; k++) { \
- for (int32_t w = 0; w < 2; w++) { \
- int32_t width = iW ; \
- int32_t height = iH; \
- uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
- uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
- uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \
- ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
- uint8_t* uAnchors[4]; \
- int16_t pBuf[MC_BUFF_DST_STRIDE]; \
- uAnchors[0] = &uAnchor[0][4][4]; \
- uAnchors[1] = &uAnchor[1][4][4]; \
- uAnchors[2] = &uAnchor[2][4][4]; \
- uAnchors[3] = &uAnchor[3][4][4]; \
- \
- memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \
- memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \
- for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
- for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \
- uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \
- uRand[j][i] = rand() % 256; \
- } \
+ for (int32_t w = 0; w < 2; w++) { \
+ int32_t width = iW ; \
+ int32_t height = iH; \
+ uint8_t uAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+ uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+ uint8_t uRand[MC_BUFF_HEIGHT][MC_BUFF_DST_STRIDE]; \
+ ENFORCE_STACK_ALIGN_2D (uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+ uint8_t* uAnchors[4]; \
+ int16_t pBuf[MC_BUFF_DST_STRIDE]; \
+ uAnchors[0] = &uAnchor[0][4][4]; \
+ uAnchors[1] = &uAnchor[1][4][4]; \
+ uAnchors[2] = &uAnchor[2][4][4]; \
+ uAnchors[3] = &uAnchor[3][4][4]; \
+ \
+ memset (uAnchor, 0, 4 * sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_SRC_STRIDE); \
+ memset (uDstTest, 0, sizeof (uint8_t)*MC_BUFF_HEIGHT * MC_BUFF_DST_STRIDE); \
+ for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+ for (int32_t i = 0; i < MC_BUFF_SRC_STRIDE; i++) { \
+ uAnchor[0][j][i] = uSrcTest[j][i] = rand() % 256; \
+ uRand[j][i] = rand() % 256; \
} \
- \
- uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL); \
- InitMcFunc (&sMcFunc, uiCpuFlag); \
- \
- MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \
- memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
- sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \
- for (int32_t j = 0; j < height; j++) { \
- for (int32_t i = 0; i < width + 1; i++) { \
- ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \
- } \
+ } \
+ \
+ InitMcFunc (&sMcFunc, WelsCPUFeatureDetect (0) & (cpu_flags)); \
+ \
+ MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4); \
+ memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+ sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height); \
+ for (int32_t j = 0; j < height; j++) { \
+ for (int32_t i = 0; i < width + 1; i++) { \
+ ASSERT_EQ (uAnchor[1][4 + j][4 + i], uDstTest[j][i]); \
} \
- for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
- for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
- ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
- } \
+ } \
+ for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+ for (int32_t i = j < height ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+ ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
} \
- memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
- sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \
- for (int32_t j = 0; j < height + 1; j++) { \
- for (int32_t i = 0; i < width; i++) { \
- ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \
- } \
+ } \
+ memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+ sMcFunc.pfLumaHalfpelVer (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width, height + 1); \
+ for (int32_t j = 0; j < height + 1; j++) { \
+ for (int32_t i = 0; i < width; i++) { \
+ ASSERT_EQ (uAnchor[2][4 + j][4 + i], uDstTest[j][i]); \
} \
- for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
- for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \
- ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
- } \
+ } \
+ for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+ for (int32_t i = j < height + 1 ? width : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+ ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
} \
- memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
- sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \
- for (int32_t j = 0; j < height + 1; j++) { \
- for (int32_t i = 0; i < width + 1; i++) { \
- ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \
- } \
+ } \
+ memcpy (&uDstTest[0][0], &uRand[0][0], sizeof uRand); \
+ sMcFunc.pfLumaHalfpelCen (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height + 1); \
+ for (int32_t j = 0; j < height + 1; j++) { \
+ for (int32_t i = 0; i < width + 1; i++) { \
+ ASSERT_EQ (uAnchor[3][4 + j][4 + i], uDstTest[j][i]); \
} \
- for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
- for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
- ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
- } \
+ } \
+ for (int32_t j = 0; j < MC_BUFF_HEIGHT; j++) { \
+ for (int32_t i = j < height + 1 ? width + 1 : 0; i < MC_BUFF_DST_STRIDE; i++) { \
+ ASSERT_EQ (uRand[j][i], uDstTest[j][i]); \
} \
} \
} \
}
-DEF_HALFPEL_MCTEST(4,4)
-DEF_HALFPEL_MCTEST(4,8)
-DEF_HALFPEL_MCTEST(8,4)
-DEF_HALFPEL_MCTEST(8,8)
-DEF_HALFPEL_MCTEST(8,16)
-DEF_HALFPEL_MCTEST(16,8)
-DEF_HALFPEL_MCTEST(16,16)
+#define DEF_HALFPEL_MCTESTS(cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST( 4 , 4, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST( 4, 8, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST( 8, 4, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST( 8, 8, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST( 8, 16, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST(16, 8, cpu_flags, name_suffix) \
+ DEF_HALFPEL_MCTEST(16, 16, cpu_flags, name_suffix)
+
+DEF_HALFPEL_MCTESTS(0, c)
+DEF_HALFPEL_MCTESTS(~0, native)
+#ifdef X86_ASM
+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2, sse2)
+DEF_HALFPEL_MCTESTS(WELS_CPU_SSE2 | WELS_CPU_SSSE3, ssse3)
+#endif