ref: 9909c306f1172661c2b09e4bee2428fd953e868a
parent: 040974f7355a2829ada1fe7451bead94eaa6aec1
author: Sindre Aamås <saamas@cisco.com>
date: Thu Feb 25 10:57:20 EST 2016
[Common/x86] DeblockChromaLt4H_ssse3 optimizations Use packed 8-bit operations rather than unpack to 16-bit. ~5.72x speedup on Haswell (x86-64). ~1.85x speedup on Haswell (x86 32-bit).
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -113,6 +113,7 @@
%define r1w dx
%define r2w r8w
%define r3w r9w
+%define r4w ax
%define r6w r11w
%define r0b cl
@@ -182,6 +183,7 @@
%define r1w si
%define r2w dx
%define r3w cx
+%define r4w r8w
%define r6w r10w
%define r0b dil
@@ -249,6 +251,7 @@
%define r1w cx
%define r2w dx
%define r3w bx
+%define r4w si
%define r6w bp
%define r0b al
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -130,7 +130,45 @@
por %1, %3
%endmacro
+; Compute
+; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
+; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255)
+; 16-wide parallel in packed byte representation in xmm registers.
+;
+; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8
+%macro SSE2_DeblockP0Q0_Lt4 8
+ ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
+ ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
+ ; Bias so that unsigned saturation can be used.
+ ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
+ ; q0 - p0 is split into a non-negative and non-positive part. The latter is
+ ; subtracted from the biased value.
+ movdqa %7, %2
+ psubusb %7, %3 ; clip(p0 - q0, 0, 255)
+ ; ((p1 - q1) >> 2) + 0xc0
+ pxor %4, %6 ; q1 ^ 0xff aka -q1 - 1 & 0xff
+ pavgb %1, %4 ; (((p1 - q1 + 0x100) >> 1)
+ pavgb %1, %6 ; + 0x100) >> 1
+ psubusb %1, %7 ; -= clip(p0 - q0, 0, 255) saturate.
+ movdqa %8, %3
+ psubusb %8, %2 ; (clip(q0 - p0, 0, 255)
+ pavgb %8, %1 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
+ ; Unbias and split into a non-negative and a non-positive part.
+ ; Clip each part to iTc via minub.
+ ; Add/subtract each part to/from p0/q0 and clip.
+ movdqa %6, [WELS_DB96_16]
+ psubusb %6, %8
+ psubusb %8, [WELS_DB96_16]
+ pminub %6, %5
+ pminub %8, %5
+ psubusb %2, %6
+ paddusb %2, %8 ; p0
+ paddusb %3, %6
+ psubusb %3, %8 ; q0
+%endmacro
+
+
;*******************************************************************************
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
@@ -208,38 +246,11 @@
SSE2_ClipUB xmm1, xmm6, xmm5, xmm7 ; clip q1.
MOVDQ [r0 + 1 * r2], xmm1 ; store q1.
- ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
- ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
- ; Bias so that unsigned saturation can be used.
- ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
- ; q0 - p0 is split into a non-negative and non-positive part. The latter is
- ; subtracted from the biased value.
- MOVDQ xmm1, [r3 + 0 * r1] ; p0
- MOVDQ xmm0, [r0 + 0 * r2] ; q0
- movdqa xmm7, xmm1
- psubusb xmm7, xmm0 ; clip(p0 - q0, 0, 255)
- ; ((p1 - q1) >> 2) + 0xc0
- pxor xmm6, xmm2 ; q1 ^ 0xff aka -q1 - 1 & 0xff
- pavgb xmm4, xmm6 ; (((p1 - q1 + 0x100) >> 1)
- pavgb xmm4, xmm2 ; + 0x100) >> 1
- psubusb xmm4, xmm7 ; -= clip(p0 - q0, 0, 255) saturate.
- psubusb xmm0, xmm1 ; (clip(q0 - p0, 0, 255)
- pavgb xmm0, xmm4 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
-
- ; Unbias and split into a non-negative and a non-positive part.
- ; Clip each part to iTc via minub.
- ; Add/subtract each part to/from p0/q0 and clip.
- movdqa xmm6, [WELS_DB96_16]
- psubusb xmm6, xmm0
- psubusb xmm0, [WELS_DB96_16]
- pminub xmm6, xmm3
- pminub xmm0, xmm3
- psubusb xmm1, xmm6
- paddusb xmm1, xmm0
- paddusb xmm6, [r0 + 0 * r2]
- psubusb xmm6, xmm0
+ MOVDQ xmm1, [r3 + 0 * r1] ; p0
+ MOVDQ xmm0, [r0 + 0 * r2] ; q0
+ SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7
MOVDQ [r3 + 0 * r1], xmm1 ; store p0.
- MOVDQ [r0 + 0 * r2], xmm6 ; store q0.
+ MOVDQ [r0 + 0 * r2], xmm0 ; store q0.
POP_XMM
LOAD_5_PARA_POP
@@ -375,6 +386,130 @@
ret
+; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11
+%macro SSE2_LoadCbCr_4x16H 11
+ movd %1, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 0
+ movd %2, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 2
+ punpcklbw %1, %2 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2
+ movd %2, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 4
+ movd %9, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 6
+ punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6
+ punpcklwd %1, %2 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6
+ movd %2, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 0
+ movd %9, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 2
+ punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2
+ movd %9, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 4
+ movd %10, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 6
+ punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6
+ punpcklwd %2, %9 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6
+ add %5, %7 ; pPixCb += iStride
+ add %6, %7 ; pPixCr += iStride
+ movd %9, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 1
+ movd %10, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 3
+ punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3
+ movd %10, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 5
+ movd %3, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 7
+ punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7
+ punpcklwd %9, %10 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7
+ movd %10, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 1
+ movd %3, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 3
+ punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3
+ movd %3, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 5
+ movd %4, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 7
+ punpcklbw %3, %4 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7
+ punpcklwd %10, %3 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7
+ movdqa %3, %1
+ punpckldq %1, %2 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6
+ punpckhdq %3, %2 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6
+ movdqa %11, %9
+ punpckldq %9, %10 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7
+ punpckhdq %11, %10 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7
+ movdqa %2, %1
+ punpcklqdq %1, %9 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7
+ punpckhqdq %2, %9 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7
+ movdqa %4, %3
+ punpcklqdq %3, %11 ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7
+ punpckhqdq %4, %11 ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7
+%endmacro
+
+; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10
+%macro SSE2_StoreCbCr_4x16H 10
+ movdqa %10, %5
+ punpcklbw %10, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6
+ punpckhbw %5, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7
+ mov %7, r7 ; preserve stack pointer
+ and r7, -16 ; align stack pointer
+ sub r7, 32 ; allocate stack space
+ movdqa [r7 ], %10 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack
+ movdqa [r7 + 16], %5 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack
+ mov %8, [r7 + 16] ; [p0,q0,p0,q0] cb line 1,3
+ mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 1
+ shr %8, 16 ; [p0,q0] cb line 3
+ mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 3
+ mov %8, [r7 + 20] ; [p0,q0,p0,q0] cb line 5,7
+ mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 5
+ shr %8, 16 ; [p0,q0] cb line 7
+ mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 7
+ mov %8, [r7 + 24] ; [p0,q0,p0,q0] cr line 1,3
+ mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 1
+ shr %8, 16 ; [p0,q0] cr line 3
+ mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 3
+ mov %8, [r7 + 28] ; [p0,q0,p0,q0] cr line 5,7
+ mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 5
+ shr %8, 16 ; [p0,q0] cr line 7
+ mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 7
+ sub %1, %3 ; pPixCb -= iStride
+ sub %2, %3 ; pPixCr -= iStride
+ mov %8, [r7 ] ; [p0,q0,p0,q0] cb line 0,2
+ mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 0
+ shr %8, 16 ; [p0,q0] cb line 2
+ mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 2
+ mov %8, [r7 + 4] ; [p0,q0,p0,q0] cb line 4,6
+ mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 4
+ shr %8, 16 ; [p0,q0] cb line 6
+ mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 6
+ mov %8, [r7 + 8] ; [p0,q0,p0,q0] cr line 0,2
+ mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 0
+ shr %8, 16 ; [p0,q0] cr line 2
+ mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 2
+ mov %8, [r7 + 12] ; [p0,q0,p0,q0] cr line 4,6
+ mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 4
+ shr %8, 16 ; [p0,q0] cr line 6
+ mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 6
+ mov r7, %7 ; restore stack pointer
+%endmacro
+
+; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11
+%macro SSSE3_DeblockChromaLt4 11
+ movdqa %8, %3
+ SSE2_AbsDiffUB %8, %2, %9 ; |p0 - q0|
+ SSE2_CmpgeUB %8, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+ movdqa %9, %4
+ SSE2_AbsDiffUB %9, %3, %5 ; |q1 - q0|
+ movdqa %10, %1
+ SSE2_AbsDiffUB %10, %2, %5 ; |p1 - p0|
+ pmaxub %9, %10 ; max(|q1 - q0|, |p1 - p0|)
+ pxor %10, %10
+ movd %5, %6
+ pshufb %5, %10 ; iBeta
+ SSE2_CmpgeUB %9, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+ por %8, %9 ; | !bDeltaP0Q0
+ movd %5, [%7]
+%if %11
+ punpckldq %5, %5
+ punpcklbw %5, %5 ; iTc
+%else
+ pshufd %5, %5, 0 ; iTc
+%endif
+ pcmpeqw %10, %10 ; FFh
+ movdqa %9, %5
+ pcmpgtb %9, %10 ; iTc > -1 ? FFh : 00h
+ pandn %8, %5 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
+ pand %8, %9 ; &= (iTc > -1 ? FFh : 00h)
+ SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9
+%endmacro
+
+
;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
@@ -395,73 +530,18 @@
movhps xmm0, [r1 + 0 * r2] ; q0 cr
movq xmm2, [r0 + 1 * r3] ; p0 cb
movhps xmm2, [r1 + 1 * r3] ; p0 cr
-
- movdqa xmm4, xmm0
- SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
- SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
-
movq xmm1, [r0 + 1 * r2] ; q1 cb
movhps xmm1, [r1 + 1 * r2] ; q1 cr
movq xmm3, [r0 + 2 * r3] ; p1 cb
movhps xmm3, [r1 + 2 * r3] ; p1 cr
- movdqa xmm5, xmm1
- SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
- movdqa xmm6, xmm3
- SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
- pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
-
- pxor xmm6, xmm6
- movd xmm7, arg5d
- pshufb xmm7, xmm6 ; iBeta
-
- SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
- por xmm4, xmm5 ; | !bDeltaP0Q0
-
%ifidni arg6, r5
- movd xmm7, [arg6]
+ SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1
%else
mov r2, arg6
- movd xmm7, [r2]
+ SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2, xmm4, xmm5, xmm6, 1
%endif
- punpckldq xmm7, xmm7
- punpcklbw xmm7, xmm7 ; iTc
- pcmpeqw xmm6, xmm6 ; FFh
- movdqa xmm5, xmm7
- pcmpgtb xmm5, xmm6 ; iTc > -1 ? FFh : FFh
- pandn xmm4, xmm7 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
- pand xmm4, xmm5 ; &= (iTc > -1 ? FFh : 00h)
- ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
- ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
- ; Bias so that unsigned saturation can be used.
- ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
- ; q0 - p0 is split into a non-negative and non-positive part. The latter is
- ; subtracted from the biased value.
- movdqa xmm7, xmm2
- psubusb xmm7, xmm0 ; clip(p0 - q0, 0, 255)
- ; ((p1 - q1) >> 2) + 0xc0
- pxor xmm1, xmm6 ; q1 ^ 0xff aka -q1 - 1 & 0xff
- pavgb xmm3, xmm1 ; (((p1 - q1 + 0x100) >> 1)
- pavgb xmm3, xmm6 ; + 0x100) >> 1
- psubusb xmm3, xmm7 ; -= clip(p0 - q0, 0, 255) saturate.
- movdqa xmm5, xmm0
- psubusb xmm5, xmm2 ; (clip(q0 - p0, 0, 255)
- pavgb xmm5, xmm3 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
-
- ; Unbias and split into a non-negative and a non-positive part.
- ; Clip each part to iTc via minub.
- ; Add/subtract each part to/from p0/q0 and clip.
- movdqa xmm6, [WELS_DB96_16]
- psubusb xmm6, xmm5
- psubusb xmm5, [WELS_DB96_16]
- pminub xmm6, xmm4
- pminub xmm5, xmm4
- psubusb xmm2, xmm6
- paddusb xmm2, xmm5
- paddusb xmm0, xmm6
- psubusb xmm0, xmm5
-
movlps [r0 + 1 * r3], xmm2 ; store p0 cb
movhps [r1 + 1 * r3], xmm2 ; store p0 cr
movlps [r0 ], xmm0 ; store q0 cb
@@ -536,6 +616,30 @@
ret
+;*******************************************************************************
+; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ movd xmm7, arg4d
+ pxor xmm0, xmm0
+ pshufb xmm7, xmm0 ; iAlpha
+ lea r3, [3 * r2 - 1] ; 3 * iStride - 1
+
+ SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+ SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
+ SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
+
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
%ifdef WIN64
@@ -802,289 +906,6 @@
-WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- PUSH_XMM 16
- sub rsp,170h
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h+160] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h+160] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- POP_XMM
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
-
-
-
%elifdef UNIX64
@@ -1360,294 +1181,7 @@
ret
-WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
- push r13
- push r14
- sub rsp,170h
- mov r13, r8
- mov r14, r9
- mov r8, rdx
- mov r9, rcx
- mov rdx, rdi
- mov rcx, rsi
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- ret
-
-
-
%elifdef X86_32
;***************************************************************************
@@ -1921,315 +1455,6 @@
psrldq xmm1,4
psrldq xmm6,4
mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
movd dword [edi],xmm0
movd dword [edi+ecx],xmm5
movd dword [edi+ecx*2],xmm1