ref: 65b339815efb168f48e256231299a932f397fb51
parent: ddcfc09c495c81dd6cf4824d1f3f345481c75d61
author: Martin Storsjö <martin@martin.st>
date: Sun Jan 5 09:16:22 EST 2014
Get rid of trailing whitespace in the assembly source files
--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -154,7 +154,7 @@
%define PUSHRFLAGS pushfq
%define POPRFLAGS popfq
%define retrq rax
-%define retrd eax
+%define retrd eax
%elifdef X86_32 ; X86_32 ;************************************
@@ -233,7 +233,7 @@
%macro LOAD_4_PARA 0
%ifdef X86_32
push r3
- %assign push_num push_num+1
+ %assign push_num push_num+1
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -245,7 +245,7 @@
%ifdef X86_32
push r3
push r4
- %assign push_num push_num+2
+ %assign push_num push_num+2
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -261,7 +261,7 @@
push r3
push r4
push r5
- %assign push_num push_num+3
+ %assign push_num push_num+3
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -280,7 +280,7 @@
push r4
push r5
push r6
- %assign push_num push_num+4
+ %assign push_num push_num+4
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -334,7 +334,7 @@
movsx %1, %2
%endif
%endmacro
-
+
%macro WELS_EXTERN 1
%ifdef PREFIX
global _%1
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -81,17 +81,17 @@
%ifdef WIN64
WelsCPUId:
- push rbx
- push rdx
-
+ push rbx
+ push rdx
+
mov eax, ecx
mov rcx, [r9]
- cpuid
+ cpuid
mov [r9], ecx
mov [r8], ebx
- mov rcx, [rsp + 2*8 + 40]
+ mov rcx, [rsp + 2*8 + 40]
mov [rcx], edx
- pop rdx
+ pop rdx
mov [rdx], eax
pop rbx
@@ -103,8 +103,8 @@
push rcx
push rdx
- mov eax, edi
- mov rcx, [rcx]
+ mov eax, edi
+ mov rcx, [rcx]
cpuid
mov [r8], edx
pop rdx
@@ -156,9 +156,9 @@
%elifdef UNIX64
mov eax, edi
mov ecx, esi
-%else
+%else
mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov ecx, [esp+8]
%endif
; refer to detection of AVX addressed in INTEL AVX manual document
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -57,264 +57,264 @@
SECTION .text
-%ifdef WIN64
+%ifdef WIN64
WELS_EXTERN DeblockLumaLt4V_sse2
DeblockLumaLt4V_sse2:
- push rbp
+ push rbp
mov r11,[rsp + 16 + 20h] ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,r8d
- movd xmm2,r9d
- mov qword [rbp+180h],r12
- mov r10,rcx
- movsxd r12,edx
- add edx,edx
- movsxd rdx,edx
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rcx]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx edx,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,edx
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rcx]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rcx]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rcx]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rcx]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rcx]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rcx],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rcx],xmm8
- movdqa [r12+rcx],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_sse2
@@ -321,462 +321,462 @@
ALIGN 16
DeblockLumaEq4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
ret
@@ -784,161 +784,161 @@
ALIGN 16
DeblockChromaLt4V_sse2:
- mov rax,rsp
- push rbx
- push rdi
- sub rsp,0C8h
+ mov rax,rsp
+ push rbx
+ push rdi
+ sub rsp,0C8h
mov r10,qword [rax + 30h] ; pTC
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rdi
- pop rbx
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rdi
+ pop rbx
ret
@@ -945,151 +945,151 @@
WELS_EXTERN DeblockChromaEq4V_sse2
ALIGN 16
DeblockChromaEq4V_sse2:
- mov rax,rsp
- push rbx
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
+ mov rax,rsp
+ push rbx
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
- pop rbx
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ pop rbx
ret
@@ -1099,263 +1099,263 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
DeblockChromaEq4H_sse2:
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop rdi
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop rdi
ret
@@ -1363,283 +1363,283 @@
WELS_EXTERN DeblockChromaLt4H_sse2
ALIGN 16
DeblockChromaLt4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- sub rsp,170h
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ sub rsp,170h
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
mov rax, [rsp+1C8h] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
movsx eax,word [rsp+1C0h] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
@@ -1649,258 +1649,258 @@
WELS_EXTERN DeblockLumaLt4V_sse2
DeblockLumaLt4V_sse2:
- push rbp
- mov r11,r8 ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,edx
- movd xmm2,ecx
- mov qword [rbp+180h],r12
- mov r10,rdi
- movsxd r12,esi
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
add rsi,rsi
- movsxd rdx,esi
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rdi]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx rsi,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,esi
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rdi]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rdi]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rdi]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rdi]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rdi]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rdi],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rdi],xmm8
- movdqa [r12+rdi],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_sse2
@@ -1907,637 +1907,637 @@
ALIGN 16
DeblockLumaEq4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
ret
WELS_EXTERN DeblockChromaLt4V_sse2
-ALIGN 16
-DeblockChromaLt4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ALIGN 16
+DeblockChromaLt4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
mov r10, rdx
mov r11, rcx
mov rcx, rdi
- mov rdx, rsi
+ mov rdx, rsi
mov rsi, r10
mov r10, r9
mov rbp, r8
mov r8, rsi
mov r9, r11
- sub rsp,0C8h
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rbp
- pop rbx
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
ret
WELS_EXTERN DeblockChromaEq4V_sse2
ALIGN 16
DeblockChromaEq4V_sse2:
- mov rax,rsp
- push rbx
+ mov rax,rsp
+ push rbx
push rbp
mov rbp, r8
@@ -2545,143 +2545,143 @@
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
-
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
+
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
pop rbp
- pop rbx
+ pop rbx
ret
@@ -2688,270 +2688,270 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
DeblockChromaEq4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
push r12
-
- mov rbp, r8
+
+ mov rbp, r8
mov r8, rdx
mov r9, rcx
mov rcx, rdi
- mov rdx, rsi
+ mov rdx, rsi
mov rdi, rdx
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
mov rsp,r11
pop r12
pop rbp
@@ -2962,14 +2962,14 @@
WELS_EXTERN DeblockChromaLt4H_sse2
ALIGN 16
DeblockChromaLt4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push r12
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
push r13
push r14
- sub rsp,170h
-
+ sub rsp,170h
+
mov r13, r8
mov r14, r9
mov r8, rdx
@@ -2977,275 +2977,275 @@
mov rdx, rdi
mov rcx, rsi
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
pop r14
pop r13
- pop r12
- pop rbp
- pop rbx
- ret
+ pop r12
+ pop rbp
+ pop rbx
+ ret
@@ -5162,7 +5162,7 @@
mov esp, ebp
pop ebp
ret
-
+
%endif
@@ -5178,16 +5178,16 @@
ALIGN 16
DeblockLumaTransposeH2V_sse2:
- push r3
- push r4
+ push r3
+ push r4
push r5
-%assign push_num 3
- LOAD_3_PARA
+%assign push_num 3
+ LOAD_3_PARA
SIGN_EXTENTION r1, r1d
- mov r5, r7
+ mov r5, r7
mov r3, r7
and r3, 0Fh
sub r7, r3
@@ -5229,7 +5229,7 @@
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
movdqa [r2], xmm4
movdqa [r2 + 10h], xmm2
movdqa [r2 + 20h], xmm3
@@ -5258,17 +5258,17 @@
DeblockLumaTransposeV2H_sse2:
push r3
- push r4
+ push r4
%assign push_num 2
LOAD_3_PARA
- SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r1, r1d
mov r4, r7
- mov r3, r7
+ mov r3, r7
and r3, 0Fh
- sub r7, r3
+ sub r7, r3
sub r7, 10h
movdqa xmm0, [r2]
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -244,7 +244,7 @@
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
- ;r3 [pSrc+(w-1)] r4[pSrc+w]
+ ;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
.left_right_loops:
@@ -375,13 +375,13 @@
%assign push_num 3
LOAD_4_PARA
-
+
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r2, r2d
SIGN_EXTENTION r3, r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte[r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -395,22 +395,22 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,05h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
-
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
push r1
push r2
@@ -419,13 +419,13 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
-
+
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
@@ -432,7 +432,7 @@
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -444,7 +444,7 @@
push r1
push r2
push r6
-
+
exp_left_right_sse2 32,a
pop r6
@@ -455,22 +455,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-32]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
-
-
+
+
neg r1 ;r1 = stride
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -477,11 +477,11 @@
exp_cross_sse2 32,a
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
@@ -495,7 +495,7 @@
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
-
+
push r4
push r5
push r6
@@ -508,7 +508,7 @@
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -522,24 +522,24 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
+ lea r4,[r3+r6] ;r4 = dst bottom
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
- push r1
+ push r1
push r2
exp_top_bottom_sse2 16
@@ -546,20 +546,20 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
+
+ lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -568,7 +568,7 @@
push r0
- push r1
+ push r1
push r2
push r6
exp_left_right_sse2 16,a
@@ -581,22 +581,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -603,11 +603,11 @@
exp_cross_sse2 16,a
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
@@ -633,7 +633,7 @@
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -647,24 +647,24 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
+ lea r4,[r3+r6] ;r4 = dst bottom
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
- push r1
+ push r1
push r2
exp_top_bottom_sse2 16
@@ -671,20 +671,20 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
+
+ lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -693,7 +693,7 @@
push r0
- push r1
+ push r1
push r2
push r6
exp_left_right_sse2 16,u
@@ -706,22 +706,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -728,13 +728,12 @@
exp_cross_sse2 16,u
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
ret
-
\ No newline at end of file
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -121,7 +121,7 @@
movdqa [r0+r4], xmm7
LOAD_4_PARA_POP
pop r5
- pop r4
+ pop r4
ret
;***********************************************************************
@@ -141,7 +141,7 @@
;mov eax, [esp+20] ; iStrideD
;mov esi, [esp+24] ; Src
;mov ecx, [esp+28] ; iStrideS
-
+
push r4
push r5
%assign push_num 2
@@ -213,7 +213,7 @@
;mov eax, [esp+20] ; iStrideD
;mov esi, [esp+24] ; Src
;mov ecx, [esp+28] ; iStrideS
-
+
push r4
push r5
%assign push_num 2
@@ -243,7 +243,7 @@
movdqa [r0+r4], xmm7
LOAD_4_PARA_POP
pop r5
- pop r4
+ pop r4
ret
@@ -261,7 +261,7 @@
;mov ecx, [esp + 12] ;iStrideD
;mov ebx, [esp + 16] ;Src
;mov edx, [esp + 20] ;iStrideS
-
+
%assign push_num 0
LOAD_4_PARA
@@ -277,7 +277,7 @@
movq mm6, [r2]
movq mm7, [r2+r3]
lea r2, [r2+2*r3]
-
+
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
@@ -302,7 +302,7 @@
lea r2, [r2+2*r3]
movq mm6, [r2]
movq mm7, [r2+r3]
-
+
movq [r0], mm0
movq [r0+r1], mm1
lea r0, [r0+2*r1]
@@ -333,7 +333,7 @@
;mov ecx, [esp + 16] ;iStrideD
;mov esi, [esp + 20] ;Src
;mov ebx, [esp + 24] ;iStrideS
-
+
push r4
%assign push_num 1
LOAD_4_PARA
@@ -374,7 +374,7 @@
WELSEMMS
;pop esi
- ;pop ebx
+ ;pop ebx
LOAD_4_PARA_POP
pop r4
ret
@@ -388,7 +388,7 @@
%assign push_num 0
LOAD_2_PARA
-
+
;mov eax, [esp+4] ; mv_buffer
;movd xmm0, [esp+8] ; _mv
movd xmm0, r1d ; _mv
@@ -438,7 +438,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq4_mmx:
-
+
%assign push_num 0
LOAD_7_PARA
@@ -487,7 +487,7 @@
;mov ebp, [esp+36] ; pSrcB
;mov edx, [esp+40] ; iSrcBStride
;mov ebx, [esp+44] ; iHeight
-
+
%assign push_num 0
LOAD_7_PARA
@@ -497,7 +497,7 @@
movsx r5, r5d
movsx r6, r6d
%endif
-
+
ALIGN 4
.height_loop:
movq mm0, [r2]
@@ -528,7 +528,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq16_sse2:
-
+
%assign push_num 0
LOAD_7_PARA
%ifndef X86_32
@@ -567,7 +567,7 @@
lea r2, [r2+2*r3]
lea r4, [r4+2*r5]
lea r0, [r0+2*r1]
-
+
sub r6, 4
jne .height_loop
@@ -591,17 +591,17 @@
;mov edi, [esp+24]
;mov ecx, [esp+28]
;mov edx, [esp+32]
-
+
push r5
%assign push_num 1
LOAD_5_PARA
-
+
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r4, r4d
%endif
-
+
ALIGN 4
.height_loop:
mov r5d, [r0]
@@ -629,7 +629,7 @@
;mov edi, [esp+20]
;mov ecx, [esp+24]
;mov edx, [esp+28]
-
+
%assign push_num 0
LOAD_5_PARA
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -79,17 +79,17 @@
;push esi
;push edi
;push ebx
-
+
%assign push_num 0
- LOAD_6_PARA
+ LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
movsx r5, r5d
%endif
-
+
;mov eax, [esp +12 + 20]
-
+
movd mm3, [r4]; [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
@@ -173,7 +173,7 @@
;push ebx
%assign push_num 0
- LOAD_6_PARA
+ LOAD_6_PARA
%ifndef X86_32
movsx r1, r1d
movsx r3, r3d
@@ -241,7 +241,7 @@
dec r5
jnz near .xloop
-
+
LOAD_6_PARA_POP
;pop ebx
@@ -273,7 +273,7 @@
movsx r3, r3d
movsx r5, r5d
%endif
-
+
;mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
@@ -333,9 +333,9 @@
sub r5, 2
jnz .hloop_chroma
-
+
LOAD_6_PARA_POP
-
+
;pop edi
;pop esi
;pop ebx
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -56,7 +56,7 @@
ALIGN 16
h264_w0x10:
- dw 16, 16, 16, 16
+ dw 16, 16, 16, 16
ALIGN 16
h264_w0x10_1:
dw 16, 16, 16, 16, 16, 16, 16, 16
@@ -91,7 +91,7 @@
;mov edi, [esp+20]
;mov ecx, [esp+24]
;mov edx, [esp+28]
-
+
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
@@ -99,7 +99,7 @@
movsx r3, r3d
movsx r4, r4d
%endif
-
+
sub r0, 2
WELS_Zero mm7
movq mm6, [h264_w0x10]
@@ -195,7 +195,7 @@
;mov edi, [esp+24] ;pDst
;mov edx, [esp+28] ;iDstStride
;mov ebx, [esp+32] ;iHeight
-
+
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
@@ -257,7 +257,7 @@
;mov edi, [esp + 20] ;pDst
;mov ecx, [esp + 28] ;iHeight
;mov edx, [esp + 24] ;iDstStride
-
+
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
@@ -322,7 +322,7 @@
;mov edi, [esp + 20] ;pDst
;mov ecx, [esp + 28] ;iHeight
;mov edx, [esp + 24] ;iDstStride
-
+
%assign push_num 0
LOAD_5_PARA
%ifndef X86_32
@@ -392,7 +392,7 @@
lea r0, [r0+r1]
dec r4
jnz near .y_loop
-
+
LOAD_5_PARA_POP
ret
@@ -523,7 +523,7 @@
;mov eax, [esp + 28]
;mov ecx, [esp + 36]
;mov ebx, [esp + 32]
-
+
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
@@ -532,7 +532,7 @@
movsx r4, r4d
movsx r5, r5d
%endif
-
+
%ifndef X86_32
push r12
push r13
@@ -541,7 +541,7 @@
mov r13, r2
mov r14, r5
%endif
-
+
shr r4, 3
sub r0, r1
sub r0, r1
@@ -639,7 +639,7 @@
sub r0, r1
sub r0, r1
add r0, 8
- add r2, 8
+ add r2, 8
jmp near .xloop
.xx_exit:
@@ -680,7 +680,7 @@
movsx r3, r3d
movsx r4, r4d
movsx r5, r5d
-%endif
+%endif
sub r0, 2
pxor xmm7, xmm7
@@ -842,7 +842,7 @@
;mov edx, [esp+28]
;mov ecx, [esp+32]
;mov ebx, [esp+36]
-
+
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
@@ -850,7 +850,7 @@
movsx r3, r3d
movsx r4, r4d
movsx r5, r5d
-%endif
+%endif
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
@@ -1023,7 +1023,7 @@
;mov edx, [esp+32]
;mov ebx, [esp+36]
;mov ecx, [esp+40]
-
+
%assign push_num 0
LOAD_6_PARA
%ifndef X86_32
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -160,7 +160,7 @@
AnalysisVaaInfoIntra_sse2:
%assign push_num 0
- LOAD_2_PARA
+ LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@@ -175,16 +175,16 @@
and r5,0fh
sub r7,r5
sub r7,32
-
-
- mov r2,r1
+
+
+ mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
-
+
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
-
+
pxor xmm7, xmm7
; loops
@@ -225,8 +225,8 @@
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
-
+
+
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@@ -234,7 +234,7 @@
sar r2, $4
movd retrd, xmm1
sub retrd, r2d
-
+
add r7,32
add r7,r5
@@ -244,7 +244,7 @@
pop r4
pop r3
%endif
-
+
ret
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -255,7 +255,7 @@
AnalysisVaaInfoIntra_ssse3:
%assign push_num 0
- LOAD_2_PARA
+ LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@@ -265,41 +265,41 @@
push r6
%assign push_num push_num+4
%endif
-
+
mov r5,r7
and r5,0fh
sub r7,r5
sub r7,32
-
- mov r2,r1
+
+ mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
-
+
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
-
+
pxor xmm7, xmm7
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
-
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
-
-
+
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
-
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
-
-
+
+
movdqa xmm0,[r7]
movdqa xmm1,[r7+16]
movdqa xmm2, xmm0
@@ -322,7 +322,7 @@
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@@ -339,7 +339,7 @@
pop r4
pop r3
%endif
-
+
ret
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
@@ -368,7 +368,7 @@
paddd xmm3, xmm4
movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
-
+
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
@@ -412,7 +412,7 @@
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4
-
+
movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -477,7 +477,7 @@
SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
;mov esi, [esp + pushsize + 4]
- mov r0, r4
+ mov r0, r4
add r3, 16
imul r2, -3
add r3, r2 ; s = a + 16 + (-3)*c
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -186,7 +186,7 @@
movsx r1, r1d
movsx r3, r3d
%endif
-; mov eax, [pDct ]
+; mov eax, [pDct ]
movq mm0, [r4+ 0]
movq mm1, [r4+ 8]
movq mm2, [r4+16]
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -45,8 +45,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
LOAD_1_PARA
;mov eax,[esp+4]
prefetchnta [r0]
- ret
+ ret
ALIGN 16
@@ -71,7 +71,7 @@
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [r0], xmm0
@@ -79,12 +79,12 @@
movdqa [r0+32], xmm0
movdqa [r0+48], xmm0
add r0, 0x40
-
+
add r1, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -96,7 +96,7 @@
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [r0], mm0
@@ -106,16 +106,16 @@
movq [r0+32], mm0
movq [r0+40], mm0
movq [r0+48], mm0
- movq [r0+56], mm0
+ movq [r0+56], mm0
add r0, 0x40
-
+
add r1, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -125,17 +125,17 @@
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
- neg r1
+ neg r1
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [r0], mm0
add r0, 0x08
-
+
add r1, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -163,7 +163,7 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
@@ -243,11 +243,11 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
@@ -270,11 +270,11 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
pxor xmm7, xmm7
@@ -302,18 +302,18 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
push r0
- push r2
+ push r2
pxor xmm6, xmm6
pxor xmm7, xmm7
SSE2_GetSatd8x8
-
+
pop r2
pop r0
;mov eax, [esp+8]
@@ -341,13 +341,13 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r3, r3d
push r0
- push r2
+ push r2
pxor xmm6, xmm6
pxor xmm7, xmm7
@@ -1027,11 +1027,11 @@
;mov ebx,[esp+12]
;mov ecx,[esp+16]
;mov edx,[esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movdqa xmm4,[HSwapSumSubDB1]
movd xmm2,[r2]
movd xmm5,[r2+r3]
@@ -1089,14 +1089,14 @@
;mov ebx, [esp+20]
;mov ecx, [esp+24]
;mov edx, [esp+28]
-%ifdef X86_32
+%ifdef X86_32
push r4
push r5
-%endif
+%endif
%assign push_num 2
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movdqa xmm7, [HSumSubDB1]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
@@ -1130,15 +1130,15 @@
;mov ebx, [esp+pushsize+8]
;mov ecx, [esp+pushsize+12]
;mov edx, [esp+pushsize+16]
-%ifdef X86_32
+%ifdef X86_32
push r4
push r5
push r6
-%endif
+%endif
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movdqa xmm7, [HSumSubDB1]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
@@ -1175,17 +1175,17 @@
;mov ebx, [esp+20]
;mov ecx, [esp+24]
;mov edx, [esp+28]
-%ifdef X86_32
+%ifdef X86_32
push r4
push r5
-%endif
+%endif
%assign push_num 2
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
push r0
push r2
-
+
movdqa xmm7, [HSumSubDB1]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
@@ -1194,7 +1194,7 @@
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
-
+
pop r2
pop r0
;mov eax, [esp+16]
@@ -1231,19 +1231,19 @@
;mov ebx, [esp+pushsize+8]
;mov ecx, [esp+pushsize+12]
;mov edx, [esp+pushsize+16]
-%ifdef X86_32
+%ifdef X86_32
push r4
push r5
push r6
-%endif
+%endif
%assign push_num 3
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r3, r3d
-
+
push r0
push r2
-
+
movdqa xmm7, [HSumSubDB1]
lea r4, [r1+r1*2]
lea r5, [r3+r3*2]
@@ -1258,7 +1258,7 @@
jl loop_get_satd_16x16_left
pop r2
- pop r0
+ pop r0
;mov eax, [esp+pushsize+4]
;mov ecx, [esp+pushsize+12]
add r0, 8
@@ -1360,13 +1360,13 @@
;push esi
;%define _STACK_SIZE 12
;mov eax, [esp+_STACK_SIZE+4 ]
- ;mov ebx, [esp+_STACK_SIZE+8 ]
+ ;mov ebx, [esp+_STACK_SIZE+8 ]
;mov ecx, [esp+_STACK_SIZE+12]
;mov edx, [esp+_STACK_SIZE+16]
%ifdef X86_32
push r4
push r5
-%endif
+%endif
%assign push_num 2
LOAD_4_PARA
@@ -1411,11 +1411,11 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movdqu xmm0, [r2]
MOVDQ xmm2, [r0]
psadbw xmm0, xmm2
@@ -1443,7 +1443,7 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
@@ -1483,7 +1483,7 @@
;push edi
;mov eax, [esp+12]
;mov ebx, [esp+16]
-
+
%assign push_num 0
mov r2, arg3
push r2
@@ -1490,7 +1490,7 @@
CACHE_SPLIT_CHECK r2, 8, 64
jle near .pixel_sad_8x8_nsplit
pop r2
-%ifdef X86_32
+%ifdef X86_32
push r3
push r4
push r5
@@ -1497,10 +1497,10 @@
%endif
%assign push_num 3
mov r0, arg1
- mov r1, arg2
+ mov r1, arg2
SIGN_EXTENTION r1, r1d
pxor xmm7, xmm7
-
+
;ecx r2, edx r4, edi r5
mov r5, r2
@@ -1594,18 +1594,18 @@
pop r3
%endif
jmp .return
-
+
.pixel_sad_8x8_nsplit:
;push ebx
;mov eax, [esp+8]
;mov ebx, [esp+12]
;mov edx, [esp+20]
-
+
pop r2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]
@@ -1652,11 +1652,11 @@
;mov ebx, [esp+12]
;mov ecx, [esp+16]
;mov edx, [esp+20]
-
+
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
@@ -1778,11 +1778,11 @@
;mov ebx, [esp+16]
;mov edi, [esp+20]
;mov edx, [esp+24]
-
+
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
@@ -1871,11 +1871,11 @@
;mov ebx, [esp+16]
;mov edi, [esp+20]
;mov edx, [esp+24]
-
+
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
@@ -2091,11 +2091,11 @@
;mov ebx, [esp+16]
;mov edi, [esp+20]
;mov edx, [esp+24]
-
+
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
pxor xmm6, xmm6 ;sad pRefMb-1
@@ -2219,11 +2219,11 @@
;mov ebx, [esp+16]
;mov edi, [esp+20]
;mov edx, [esp+24]
-
+
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movd xmm0, [r0]
movd xmm1, [r0+r1]
lea r0, [r0+2*r1]
@@ -2310,11 +2310,11 @@
;mov ebx, [pix1stride ]
;mov ecx, [pix2address]
;mov edx, [pix2stride ]
-
+
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
movd mm0, [r0]
movd mm1, [r0+r1]
punpckldq mm0, mm1
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -179,15 +179,15 @@
;%define stride r1
BilateralLumaFilter8_sse2:
-
- push r3
+
+ push r3
%assign push_num 1
LOAD_2_PARA
pxor xmm7, xmm7
-
+
mov r3, r0
-
+
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
@@ -218,10 +218,10 @@
packuswb xmm5, xmm5
movq [r3], xmm5
-
+
pop r3
%assign push_num 0
-
+
ret
WELS_EXTERN WaverageChromaFilter8_sse2
@@ -239,11 +239,11 @@
WaverageChromaFilter8_sse2:
push r3
-
+
%assign push_num 1
-
+
LOAD_2_PARA
-
+
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
@@ -272,8 +272,8 @@
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
-
+
pop r3
-
+
%assign push_num 0
ret
--- a/codec/processing/src/asm/sad.asm
+++ b/codec/processing/src/asm/sad.asm
@@ -84,7 +84,7 @@
;push edi
;mov eax, [esp+12]
;mov ebx, [esp+16]
-
+
%assign push_num 0
mov r2, arg3
push r2
@@ -91,7 +91,7 @@
CACHE_SPLIT_CHECK r2, 8, 64
jle near .pixel_sad_8x8_nsplit
pop r2
-%ifdef X86_32
+%ifdef X86_32
push r3
push r4
push r5
@@ -98,10 +98,10 @@
%endif
%assign push_num 3
mov r0, arg1
- mov r1, arg2
+ mov r1, arg2
SIGN_EXTENTION r1, r1d
pxor xmm7, xmm7
-
+
;ecx r2, edx r4, edi r5
mov r5, r2
@@ -195,18 +195,18 @@
pop r3
%endif
jmp .return
-
+
.pixel_sad_8x8_nsplit:
;push ebx
;mov eax, [esp+8]
;mov ebx, [esp+12]
;mov edx, [esp+20]
-
+
pop r2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]