ref: e96a7b5c92472d8c1e4baf044069aa72931411c2
parent: fc1601058379affdd1cc3f6d1b3c3d4be7155edf
author: Sindre Aamås <saamas@cisco.com>
date: Fri Feb 12 16:01:24 EST 2016
[Common/x86] DeblockChromaEq4V_ssse3 optimizations Use packed 8-bit operations rather than unpack to 16-bit. Avoid spills. ~2.07x speedup on Haswell (x86-64). ~2.12x speedup on Haswell (x86 32-bit).
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -472,163 +472,73 @@
ret
-%ifdef WIN64
+;********************************************************************************
+; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
-
WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- PUSH_XMM 15
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ movd xmm7, arg4d
+ pxor xmm0, xmm0
+ pshufb xmm7, xmm0 ; iAlpha
+ mov r3, r2
+ neg r3 ; -iStride
+
+ movq xmm0, [r0 + 0 * r2] ; q0 cb
+ movhps xmm0, [r1 + 0 * r2] ; q0 cr
+ movq xmm2, [r0 + 1 * r3] ; p0 cb
+ movhps xmm2, [r1 + 1 * r3] ; p0 cr
+
+ movdqa xmm4, xmm0
+ SSE2_AbsDiffUB xmm4, xmm2, xmm5 ; |p0 - q0|
+ SSE2_CmpgeUB xmm4, xmm7 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+
+ movq xmm1, [r0 + 1 * r2] ; q1 cb
+ movhps xmm1, [r1 + 1 * r2] ; q1 cr
+ movq xmm3, [r0 + 2 * r3] ; p1 cb
+ movhps xmm3, [r1 + 2 * r3] ; p1 cr
+
+ movdqa xmm5, xmm1
+ SSE2_AbsDiffUB xmm5, xmm0, xmm7 ; |q1 - q0|
+ movdqa xmm6, xmm3
+ SSE2_AbsDiffUB xmm6, xmm2, xmm7 ; |p1 - p0|
+ pmaxub xmm5, xmm6 ; max(|q1 - q0|, |p1 - p0|)
+
+ pxor xmm6, xmm6
+ movd xmm7, arg5d
+ pshufb xmm7, xmm6 ; iBeta
+
+ SSE2_CmpgeUB xmm5, xmm7 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+ por xmm4, xmm5 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0
+
+ WELS_DB1 xmm7
+ movdqa xmm5, xmm2
+ SSE2_AvgbFloor1 xmm2, xmm1, xmm7, xmm6 ; (p0 + q1) >> 1
+ pavgb xmm2, xmm3 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1
+ movdqa xmm6, xmm4
+ SSE2_Blend xmm5, xmm2, xmm4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
+
+ SSE2_AvgbFloor1 xmm3, xmm0, xmm7, xmm4 ; (q0 + p1) >> 1
+ pavgb xmm3, xmm1 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1
+ SSE2_Blend xmm0, xmm3, xmm6 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0
+
+ movlps [r0 + 1 * r3], xmm5 ; store p0 cb
+ movhps [r1 + 1 * r3], xmm5 ; store p0 cr
+ movlps [r0 + 0 * r2], xmm0 ; store q0 cb
+ movhps [r1 + 0 * r2], xmm0 ; store q0 cr
+
POP_XMM
- pop rbx
+ LOAD_4_PARA_POP
ret
+%ifdef WIN64
-
WELS_EXTERN DeblockChromaEq4H_ssse3
mov rax,rsp
mov [rax+20h],rbx
@@ -1178,155 +1088,6 @@
%elifdef UNIX64
-WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
-
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
-
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
-
WELS_EXTERN DeblockChromaEq4H_ssse3
mov rax,rsp
push rbx
@@ -1888,172 +1649,6 @@
%elifdef X86_32
-
-;********************************************************************************
-; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
;***************************************************************************
; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,