shithub: openh264

Download patch

ref: 4dd64f06beec5301c02efce225e9d8ffe66c47ff
parent: 4db01d95183b9a163500177b2920cd99da154dcb
author: Sindre Aamås <saamas@cisco.com>
date: Tue Mar 7 09:10:05 EST 2017

[Common/x86] Simplify deblocking X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -45,7 +45,11 @@
 ; Macros and other preprocessor constants
 ;*******************************************************************************
 
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 ALIGN   16
 FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
@@ -157,25 +161,9 @@
     ; Unbias and split into a non-negative and a non-positive part.
     ; Clip each part to iTc via minub.
     ; Add/subtract each part to/from p0/q0 and clip.
-%ifdef X86_32_PICASM
-    push       r0
-    mov        r0, esp
-    sub        esp, 16
-    and        esp, -16
-    push       0x60606060    ;WELS_DB96_16
-    push       0x60606060
-    push       0x60606060
-    push       0x60606060
-    movdqa     %6, [esp]
+    movdqa     %6, [pic(WELS_DB96_16)]
     psubusb    %6, %8
-    psubusb    %8, [esp]
-    mov        esp, r0
-    pop        r0
-%else
-    movdqa     %6, [WELS_DB96_16]
-    psubusb    %6, %8
-    psubusb    %8, [WELS_DB96_16]
-%endif
+    psubusb    %8, [pic(WELS_DB96_16)]
     pminub     %6, %5
     pminub     %8, %5
     psubusb    %2, %6
@@ -192,6 +180,7 @@
 
 WELS_EXTERN DeblockLumaLt4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r5
     LOAD_5_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -198,21 +187,8 @@
     movd     xmm1, arg3d
     movd     xmm2, arg4d
     pxor     xmm3, xmm3
-%ifdef X86_32_PICASM
-    push     r4
-    mov      r4, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    pxor     xmm1, [esp]
-    pxor     xmm2, [esp]
-%else
-    pxor     xmm1, [WELS_DB127_16]
-    pxor     xmm2, [WELS_DB127_16]
-%endif
+    pxor     xmm1, [pic(WELS_DB127_16)]
+    pxor     xmm2, [pic(WELS_DB127_16)]
     pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
     mov      r2, r1                           ; iStride
@@ -225,40 +201,22 @@
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
-    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
     pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     movdqa   xmm0, xmm7
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
     movdqa   xmm1, xmm5
     SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-    mov      esp, r4
-    pop      r4
-%else
-    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
 
     pavgb    xmm3, [r3 + 0 * r1]
     pcmpeqw  xmm2, xmm2  ; FFh
@@ -273,21 +231,7 @@
     pxor     xmm1, xmm2
 
     movd     xmm3, [r4]
-%ifdef X86_32_PICASM
-    push     r0
-    mov      r0, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x03030303    ;WELS_SHUFB0000111122223333
-    push     0x02020202
-    push     0x01010101
-    push     0x00000000
-    pshufb   xmm3, [esp] ; iTc
-    mov      esp, r0
-    pop      r0
-%else
-    pshufb   xmm3, [WELS_SHUFB0000111122223333] ; iTc
-%endif
+    pshufb   xmm3, [pic(WELS_SHUFB0000111122223333)] ; iTc
     movdqa   xmm4, xmm3  ; iTc0 = iTc
     pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
     pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
@@ -315,6 +259,7 @@
 
     POP_XMM
     LOAD_5_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -380,6 +325,7 @@
 
 WELS_EXTERN DeblockLumaEq4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 10
     SIGN_EXTENSION r1, r1d
@@ -389,21 +335,8 @@
     add      r2, 1
     movd     xmm3, r2d
     pxor     xmm4, xmm4
-%ifdef X86_32_PICASM
-    push     r4
-    mov      r4, esp
-    sub      esp, 16
-    and      esp, -16
-    push     0x7f7f7f7f    ;WELS_DB127_16
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    push     0x7f7f7f7f
-    pxor     xmm1, [esp]
-    pxor     xmm2, [esp]
-%else
-    pxor     xmm1, [WELS_DB127_16]
-    pxor     xmm2, [WELS_DB127_16]
-%endif
+    pxor     xmm1, [pic(WELS_DB127_16)]
+    pxor     xmm2, [pic(WELS_DB127_16)]
     pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
     pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
@@ -418,41 +351,23 @@
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
     SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%else
-    SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
-%endif
+    SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
     pand     xmm6, xmm7                       ; & bDeltaP0Q0
 
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%else
-    SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
-%endif
+    SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)]  ; bDeltaP2P0 = |p2 - p0| < iBeta
     pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
-%ifdef X86_32_PICASM
-    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-    mov      esp, r4
-    pop      r4
-%else
-    SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
-%endif
+    SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
     pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
 %ifdef X86_32
@@ -461,26 +376,12 @@
     mov      r2, esp
     sub      esp,  16
     and      esp, -16
-%ifdef X86_32_PICASM
-    push     0x01010101
-    push     0x01010101
-    push     0x01010101
-    push     0x01010101
-    sub      esp, 16
     movdqa   [esp], xmm5
-    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
+    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [pic(WELS_DB1_16)]
     movdqa   xmm5, [esp]
-    neg      r1
-    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
     mov      esp, r2
-%else
-    movdqa   [esp], xmm5
-    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
-    movdqa   xmm5, [esp]
-    mov      esp, r2
     neg      r1
-    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
-%endif
+    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [pic(WELS_DB1_16)]
 %else
     movdqa   xmm9, [WELS_DB1_16]
     SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
@@ -489,6 +390,7 @@
 
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -649,6 +551,7 @@
 
 WELS_EXTERN DeblockChromaLt4V_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r4
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
@@ -681,6 +584,7 @@
 
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 
@@ -737,7 +641,9 @@
     lea        r3, [3 * r2 - 1]                 ; 3 * iStride - 1
 
     SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6
+    INIT_X86_32_PIC r1
     SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0
+    DEINIT_X86_32_PIC
     SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0
 
     POP_XMM