ref: e65f5987f6eea95bfbb0876cc0a966d1d82f5841
dir: /vp8/common/x86/mask_sse3.asm/
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void int vp8_makemask_sse3(
;    unsigned char *y,
;    unsigned char *u,
;    unsigned char *v,
;    unsigned char *ym,
;    unsigned char *uvm,
;    int yp,
;    int uvp,
;    int ys,
;    int us,
;    int vs,
;    int yt,
;    int ut,
;    int vt)
global sym(vp8_makemask_sse3)
sym(vp8_makemask_sse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 14
    push        rsi
    push        rdi
    ; end prolog
        mov             rsi,        arg(0) ;y
        mov             rdi,        arg(1) ;u
        mov             rcx,        arg(2) ;v
        mov             rax,        arg(3) ;ym
        movsxd          rbx,        dword arg(4) ;yp
        movsxd          rdx,        dword arg(5) ;uvp
        pxor            xmm0,xmm0
        ;make 16 copies of the center y value
        movd            xmm1, arg(6)
        pshufb          xmm1, xmm0
        ; make 16 copies of the center u value
        movd            xmm2, arg(7)
        pshufb          xmm2, xmm0
        ; make 16 copies of the center v value
        movd            xmm3, arg(8)
        pshufb          xmm3, xmm0
        unpcklpd        xmm2, xmm3
        ;make 16 copies of the y tolerance
        movd            xmm3, arg(9)
        pshufb          xmm3, xmm0
        ;make 16 copies of the u tolerance
        movd            xmm4, arg(10)
        pshufb          xmm4, xmm0
        ;make 16 copies of the v tolerance
        movd            xmm5, arg(11)
        pshufb          xmm5, xmm0
        unpckhpd        xmm4, xmm5
        mov             r8,8
NextPairOfRows:
        ;grab the y source values
        movdqu          xmm0, [rsi]
        ;compute abs difference between source and y target
        movdqa          xmm6, xmm1
        movdqa          xmm7, xmm0
        psubusb         xmm0, xmm1
        psubusb         xmm6, xmm7
        por             xmm0, xmm6
        ;compute abs difference between
        movdqa          xmm6, xmm3
        pcmpgtb         xmm6, xmm0
        ;grab the y source values
        add             rsi, rbx
        movdqu          xmm0, [rsi]
        ;compute abs difference between source and y target
        movdqa          xmm11, xmm1
        movdqa          xmm7, xmm0
        psubusb         xmm0, xmm1
        psubusb         xmm11, xmm7
        por             xmm0, xmm11
        ;compute abs difference between
        movdqa          xmm11, xmm3
        pcmpgtb         xmm11, xmm0
        ;grab the u and v source values
        movdqu          xmm7, [rdi]
        movdqu          xmm8, [rcx]
        unpcklpd        xmm7, xmm8
        ;compute abs difference between source and uv targets
        movdqa          xmm9, xmm2
        movdqa          xmm10, xmm7
        psubusb         xmm7, xmm2
        psubusb         xmm9, xmm10
        por             xmm7, xmm9
        ;check whether the number is < tolerance
        movdqa          xmm0, xmm4
        pcmpgtb         xmm0, xmm7
        ;double  u and v masks
        movdqa          xmm8, xmm0
        punpckhbw       xmm0, xmm0
        punpcklbw       xmm8, xmm8
        ;mask row 0 and output
        pand            xmm6, xmm8
        pand            xmm6, xmm0
        movdqa          [rax],xmm6
        ;mask row 1 and output
        pand            xmm11, xmm8
        pand            xmm11, xmm0
        movdqa          [rax+16],xmm11
        ; to the next row or set of rows
        add             rsi, rbx
        add             rdi, rdx
        add             rcx, rdx
        add             rax,32
        dec r8
        jnz NextPairOfRows
    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;GROW_HORIZ (register for result, source register or mem local)
; takes source and shifts left and ors with source
; then shifts right and ors with source
%macro GROW_HORIZ 2
    movdqa          %1, %2
    movdqa          xmm14, %1
    movdqa          xmm15, %1
    pslldq          xmm14, 1
    psrldq          xmm15, 1
    por             %1,xmm14
    por             %1,xmm15
%endmacro
;GROW_VERT (result, center row, above row, below row)
%macro GROW_VERT 4
    movdqa          %1,%2
    por             %1,%3
    por             %1,%4
%endmacro
;GROW_NEXTLINE (new line to grow, new source, line to write)
%macro GROW_NEXTLINE 3
    GROW_HORIZ %1, %2
    GROW_VERT xmm3, xmm0, xmm1, xmm2
    movdqa %3,xmm3
%endmacro
;void int vp8_growmaskmb_sse3(
;    unsigned char *om,
;    unsigned char *nm,
global sym(vp8_growmaskmb_sse3)
sym(vp8_growmaskmb_sse3):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 2
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src
    mov             rdi,        arg(1) ;rst
    GROW_HORIZ xmm0, [rsi]
    GROW_HORIZ xmm1, [rsi+16]
    GROW_HORIZ xmm2, [rsi+32]
    GROW_VERT xmm3, xmm0, xmm1, xmm2
    por xmm0,xmm1
    movdqa [rdi], xmm0
    movdqa [rdi+16],xmm3
    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
    por xmm0,xmm2
    movdqa [rdi+240], xmm0
    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;unsigned int vp8_sad16x16_masked_wmt(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned char *mask)
global sym(vp8_sad16x16_masked_wmt)
sym(vp8_sad16x16_masked_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src_ptr
    mov             rdi,        arg(2) ;ref_ptr
    mov             rbx,        arg(4) ;mask
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
    mov             rcx,        16
    pxor            xmm3,       xmm3
NextSadRow:
    movdqu          xmm0,       [rsi]
    movdqu          xmm1,       [rdi]
    movdqu          xmm2,       [rbx]
    pand            xmm0,       xmm2
    pand            xmm1,       xmm2
    psadbw          xmm0,       xmm1
    paddw           xmm3,       xmm0
    add             rsi, rax
    add             rdi, rdx
    add             rbx,  16
    dec rcx
    jnz NextSadRow
    movdqa          xmm4 ,     xmm3
    psrldq          xmm4,       8
    paddw           xmm3,      xmm4
    movq            rax,       xmm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;unsigned int vp8_sad16x16_unmasked_wmt(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned char *mask)
global sym(vp8_sad16x16_unmasked_wmt)
sym(vp8_sad16x16_unmasked_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src_ptr
    mov             rdi,        arg(2) ;ref_ptr
    mov             rbx,        arg(4) ;mask
    movsxd          rax,        dword ptr arg(1) ;src_stride
    movsxd          rdx,        dword ptr arg(3) ;ref_stride
    mov             rcx,        16
    pxor            xmm3,       xmm3
next_vp8_sad16x16_unmasked_wmt:
    movdqu          xmm0,       [rsi]
    movdqu          xmm1,       [rdi]
    movdqu          xmm2,       [rbx]
    por             xmm0,       xmm2
    por             xmm1,       xmm2
    psadbw          xmm0,       xmm1
    paddw           xmm3,       xmm0
    add             rsi, rax
    add             rdi, rdx
    add             rbx,  16
    dec rcx
    jnz next_vp8_sad16x16_unmasked_wmt
    movdqa          xmm4 ,     xmm3
    psrldq          xmm4,       8
    paddw           xmm3,      xmm4
    movq            rax,        xmm3
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;unsigned int vp8_masked_predictor_wmt(
;    unsigned char *masked,
;    unsigned char *unmasked,
;    int  src_stride,
;    unsigned char *dst_ptr,
;    int  dst_stride,
;    unsigned char *mask)
global sym(vp8_masked_predictor_wmt)
sym(vp8_masked_predictor_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src_ptr
    mov             rdi,        arg(1) ;ref_ptr
    mov             rbx,        arg(5) ;mask
    movsxd          rax,        dword ptr arg(2) ;src_stride
    mov             r11,        arg(3) ; destination
    movsxd          rdx,        dword ptr arg(4) ;dst_stride
    mov             rcx,        16
    pxor            xmm3,       xmm3
next_vp8_masked_predictor_wmt:
    movdqu          xmm0,       [rsi]
    movdqu          xmm1,       [rdi]
    movdqu          xmm2,       [rbx]
    pand            xmm0,       xmm2
    pandn           xmm2,       xmm1
    por             xmm0,       xmm2
    movdqu          [r11],      xmm0
    add             r11, rdx
    add             rsi, rax
    add             rdi, rdx
    add             rbx,  16
    dec rcx
    jnz next_vp8_masked_predictor_wmt
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;unsigned int vp8_masked_predictor_uv_wmt(
;    unsigned char *masked,
;    unsigned char *unmasked,
;    int  src_stride,
;    unsigned char *dst_ptr,
;    int  dst_stride,
;    unsigned char *mask)
global sym(vp8_masked_predictor_uv_wmt)
sym(vp8_masked_predictor_uv_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src_ptr
    mov             rdi,        arg(1) ;ref_ptr
    mov             rbx,        arg(5) ;mask
    movsxd          rax,        dword ptr arg(2) ;src_stride
    mov             r11,        arg(3) ; destination
    movsxd          rdx,        dword ptr arg(4) ;dst_stride
    mov             rcx,        8
    pxor            xmm3,       xmm3
next_vp8_masked_predictor_uv_wmt:
    movq            xmm0,       [rsi]
    movq            xmm1,       [rdi]
    movq            xmm2,       [rbx]
    pand            xmm0,       xmm2
    pandn           xmm2,       xmm1
    por             xmm0,       xmm2
    movq            [r11],      xmm0
    add             r11, rdx
    add             rsi, rax
    add             rdi, rax
    add             rbx,  8
    dec rcx
    jnz next_vp8_masked_predictor_uv_wmt
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
;unsigned int vp8_uv_from_y_mask(
;    unsigned char *ymask,
;    unsigned char *uvmask)
global sym(vp8_uv_from_y_mask)
sym(vp8_uv_from_y_mask):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push        rsi
    push        rdi
    ; end prolog
    mov             rsi,        arg(0) ;src_ptr
    mov             rdi,        arg(1) ;dst_ptr
    mov             rcx,        8
    pxor            xmm3,       xmm3
next_p8_uv_from_y_mask:
    movdqu          xmm0,       [rsi]
    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
    movq            [rdi],xmm0
    add             rdi, 8
    add             rsi,32
    dec rcx
    jnz next_p8_uv_from_y_mask
    ; begin epilog
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
SECTION_RODATA
align 16
shuf1b:
    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0