ref: 18ab22b67c271bbc845ae485094d72cb7527d1d2
dir: /src/x86/mc_ssse3.asm/
; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2018, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 16 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 %macro BIDIR_JMP_TABLE 1-* ;evaluated at definition time (in loop below) %xdefine %1_table (%%table - 2*%2) %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) ; dynamically generated label %%table: %rep %0 - 1 ; repeat for num args dd %%prefix %+ .w%2 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128 SECTION .text INIT_XMM ssse3 %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w4: ; tile 4x movd [dstq ], m0 ; copy dw[0] pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] movd [dstq+strideq*1], m1 ; copy dw[1] punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] movd [dstq+strideq*2], m0 ; dw[2] psrlq m0, 32 ; shift right in dw[3] movd [dstq+stride3q ], m0 ; copy sub hd, 4 jg .w4_loop RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*2] .w8: movq [dstq ], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq] .w16: mova [dstq ], m0 dec hd jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq] .w32: mova [dstq ], m0 %1 2 mova [dstq + 16 ], m0 dec hd jg .w32_loop RET .w64_loop: %1_INC_PTR 8 %1 0 add dstq, strideq .w64: %assign i 0 %rep 4 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 4 %1 2*i %endif %endrep dec hd jg .w64_loop RET .w128_loop: %1_INC_PTR 16 %1 0 add dstq, strideq .w128: %assign i 0 %rep 8 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 8 %1 2*i %endif %endrep dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [avg_ssse3_table] tzcnt wd, wm ; leading zeros movifnidn hd, hm ; move h(stack) to h(register) if not already that register movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp2q+(%1+0)*mmsize] psubw m2, m0, [tmp1q+(%1+0)*mmsize] mova m1, [tmp2q+(%1+1)*mmsize] psubw m3, m1, [tmp1q+(%1+1)*mmsize] paddw m2, m2 ; compensate for the weight only being half paddw m3, m3 ; of what it should be pmulhw m2, m4 ; (b-a) * (-weight << 12) pmulhw m3, m4 ; (b-a) * (-weight << 12) paddw m0, m2 ; ((b-a) * -weight) + b paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [w_avg_ssse3_table] tzcnt wd, wm movifnidn hd, hm movd m0, r6m pshuflw m0, m0, q0000 punpcklqdq m0, m0 movsxd wq, dword [r6+wq*4] pxor m4, m4 psllw m0, 11 ; can't shift by 12, sign bit must be preserved psubw m4, m0 mova m5, [pw_2048+r6-w_avg_ssse3_table] add wq, r6 BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 mova m3, [maskq+(%1+0)*(mmsize/2)] mova m0, [tmp2q+(%1+0)*mmsize] ; b psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a mova m6, m3 ; m psubb m3, m4, m6 ; -m paddw m1, m1 ; (b - a) << 1 paddb m3, m3 ; -m << 1 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) pmulhw m1, m2 ; (-m * (b - a)) << 10 paddw m0, m1 ; + b mova m1, [tmp2q+(%1+1)*mmsize] ; b psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a paddw m2, m2 ; (b - a) << 1 mova m6, m3 ; (-m << 1) punpckhbw m3, m4, m6 ; (-m << 9) pmulhw m2, m3 ; (-m << 9) paddw m1, m2 ; (-m * (b - a)) << 10 pmulhrsw m0, m5 ; round pmulhrsw m1, m5 ; round packuswb m0, m1 ; interleave 16 -> 8 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*mmsize/2 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 lea r7, [mask_ssse3_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] pxor m4, m4 mova m5, [pw_2048+r7-mask_ssse3_table] add wq, r7 BIDIR_FN MASK %endif ; ARCH_X86_64