ref: 3497c4c905f8c85d8c65b28c352ff85dfddd66ed
parent: 2737c05eac98c0f4c99572614714a033617f8f3f
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Dec 4 17:00:11 EST 2020
x86: Rename looprestoration_ssse3.asm to looprestoration_sse.asm It contains both SSE2 and SSSE3 code.
--- a/src/meson.build
+++ b/src/meson.build
@@ -200,7 +200,7 @@
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
- 'x86/looprestoration_ssse3.asm',
+ 'x86/looprestoration_sse.asm',
'x86/mc_sse.asm',
)
endif
--- /dev/null
+++ b/src/x86/looprestoration_sse.asm
@@ -1,0 +1,1950 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_right_ext_mask: times 16 db 0xff
+ times 16 db 0
+pb_14x0_1_2: times 14 db 0
+ db 1, 2
+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+pb_0: times 16 db 0
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_15: times 16 db 15
+pb_0_1: times 8 db 0, 1
+pb_6_7: times 8 db 6, 7
+pb_14_15: times 8 db 14, 15
+pw_1: times 8 dw 1
+pw_16: times 8 dw 16
+pw_128: times 8 dw 128
+pw_255: times 8 dw 255
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_16380: times 8 dw 16380
+pw_5_6: times 4 dw 5, 6
+pd_1024: times 4 dd 1024
+%if ARCH_X86_32
+pd_256: times 4 dd 256
+pd_512: times 4 dd 512
+pd_2048: times 4 dd 2048
+%endif
+pd_0xF0080029: times 4 dd 0xF0080029
+pd_0xF00801C7: times 4 dd 0XF00801C7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro PALIGNR 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;
+;; wiener ;;
+;;;;;;;;;;;;;;;;;;;;;;
+
+%macro WIENER_H 0
+%if ARCH_X86_64
+cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+%else
+cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge
+ mov r5, edgem
+ mov [esp+12], r5
+ mov wd, wm
+ mov hd, hm
+ SETUP_PIC hd
+ %define m15 m0
+ %define m14 m1
+ %define m13 m2
+ %define m12 m3
+%endif
+
+ movq m15, [fltq]
+%if cpuflag(ssse3)
+ pshufb m12, m15, [PIC_sym(pb_6_7)]
+ pshufb m13, m15, [PIC_sym(pb_4)]
+ pshufb m14, m15, [PIC_sym(pb_2)]
+ pshufb m15, m15, [PIC_sym(pb_0)]
+%else
+ pshuflw m12, m15, q3333
+ punpcklbw m15, m15
+ pshufhw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshuflw m15, m15, q0000
+ punpcklqdq m12, m12
+ punpckhqdq m13, m13
+ punpcklqdq m14, m14
+ punpcklqdq m15, m15
+ psraw m13, 8
+ psraw m14, 8
+ psraw m15, 8
+%endif
+
+%if ARCH_X86_64
+ mova m11, [pw_2048]
+ mova m10, [pw_16380]
+ lea r11, [pb_right_ext_mask]
+
+ DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+%else
+ %define m10 [PIC_sym(pw_16380)]
+ %define m11 [PIC_sym(pw_2048)]
+ %define m12 [esp+0x14]
+ %define m13 [esp+0x24]
+ %define m14 [esp+0x34]
+ %define m15 [esp+0x44]
+ mova m12, m3
+ mova m13, m2
+ mova m14, m1
+ mova m15, m0
+
+ DEFINE_ARGS dst, left, src, stride, x, w, h, edge
+ %define srcptrq srcq
+ %define dstptrq dstq
+ %define hd dword [esp+ 0]
+ %define edgeb byte [esp+12]
+ %define xlimd dword [esp+16]
+%endif
+
+ ; if (edge & has_right) align_w_to_16
+ ; else w -= 3, and use that as limit in x loop
+ test edgeb, 2 ; has_right
+ jnz .align
+ mov xlimd, -3
+ jmp .loop
+.align:
+ add wd, 15
+ and wd, ~15
+%if ARCH_X86_64
+ xor xlimd, xlimd
+%else
+ mov xlimd, 0
+%endif
+
+ ; main y loop for vertical filter
+.loop:
+%if ARCH_X86_64
+ mov srcptrq, srcq
+ mov dstptrq, dstq
+ lea xd, [wq+xlimq]
+%else
+ mov [esp+8], srcq
+ mov [esp+4], dstq
+ mov xd, xlimd
+ add xd, wd
+%endif
+
+ ; load left edge pixels
+ test edgeb, 1 ; have_left
+ jz .emu_left
+ test leftq, leftq ; left == NULL for the edge-extended bottom/top
+ jz .load_left_combined
+ movd m0, [leftq]
+ movd m1, [srcq]
+ punpckldq m0, m1
+ pslldq m0, 9
+ add leftq, 4
+ jmp .left_load_done
+.load_left_combined:
+ movq m0, [srcq-3]
+ pslldq m0, 10
+ jmp .left_load_done
+.emu_left:
+ movd m0, [srcq]
+%if cpuflag(ssse3)
+ pshufb m0, [PIC_sym(pb_14x0_1_2)]
+%else
+ pslldq m1, m0, 13
+ punpcklbw m0, m0
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ psrldq m0, 2
+ por m0, m1
+%endif
+
+ ; load right edge pixels
+.left_load_done:
+ cmp xd, 16
+ jg .main_load
+ test xd, xd
+ jg .load_and_splat
+ je .splat_right
+
+ ; for very small images (w=[1-2]), edge-extend the original cache,
+ ; ugly, but only runs in very odd cases
+%if cpuflag(ssse3)
+ add wd, wd
+ %if ARCH_X86_64
+ pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
+ %else
+ pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
+ %endif
+ shr wd, 1
+%else
+ shl wd, 4
+ pcmpeqd m2, m2
+ movd m3, wd
+ psrldq m2, 2
+ punpckhbw m1, m0, m0
+ pshufhw m1, m1, q1122
+ psllq m1, m3
+ pand m0, m2
+ pandn m2, m1
+ por m0, m2
+ shr wd, 4
+%endif
+
+ ; main x loop, mostly this starts in .main_load
+.splat_right:
+ ; no need to load new pixels, just extend them from the (possibly previously
+ ; extended) previous load into m0
+%if cpuflag(ssse3)
+ pshufb m1, m0, [PIC_sym(pb_15)]
+%else
+ punpckhbw m1, m0, m0
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+%endif
+ jmp .main_loop
+.load_and_splat:
+ ; load new pixels and extend edge for right-most
+ movu m1, [srcptrq+3]
+%if ARCH_X86_64
+ sub r11, xq
+ movu m2, [r11+16]
+ add r11, xq
+%else
+ sub PIC_reg, xd
+ movu m2, [PIC_sym(pb_right_ext_mask)+16]
+ add PIC_reg, xd
+%endif
+ movd m3, [srcptrq+2+xq]
+%if cpuflag(ssse3)
+ pshufb m3, [PIC_sym(pb_0)]
+%else
+ punpcklbw m3, m3
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+%endif
+ pand m1, m2
+ pxor m2, [PIC_sym(pb_right_ext_mask)]
+ pand m3, m2
+ pxor m2, [PIC_sym(pb_right_ext_mask)]
+ por m1, m3
+ jmp .main_loop
+.main_load:
+ ; load subsequent line
+ movu m1, [srcptrq+3]
+.main_loop:
+%if ARCH_X86_64
+ PALIGNR m2, m1, m0, 10
+ PALIGNR m3, m1, m0, 11
+ PALIGNR m4, m1, m0, 12
+ PALIGNR m5, m1, m0, 13
+ PALIGNR m6, m1, m0, 14
+ PALIGNR m7, m1, m0, 15
+
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ punpcklbw m8, m3, m7
+ punpckhbw m3, m7
+ punpcklbw m7, m4, m6
+ punpckhbw m4, m6
+ PMADDUBSW m0, m15, m6, m9, 1
+ PMADDUBSW m2, m15, m6, m9, 0
+ PMADDUBSW m8, m14, m6, m9, 0
+ PMADDUBSW m3, m14, m6, m9, 0
+ PMADDUBSW m7, m13, m6, m9, 0
+ PMADDUBSW m4, m13, m6, m9, 0
+ paddw m0, m8
+ paddw m2, m3
+ %if cpuflag(ssse3)
+ pxor m6, m6
+ %endif
+ punpcklbw m3, m5, m6
+ punpckhbw m5, m6
+ psllw m8, m3, 7
+ psllw m6, m5, 7
+ psubw m8, m10
+ psubw m6, m10
+ pmullw m3, m12
+ pmullw m5, m12
+ paddw m0, m7
+ paddw m2, m4
+ paddw m0, m3
+ paddw m2, m5
+ paddsw m0, m8 ; see the avx2 for an explanation
+ paddsw m2, m6 ; of how the clipping works here
+ psraw m0, 3
+ psraw m2, 3
+ paddw m0, m11
+ paddw m2, m11
+ mova [dstptrq+ 0], m0
+ mova [dstptrq+16], m2
+%else
+ PALIGNR m2, m1, m0, 10
+ punpcklbw m3, m2, m1
+ punpckhbw m2, m1
+ PMADDUBSW m3, m15, m4, m5, 1
+ PMADDUBSW m2, m15, m4, m5, 0
+ PALIGNR m4, m1, m0, 11
+ PALIGNR m5, m1, m0, 15
+ punpcklbw m6, m4, m5
+ punpckhbw m4, m5
+ PMADDUBSW m6, m14, m5, m7, 1
+ PMADDUBSW m4, m14, m5, m7, 0
+ paddw m3, m6
+ paddw m2, m4
+ PALIGNR m4, m1, m0, 12
+ PALIGNR m5, m1, m0, 14
+ punpcklbw m6, m4, m5
+ punpckhbw m4, m5
+ PMADDUBSW m6, m13, m5, m7, 1
+ PMADDUBSW m4, m13, m5, m7, 0
+ paddw m3, m6
+ paddw m2, m4
+ PALIGNR m6, m1, m0, 13
+ %if cpuflag(ssse3)
+ pxor m5, m5
+ %endif
+ punpcklbw m4, m6, m5
+ punpckhbw m6, m5
+ psllw m5, m4, 7
+ psllw m7, m6, 7
+ psubw m5, m10
+ psubw m7, m10
+ pmullw m4, m12
+ pmullw m6, m12
+ paddw m3, m4
+ paddw m2, m6
+ paddsw m3, m5
+ paddsw m2, m7
+ psraw m3, 3
+ psraw m2, 3
+ paddw m3, m11
+ paddw m2, m11
+ mova [dstptrq+ 0], m3
+ mova [dstptrq+16], m2
+%endif
+
+ mova m0, m1
+ add srcptrq, 16
+ add dstptrq, 32
+ sub xd, 16
+ cmp xd, 16
+ jg .main_load
+ test xd, xd
+ jg .load_and_splat
+ cmp xd, xlimd
+ jg .splat_right
+
+%if ARCH_X86_32
+ mov srcq, [esp+8]
+ mov dstq, [esp+4]
+%endif
+ add srcq, strideq
+ add dstq, 384*2
+ dec hd
+ jg .loop
+ RET
+%endmacro
+
+%macro WIENER_V 0
+%if ARCH_X86_64
+cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge
+ mov edged, edgem
+ movifnidn fltq, fltmp
+ movifnidn hd, hm
+ movq m15, [fltq+16]
+ pshufd m14, m15, q1111
+ pshufd m15, m15, q0000
+ mova m12, [pd_1024]
+
+ DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
+
+ mov ylimd, edged
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 3
+%else
+cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge
+ %define ylimd [esp+12]
+
+ mov r5d, edgem
+ and r5d, 8
+ shr r5d, 2
+ sub r5d, 3
+ mov ylimd, r5d
+ mov fltq, fltmp
+ mov edged, edgem
+
+ SETUP_PIC edged
+
+ movq m0, [fltq+16]
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ mova [esp+0x50], m0
+ mova [esp+0x40], m1
+
+ DEFINE_ARGS dst, stride, mid, w, h, y, edge
+ %define mptrq midq
+ %define dstptrq dstq
+ %define edgeb byte [esp]
+%endif
+
+ ; main x loop for vertical filter, does one column of 16 pixels
+.loop_x:
+ mova m3, [midq] ; middle line
+
+ ; load top pixels
+ test edgeb, 4 ; have_top
+ jz .emu_top
+ mova m0, [midq-384*4]
+ mova m2, [midq-384*2]
+ mova m1, m0
+ jmp .load_bottom_pixels
+.emu_top:
+ mova m0, m3
+ mova m1, m3
+ mova m2, m3
+
+ ; load bottom pixels
+.load_bottom_pixels:
+ mov yd, hd
+%if ARCH_X86_64
+ mov mptrq, midq
+ mov dstptrq, dstq
+ add yd, ylimd
+%else
+ mov [esp+8], midq
+ mov [esp+4], dstq
+ add yd, ylimd
+%endif
+ jg .load_threelines
+
+ ; the remainder here is somewhat messy but only runs in very weird
+ ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
+ ; so performance is not terribly important here...
+ je .load_twolines
+ cmp yd, -1
+ je .load_oneline
+ ; h == 1 case
+ mova m5, m3
+ mova m4, m3
+ mova m6, m3
+ jmp .loop
+.load_oneline:
+ ; h == 2 case
+ mova m4, [midq+384*2]
+ mova m5, m4
+ mova m6, m4
+ jmp .loop
+.load_twolines:
+ ; h == 3 case
+ mova m4, [midq+384*2]
+ mova m5, [midq+384*4]
+ mova m6, m5
+ jmp .loop
+.load_threelines:
+ ; h > 3 case
+ mova m4, [midq+384*2]
+ mova m5, [midq+384*4]
+ ; third line loaded in main loop below
+
+ ; main y loop for vertical filter
+.loop_load:
+ ; load one line into m6. if that pixel is no longer available, do
+ ; nothing, since m6 still has the data from the previous line in it. We
+ ; try to structure the loop so that the common case is evaluated fastest
+ mova m6, [mptrq+384*6]
+.loop:
+%if ARCH_X86_64
+ paddw m7, m0, m6
+ paddw m8, m1, m5
+ paddw m9, m2, m4
+ punpcklwd m10, m7, m8
+ punpckhwd m7, m8
+ punpcklwd m11, m9, m3
+ punpckhwd m9, m3
+ pmaddwd m10, m15
+ pmaddwd m7, m15
+ pmaddwd m11, m14
+ pmaddwd m9, m14
+ paddd m10, m12
+ paddd m7, m12
+ paddd m10, m11
+ paddd m7, m9
+ psrad m10, 11
+ psrad m7, 11
+ packssdw m10, m7
+ packuswb m10, m10
+ movq [dstptrq], m10
+%else
+ mova [esp+0x30], m1
+ mova [esp+0x20], m2
+ mova [esp+0x10], m3
+ paddw m0, m6
+ paddw m1, m5
+ paddw m2, m4
+ punpcklwd m7, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m0, m1
+ punpckhwd m0, m1
+ mova m1, [esp+0x50]
+ pmaddwd m3, m1
+ pmaddwd m0, m1
+ mova m1, [esp+0x40]
+ pmaddwd m7, m1
+ pmaddwd m2, m1
+ paddd m3, [PIC_sym(pd_1024)]
+ paddd m0, [PIC_sym(pd_1024)]
+ paddd m3, m7
+ paddd m0, m2
+ psrad m3, 11
+ psrad m0, 11
+ packssdw m3, m0
+ packuswb m3, m3
+ movq [dstq], m3
+ mova m1, [esp+0x30]
+ mova m2, [esp+0x20]
+ mova m3, [esp+0x10]
+%endif
+ ; shift pixels one position
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m5
+ mova m5, m6
+ add mptrq, 384*2
+ add dstptrq, strideq
+ dec yd
+ jg .loop_load
+ ; for the bottom pixels, continue using m6 (as extended edge)
+ cmp yd, ylimd
+ jg .loop
+
+%if ARCH_X86_32
+ mov midq, [esp+8]
+ mov dstq, [esp+4]
+%endif
+ add midq, 16
+ add dstq, 8
+ sub wd, 8
+ jg .loop_x
+ RET
+%endmacro
+
+INIT_XMM sse2
+WIENER_H
+WIENER_V
+
+INIT_XMM ssse3
+WIENER_H
+WIENER_V
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro MULLD 2
+ pmulhuw m5, %1, %2
+ pmullw %1, %2
+ pslld m5, 16
+ paddd %1, m5
+%endmacro
+
+%macro GATHERDD 2
+ mova m5, m7
+ movd r6d, %2
+ %if ARCH_X86_64
+ movd %1, [r5+r6]
+ pextrw r6d, %2, 2
+ pinsrw m5, [r5+r6+2], 3
+ pextrw r6d, %2, 4
+ pinsrw %1, [r5+r6+2], 5
+ pextrw r6d, %2, 6
+ pinsrw m5, [r5+r6+2], 7
+ %else
+ movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
+ pextrw r6d, %2, 2
+ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
+ pextrw r6d, %2, 4
+ pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
+ pextrw r6d, %2, 6
+ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
+ %endif
+ por %1, m5
+%endmacro
+
+%if ARCH_X86_64
+cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ mov xlimd, edgem
+ movifnidn xd, xm
+ mov hd, hm
+ mov edged, xlimd
+ and xlimd, 2 ; have_right
+ add xd, xlimd
+ xor xlimd, 2 ; 2*!have_right
+%else
+cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq r0m
+ %define xlimd r1m
+ %define hd hmp
+ %define edgeb byte edgem
+
+ mov r6, edgem
+ and r6, 2 ; have_right
+ add xd, r6
+ xor r6, 2 ; 2*!have_right
+ mov xlimd, r6
+ SETUP_PIC r6, 0
+%endif
+
+ jnz .no_right
+ add xd, 7
+ and xd, ~7
+.no_right:
+ pxor m1, m1
+ lea srcq, [srcq+xq]
+ lea sumq, [sumq+xq*2-2]
+ lea sumsqq, [sumsqq+xq*4-4]
+ neg xq
+ mov wq, xq
+%if ARCH_X86_64
+ lea r10, [pb_right_ext_mask+16]
+%endif
+.loop_y:
+ mov xq, wq
+
+ ; load left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ test leftq, leftq
+ jz .load_left_from_main
+ movd m0, [leftq]
+ pslldq m0, 12
+ add leftq, 4
+ jmp .expand_x
+.no_left:
+ movd m0, [srcq+xq]
+ pshufb m0, [PIC_sym(pb_0)]
+ jmp .expand_x
+.load_left_from_main:
+ movd m0, [srcq+xq-2]
+ pslldq m0, 14
+.expand_x:
+ punpckhbw xm0, xm1
+
+ ; when we reach this, m0 contains left two px in highest words
+ cmp xd, -8
+ jle .loop_x
+.partial_load_and_extend:
+ movd m3, [srcq-4]
+ pshufb m3, [PIC_sym(pb_3)]
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+%if ARCH_X86_64
+ movu m4, [r10+xq*2]
+%else
+ movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+%endif
+ pand m2, m4
+ pandn m4, m3
+ por m2, m4
+ jmp .loop_x_noload
+.right_extend:
+ pshufb m2, m0, [PIC_sym(pb_14_15)]
+ jmp .loop_x_noload
+
+.loop_x:
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+.loop_x_noload:
+ palignr m3, m2, m0, 12
+ palignr m4, m2, m0, 14
+
+ punpcklwd m5, m3, m2
+ punpckhwd m6, m3, m2
+ paddw m3, m4
+ punpcklwd m7, m4, m1
+ punpckhwd m4, m1
+ pmaddwd m5, m5
+ pmaddwd m6, m6
+ pmaddwd m7, m7
+ pmaddwd m4, m4
+ paddd m5, m7
+ paddd m6, m4
+ paddw m3, m2
+ movu [sumq+xq*2], m3
+ movu [sumsqq+xq*4+ 0], m5
+ movu [sumsqq+xq*4+16], m6
+
+ mova m0, m2
+ add xq, 8
+
+ ; if x <= -8 we can reload more pixels
+ ; else if x < 0 we reload and extend (this implies have_right=0)
+ ; else if x < xlimd we extend from previous load (this implies have_right=0)
+ ; else we are done
+
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jl .partial_load_and_extend
+ cmp xd, xlimd
+ jl .right_extend
+
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ add srcq, strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+ movifnidn edged, edgem
+%else
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq dword [esp+4]
+ %define ylimd dword [esp+8]
+ %define m8 [esp+12]
+ mov edged, r4m
+ mov hd, r3m
+%endif
+ mov xq, -2
+%if ARCH_X86_64
+ mov ylimd, edged
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 2 ; -2 if have_bottom=0, else 0
+ mov sumsq_baseq, sumsqq
+ mov sum_baseq, sumq
+.loop_x:
+ mov sumsqq, sumsq_baseq
+ mov sumq, sum_baseq
+ lea yd, [hq+ylimq+2]
+%else
+ mov yd, edged
+ and yd, 8 ; have_bottom
+ shr yd, 2
+ sub yd, 2 ; -2 if have_bottom=0, else 0
+ mov sumsq_baseq, sumsqq
+ mov sum_baseq, sumq
+ mov ylimd, yd
+.loop_x:
+ mov sumsqd, sumsq_baseq
+ mov sumd, sum_baseq
+ lea yd, [hq+2]
+ add yd, ylimd
+%endif
+ lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
+ lea sumq, [sumq+xq*2+2-(384+16)*2]
+ test edgeb, 4 ; have_top
+ jnz .load_top
+ movu m0, [sumsqq+(384+16)*4*1]
+ movu m1, [sumsqq+(384+16)*4*1+16]
+ mova m2, m0
+ mova m3, m1
+ mova m4, m0
+ mova m5, m1
+ movu m6, [sumq+(384+16)*2*1]
+ mova m7, m6
+ mova m8, m6
+ jmp .loop_y_noload
+.load_top:
+ movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left]
+ movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right]
+ movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left]
+ movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right]
+ movu m6, [sumq-(384+16)*2*1] ; l2
+ movu m7, [sumq-(384+16)*2*0] ; l1
+.loop_y:
+%if ARCH_X86_64
+ movu m8, [sumq+(384+16)*2*1] ; l0
+%else
+ movu m4, [sumq+(384+16)*2*1] ; l0
+ mova m8, m4
+%endif
+ movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left]
+ movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right]
+.loop_y_noload:
+ paddd m0, m2
+ paddd m1, m3
+ paddw m6, m7
+ paddd m0, m4
+ paddd m1, m5
+ paddw m6, m8
+ movu [sumsqq+ 0], m0
+ movu [sumsqq+16], m1
+ movu [sumq], m6
+
+ ; shift position down by one
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ mova m6, m7
+ mova m7, m8
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ dec yd
+ jg .loop_y
+ cmp yd, ylimd
+ jg .loop_y_noload
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+
+cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+%if ARCH_X86_64
+ LEA r5, sgr_x_by_x-0xF03
+%else
+ SETUP_PIC r5, 0
+%endif
+ movd m6, sd
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ pxor m7, m7
+ DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+ mova m8, [pd_0xF00801C7]
+ mova m9, [pw_256]
+ psrld m10, m9, 13 ; pd_2048
+ mova m11, [pb_unpcklwdw]
+%else
+ %define m8 [PIC_sym(pd_0xF00801C7)]
+ %define m9 [PIC_sym(pw_256)]
+ %define m10 [PIC_sym(pd_2048)]
+ %define m11 [PIC_sym(pb_unpcklwdw)]
+%endif
+.loop_y:
+ mov xq, -2
+.loop_x:
+ movq m0, [bq+xq*2]
+ movq m1, [bq+xq*2+(384+16)*2]
+ punpcklwd m0, m7
+ punpcklwd m1, m7
+ movu m2, [aq+xq*4]
+ movu m3, [aq+xq*4+(384+16)*4]
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+ psubd m2, m4 ; p = aa * 9 - bb * bb
+ psubd m3, m5
+ MULLD m2, m6
+ MULLD m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ GATHERDD m4, m2 ; xx
+ GATHERDD m2, m3
+ psrld m4, 24
+ psrld m2, 24
+ packssdw m3, m4, m2
+ pshufb m4, m11
+ MULLD m0, m4
+ pshufb m2, m11
+ MULLD m1, m2
+ psubw m5, m9, m3
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ movq [bq+xq*2], m5
+ psrldq m5, 8
+ movq [bq+xq*2+(384+16)*2], m5
+ movu [aq+xq*4], m0
+ movu [aq+xq*4+(384+16)*4], m1
+ add xd, 4
+ cmp xd, wd
+ jl .loop_x
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ sub hd, 2
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mova m15, [pw_16]
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ xor xd, xd
+%else
+cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq [esp+8]
+ %define src_baseq [esp+12]
+ %define a_baseq [esp+16]
+ %define b_baseq [esp+20]
+ %define wd [esp+24]
+ %define hd [esp+28]
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ mov wd, xd
+ mov hd, yd
+ xor xd, xd
+ SETUP_PIC yd, 1, 1
+ jmp .loop_start
+%endif
+
+.loop_x:
+ mov tq, tmp_baseq
+ mov srcq, src_baseq
+ mov aq, a_baseq
+ mov bq, b_baseq
+%if ARCH_X86_32
+.loop_start:
+ movu m0, [bq+xq*2-(384+16)*2-2]
+ movu m2, [bq+xq*2-(384+16)*2+2]
+ mova m1, [bq+xq*2-(384+16)*2] ; b:top
+ paddw m0, m2 ; b:tl+tr
+ movu m2, [bq+xq*2-2]
+ movu m3, [bq+xq*2+2]
+ paddw m1, [bq+xq*2] ; b:top+ctr
+ paddw m2, m3 ; b:l+r
+ mova [esp+0x80], m0
+ mova [esp+0x70], m1
+ mova [esp+0x60], m2
+%endif
+ movu m0, [aq+xq*4-(384+16)*4-4]
+ movu m2, [aq+xq*4-(384+16)*4+4]
+ mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half]
+ paddd m0, m2 ; a:tl+tr [first half]
+ movu m2, [aq+xq*4-(384+16)*4-4+16]
+ movu m4, [aq+xq*4-(384+16)*4+4+16]
+ mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half]
+ paddd m2, m4 ; a:tl+tr [second half]
+ movu m4, [aq+xq*4-4]
+ movu m5, [aq+xq*4+4]
+ paddd m1, [aq+xq*4] ; a:top+ctr [first half]
+ paddd m4, m5 ; a:l+r [first half]
+ movu m5, [aq+xq*4+16-4]
+ movu m6, [aq+xq*4+16+4]
+ paddd m3, [aq+xq*4+16] ; a:top+ctr [second half]
+ paddd m5, m6 ; a:l+r [second half]
+%if ARCH_X86_64
+ movu m6, [bq+xq*2-(384+16)*2-2]
+ movu m8, [bq+xq*2-(384+16)*2+2]
+ mova m7, [bq+xq*2-(384+16)*2] ; b:top
+ paddw m6, m8 ; b:tl+tr
+ movu m8, [bq+xq*2-2]
+ movu m9, [bq+xq*2+2]
+ paddw m7, [bq+xq*2] ; b:top+ctr
+ paddw m8, m9 ; b:l+r
+%endif
+
+ lea tq, [tq+xq*2]
+ lea srcq, [srcq+xq*1]
+ lea aq, [aq+xq*4+(384+16)*4]
+ lea bq, [bq+xq*2+(384+16)*2]
+ mov yd, hd
+.loop_y:
+%if ARCH_X86_64
+ movu m9, [bq-2]
+ movu m10, [bq+2]
+ paddw m7, [bq] ; b:top+ctr+bottom
+ paddw m9, m10 ; b:bl+br
+ paddw m10, m7, m8 ; b:top+ctr+bottom+l+r
+ paddw m6, m9 ; b:tl+tr+bl+br
+ psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom
+ paddw m10, m6
+ psllw m10, 2
+ psubw m10, m6 ; aa
+ pxor m14, m14
+ movq m12, [srcq]
+ punpcklbw m12, m14
+ punpcklwd m6, m10, m15
+ punpckhwd m10, m15
+ punpcklwd m13, m12, m15
+ punpckhwd m12, m15
+ pmaddwd m6, m13 ; aa*src[x]+256 [first half]
+ pmaddwd m10, m12 ; aa*src[x]+256 [second half]
+%else
+ paddd m1, [aq] ; a:top+ctr+bottom [first half]
+ paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
+ mova [esp+0x50], m1
+ mova [esp+0x40], m3
+ mova [esp+0x30], m4
+ movu m6, [aq-4]
+ movu m7, [aq+4]
+ paddd m1, m4 ; a:top+ctr+bottom+l+r [first half]
+ paddd m3, m5 ; a:top+ctr+bottom+l+r [second half]
+ paddd m6, m7 ; a:bl+br [first half]
+ movu m7, [aq+16-4]
+ movu m4, [aq+16+4]
+ paddd m7, m4 ; a:bl+br [second half]
+ paddd m0, m6 ; a:tl+tr+bl+br [first half]
+ paddd m2, m7 ; a:tl+tr+bl+br [second half]
+ paddd m1, m0
+ paddd m3, m2
+ pslld m1, 2
+ pslld m3, 2
+ psubd m1, m0 ; bb [first half]
+ psubd m3, m2 ; bb [second half]
+%endif
+
+%if ARCH_X86_64
+ movu m11, [aq-4]
+ movu m12, [aq+4]
+ paddd m1, [aq] ; a:top+ctr+bottom [first half]
+ paddd m11, m12 ; a:bl+br [first half]
+ movu m12, [aq+16-4]
+ movu m13, [aq+16+4]
+ paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
+ paddd m12, m13 ; a:bl+br [second half]
+ paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half]
+ paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half]
+ paddd m0, m11 ; a:tl+tr+bl+br [first half]
+ paddd m2, m12 ; a:tl+tr+bl+br [second half]
+ paddd m13, m0
+ paddd m14, m2
+ pslld m13, 2
+ pslld m14, 2
+ psubd m13, m0 ; bb [first half]
+ psubd m14, m2 ; bb [second half]
+ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
+ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
+%else
+ mova m4, [esp+0x80]
+ mova [esp+0x80], m5
+ mova m5, [esp+0x70]
+ mova [esp+0x70], m6
+ mova m6, [esp+0x60]
+ mova [esp+0x60], m7
+ mova [esp+0x20], m1
+ movu m7, [bq-2]
+ movu m1, [bq+2]
+ paddw m5, [bq] ; b:top+ctr+bottom
+ paddw m7, m1
+ paddw m1, m5, m6 ; b:top+ctr+bottom+l+r
+ paddw m4, m7 ; b:tl+tr+bl+br
+ psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom
+ paddw m1, m4
+ psllw m1, 2
+ psubw m1, m4 ; aa
+ movq m0, [srcq]
+ XCHG_PIC_REG
+ punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16]
+ punpcklwd m4, m1, [PIC_sym(pw_16)]
+ punpckhwd m1, [PIC_sym(pw_16)]
+ punpcklwd m2, m0, [PIC_sym(pw_16)]
+ punpckhwd m0, [PIC_sym(pw_16)]
+ XCHG_PIC_REG
+ pmaddwd m4, m2 ; aa*src[x]+256 [first half]
+ pmaddwd m1, m0 ; aa*src[x]+256 [second half]
+%endif
+
+%if ARCH_X86_64
+ paddd m6, m13
+ paddd m10, m14
+ psrad m6, 9
+ psrad m10, 9
+ packssdw m6, m10
+ mova [tq], m6
+%else
+ paddd m4, [esp+0x20]
+ paddd m1, m3
+ psrad m4, 9
+ psrad m1, 9
+ packssdw m4, m1
+ mova [tq], m4
+%endif
+
+ ; shift to next row
+%if ARCH_X86_64
+ mova m0, m4
+ mova m2, m5
+ mova m4, m11
+ mova m5, m12
+ mova m6, m8
+ mova m8, m9
+%else
+ mova m1, [esp+0x50]
+ mova m3, [esp+0x40]
+ mova m0, [esp+0x30]
+ mova m2, [esp+0x80]
+ mova m4, [esp+0x70]
+ mova [esp+0x70], m5
+ mova m5, [esp+0x60]
+ mova [esp+0x80], m6
+ mova [esp+0x60], m7
+ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
+ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
+%endif
+
+ add srcq, strideq
+ add aq, (384+16)*4
+ add bq, (384+16)*2
+ add tq, 384*2
+ dec yd
+ jg .loop_y
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+
+cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+ movifnidn hd, hm
+%if ARCH_X86_32
+ SETUP_PIC r6, 0
+%endif
+ movd m0, wtm
+ pshufb m0, [PIC_sym(pb_0_1)]
+ psllw m0, 4
+ pxor m7, m7
+ DEFINE_ARGS dst, stride, t, w, h, idx
+.loop_y:
+ xor idxd, idxd
+.loop_x:
+ mova m1, [tq+idxq*2+ 0]
+ mova m4, [tq+idxq*2+16]
+ mova m5, [dstq+idxq]
+ punpcklbw m2, m5, m7
+ punpckhbw m5, m7
+ psllw m3, m2, 4
+ psllw m6, m5, 4
+ psubw m1, m3
+ psubw m4, m6
+ pmulhrsw m1, m0
+ pmulhrsw m4, m0
+ paddw m1, m2
+ paddw m4, m5
+ packuswb m1, m4
+ mova [dstq+idxq], m1
+ add idxd, 16
+ cmp idxd, wd
+ jl .loop_x
+ add dstq, strideq
+ add tq, 384 * 2
+ dec hd
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mova m10, [pb_0]
+ mova m11, [pb_0_1]
+%else
+cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb byte edgem
+ %define wd xd
+ %define wq wd
+ %define wm r5m
+ %define strideq r4m
+ SUB esp, 8
+ SETUP_PIC sumsqd, 1, 1
+
+ %define m10 [PIC_sym(pb_0)]
+ %define m11 [PIC_sym(pb_0_1)]
+%endif
+
+ test edgeb, 2 ; have_right
+ jz .no_right
+ xor xlimd, xlimd
+ add wd, 2
+ add wd, 15
+ and wd, ~15
+ jmp .right_done
+.no_right:
+ mov xlimd, 3
+ dec wd
+.right_done:
+ pxor m1, m1
+ lea srcq, [srcq+wq+1]
+ lea sumq, [sumq+wq*2-2]
+ lea sumsqq, [sumsqq+wq*4-4]
+ neg wq
+%if ARCH_X86_64
+ lea r10, [pb_right_ext_mask+16]
+%else
+ mov wm, xd
+ %define wq wm
+%endif
+
+.loop_y:
+ mov xq, wq
+ ; load left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ test leftq, leftq
+ jz .load_left_from_main
+ movd m0, [leftq]
+ movd m2, [srcq+xq-1]
+ pslldq m2, 4
+ por m0, m2
+ pslldq m0, 11
+ add leftq, 4
+ jmp .expand_x
+.no_left:
+ movd m0, [srcq+xq-1]
+ XCHG_PIC_REG
+ pshufb m0, m10
+ XCHG_PIC_REG
+ jmp .expand_x
+.load_left_from_main:
+ movd m0, [srcq+xq-4]
+ pslldq m0, 12
+.expand_x:
+ punpckhbw m0, m1
+
+ ; when we reach this, m0 contains left two px in highest words
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jge .right_extend
+.partial_load_and_extend:
+ XCHG_PIC_REG
+ movd m3, [srcq-1]
+ movq m2, [srcq+xq]
+ pshufb m3, m10
+ punpcklbw m3, m1
+ punpcklbw m2, m1
+%if ARCH_X86_64
+ movu m4, [r10+xq*2]
+%else
+ movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+ XCHG_PIC_REG
+%endif
+ pand m2, m4
+ pandn m4, m3
+ por m2, m4
+ jmp .loop_x_noload
+.right_extend:
+ psrldq m2, m0, 14
+ XCHG_PIC_REG
+ pshufb m2, m11
+ XCHG_PIC_REG
+ jmp .loop_x_noload
+
+.loop_x:
+ movq m2, [srcq+xq]
+ punpcklbw m2, m1
+.loop_x_noload:
+ palignr m3, m2, m0, 8
+ palignr m4, m2, m0, 10
+ palignr m5, m2, m0, 12
+ palignr m6, m2, m0, 14
+
+%if ARCH_X86_64
+ paddw m0, m3, m2
+ punpcklwd m7, m3, m2
+ punpckhwd m3, m2
+ paddw m0, m4
+ punpcklwd m8, m4, m5
+ punpckhwd m4, m5
+ paddw m0, m5
+ punpcklwd m9, m6, m1
+ punpckhwd m5, m6, m1
+ paddw m0, m6
+ pmaddwd m7, m7
+ pmaddwd m3, m3
+ pmaddwd m8, m8
+ pmaddwd m4, m4
+ pmaddwd m9, m9
+ pmaddwd m5, m5
+ paddd m7, m8
+ paddd m3, m4
+ paddd m7, m9
+ paddd m3, m5
+ movu [sumq+xq*2], m0
+ movu [sumsqq+xq*4+ 0], m7
+ movu [sumsqq+xq*4+16], m3
+%else
+ paddw m0, m3, m2
+ paddw m0, m4
+ paddw m0, m5
+ paddw m0, m6
+ movu [sumq+xq*2], m0
+ punpcklwd m7, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ punpckhwd m5, m6, m1
+ pmaddwd m7, m7
+ pmaddwd m3, m3
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ pmaddwd m5, m5
+ paddd m7, m0
+ paddd m3, m4
+ paddd m3, m5
+ punpcklwd m0, m6, m1
+ pmaddwd m0, m0
+ paddd m7, m0
+ movu [sumsqq+xq*4+ 0], m7
+ movu [sumsqq+xq*4+16], m3
+%endif
+
+ mova m0, m2
+ add xq, 8
+
+ ; if x <= -8 we can reload more pixels
+ ; else if x < 0 we reload and extend (this implies have_right=0)
+ ; else if x < xlimd we extend from previous load (this implies have_right=0)
+ ; else we are done
+
+ cmp xd, -8
+ jle .loop_x
+ test xd, xd
+ jl .partial_load_and_extend
+ cmp xd, xlimd
+ jl .right_extend
+
+ add srcq, strideq
+ add sumsqq, (384+16)*4
+ add sumq, (384+16)*2
+ dec hd
+ jg .loop_y
+%if ARCH_X86_32
+ ADD esp, 8
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
+ mov ylimd, edged
+%else
+cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm [esp+0]
+ %define hm [esp+4]
+ %define edgem [esp+8]
+ mov wm, xd
+ mov hm, yd
+ mov edgem, ylimd
+%endif
+
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 3 ; -3 if have_bottom=0, else -1
+ mov xq, -2
+%if ARCH_X86_64
+.loop_x:
+ lea yd, [hd+ylimd+2]
+ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+ lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
+ test edgeb, 4 ; have_top
+ jnz .load_top
+ movu m0, [sumsq_ptrq+(384+16)*4*1]
+ movu m1, [sumsq_ptrq+(384+16)*4*1+16]
+ mova m2, m0
+ mova m3, m1
+ mova m4, m0
+ mova m5, m1
+ mova m6, m0
+ mova m7, m1
+ movu m10, [sum_ptrq+(384+16)*2*1]
+ mova m11, m10
+ mova m12, m10
+ mova m13, m10
+ jmp .loop_y_second_load
+.load_top:
+ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
+ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
+ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
+ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
+ mova m2, m0
+ mova m3, m1
+ movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
+ movu m12, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m11, m10
+.loop_y:
+ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
+ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
+ movu m13, [sum_ptrq+(384+16)*2*1] ; l1
+.loop_y_second_load:
+ test yd, yd
+ jle .emulate_second_load
+ movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
+ movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
+ movu m14, [sum_ptrq+(384+16)*2*2] ; l0
+.loop_y_noload:
+ paddd m0, m2
+ paddd m1, m3
+ paddw m10, m11
+ paddd m0, m4
+ paddd m1, m5
+ paddw m10, m12
+ paddd m0, m6
+ paddd m1, m7
+ paddw m10, m13
+ paddd m0, m8
+ paddd m1, m9
+ paddw m10, m14
+ movu [sumsq_ptrq+ 0], m0
+ movu [sumsq_ptrq+16], m1
+ movu [sum_ptrq], m10
+
+ ; shift position down by one
+ mova m0, m4
+ mova m1, m5
+ mova m2, m6
+ mova m3, m7
+ mova m4, m8
+ mova m5, m9
+ mova m10, m12
+ mova m11, m13
+ mova m12, m14
+ add sumsq_ptrq, (384+16)*4*2
+ add sum_ptrq, (384+16)*2*2
+ sub yd, 2
+ jge .loop_y
+ ; l1 = l0
+ mova m6, m8
+ mova m7, m9
+ mova m13, m14
+ cmp yd, ylimd
+ jg .loop_y_noload
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ RET
+.emulate_second_load:
+ mova m8, m6
+ mova m9, m7
+ mova m14, m13
+ jmp .loop_y_noload
+%else
+.sumsq_loop_x:
+ lea yd, [ylimd+2]
+ add yd, hm
+ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+ test byte edgem, 4 ; have_top
+ jnz .sumsq_load_top
+ movu m0, [sumsq_ptrq+(384+16)*4*1]
+ movu m1, [sumsq_ptrq+(384+16)*4*1+16]
+ mova m4, m0
+ mova m5, m1
+ mova m6, m0
+ mova m7, m1
+ mova [esp+0x1c], m0
+ mova [esp+0x0c], m1
+ jmp .sumsq_loop_y_second_load
+.sumsq_load_top:
+ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
+ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
+ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
+ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
+ mova [esp+0x1c], m0
+ mova [esp+0x0c], m1
+.sumsq_loop_y:
+ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
+ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
+.sumsq_loop_y_second_load:
+ test yd, yd
+ jle .sumsq_emulate_second_load
+ movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
+ movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
+.sumsq_loop_y_noload:
+ paddd m0, [esp+0x1c]
+ paddd m1, [esp+0x0c]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m0, m6
+ paddd m1, m7
+ paddd m0, m2
+ paddd m1, m3
+ movu [sumsq_ptrq+ 0], m0
+ movu [sumsq_ptrq+16], m1
+
+ ; shift position down by one
+ mova m0, m4
+ mova m1, m5
+ mova m4, m2
+ mova m5, m3
+ mova [esp+0x1c], m6
+ mova [esp+0x0c], m7
+ add sumsq_ptrq, (384+16)*4*2
+ sub yd, 2
+ jge .sumsq_loop_y
+ ; l1 = l0
+ mova m6, m2
+ mova m7, m3
+ cmp yd, ylimd
+ jg .sumsq_loop_y_noload
+ add xd, 8
+ cmp xd, wm
+ jl .sumsq_loop_x
+
+ mov xd, -2
+.sum_loop_x:
+ lea yd, [ylimd+2]
+ add yd, hm
+ lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+ test byte edgem, 4 ; have_top
+ jnz .sum_load_top
+ movu m0, [sum_ptrq+(384+16)*2*1]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp .sum_loop_y_second_load
+.sum_load_top:
+ movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4
+ movu m2, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m1, m0
+.sum_loop_y:
+ movu m3, [sum_ptrq+(384+16)*2*1] ; l1
+.sum_loop_y_second_load:
+ test yd, yd
+ jle .sum_emulate_second_load
+ movu m4, [sum_ptrq+(384+16)*2*2] ; l0
+.sum_loop_y_noload:
+ paddw m0, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movu [sum_ptrq], m0
+
+ ; shift position down by one
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ add sum_ptrq, (384+16)*2*2
+ sub yd, 2
+ jge .sum_loop_y
+ ; l1 = l0
+ mova m3, m4
+ cmp yd, ylimd
+ jg .sum_loop_y_noload
+ add xd, 8
+ cmp xd, wm
+ jl .sum_loop_x
+ RET
+.sumsq_emulate_second_load:
+ mova m2, m6
+ mova m3, m7
+ jmp .sumsq_loop_y_noload
+.sum_emulate_second_load:
+ mova m4, m3
+ jmp .sum_loop_y_noload
+%endif
+
+cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+ movifnidn sd, sm
+ sub aq, (384+16-1)*4
+ sub bq, (384+16-1)*2
+ add hd, 2
+%if ARCH_X86_64
+ LEA r5, sgr_x_by_x-0xF03
+%else
+ SETUP_PIC r5, 0
+%endif
+ movd m6, sd
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
+ pxor m7, m7
+ DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+ mova m8, [pd_0xF0080029]
+ mova m9, [pw_256]
+ psrld m10, m9, 15 ; pd_512
+%else
+ %define m8 [PIC_sym(pd_0xF0080029)]
+ %define m9 [PIC_sym(pw_256)]
+ %define m10 [PIC_sym(pd_512)]
+%endif
+.loop_y:
+ mov xq, -2
+.loop_x:
+ movq m0, [bq+xq*2+0]
+ movq m1, [bq+xq*2+8]
+ punpcklwd m0, m7
+ punpcklwd m1, m7
+ movu m2, [aq+xq*4+ 0]
+ movu m3, [aq+xq*4+16]
+ pslld m4, m2, 3 ; aa * 8
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ paddd m4, m4 ; aa * 16
+ paddd m5, m5
+ paddd m2, m4 ; aa * 25
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p = aa * 25 - bb * bb
+ psubd m3, m5
+ MULLD m2, m6
+ MULLD m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ GATHERDD m4, m2 ; xx
+ GATHERDD m2, m3
+ psrld m4, 24
+ psrld m2, 24
+ packssdw m3, m4, m2
+ pmullw m4, m8
+ pmullw m2, m8
+ psubw m5, m9, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m2
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 10
+ psrld m1, 10
+ movu [bq+xq*2], m5
+ movu [aq+xq*4+ 0], m0
+ movu [aq+xq*4+16], m1
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ sub hd, 2
+ jg .loop_y
+ RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+ tmp_base, src_base, a_base, b_base, x, y
+ movifnidn wd, wm
+ mov hd, hm
+ mov tmp_baseq, tq
+ mov src_baseq, srcq
+ mov a_baseq, aq
+ mov b_baseq, bq
+ mova m9, [pw_5_6]
+ mova m12, [pw_256]
+ psrlw m10, m12, 8 ; pw_1
+ psrlw m11, m12, 1 ; pw_128
+ pxor m13, m13
+%else
+cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq r0m
+ %define src_baseq r1m
+ %define a_baseq r3m
+ %define b_baseq r4m
+ %define wd r5m
+ %define hd r6m
+
+ SUB esp, 8
+ SETUP_PIC yd
+
+ %define m8 m5
+ %define m9 [PIC_sym(pw_5_6)]
+ %define m10 [PIC_sym(pw_1)]
+ %define m11 [PIC_sym(pw_128)]
+ %define m12 [PIC_sym(pw_256)]
+ %define m13 m0
+%endif
+ xor xd, xd
+.loop_x:
+ mov tq, tmp_baseq
+ mov srcq, src_baseq
+ mov aq, a_baseq
+ mov bq, b_baseq
+ movu m0, [aq+xq*4-(384+16)*4-4]
+ mova m1, [aq+xq*4-(384+16)*4]
+ movu m2, [aq+xq*4-(384+16)*4+4]
+ movu m3, [aq+xq*4-(384+16)*4-4+16]
+ mova m4, [aq+xq*4-(384+16)*4+16]
+ movu m5, [aq+xq*4-(384+16)*4+4+16]
+ paddd m0, m2
+ paddd m3, m5
+ paddd m0, m1
+ paddd m3, m4
+ pslld m2, m0, 2
+ pslld m5, m3, 2
+ paddd m2, m0
+ paddd m5, m3
+ paddd m0, m2, m1 ; prev_odd_b [first half]
+ paddd m1, m5, m4 ; prev_odd_b [second half]
+ movu m3, [bq+xq*2-(384+16)*2-2]
+ mova m4, [bq+xq*2-(384+16)*2]
+ movu m5, [bq+xq*2-(384+16)*2+2]
+ paddw m3, m5
+ punpcklwd m5, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m5, m9
+ pmaddwd m3, m9
+ mova m2, m5
+ packssdw m2, m3 ; prev_odd_a
+ lea tq, [tq+xq*2]
+ lea srcq, [srcq+xq*1]
+ lea aq, [aq+xq*4+(384+16)*4]
+ lea bq, [bq+xq*2+(384+16)*2]
+%if ARCH_X86_32
+ mov [esp], PIC_reg
+%endif
+ mov yd, hd
+ XCHG_PIC_REG
+.loop_y:
+ movu m3, [aq-4]
+ mova m4, [aq]
+ movu m5, [aq+4]
+ paddd m3, m5
+ paddd m3, m4
+ pslld m5, m3, 2
+ paddd m5, m3
+ paddd m5, m4 ; cur_odd_b [first half]
+ movu m3, [aq+16-4]
+ mova m6, [aq+16]
+ movu m7, [aq+16+4]
+ paddd m3, m7
+ paddd m3, m6
+ pslld m7, m3, 2
+ paddd m7, m3
+ paddd m4, m7, m6 ; cur_odd_b [second half]
+ movu m3, [bq-2]
+ mova m6, [bq]
+ movu m7, [bq+2]
+ paddw m3, m7
+ punpcklwd m7, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m7, m9
+ pmaddwd m3, m9
+ packssdw m6, m7, m3 ; cur_odd_a
+
+ paddd m0, m5 ; cur_even_b [first half]
+ paddd m1, m4 ; cur_even_b [second half]
+ paddw m2, m6 ; cur_even_a
+
+ movq m3, [srcq]
+%if ARCH_X86_64
+ punpcklbw m3, m13
+%else
+ mova [td], m5
+ pxor m7, m7
+ punpcklbw m3, m7
+%endif
+ punpcklwd m7, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m8, m2, m12
+ punpckhwd m2, m12
+ pmaddwd m7, m8
+ pmaddwd m3, m2
+ paddd m7, m0
+ paddd m3, m1
+ psrad m7, 9
+ psrad m3, 9
+
+%if ARCH_X86_32
+ pxor m13, m13
+%endif
+ movq m8, [srcq+strideq]
+ punpcklbw m8, m13
+ punpcklwd m0, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m1, m6, m11
+ punpckhwd m2, m6, m11
+ pmaddwd m0, m1
+ pmaddwd m8, m2
+%if ARCH_X86_64
+ paddd m0, m5
+%else
+ paddd m0, [td]
+%endif
+ paddd m8, m4
+ psrad m0, 8
+ psrad m8, 8
+
+ packssdw m7, m3
+ packssdw m0, m8
+%if ARCH_X86_32
+ mova m5, [td]
+%endif
+ mova [tq+384*2*0], m7
+ mova [tq+384*2*1], m0
+
+ mova m0, m5
+ mova m1, m4
+ mova m2, m6
+ add aq, (384+16)*4*2
+ add bq, (384+16)*2*2
+ add tq, 384*2*2
+ lea srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+ sub yd, 2
+%else
+ sub dword [esp+4], 2
+%endif
+ jg .loop_y
+ add xd, 8
+ cmp xd, wd
+ jl .loop_x
+%if ARCH_X86_32
+ ADD esp, 8
+%endif
+ RET
+
+cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movd m0, wtm
+%if ARCH_X86_64
+ movifnidn hd, hm
+ mova m10, [pd_1024]
+ pxor m11, m11
+%else
+ SETUP_PIC hd, 0
+ %define m10 [PIC_sym(pd_1024)]
+ %define m11 m7
+%endif
+ pshufd m0, m0, 0
+ DEFINE_ARGS dst, stride, t1, t2, w, h, idx
+%if ARCH_X86_32
+ %define hd hmp
+%endif
+
+.loop_y:
+ xor idxd, idxd
+.loop_x:
+ mova m1, [t1q+idxq*2+ 0]
+ mova m2, [t1q+idxq*2+16]
+ mova m3, [t2q+idxq*2+ 0]
+ mova m4, [t2q+idxq*2+16]
+ mova m6, [dstq+idxq]
+%if ARCH_X86_32
+ pxor m11, m11
+%endif
+ punpcklbw m5, m6, m11
+ punpckhbw m6, m11
+ psllw m7, m5, 4
+ psubw m1, m7
+ psubw m3, m7
+ psllw m7, m6, 4
+ psubw m2, m7
+ psubw m4, m7
+ punpcklwd m7, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m2, m4
+ punpckhwd m2, m4
+ pmaddwd m7, m0
+ pmaddwd m1, m0
+ pmaddwd m3, m0
+ pmaddwd m2, m0
+ paddd m7, m10
+ paddd m1, m10
+ paddd m3, m10
+ paddd m2, m10
+ psrad m7, 11
+ psrad m1, 11
+ psrad m3, 11
+ psrad m2, 11
+ packssdw m7, m1
+ packssdw m3, m2
+ paddw m7, m5
+ paddw m3, m6
+ packuswb m7, m3
+ mova [dstq+idxq], m7
+ add idxd, 16
+ cmp idxd, wd
+ jl .loop_x
+ add dstq, strideq
+ add t1q, 384 * 2
+ add t2q, 384 * 2
+ dec hd
+ jg .loop_y
+ RET
--- a/src/x86/looprestoration_ssse3.asm
+++ /dev/null
@@ -1,1950 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; Copyright © 2018, VideoLabs
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-; list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-; this list of conditions and the following disclaimer in the documentation
-; and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "config.asm"
-%include "ext/x86/x86inc.asm"
-
-SECTION_RODATA 16
-
-pb_right_ext_mask: times 16 db 0xff
- times 16 db 0
-pb_14x0_1_2: times 14 db 0
- db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
- db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
-pb_0: times 16 db 0
-pb_2: times 16 db 2
-pb_3: times 16 db 3
-pb_4: times 16 db 4
-pb_15: times 16 db 15
-pb_0_1: times 8 db 0, 1
-pb_6_7: times 8 db 6, 7
-pb_14_15: times 8 db 14, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_128: times 8 dw 128
-pw_255: times 8 dw 255
-pw_256: times 8 dw 256
-pw_2048: times 8 dw 2048
-pw_16380: times 8 dw 16380
-pw_5_6: times 4 dw 5, 6
-pd_1024: times 4 dd 1024
-%if ARCH_X86_32
-pd_256: times 4 dd 256
-pd_512: times 4 dd 512
-pd_2048: times 4 dd 2048
-%endif
-pd_0xF0080029: times 4 dd 0xF0080029
-pd_0xF00801C7: times 4 dd 0XF00801C7
-
-cextern sgr_x_by_x
-
-SECTION .text
-
-%if ARCH_X86_32
- %define PIC_base_offset $$
-
- %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
- %assign pic_reg_stk_off 4
- %xdefine PIC_reg %1
- %if %2 == 1
- mov [esp], %1
- %endif
- LEA PIC_reg, PIC_base_offset
- %if %3 == 1
- XCHG_PIC_REG
- %endif
- %endmacro
-
- %macro XCHG_PIC_REG 0
- mov [esp+pic_reg_stk_off], PIC_reg
- %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
- mov PIC_reg, [esp+pic_reg_stk_off]
- %endmacro
-
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-
-%else
- %macro XCHG_PIC_REG 0
- %endmacro
-
- %define PIC_sym(sym) (sym)
-%endif
-
-%macro PALIGNR 4 ; dst, src1, src2, shift
- %if cpuflag(ssse3)
- palignr %1, %2, %3, %4
- %else
- %assign %%i regnumof%+%1 + 1
- %define %%tmp m %+ %%i
- psrldq %1, %3, %4
- pslldq %%tmp, %2, 16-%4
- por %1, %%tmp
- %endif
-%endmacro
-
-%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
- %if cpuflag(ssse3)
- pmaddubsw %1, %2
- %else
- %if %5 == 1
- pxor %3, %3
- %endif
- punpckhbw %4, %1, %3
- punpcklbw %1, %3
- pmaddwd %4, %2
- pmaddwd %1, %2
- packssdw %1, %4
- %endif
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;
-;; wiener ;;
-;;;;;;;;;;;;;;;;;;;;;;
-
-%macro WIENER_H 0
-%if ARCH_X86_64
-cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
-%else
-cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge
- mov r5, edgem
- mov [esp+12], r5
- mov wd, wm
- mov hd, hm
- SETUP_PIC hd
- %define m15 m0
- %define m14 m1
- %define m13 m2
- %define m12 m3
-%endif
-
- movq m15, [fltq]
-%if cpuflag(ssse3)
- pshufb m12, m15, [PIC_sym(pb_6_7)]
- pshufb m13, m15, [PIC_sym(pb_4)]
- pshufb m14, m15, [PIC_sym(pb_2)]
- pshufb m15, m15, [PIC_sym(pb_0)]
-%else
- pshuflw m12, m15, q3333
- punpcklbw m15, m15
- pshufhw m13, m15, q0000
- pshuflw m14, m15, q2222
- pshuflw m15, m15, q0000
- punpcklqdq m12, m12
- punpckhqdq m13, m13
- punpcklqdq m14, m14
- punpcklqdq m15, m15
- psraw m13, 8
- psraw m14, 8
- psraw m15, 8
-%endif
-
-%if ARCH_X86_64
- mova m11, [pw_2048]
- mova m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
-%else
- %define m10 [PIC_sym(pw_16380)]
- %define m11 [PIC_sym(pw_2048)]
- %define m12 [esp+0x14]
- %define m13 [esp+0x24]
- %define m14 [esp+0x34]
- %define m15 [esp+0x44]
- mova m12, m3
- mova m13, m2
- mova m14, m1
- mova m15, m0
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge
- %define srcptrq srcq
- %define dstptrq dstq
- %define hd dword [esp+ 0]
- %define edgeb byte [esp+12]
- %define xlimd dword [esp+16]
-%endif
-
- ; if (edge & has_right) align_w_to_16
- ; else w -= 3, and use that as limit in x loop
- test edgeb, 2 ; has_right
- jnz .align
- mov xlimd, -3
- jmp .loop
-.align:
- add wd, 15
- and wd, ~15
-%if ARCH_X86_64
- xor xlimd, xlimd
-%else
- mov xlimd, 0
-%endif
-
- ; main y loop for vertical filter
-.loop:
-%if ARCH_X86_64
- mov srcptrq, srcq
- mov dstptrq, dstq
- lea xd, [wq+xlimq]
-%else
- mov [esp+8], srcq
- mov [esp+4], dstq
- mov xd, xlimd
- add xd, wd
-%endif
-
- ; load left edge pixels
- test edgeb, 1 ; have_left
- jz .emu_left
- test leftq, leftq ; left == NULL for the edge-extended bottom/top
- jz .load_left_combined
- movd m0, [leftq]
- movd m1, [srcq]
- punpckldq m0, m1
- pslldq m0, 9
- add leftq, 4
- jmp .left_load_done
-.load_left_combined:
- movq m0, [srcq-3]
- pslldq m0, 10
- jmp .left_load_done
-.emu_left:
- movd m0, [srcq]
-%if cpuflag(ssse3)
- pshufb m0, [PIC_sym(pb_14x0_1_2)]
-%else
- pslldq m1, m0, 13
- punpcklbw m0, m0
- pshuflw m0, m0, q0000
- punpcklqdq m0, m0
- psrldq m0, 2
- por m0, m1
-%endif
-
- ; load right edge pixels
-.left_load_done:
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- je .splat_right
-
- ; for very small images (w=[1-2]), edge-extend the original cache,
- ; ugly, but only runs in very odd cases
-%if cpuflag(ssse3)
- add wd, wd
- %if ARCH_X86_64
- pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- %else
- pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
- %endif
- shr wd, 1
-%else
- shl wd, 4
- pcmpeqd m2, m2
- movd m3, wd
- psrldq m2, 2
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q1122
- psllq m1, m3
- pand m0, m2
- pandn m2, m1
- por m0, m2
- shr wd, 4
-%endif
-
- ; main x loop, mostly this starts in .main_load
-.splat_right:
- ; no need to load new pixels, just extend them from the (possibly previously
- ; extended) previous load into m0
-%if cpuflag(ssse3)
- pshufb m1, m0, [PIC_sym(pb_15)]
-%else
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q3333
- punpckhqdq m1, m1
-%endif
- jmp .main_loop
-.load_and_splat:
- ; load new pixels and extend edge for right-most
- movu m1, [srcptrq+3]
-%if ARCH_X86_64
- sub r11, xq
- movu m2, [r11+16]
- add r11, xq
-%else
- sub PIC_reg, xd
- movu m2, [PIC_sym(pb_right_ext_mask)+16]
- add PIC_reg, xd
-%endif
- movd m3, [srcptrq+2+xq]
-%if cpuflag(ssse3)
- pshufb m3, [PIC_sym(pb_0)]
-%else
- punpcklbw m3, m3
- pshuflw m3, m3, q0000
- punpcklqdq m3, m3
-%endif
- pand m1, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- pand m3, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- por m1, m3
- jmp .main_loop
-.main_load:
- ; load subsequent line
- movu m1, [srcptrq+3]
-.main_loop:
-%if ARCH_X86_64
- PALIGNR m2, m1, m0, 10
- PALIGNR m3, m1, m0, 11
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 13
- PALIGNR m6, m1, m0, 14
- PALIGNR m7, m1, m0, 15
-
- punpcklbw m0, m2, m1
- punpckhbw m2, m1
- punpcklbw m8, m3, m7
- punpckhbw m3, m7
- punpcklbw m7, m4, m6
- punpckhbw m4, m6
- PMADDUBSW m0, m15, m6, m9, 1
- PMADDUBSW m2, m15, m6, m9, 0
- PMADDUBSW m8, m14, m6, m9, 0
- PMADDUBSW m3, m14, m6, m9, 0
- PMADDUBSW m7, m13, m6, m9, 0
- PMADDUBSW m4, m13, m6, m9, 0
- paddw m0, m8
- paddw m2, m3
- %if cpuflag(ssse3)
- pxor m6, m6
- %endif
- punpcklbw m3, m5, m6
- punpckhbw m5, m6
- psllw m8, m3, 7
- psllw m6, m5, 7
- psubw m8, m10
- psubw m6, m10
- pmullw m3, m12
- pmullw m5, m12
- paddw m0, m7
- paddw m2, m4
- paddw m0, m3
- paddw m2, m5
- paddsw m0, m8 ; see the avx2 for an explanation
- paddsw m2, m6 ; of how the clipping works here
- psraw m0, 3
- psraw m2, 3
- paddw m0, m11
- paddw m2, m11
- mova [dstptrq+ 0], m0
- mova [dstptrq+16], m2
-%else
- PALIGNR m2, m1, m0, 10
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- PMADDUBSW m3, m15, m4, m5, 1
- PMADDUBSW m2, m15, m4, m5, 0
- PALIGNR m4, m1, m0, 11
- PALIGNR m5, m1, m0, 15
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m14, m5, m7, 1
- PMADDUBSW m4, m14, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 14
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m13, m5, m7, 1
- PMADDUBSW m4, m13, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m6, m1, m0, 13
- %if cpuflag(ssse3)
- pxor m5, m5
- %endif
- punpcklbw m4, m6, m5
- punpckhbw m6, m5
- psllw m5, m4, 7
- psllw m7, m6, 7
- psubw m5, m10
- psubw m7, m10
- pmullw m4, m12
- pmullw m6, m12
- paddw m3, m4
- paddw m2, m6
- paddsw m3, m5
- paddsw m2, m7
- psraw m3, 3
- psraw m2, 3
- paddw m3, m11
- paddw m2, m11
- mova [dstptrq+ 0], m3
- mova [dstptrq+16], m2
-%endif
-
- mova m0, m1
- add srcptrq, 16
- add dstptrq, 32
- sub xd, 16
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- cmp xd, xlimd
- jg .splat_right
-
-%if ARCH_X86_32
- mov srcq, [esp+8]
- mov dstq, [esp+4]
-%endif
- add srcq, strideq
- add dstq, 384*2
- dec hd
- jg .loop
- RET
-%endmacro
-
-%macro WIENER_V 0
-%if ARCH_X86_64
-cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge
- mov edged, edgem
- movifnidn fltq, fltmp
- movifnidn hd, hm
- movq m15, [fltq+16]
- pshufd m14, m15, q1111
- pshufd m15, m15, q0000
- mova m12, [pd_1024]
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
-
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 3
-%else
-cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge
- %define ylimd [esp+12]
-
- mov r5d, edgem
- and r5d, 8
- shr r5d, 2
- sub r5d, 3
- mov ylimd, r5d
- mov fltq, fltmp
- mov edged, edgem
-
- SETUP_PIC edged
-
- movq m0, [fltq+16]
- pshufd m1, m0, q1111
- pshufd m0, m0, q0000
- mova [esp+0x50], m0
- mova [esp+0x40], m1
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge
- %define mptrq midq
- %define dstptrq dstq
- %define edgeb byte [esp]
-%endif
-
- ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
- mova m3, [midq] ; middle line
-
- ; load top pixels
- test edgeb, 4 ; have_top
- jz .emu_top
- mova m0, [midq-384*4]
- mova m2, [midq-384*2]
- mova m1, m0
- jmp .load_bottom_pixels
-.emu_top:
- mova m0, m3
- mova m1, m3
- mova m2, m3
-
- ; load bottom pixels
-.load_bottom_pixels:
- mov yd, hd
-%if ARCH_X86_64
- mov mptrq, midq
- mov dstptrq, dstq
- add yd, ylimd
-%else
- mov [esp+8], midq
- mov [esp+4], dstq
- add yd, ylimd
-%endif
- jg .load_threelines
-
- ; the remainder here is somewhat messy but only runs in very weird
- ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
- ; so performance is not terribly important here...
- je .load_twolines
- cmp yd, -1
- je .load_oneline
- ; h == 1 case
- mova m5, m3
- mova m4, m3
- mova m6, m3
- jmp .loop
-.load_oneline:
- ; h == 2 case
- mova m4, [midq+384*2]
- mova m5, m4
- mova m6, m4
- jmp .loop
-.load_twolines:
- ; h == 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- mova m6, m5
- jmp .loop
-.load_threelines:
- ; h > 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- ; third line loaded in main loop below
-
- ; main y loop for vertical filter
-.loop_load:
- ; load one line into m6. if that pixel is no longer available, do
- ; nothing, since m6 still has the data from the previous line in it. We
- ; try to structure the loop so that the common case is evaluated fastest
- mova m6, [mptrq+384*6]
-.loop:
-%if ARCH_X86_64
- paddw m7, m0, m6
- paddw m8, m1, m5
- paddw m9, m2, m4
- punpcklwd m10, m7, m8
- punpckhwd m7, m8
- punpcklwd m11, m9, m3
- punpckhwd m9, m3
- pmaddwd m10, m15
- pmaddwd m7, m15
- pmaddwd m11, m14
- pmaddwd m9, m14
- paddd m10, m12
- paddd m7, m12
- paddd m10, m11
- paddd m7, m9
- psrad m10, 11
- psrad m7, 11
- packssdw m10, m7
- packuswb m10, m10
- movq [dstptrq], m10
-%else
- mova [esp+0x30], m1
- mova [esp+0x20], m2
- mova [esp+0x10], m3
- paddw m0, m6
- paddw m1, m5
- paddw m2, m4
- punpcklwd m7, m2, m3
- punpckhwd m2, m3
- punpcklwd m3, m0, m1
- punpckhwd m0, m1
- mova m1, [esp+0x50]
- pmaddwd m3, m1
- pmaddwd m0, m1
- mova m1, [esp+0x40]
- pmaddwd m7, m1
- pmaddwd m2, m1
- paddd m3, [PIC_sym(pd_1024)]
- paddd m0, [PIC_sym(pd_1024)]
- paddd m3, m7
- paddd m0, m2
- psrad m3, 11
- psrad m0, 11
- packssdw m3, m0
- packuswb m3, m3
- movq [dstq], m3
- mova m1, [esp+0x30]
- mova m2, [esp+0x20]
- mova m3, [esp+0x10]
-%endif
- ; shift pixels one position
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m5
- mova m5, m6
- add mptrq, 384*2
- add dstptrq, strideq
- dec yd
- jg .loop_load
- ; for the bottom pixels, continue using m6 (as extended edge)
- cmp yd, ylimd
- jg .loop
-
-%if ARCH_X86_32
- mov midq, [esp+8]
- mov dstq, [esp+4]
-%endif
- add midq, 16
- add dstq, 8
- sub wd, 8
- jg .loop_x
- RET
-%endmacro
-
-INIT_XMM sse2
-WIENER_H
-WIENER_V
-
-INIT_XMM ssse3
-WIENER_H
-WIENER_V
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; self-guided ;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%macro MULLD 2
- pmulhuw m5, %1, %2
- pmullw %1, %2
- pslld m5, 16
- paddd %1, m5
-%endmacro
-
-%macro GATHERDD 2
- mova m5, m7
- movd r6d, %2
- %if ARCH_X86_64
- movd %1, [r5+r6]
- pextrw r6d, %2, 2
- pinsrw m5, [r5+r6+2], 3
- pextrw r6d, %2, 4
- pinsrw %1, [r5+r6+2], 5
- pextrw r6d, %2, 6
- pinsrw m5, [r5+r6+2], 7
- %else
- movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
- pextrw r6d, %2, 2
- pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
- pextrw r6d, %2, 4
- pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
- pextrw r6d, %2, 6
- pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
- %endif
- por %1, m5
-%endmacro
-
-%if ARCH_X86_64
-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- mov xlimd, edgem
- movifnidn xd, xm
- mov hd, hm
- mov edged, xlimd
- and xlimd, 2 ; have_right
- add xd, xlimd
- xor xlimd, 2 ; 2*!have_right
-%else
-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- %define wq r0m
- %define xlimd r1m
- %define hd hmp
- %define edgeb byte edgem
-
- mov r6, edgem
- and r6, 2 ; have_right
- add xd, r6
- xor r6, 2 ; 2*!have_right
- mov xlimd, r6
- SETUP_PIC r6, 0
-%endif
-
- jnz .no_right
- add xd, 7
- and xd, ~7
-.no_right:
- pxor m1, m1
- lea srcq, [srcq+xq]
- lea sumq, [sumq+xq*2-2]
- lea sumsqq, [sumsqq+xq*4-4]
- neg xq
- mov wq, xq
-%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
-%endif
-.loop_y:
- mov xq, wq
-
- ; load left
- test edgeb, 1 ; have_left
- jz .no_left
- test leftq, leftq
- jz .load_left_from_main
- movd m0, [leftq]
- pslldq m0, 12
- add leftq, 4
- jmp .expand_x
-.no_left:
- movd m0, [srcq+xq]
- pshufb m0, [PIC_sym(pb_0)]
- jmp .expand_x
-.load_left_from_main:
- movd m0, [srcq+xq-2]
- pslldq m0, 14
-.expand_x:
- punpckhbw xm0, xm1
-
- ; when we reach this, m0 contains left two px in highest words
- cmp xd, -8
- jle .loop_x
-.partial_load_and_extend:
- movd m3, [srcq-4]
- pshufb m3, [PIC_sym(pb_3)]
- movq m2, [srcq+xq]
- punpcklbw m2, m1
- punpcklbw m3, m1
-%if ARCH_X86_64
- movu m4, [r10+xq*2]
-%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
-%endif
- pand m2, m4
- pandn m4, m3
- por m2, m4
- jmp .loop_x_noload
-.right_extend:
- pshufb m2, m0, [PIC_sym(pb_14_15)]
- jmp .loop_x_noload
-
-.loop_x:
- movq m2, [srcq+xq]
- punpcklbw m2, m1
-.loop_x_noload:
- palignr m3, m2, m0, 12
- palignr m4, m2, m0, 14
-
- punpcklwd m5, m3, m2
- punpckhwd m6, m3, m2
- paddw m3, m4
- punpcklwd m7, m4, m1
- punpckhwd m4, m1
- pmaddwd m5, m5
- pmaddwd m6, m6
- pmaddwd m7, m7
- pmaddwd m4, m4
- paddd m5, m7
- paddd m6, m4
- paddw m3, m2
- movu [sumq+xq*2], m3
- movu [sumsqq+xq*4+ 0], m5
- movu [sumsqq+xq*4+16], m6
-
- mova m0, m2
- add xq, 8
-
- ; if x <= -8 we can reload more pixels
- ; else if x < 0 we reload and extend (this implies have_right=0)
- ; else if x < xlimd we extend from previous load (this implies have_right=0)
- ; else we are done
-
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jl .partial_load_and_extend
- cmp xd, xlimd
- jl .right_extend
-
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- add srcq, strideq
- dec hd
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
- movifnidn edged, edgem
-%else
-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- %define sumsq_baseq dword [esp+0]
- %define sum_baseq dword [esp+4]
- %define ylimd dword [esp+8]
- %define m8 [esp+12]
- mov edged, r4m
- mov hd, r3m
-%endif
- mov xq, -2
-%if ARCH_X86_64
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 2 ; -2 if have_bottom=0, else 0
- mov sumsq_baseq, sumsqq
- mov sum_baseq, sumq
-.loop_x:
- mov sumsqq, sumsq_baseq
- mov sumq, sum_baseq
- lea yd, [hq+ylimq+2]
-%else
- mov yd, edged
- and yd, 8 ; have_bottom
- shr yd, 2
- sub yd, 2 ; -2 if have_bottom=0, else 0
- mov sumsq_baseq, sumsqq
- mov sum_baseq, sumq
- mov ylimd, yd
-.loop_x:
- mov sumsqd, sumsq_baseq
- mov sumd, sum_baseq
- lea yd, [hq+2]
- add yd, ylimd
-%endif
- lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
- lea sumq, [sumq+xq*2+2-(384+16)*2]
- test edgeb, 4 ; have_top
- jnz .load_top
- movu m0, [sumsqq+(384+16)*4*1]
- movu m1, [sumsqq+(384+16)*4*1+16]
- mova m2, m0
- mova m3, m1
- mova m4, m0
- mova m5, m1
- movu m6, [sumq+(384+16)*2*1]
- mova m7, m6
- mova m8, m6
- jmp .loop_y_noload
-.load_top:
- movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left]
- movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right]
- movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left]
- movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right]
- movu m6, [sumq-(384+16)*2*1] ; l2
- movu m7, [sumq-(384+16)*2*0] ; l1
-.loop_y:
-%if ARCH_X86_64
- movu m8, [sumq+(384+16)*2*1] ; l0
-%else
- movu m4, [sumq+(384+16)*2*1] ; l0
- mova m8, m4
-%endif
- movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left]
- movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right]
-.loop_y_noload:
- paddd m0, m2
- paddd m1, m3
- paddw m6, m7
- paddd m0, m4
- paddd m1, m5
- paddw m6, m8
- movu [sumsqq+ 0], m0
- movu [sumsqq+16], m1
- movu [sumq], m6
-
- ; shift position down by one
- mova m0, m2
- mova m1, m3
- mova m2, m4
- mova m3, m5
- mova m6, m7
- mova m7, m8
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- dec yd
- jg .loop_y
- cmp yd, ylimd
- jg .loop_y_noload
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-
-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-%if ARCH_X86_64
- LEA r5, sgr_x_by_x-0xF03
-%else
- SETUP_PIC r5, 0
-%endif
- movd m6, sd
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- pxor m7, m7
- DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
- mova m8, [pd_0xF00801C7]
- mova m9, [pw_256]
- psrld m10, m9, 13 ; pd_2048
- mova m11, [pb_unpcklwdw]
-%else
- %define m8 [PIC_sym(pd_0xF00801C7)]
- %define m9 [PIC_sym(pw_256)]
- %define m10 [PIC_sym(pd_2048)]
- %define m11 [PIC_sym(pb_unpcklwdw)]
-%endif
-.loop_y:
- mov xq, -2
-.loop_x:
- movq m0, [bq+xq*2]
- movq m1, [bq+xq*2+(384+16)*2]
- punpcklwd m0, m7
- punpcklwd m1, m7
- movu m2, [aq+xq*4]
- movu m3, [aq+xq*4+(384+16)*4]
- pslld m4, m2, 3
- pslld m5, m3, 3
- paddd m2, m4 ; aa * 9
- paddd m3, m5
- pmaddwd m4, m0, m0
- pmaddwd m5, m1, m1
- pmaddwd m0, m8
- pmaddwd m1, m8
- psubd m2, m4 ; p = aa * 9 - bb * bb
- psubd m3, m5
- MULLD m2, m6
- MULLD m3, m6
- paddusw m2, m8
- paddusw m3, m8
- psrld m2, 20 ; z
- psrld m3, 20
- GATHERDD m4, m2 ; xx
- GATHERDD m2, m3
- psrld m4, 24
- psrld m2, 24
- packssdw m3, m4, m2
- pshufb m4, m11
- MULLD m0, m4
- pshufb m2, m11
- MULLD m1, m2
- psubw m5, m9, m3
- paddd m0, m10
- paddd m1, m10
- psrld m0, 12
- psrld m1, 12
- movq [bq+xq*2], m5
- psrldq m5, 8
- movq [bq+xq*2+(384+16)*2], m5
- movu [aq+xq*4], m0
- movu [aq+xq*4+(384+16)*4], m1
- add xd, 4
- cmp xd, wd
- jl .loop_x
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- sub hd, 2
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mova m15, [pw_16]
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- xor xd, xd
-%else
-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
- %define tmp_baseq [esp+8]
- %define src_baseq [esp+12]
- %define a_baseq [esp+16]
- %define b_baseq [esp+20]
- %define wd [esp+24]
- %define hd [esp+28]
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- mov wd, xd
- mov hd, yd
- xor xd, xd
- SETUP_PIC yd, 1, 1
- jmp .loop_start
-%endif
-
-.loop_x:
- mov tq, tmp_baseq
- mov srcq, src_baseq
- mov aq, a_baseq
- mov bq, b_baseq
-%if ARCH_X86_32
-.loop_start:
- movu m0, [bq+xq*2-(384+16)*2-2]
- movu m2, [bq+xq*2-(384+16)*2+2]
- mova m1, [bq+xq*2-(384+16)*2] ; b:top
- paddw m0, m2 ; b:tl+tr
- movu m2, [bq+xq*2-2]
- movu m3, [bq+xq*2+2]
- paddw m1, [bq+xq*2] ; b:top+ctr
- paddw m2, m3 ; b:l+r
- mova [esp+0x80], m0
- mova [esp+0x70], m1
- mova [esp+0x60], m2
-%endif
- movu m0, [aq+xq*4-(384+16)*4-4]
- movu m2, [aq+xq*4-(384+16)*4+4]
- mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half]
- paddd m0, m2 ; a:tl+tr [first half]
- movu m2, [aq+xq*4-(384+16)*4-4+16]
- movu m4, [aq+xq*4-(384+16)*4+4+16]
- mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half]
- paddd m2, m4 ; a:tl+tr [second half]
- movu m4, [aq+xq*4-4]
- movu m5, [aq+xq*4+4]
- paddd m1, [aq+xq*4] ; a:top+ctr [first half]
- paddd m4, m5 ; a:l+r [first half]
- movu m5, [aq+xq*4+16-4]
- movu m6, [aq+xq*4+16+4]
- paddd m3, [aq+xq*4+16] ; a:top+ctr [second half]
- paddd m5, m6 ; a:l+r [second half]
-%if ARCH_X86_64
- movu m6, [bq+xq*2-(384+16)*2-2]
- movu m8, [bq+xq*2-(384+16)*2+2]
- mova m7, [bq+xq*2-(384+16)*2] ; b:top
- paddw m6, m8 ; b:tl+tr
- movu m8, [bq+xq*2-2]
- movu m9, [bq+xq*2+2]
- paddw m7, [bq+xq*2] ; b:top+ctr
- paddw m8, m9 ; b:l+r
-%endif
-
- lea tq, [tq+xq*2]
- lea srcq, [srcq+xq*1]
- lea aq, [aq+xq*4+(384+16)*4]
- lea bq, [bq+xq*2+(384+16)*2]
- mov yd, hd
-.loop_y:
-%if ARCH_X86_64
- movu m9, [bq-2]
- movu m10, [bq+2]
- paddw m7, [bq] ; b:top+ctr+bottom
- paddw m9, m10 ; b:bl+br
- paddw m10, m7, m8 ; b:top+ctr+bottom+l+r
- paddw m6, m9 ; b:tl+tr+bl+br
- psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom
- paddw m10, m6
- psllw m10, 2
- psubw m10, m6 ; aa
- pxor m14, m14
- movq m12, [srcq]
- punpcklbw m12, m14
- punpcklwd m6, m10, m15
- punpckhwd m10, m15
- punpcklwd m13, m12, m15
- punpckhwd m12, m15
- pmaddwd m6, m13 ; aa*src[x]+256 [first half]
- pmaddwd m10, m12 ; aa*src[x]+256 [second half]
-%else
- paddd m1, [aq] ; a:top+ctr+bottom [first half]
- paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
- mova [esp+0x50], m1
- mova [esp+0x40], m3
- mova [esp+0x30], m4
- movu m6, [aq-4]
- movu m7, [aq+4]
- paddd m1, m4 ; a:top+ctr+bottom+l+r [first half]
- paddd m3, m5 ; a:top+ctr+bottom+l+r [second half]
- paddd m6, m7 ; a:bl+br [first half]
- movu m7, [aq+16-4]
- movu m4, [aq+16+4]
- paddd m7, m4 ; a:bl+br [second half]
- paddd m0, m6 ; a:tl+tr+bl+br [first half]
- paddd m2, m7 ; a:tl+tr+bl+br [second half]
- paddd m1, m0
- paddd m3, m2
- pslld m1, 2
- pslld m3, 2
- psubd m1, m0 ; bb [first half]
- psubd m3, m2 ; bb [second half]
-%endif
-
-%if ARCH_X86_64
- movu m11, [aq-4]
- movu m12, [aq+4]
- paddd m1, [aq] ; a:top+ctr+bottom [first half]
- paddd m11, m12 ; a:bl+br [first half]
- movu m12, [aq+16-4]
- movu m13, [aq+16+4]
- paddd m3, [aq+16] ; a:top+ctr+bottom [second half]
- paddd m12, m13 ; a:bl+br [second half]
- paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half]
- paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half]
- paddd m0, m11 ; a:tl+tr+bl+br [first half]
- paddd m2, m12 ; a:tl+tr+bl+br [second half]
- paddd m13, m0
- paddd m14, m2
- pslld m13, 2
- pslld m14, 2
- psubd m13, m0 ; bb [first half]
- psubd m14, m2 ; bb [second half]
- psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
- psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
-%else
- mova m4, [esp+0x80]
- mova [esp+0x80], m5
- mova m5, [esp+0x70]
- mova [esp+0x70], m6
- mova m6, [esp+0x60]
- mova [esp+0x60], m7
- mova [esp+0x20], m1
- movu m7, [bq-2]
- movu m1, [bq+2]
- paddw m5, [bq] ; b:top+ctr+bottom
- paddw m7, m1
- paddw m1, m5, m6 ; b:top+ctr+bottom+l+r
- paddw m4, m7 ; b:tl+tr+bl+br
- psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom
- paddw m1, m4
- psllw m1, 2
- psubw m1, m4 ; aa
- movq m0, [srcq]
- XCHG_PIC_REG
- punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16]
- punpcklwd m4, m1, [PIC_sym(pw_16)]
- punpckhwd m1, [PIC_sym(pw_16)]
- punpcklwd m2, m0, [PIC_sym(pw_16)]
- punpckhwd m0, [PIC_sym(pw_16)]
- XCHG_PIC_REG
- pmaddwd m4, m2 ; aa*src[x]+256 [first half]
- pmaddwd m1, m0 ; aa*src[x]+256 [second half]
-%endif
-
-%if ARCH_X86_64
- paddd m6, m13
- paddd m10, m14
- psrad m6, 9
- psrad m10, 9
- packssdw m6, m10
- mova [tq], m6
-%else
- paddd m4, [esp+0x20]
- paddd m1, m3
- psrad m4, 9
- psrad m1, 9
- packssdw m4, m1
- mova [tq], m4
-%endif
-
- ; shift to next row
-%if ARCH_X86_64
- mova m0, m4
- mova m2, m5
- mova m4, m11
- mova m5, m12
- mova m6, m8
- mova m8, m9
-%else
- mova m1, [esp+0x50]
- mova m3, [esp+0x40]
- mova m0, [esp+0x30]
- mova m2, [esp+0x80]
- mova m4, [esp+0x70]
- mova [esp+0x70], m5
- mova m5, [esp+0x60]
- mova [esp+0x80], m6
- mova [esp+0x60], m7
- psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half]
- psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
-%endif
-
- add srcq, strideq
- add aq, (384+16)*4
- add bq, (384+16)*2
- add tq, 384*2
- dec yd
- jg .loop_y
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-
-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
- movifnidn hd, hm
-%if ARCH_X86_32
- SETUP_PIC r6, 0
-%endif
- movd m0, wtm
- pshufb m0, [PIC_sym(pb_0_1)]
- psllw m0, 4
- pxor m7, m7
- DEFINE_ARGS dst, stride, t, w, h, idx
-.loop_y:
- xor idxd, idxd
-.loop_x:
- mova m1, [tq+idxq*2+ 0]
- mova m4, [tq+idxq*2+16]
- mova m5, [dstq+idxq]
- punpcklbw m2, m5, m7
- punpckhbw m5, m7
- psllw m3, m2, 4
- psllw m6, m5, 4
- psubw m1, m3
- psubw m4, m6
- pmulhrsw m1, m0
- pmulhrsw m4, m0
- paddw m1, m2
- paddw m4, m5
- packuswb m1, m4
- mova [dstq+idxq], m1
- add idxd, 16
- cmp idxd, wd
- jl .loop_x
- add dstq, strideq
- add tq, 384 * 2
- dec hd
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
- mova m10, [pb_0]
- mova m11, [pb_0_1]
-%else
-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edgeb byte edgem
- %define wd xd
- %define wq wd
- %define wm r5m
- %define strideq r4m
- SUB esp, 8
- SETUP_PIC sumsqd, 1, 1
-
- %define m10 [PIC_sym(pb_0)]
- %define m11 [PIC_sym(pb_0_1)]
-%endif
-
- test edgeb, 2 ; have_right
- jz .no_right
- xor xlimd, xlimd
- add wd, 2
- add wd, 15
- and wd, ~15
- jmp .right_done
-.no_right:
- mov xlimd, 3
- dec wd
-.right_done:
- pxor m1, m1
- lea srcq, [srcq+wq+1]
- lea sumq, [sumq+wq*2-2]
- lea sumsqq, [sumsqq+wq*4-4]
- neg wq
-%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
-%else
- mov wm, xd
- %define wq wm
-%endif
-
-.loop_y:
- mov xq, wq
- ; load left
- test edgeb, 1 ; have_left
- jz .no_left
- test leftq, leftq
- jz .load_left_from_main
- movd m0, [leftq]
- movd m2, [srcq+xq-1]
- pslldq m2, 4
- por m0, m2
- pslldq m0, 11
- add leftq, 4
- jmp .expand_x
-.no_left:
- movd m0, [srcq+xq-1]
- XCHG_PIC_REG
- pshufb m0, m10
- XCHG_PIC_REG
- jmp .expand_x
-.load_left_from_main:
- movd m0, [srcq+xq-4]
- pslldq m0, 12
-.expand_x:
- punpckhbw m0, m1
-
- ; when we reach this, m0 contains left two px in highest words
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jge .right_extend
-.partial_load_and_extend:
- XCHG_PIC_REG
- movd m3, [srcq-1]
- movq m2, [srcq+xq]
- pshufb m3, m10
- punpcklbw m3, m1
- punpcklbw m2, m1
-%if ARCH_X86_64
- movu m4, [r10+xq*2]
-%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
- XCHG_PIC_REG
-%endif
- pand m2, m4
- pandn m4, m3
- por m2, m4
- jmp .loop_x_noload
-.right_extend:
- psrldq m2, m0, 14
- XCHG_PIC_REG
- pshufb m2, m11
- XCHG_PIC_REG
- jmp .loop_x_noload
-
-.loop_x:
- movq m2, [srcq+xq]
- punpcklbw m2, m1
-.loop_x_noload:
- palignr m3, m2, m0, 8
- palignr m4, m2, m0, 10
- palignr m5, m2, m0, 12
- palignr m6, m2, m0, 14
-
-%if ARCH_X86_64
- paddw m0, m3, m2
- punpcklwd m7, m3, m2
- punpckhwd m3, m2
- paddw m0, m4
- punpcklwd m8, m4, m5
- punpckhwd m4, m5
- paddw m0, m5
- punpcklwd m9, m6, m1
- punpckhwd m5, m6, m1
- paddw m0, m6
- pmaddwd m7, m7
- pmaddwd m3, m3
- pmaddwd m8, m8
- pmaddwd m4, m4
- pmaddwd m9, m9
- pmaddwd m5, m5
- paddd m7, m8
- paddd m3, m4
- paddd m7, m9
- paddd m3, m5
- movu [sumq+xq*2], m0
- movu [sumsqq+xq*4+ 0], m7
- movu [sumsqq+xq*4+16], m3
-%else
- paddw m0, m3, m2
- paddw m0, m4
- paddw m0, m5
- paddw m0, m6
- movu [sumq+xq*2], m0
- punpcklwd m7, m3, m2
- punpckhwd m3, m2
- punpcklwd m0, m4, m5
- punpckhwd m4, m5
- punpckhwd m5, m6, m1
- pmaddwd m7, m7
- pmaddwd m3, m3
- pmaddwd m0, m0
- pmaddwd m4, m4
- pmaddwd m5, m5
- paddd m7, m0
- paddd m3, m4
- paddd m3, m5
- punpcklwd m0, m6, m1
- pmaddwd m0, m0
- paddd m7, m0
- movu [sumsqq+xq*4+ 0], m7
- movu [sumsqq+xq*4+16], m3
-%endif
-
- mova m0, m2
- add xq, 8
-
- ; if x <= -8 we can reload more pixels
- ; else if x < 0 we reload and extend (this implies have_right=0)
- ; else if x < xlimd we extend from previous load (this implies have_right=0)
- ; else we are done
-
- cmp xd, -8
- jle .loop_x
- test xd, xd
- jl .partial_load_and_extend
- cmp xd, xlimd
- jl .right_extend
-
- add srcq, strideq
- add sumsqq, (384+16)*4
- add sumq, (384+16)*2
- dec hd
- jg .loop_y
-%if ARCH_X86_32
- ADD esp, 8
-%endif
- RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
- movifnidn edged, edgem
- mov ylimd, edged
-%else
-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- %define wm [esp+0]
- %define hm [esp+4]
- %define edgem [esp+8]
- mov wm, xd
- mov hm, yd
- mov edgem, ylimd
-%endif
-
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 3 ; -3 if have_bottom=0, else -1
- mov xq, -2
-%if ARCH_X86_64
-.loop_x:
- lea yd, [hd+ylimd+2]
- lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
- test edgeb, 4 ; have_top
- jnz .load_top
- movu m0, [sumsq_ptrq+(384+16)*4*1]
- movu m1, [sumsq_ptrq+(384+16)*4*1+16]
- mova m2, m0
- mova m3, m1
- mova m4, m0
- mova m5, m1
- mova m6, m0
- mova m7, m1
- movu m10, [sum_ptrq+(384+16)*2*1]
- mova m11, m10
- mova m12, m10
- mova m13, m10
- jmp .loop_y_second_load
-.load_top:
- movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
- movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
- movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
- movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
- mova m2, m0
- mova m3, m1
- movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
- movu m12, [sum_ptrq-(384+16)*2*0] ; l2
- mova m11, m10
-.loop_y:
- movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
- movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
- movu m13, [sum_ptrq+(384+16)*2*1] ; l1
-.loop_y_second_load:
- test yd, yd
- jle .emulate_second_load
- movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
- movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
- movu m14, [sum_ptrq+(384+16)*2*2] ; l0
-.loop_y_noload:
- paddd m0, m2
- paddd m1, m3
- paddw m10, m11
- paddd m0, m4
- paddd m1, m5
- paddw m10, m12
- paddd m0, m6
- paddd m1, m7
- paddw m10, m13
- paddd m0, m8
- paddd m1, m9
- paddw m10, m14
- movu [sumsq_ptrq+ 0], m0
- movu [sumsq_ptrq+16], m1
- movu [sum_ptrq], m10
-
- ; shift position down by one
- mova m0, m4
- mova m1, m5
- mova m2, m6
- mova m3, m7
- mova m4, m8
- mova m5, m9
- mova m10, m12
- mova m11, m13
- mova m12, m14
- add sumsq_ptrq, (384+16)*4*2
- add sum_ptrq, (384+16)*2*2
- sub yd, 2
- jge .loop_y
- ; l1 = l0
- mova m6, m8
- mova m7, m9
- mova m13, m14
- cmp yd, ylimd
- jg .loop_y_noload
- add xd, 8
- cmp xd, wd
- jl .loop_x
- RET
-.emulate_second_load:
- mova m8, m6
- mova m9, m7
- mova m14, m13
- jmp .loop_y_noload
-%else
-.sumsq_loop_x:
- lea yd, [ylimd+2]
- add yd, hm
- lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- test byte edgem, 4 ; have_top
- jnz .sumsq_load_top
- movu m0, [sumsq_ptrq+(384+16)*4*1]
- movu m1, [sumsq_ptrq+(384+16)*4*1+16]
- mova m4, m0
- mova m5, m1
- mova m6, m0
- mova m7, m1
- mova [esp+0x1c], m0
- mova [esp+0x0c], m1
- jmp .sumsq_loop_y_second_load
-.sumsq_load_top:
- movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left]
- movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right]
- movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
- movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right]
- mova [esp+0x1c], m0
- mova [esp+0x0c], m1
-.sumsq_loop_y:
- movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
- movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right]
-.sumsq_loop_y_second_load:
- test yd, yd
- jle .sumsq_emulate_second_load
- movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left]
- movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right]
-.sumsq_loop_y_noload:
- paddd m0, [esp+0x1c]
- paddd m1, [esp+0x0c]
- paddd m0, m4
- paddd m1, m5
- paddd m0, m6
- paddd m1, m7
- paddd m0, m2
- paddd m1, m3
- movu [sumsq_ptrq+ 0], m0
- movu [sumsq_ptrq+16], m1
-
- ; shift position down by one
- mova m0, m4
- mova m1, m5
- mova m4, m2
- mova m5, m3
- mova [esp+0x1c], m6
- mova [esp+0x0c], m7
- add sumsq_ptrq, (384+16)*4*2
- sub yd, 2
- jge .sumsq_loop_y
- ; l1 = l0
- mova m6, m2
- mova m7, m3
- cmp yd, ylimd
- jg .sumsq_loop_y_noload
- add xd, 8
- cmp xd, wm
- jl .sumsq_loop_x
-
- mov xd, -2
-.sum_loop_x:
- lea yd, [ylimd+2]
- add yd, hm
- lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test byte edgem, 4 ; have_top
- jnz .sum_load_top
- movu m0, [sum_ptrq+(384+16)*2*1]
- mova m1, m0
- mova m2, m0
- mova m3, m0
- jmp .sum_loop_y_second_load
-.sum_load_top:
- movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4
- movu m2, [sum_ptrq-(384+16)*2*0] ; l2
- mova m1, m0
-.sum_loop_y:
- movu m3, [sum_ptrq+(384+16)*2*1] ; l1
-.sum_loop_y_second_load:
- test yd, yd
- jle .sum_emulate_second_load
- movu m4, [sum_ptrq+(384+16)*2*2] ; l0
-.sum_loop_y_noload:
- paddw m0, m1
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- movu [sum_ptrq], m0
-
- ; shift position down by one
- mova m0, m2
- mova m1, m3
- mova m2, m4
- add sum_ptrq, (384+16)*2*2
- sub yd, 2
- jge .sum_loop_y
- ; l1 = l0
- mova m3, m4
- cmp yd, ylimd
- jg .sum_loop_y_noload
- add xd, 8
- cmp xd, wm
- jl .sum_loop_x
- RET
-.sumsq_emulate_second_load:
- mova m2, m6
- mova m3, m7
- jmp .sumsq_loop_y_noload
-.sum_emulate_second_load:
- mova m4, m3
- jmp .sum_loop_y_noload
-%endif
-
-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
- movifnidn sd, sm
- sub aq, (384+16-1)*4
- sub bq, (384+16-1)*2
- add hd, 2
-%if ARCH_X86_64
- LEA r5, sgr_x_by_x-0xF03
-%else
- SETUP_PIC r5, 0
-%endif
- movd m6, sd
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
- pxor m7, m7
- DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
- mova m8, [pd_0xF0080029]
- mova m9, [pw_256]
- psrld m10, m9, 15 ; pd_512
-%else
- %define m8 [PIC_sym(pd_0xF0080029)]
- %define m9 [PIC_sym(pw_256)]
- %define m10 [PIC_sym(pd_512)]
-%endif
-.loop_y:
- mov xq, -2
-.loop_x:
- movq m0, [bq+xq*2+0]
- movq m1, [bq+xq*2+8]
- punpcklwd m0, m7
- punpcklwd m1, m7
- movu m2, [aq+xq*4+ 0]
- movu m3, [aq+xq*4+16]
- pslld m4, m2, 3 ; aa * 8
- pslld m5, m3, 3
- paddd m2, m4 ; aa * 9
- paddd m3, m5
- paddd m4, m4 ; aa * 16
- paddd m5, m5
- paddd m2, m4 ; aa * 25
- paddd m3, m5
- pmaddwd m4, m0, m0
- pmaddwd m5, m1, m1
- psubd m2, m4 ; p = aa * 25 - bb * bb
- psubd m3, m5
- MULLD m2, m6
- MULLD m3, m6
- paddusw m2, m8
- paddusw m3, m8
- psrld m2, 20 ; z
- psrld m3, 20
- GATHERDD m4, m2 ; xx
- GATHERDD m2, m3
- psrld m4, 24
- psrld m2, 24
- packssdw m3, m4, m2
- pmullw m4, m8
- pmullw m2, m8
- psubw m5, m9, m3
- pmaddwd m0, m4
- pmaddwd m1, m2
- paddd m0, m10
- paddd m1, m10
- psrld m0, 10
- psrld m1, 10
- movu [bq+xq*2], m5
- movu [aq+xq*4+ 0], m0
- movu [aq+xq*4+16], m1
- add xd, 8
- cmp xd, wd
- jl .loop_x
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- sub hd, 2
- jg .loop_y
- RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
- tmp_base, src_base, a_base, b_base, x, y
- movifnidn wd, wm
- mov hd, hm
- mov tmp_baseq, tq
- mov src_baseq, srcq
- mov a_baseq, aq
- mov b_baseq, bq
- mova m9, [pw_5_6]
- mova m12, [pw_256]
- psrlw m10, m12, 8 ; pw_1
- psrlw m11, m12, 1 ; pw_128
- pxor m13, m13
-%else
-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
- %define tmp_baseq r0m
- %define src_baseq r1m
- %define a_baseq r3m
- %define b_baseq r4m
- %define wd r5m
- %define hd r6m
-
- SUB esp, 8
- SETUP_PIC yd
-
- %define m8 m5
- %define m9 [PIC_sym(pw_5_6)]
- %define m10 [PIC_sym(pw_1)]
- %define m11 [PIC_sym(pw_128)]
- %define m12 [PIC_sym(pw_256)]
- %define m13 m0
-%endif
- xor xd, xd
-.loop_x:
- mov tq, tmp_baseq
- mov srcq, src_baseq
- mov aq, a_baseq
- mov bq, b_baseq
- movu m0, [aq+xq*4-(384+16)*4-4]
- mova m1, [aq+xq*4-(384+16)*4]
- movu m2, [aq+xq*4-(384+16)*4+4]
- movu m3, [aq+xq*4-(384+16)*4-4+16]
- mova m4, [aq+xq*4-(384+16)*4+16]
- movu m5, [aq+xq*4-(384+16)*4+4+16]
- paddd m0, m2
- paddd m3, m5
- paddd m0, m1
- paddd m3, m4
- pslld m2, m0, 2
- pslld m5, m3, 2
- paddd m2, m0
- paddd m5, m3
- paddd m0, m2, m1 ; prev_odd_b [first half]
- paddd m1, m5, m4 ; prev_odd_b [second half]
- movu m3, [bq+xq*2-(384+16)*2-2]
- mova m4, [bq+xq*2-(384+16)*2]
- movu m5, [bq+xq*2-(384+16)*2+2]
- paddw m3, m5
- punpcklwd m5, m3, m4
- punpckhwd m3, m4
- pmaddwd m5, m9
- pmaddwd m3, m9
- mova m2, m5
- packssdw m2, m3 ; prev_odd_a
- lea tq, [tq+xq*2]
- lea srcq, [srcq+xq*1]
- lea aq, [aq+xq*4+(384+16)*4]
- lea bq, [bq+xq*2+(384+16)*2]
-%if ARCH_X86_32
- mov [esp], PIC_reg
-%endif
- mov yd, hd
- XCHG_PIC_REG
-.loop_y:
- movu m3, [aq-4]
- mova m4, [aq]
- movu m5, [aq+4]
- paddd m3, m5
- paddd m3, m4
- pslld m5, m3, 2
- paddd m5, m3
- paddd m5, m4 ; cur_odd_b [first half]
- movu m3, [aq+16-4]
- mova m6, [aq+16]
- movu m7, [aq+16+4]
- paddd m3, m7
- paddd m3, m6
- pslld m7, m3, 2
- paddd m7, m3
- paddd m4, m7, m6 ; cur_odd_b [second half]
- movu m3, [bq-2]
- mova m6, [bq]
- movu m7, [bq+2]
- paddw m3, m7
- punpcklwd m7, m3, m6
- punpckhwd m3, m6
- pmaddwd m7, m9
- pmaddwd m3, m9
- packssdw m6, m7, m3 ; cur_odd_a
-
- paddd m0, m5 ; cur_even_b [first half]
- paddd m1, m4 ; cur_even_b [second half]
- paddw m2, m6 ; cur_even_a
-
- movq m3, [srcq]
-%if ARCH_X86_64
- punpcklbw m3, m13
-%else
- mova [td], m5
- pxor m7, m7
- punpcklbw m3, m7
-%endif
- punpcklwd m7, m3, m10
- punpckhwd m3, m10
- punpcklwd m8, m2, m12
- punpckhwd m2, m12
- pmaddwd m7, m8
- pmaddwd m3, m2
- paddd m7, m0
- paddd m3, m1
- psrad m7, 9
- psrad m3, 9
-
-%if ARCH_X86_32
- pxor m13, m13
-%endif
- movq m8, [srcq+strideq]
- punpcklbw m8, m13
- punpcklwd m0, m8, m10
- punpckhwd m8, m10
- punpcklwd m1, m6, m11
- punpckhwd m2, m6, m11
- pmaddwd m0, m1
- pmaddwd m8, m2
-%if ARCH_X86_64
- paddd m0, m5
-%else
- paddd m0, [td]
-%endif
- paddd m8, m4
- psrad m0, 8
- psrad m8, 8
-
- packssdw m7, m3
- packssdw m0, m8
-%if ARCH_X86_32
- mova m5, [td]
-%endif
- mova [tq+384*2*0], m7
- mova [tq+384*2*1], m0
-
- mova m0, m5
- mova m1, m4
- mova m2, m6
- add aq, (384+16)*4*2
- add bq, (384+16)*2*2
- add tq, 384*2*2
- lea srcq, [srcq+strideq*2]
-%if ARCH_X86_64
- sub yd, 2
-%else
- sub dword [esp+4], 2
-%endif
- jg .loop_y
- add xd, 8
- cmp xd, wd
- jl .loop_x
-%if ARCH_X86_32
- ADD esp, 8
-%endif
- RET
-
-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
- movifnidn wd, wm
- movd m0, wtm
-%if ARCH_X86_64
- movifnidn hd, hm
- mova m10, [pd_1024]
- pxor m11, m11
-%else
- SETUP_PIC hd, 0
- %define m10 [PIC_sym(pd_1024)]
- %define m11 m7
-%endif
- pshufd m0, m0, 0
- DEFINE_ARGS dst, stride, t1, t2, w, h, idx
-%if ARCH_X86_32
- %define hd hmp
-%endif
-
-.loop_y:
- xor idxd, idxd
-.loop_x:
- mova m1, [t1q+idxq*2+ 0]
- mova m2, [t1q+idxq*2+16]
- mova m3, [t2q+idxq*2+ 0]
- mova m4, [t2q+idxq*2+16]
- mova m6, [dstq+idxq]
-%if ARCH_X86_32
- pxor m11, m11
-%endif
- punpcklbw m5, m6, m11
- punpckhbw m6, m11
- psllw m7, m5, 4
- psubw m1, m7
- psubw m3, m7
- psllw m7, m6, 4
- psubw m2, m7
- psubw m4, m7
- punpcklwd m7, m1, m3
- punpckhwd m1, m3
- punpcklwd m3, m2, m4
- punpckhwd m2, m4
- pmaddwd m7, m0
- pmaddwd m1, m0
- pmaddwd m3, m0
- pmaddwd m2, m0
- paddd m7, m10
- paddd m1, m10
- paddd m3, m10
- paddd m2, m10
- psrad m7, 11
- psrad m1, 11
- psrad m3, 11
- psrad m2, 11
- packssdw m7, m1
- packssdw m3, m2
- paddw m7, m5
- paddw m3, m6
- packuswb m7, m3
- mova [dstq+idxq], m7
- add idxd, 16
- cmp idxd, wd
- jl .loop_x
- add dstq, strideq
- add t1q, 384 * 2
- add t2q, 384 * 2
- dec hd
- jg .loop_y
- RET