ref: 8ff894639bbfaf849f54424f0d8241b076f0fd97
parent: e79e5ceb2cb74fc466e2868c4725d98ccca7cac7
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Sat Dec 21 11:06:45 EST 2019
SSSE3 implementations of film grain gen_grain_y_ar0_8bpc_c: 84853.3 gen_grain_y_ar0_8bpc_ssse3: 23528.0 gen_grain_y_ar1_8bpc_c: 140775.5 gen_grain_y_ar1_8bpc_ssse3: 70410.2 gen_grain_y_ar2_8bpc_c: 251311.3 gen_grain_y_ar2_8bpc_ssse3: 95222.2 gen_grain_y_ar3_8bpc_c: 394763.0 gen_grain_y_ar3_8bpc_ssse3: 103541.9 gen_grain_uv_ar0_8bpc_420_c: 29773.7 gen_grain_uv_ar0_8bpc_420_ssse3: 7068.9 gen_grain_uv_ar1_8bpc_420_c: 46113.2 gen_grain_uv_ar1_8bpc_420_ssse3: 22148.1 gen_grain_uv_ar2_8bpc_420_c: 70061.4 gen_grain_uv_ar2_8bpc_420_ssse3: 25479.0 gen_grain_uv_ar3_8bpc_420_c: 113826.0 gen_grain_uv_ar3_8bpc_420_ssse3: 30004.9 fguv_32x32xn_8bpc_420_csfl0_c: 8148.9 fguv_32x32xn_8bpc_420_csfl0_ssse3: 1371.3 fguv_32x32xn_8bpc_420_csfl1_c: 6391.9 fguv_32x32xn_8bpc_420_csfl1_ssse3: 1034.8 fgy_32x32xn_8bpc_c: 14201.3 fgy_32x32xn_8bpc_ssse3: 3443.0
--- a/include/dav1d/headers.h
+++ b/include/dav1d/headers.h
@@ -318,7 +318,7 @@
int scaling_shift;
int ar_coeff_lag;
int8_t ar_coeffs_y[24];
- int8_t ar_coeffs_uv[2][25];
+ int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
uint64_t ar_coeff_shift;
int grain_scale_shift;
int uv_mult[2];
--- a/src/meson.build
+++ b/src/meson.build
@@ -151,6 +151,7 @@
'x86/looprestoration.asm',
'x86/mc.asm',
'x86/cdef_sse.asm',
+ 'x86/film_grain_ssse3.asm',
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -469,7 +469,7 @@
.ar0:
INIT_YMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
- imul uvd, 25
+ imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
movd xm3, [base+hmul_bits+shiftq*2]
@@ -539,7 +539,7 @@
.ar1:
INIT_XMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
- imul uvd, 25
+ imul uvd, 28
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
@@ -606,7 +606,7 @@
.ar2:
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
- imul uvd, 25
+ imul uvd, 28
vpbroadcastw xm15, [base+round_vals-12+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
@@ -696,7 +696,7 @@
%assign stack_size_padded (stack_size_padded+16*12)
%assign stack_size (stack_size+16*12)
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
- imul uvd, 25
+ imul uvd, 28
vpbroadcastw xm14, [base+round_vals-12+shiftq*2]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -28,6 +28,11 @@
#include "src/cpu.h"
#include "src/film_grain.h"
+decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
+decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
@@ -35,6 +40,15 @@
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ c->generate_grain_y = dav1d_generate_grain_y_ssse3;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
+ c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- /dev/null
+++ b/src/x86/film_grain_ssse3.asm
@@ -1,0 +1,2938 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+pw_1024: times 8 dw 1024
+pb_27_17: times 8 db 27, 17
+pb_17_27: times 8 db 17, 27
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_23_22: times 2 db 23, 22
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pw_1: dw 1
+
+%define pb_27_17_17_27 pb_17_27 - 2
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
+
+struc FGData
+ .seed: resd 1
+ .num_y_points: resd 1
+ .y_points: resb 14 * 2
+ .chroma_scaling_from_luma: resd 1
+ .num_uv_points: resd 2
+ .uv_points: resb 2 * 10 * 2
+ .scaling_shift: resd 1
+ .ar_coeff_lag: resd 1
+ .ar_coeffs_y: resb 24
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
+ .grain_scale_shift: resd 1
+ .uv_mult: resd 2
+ .uv_luma_mult: resd 2
+ .uv_offset: resd 2
+ .overlap_flag: resd 1
+ .clip_to_restricted_range: resd 1
+endstruc
+
+cextern gaussian_sequence
+
+SECTION .text
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r2d, [fg_dataq+FGData.grain_scale_shift]
+ movd m2, [base+round+r2*2]
+ movd m0, [fg_dataq+FGData.seed]
+ mova m5, [base+pb_mask]
+ pshuflw m2, m2, q0000
+ pshuflw m0, m0, q0000
+ mov r2, -73*82
+ sub bufq, r2
+ lea r3, [base+gaussian_sequence]
+.loop:
+ pand m6, m0, m1
+ psrlw m3, m6, 10
+ por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m6, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m6 ; set 15th bit for next 4 seeds
+ psllq m6, m3, 30
+ por m3, m6
+ psllq m6, m3, 15
+ por m3, m6 ; aggregate each bit into next seed's high bit
+ pmulhuw m6, m0, m7
+ por m3, m6 ; 4 next output seeds
+ pshuflw m0, m3, q3333
+ psrlw m3, 5
+%if ARCH_X86_64
+ movq r6, m3
+ mov r8, r6
+ movzx r5d, r6w
+ shr r6d, 16
+ shr r8, 32
+ movzx r7, r8w
+ shr r8, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+ pinsrw m6, [r3+r7*2], 2
+ pinsrw m6, [r3+r8*2], 3
+%else
+ movd r6, m3
+ pshuflw m3, m3, q3232
+ movzx r5, r6w
+ shr r6, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+
+ movd r6, m3
+ movzx r5, r6w
+ shr r6, 16
+
+ pinsrw m6, [r3+r5*2], 2
+ pinsrw m6, [r3+r6*2], 3
+%endif
+ pmulhrsw m6, m2
+ packsswb m6, m6
+ movd [bufq+r2], m6
+ add r2, 4
+ jl .loop
+
+ ; auto-regression code
+ movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r2, [base+generate_grain_y_ssse3_table+r2*4]
+ lea r2, [r2+base+generate_grain_y_ssse3_table]
+ jmp r2
+
+.ar1:
+%if ARCH_X86_32
+ DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
+%elif WIN64
+ DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov ecx, [fg_dataq+FGData.ar_coeff_shift]
+%if ARCH_X86_32
+ mov r1m, cf3d
+ DEFINE_ARGS buf, shift, val3, min, max, x, val0
+%define hd r0mp
+%define cf3d r1mp
+%elif WIN64
+ DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+%endif
+ pxor m6, m6
+ pcmpgtb m7, m6, m4
+ punpcklbw m4, m7
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pshuflw m3, m3, q0000
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ movq m0, [bufq+xq-82-1] ; top/left
+ pcmpgtb m7, m6, m0
+ punpcklbw m0, m7
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend+1]
+ SCRATCH 7, 15, 7
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ pxor m7, m7
+ pshuflw m6, m6, q0000
+ punpcklwd m6, m7
+ pcmpgtb m4, m7, m0
+ pcmpgtb m5, m7, m1
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ DEFINE_ARGS buf, fg_data, h, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m7, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m6, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m6, m1
+ psrldq m5, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m5, m1
+ pmaddwd m4, m9
+ pmaddwd m6, m10
+ pmaddwd m5, m12
+ paddd m4, m6
+ paddd m2, m5
+ paddd m2, m4
+ paddd m2, m14
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pcmpgtb m4, m7, m0
+ punpcklbw m1, m0, m4
+ pmaddwd m3, m1, m13
+ paddd m3, m2
+ psrldq m1, 4 ; y=0,x=0
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw m3, m1
+ packsswb m3, m3
+ pslldq m3, 2
+ pand m3, m15
+ pandn m1, m15, m0
+ por m0, m1, m3
+ psrldq m0, 1
+ ; overwrite 2 pixels, but that's ok
+ movd [bufq+xq-1], m0
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, shift
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+%elif WIN64
+ SUB rsp, 16*6
+%assign stack_size_padded (stack_size_padded+16*6)
+%assign stack_size (stack_size+16*6)
+%else
+ ALLOC_STACK -16*6
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend]
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pxor m3, m3
+ pcmpgtb m4, m3, m0
+ pcmpgtb m3, m2
+ pshuflw m6, m6, q0000
+ SCRATCH 6, 14, 12
+ SCRATCH 7, 15, 13
+ punpckhbw m1, m0, m4
+ punpcklbw m0, m4
+ punpcklbw m2, m3
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m5, m0, q3333
+ pshufd m0, m0, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m3
+ mova [rsp+ 2*16], m4
+ mova [rsp+ 3*16], m5
+ pshufd m6, m1, q1111
+ pshufd m7, m1, q2222
+ pshufd m5, m1, q3333
+ pshufd m1, m1, q0000
+ pshufd m3, m2, q1111
+ psrldq m0, m2, 10
+ pinsrw m2, [base+pw_1], 5
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+ pinsrw m0, [base+round_vals+shiftq*2-10], 3
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m6
+ SCRATCH 7, 8, 6
+ SCRATCH 5, 9, 7
+ SCRATCH 2, 10, 8
+ SCRATCH 3, 11, 9
+ SCRATCH 4, 12, 10
+ SCRATCH 0, 13, 11
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m2, m0, m3
+ punpcklbw m0, m3
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ palignr m6, m2, m0, 10
+ palignr m7, m2, m0, 12
+ psrldq m0, 8
+ punpcklwd m0, m6
+ punpcklwd m7, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m7, [rsp+ 3*16]
+ paddd m0, m7
+ paddd m0, m4
+
+ psrldq m4, m1, 2
+ psrldq m5, m1, 4
+ psrldq m6, m1, 6
+ psrldq m7, m1, 8
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 4*16]
+ pmaddwd m6, [rsp+ 5*16]
+ paddd m4, m6
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m5, m2, m7
+ punpcklbw m2, m7
+ palignr m7, m3, m1, 10
+ palignr m3, m1, 12
+ psrldq m1, m2, 2
+ punpcklwd m7, m3
+ punpcklwd m3, m2, m1
+ pmaddwd m7, m8
+ pmaddwd m3, m9
+ paddd m7, m3
+ paddd m0, m7
+
+ psrldq m6, m2, 4
+ psrldq m1, m2, 6
+ psrldq m3, m2, 8
+ palignr m4, m5, m2, 10
+ palignr m5, m5, m2, 12
+
+ punpcklwd m6, m1
+ punpcklwd m3, m4
+ punpcklwd m5, m14
+ pmaddwd m6, m10
+ pmaddwd m3, m11
+ pmaddwd m5, m12
+ paddd m0, m6
+ paddd m3, m5
+ paddd m0, m3
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpcklbw m2, m1, m5
+ pmaddwd m2, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb m2, m2
+ pslldq m2, 3
+ pand m2, m15
+ pandn m3, m15, m1
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+INIT_XMM ssse3
+cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+ movifnidn r2, r2mp
+ movifnidn r3, r3mp
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movd m6, [base+round+r5*2]
+ mova m5, [base+pb_mask]
+ movd m0, [fg_dataq+FGData.seed]
+ movd m2, [base+pw_seed_xor+uvq*4]
+ pxor m0, m2
+ pshuflw m6, m6, q0000
+ pshuflw m0, m0, q0000
+ lea r6, [base+gaussian_sequence]
+%if ARCH_X86_64
+ mov r7d, 38
+%else
+ mov r3mp, 38
+%endif
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+.loop_x:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m3, m2
+ psllq m2, m3, 15
+ por m3, m2 ; aggregate each bit into next seed's high bit
+ pmulhuw m2, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ movd r9d, m2
+ pshuflw m2, m2, q3232
+ movzx r8, r9w
+ shr r9, 16
+
+ movd m3, [r6+r8*2]
+ pinsrw m3, [r6+r9*2], 1
+
+ movd r9d, m2
+ movzx r8, r9w
+ shr r9, 16
+
+ pinsrw m3, [r6+r8*2], 2
+ pinsrw m3, [r6+r9*2], 3
+%else
+ movd r2, m2
+ pshuflw m2, m2, q3232
+ movzx r1, r2w
+ shr r2, 16
+
+ movd m3, [r6+r1*2]
+ pinsrw m3, [r6+r2*2], 1
+
+ movd r2, m2
+ movzx r1, r2w
+ shr r2, 16
+
+ pinsrw m3, [r6+r1*2], 2
+ pinsrw m3, [r6+r2*2], 3
+%endif
+ pmulhrsw m3, m6
+ packsswb m3, m3
+ movd [bufq+r5], m3
+ add r5, 4
+ jl .loop_x
+ add bufq, 82
+%if ARCH_X86_64
+ dec r7d
+%else
+ dec r3mp
+%endif
+ jg .loop_y
+
+%if ARCH_X86_32
+ mov r2, r2mp
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_420_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_420_ssse3_table]
+ jmp r5
+
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -2*16
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd m4, [base+hmul_bits+shiftq*2]
+ movd m1, [base+byte_blend]
+ DEFINE_ARGS buf, bufy, h
+ pxor m0, m0
+ pcmpgtb m0, m5
+ punpcklbw m5, m0
+ movd m7, [base+pb_1]
+ movd m6, [base+hmul_bits+4]
+ pshuflw m5, m5, q0000
+ pshuflw m4, m4, q0000
+ pshufd m7, m7, q0000
+ pshuflw m6, m6, q0000
+ punpcklqdq m5, m5
+ punpcklqdq m4, m4
+ punpcklqdq m6, m6
+ punpcklbw m1, m1
+ SCRATCH 1, 8, 0
+ SCRATCH 4, 9, 1
+ sub bufq, 82*38+82-(82*3+41)
+ add bufyq, 3+82*3
+ mov hd, 35
+.y_loop_ar0:
+ ; first 32 pixels
+ movu m1, [bufyq]
+ movu m2, [bufyq+82]
+ movu m3, [bufyq+16]
+ movu m4, [bufyq+82+16]
+ pmaddubsw m0, m7, m1
+ pmaddubsw m1, m7, m2
+ pmaddubsw m2, m7, m3
+ pmaddubsw m3, m7, m4
+ paddw m0, m1
+ paddw m2, m3
+ pmulhrsw m0, m6
+ pmulhrsw m2, m6
+ pmullw m0, m5
+ pmullw m2, m5
+ pmulhrsw m0, m9
+ pmulhrsw m2, m9
+ packsswb m0, m2
+ movu m1, [bufq]
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m1, m7, m2
+ pmaddubsw m2, m7, m0
+ packsswb m2, m1
+ movu [bufq], m2
+ add bufyq, 32
+ add bufq, 16
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .y_loop_ar0
+
+ ; last 6 pixels
+ movu m1, [bufyq]
+ movu m2, [bufyq+82]
+ pmaddubsw m0, m7, m1
+ pmaddubsw m1, m7, m2
+ paddw m0, m1
+ pmulhrsw m0, m6
+ pmullw m0, m5
+ pmulhrsw m0, m9
+ packsswb m0, m0
+ movq m1, [bufq]
+ punpcklbw m0, m1
+ pmaddubsw m2, m7, m0
+ packsswb m2, m2
+ pandn m0, m8, m2
+ pand m1, m8
+ por m0, m1
+ movq [bufq], m0
+
+ add bufq, 82-32
+ add bufyq, 82*2-64
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+.ar1:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
+ pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
+%if ARCH_X86_32
+ mov r3mp, cf3d
+ DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ movd m7, [base+pb_1]
+ movd m6, [base+hmul_bits+4]
+ psrldq m4, 1
+%if ARCH_X86_32
+ DEFINE_ARGS buf, shift, val0, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
+%endif
+ pxor m5, m5
+ punpcklwd m3, m5
+ punpcklwd m6, m6
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ pshufd m3, m3, q0000
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ sub bufq, 82*38+44-(82*3+41)
+%if ARCH_X86_32
+ add r1mp, 79+82*3
+ mov r0mp, 35
+%else
+ add bufyq, 79+82*3
+ mov hd, 35
+%endif
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -38
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+%if ARCH_X86_32
+ mov r2, r1mp
+ movq m0, [r2+xq*2]
+ movq m1, [r2+xq*2+82]
+%else
+ movq m0, [bufyq+xq*2]
+ movq m1, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m2, m7, m0
+ pmaddubsw m0, m7, m1
+ paddw m2, m0
+ pmulhrsw m2, m6
+
+ movq m0, [bufq+xq-82-1] ; top/left
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m1, m2
+ psrldq m2, m0, 2 ; top
+ punpcklwd m0, m2
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+%if ARCH_X86_32
+ imul val3d, r3mp
+%else
+ imul val3d, cf3d
+%endif
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+%if ARCH_X86_32
+ add r1mp, 82*2
+ dec r0mp
+%else
+ add bufyq, 82*2
+ dec hd
+%endif
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+ ALLOC_STACK -8*16
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ movd m7, [base+round_vals-12+shiftq*2]
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12
+ pxor m2, m2
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ pinsrw m1, [base+pw_1], 5
+ punpcklwd m7, m7
+ pshufd m7, m7, q0000
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ SCRATCH 7, 15, 7
+ movd m7, [base+hmul_bits+4]
+ movd m6, [base+pb_1]
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ sub bufq, 82*38+44-(82*3+41)
+ add bufyq, 79+82*3
+ mov hd, 35
+.y_loop_ar2:
+ mov xq, -38
+
+.x_loop_ar2:
+ pxor m2, m2
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m0, m1
+ psrldq m3, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ pmaddwd m4, m9
+ pmaddwd m0, m10
+ pmaddwd m3, m12
+ paddd m4, m0
+ paddd m2, m3
+ paddd m2, m4
+
+ movq m0, [bufyq+xq*2]
+ movq m3, [bufyq+xq*2+82]
+ pmaddubsw m1, m6, m0
+ pmaddubsw m0, m6, m3
+ paddw m0, m1
+ pmulhrsw m0, m7
+ punpcklwd m0, m15
+ pmaddwd m0, m14
+ paddd m2, m0
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+ pxor m4, m4
+ movd m5, [base+byte_blend+1]
+ punpcklbw m5, m5
+.x_loop_ar2_inner:
+ pcmpgtb m1, m4, m0
+ punpcklbw m0, m1
+ pmaddwd m3, m0, m13
+ paddd m3, m2
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq m3, 4
+ pand m3, m5
+ paddw m0, m3
+ packsswb m0, m0
+ movd [bufq+xq-2], m0
+ psrldq m0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82*2
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+ ALLOC_STACK -15*16
+%else
+ SUB rsp, 16*7
+%assign stack_size_padded (stack_size_padded+16*7)
+%assign stack_size (stack_size+16*7)
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ pshufd m2, m0, q1111
+ pshufd m3, m0, q2222
+ pshufd m4, m0, q3333
+ pshufd m0, m0, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m7, m1, q3333
+ pshufd m1, m1, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m2
+ mova [rsp+ 2*16], m3
+ mova [rsp+ 3*16], m4
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m5
+ mova [rsp+ 6*16], m6
+ SCRATCH 7, 8, 7
+
+ movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma]
+ pxor m4, m4
+ pcmpgtb m4, m2
+ punpckhbw m5, m2, m4
+ punpcklbw m2, m4
+ pshufd m4, m2, q3232
+ punpcklwd m3, m4, m5
+ pshuflw m5, m4, q3321
+ pshufd m4, m3, q0000
+ pshufd m3, m2, q1111
+ pshufd m2, m2, q0000
+ pinsrw m5, [base+round_vals+shiftq*2-10], 3
+ SCRATCH 2, 9, 8
+ SCRATCH 3, 10, 9
+ SCRATCH 4, 11, 10
+ SCRATCH 5, 12, 11
+
+ movd m2, [base+round_vals-12+shiftq*2]
+ movd m1, [base+pb_1]
+ movd m3, [base+hmul_bits+4]
+ pxor m0, m0
+ punpcklwd m2, m0
+ punpcklwd m3, m3
+ pshufd m2, m2, q0000
+ pshufd m1, m1, q0000
+ pshufd m3, m3, q0000
+ SCRATCH 1, 13, 12
+ SCRATCH 2, 14, 13
+ SCRATCH 3, 15, 14
+
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ sub bufq, 82*38+44-(82*3+41)
+ add bufyq, 79+82*3
+ mov hd, 35
+.y_loop_ar3:
+ mov xq, -38
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m4, m4
+ pcmpgtb m4, m0
+ punpckhbw m3, m0, m4
+ punpcklbw m0, m4
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ palignr m2, m3, m0, 10
+ palignr m3, m0, 12
+ psrldq m0, 8
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m6, m6
+ pcmpgtb m6, m1
+ punpckhbw m5, m1, m6
+ punpcklbw m1, m6
+
+ punpcklwd m0, m2
+ punpcklwd m3, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m3, [rsp+ 3*16]
+ paddd m0, m3
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m6, m2, m7
+ punpcklbw m2, m7
+
+ palignr m3, m5, m1, 10
+ palignr m5, m1, 12
+ psrldq m4, m2, 2
+
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m4
+ pmaddwd m3, [rsp+ 6*16]
+ pmaddwd m5, m8
+ paddd m3, m5
+ paddd m0, m3
+
+ psrldq m3, m1, 2
+ psrldq m4, m1, 4
+ psrldq m5, m1, 6
+ psrldq m1, 8
+
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ pmaddwd m3, [rsp+ 4*16]
+ pmaddwd m5, [rsp+ 5*16]
+ paddd m3, m5
+ paddd m0, m3
+
+ movq m1, [bufyq+xq*2]
+ movq m3, [bufyq+xq*2+82]
+ pmaddubsw m5, m13, m1
+ pmaddubsw m7, m13, m3
+ paddw m7, m5
+ pmulhrsw m7, m15
+
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ palignr m4, m6, m2, 10
+ palignr m6, m2, 12
+ psrldq m2, 8
+
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+ punpcklwd m6, m7
+ pmaddwd m1, m9
+ pmaddwd m2, m10
+ pmaddwd m6, m11
+ paddd m1, m2
+ paddd m0, m6
+ paddd m0, m1
+ paddd m0, m14
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ pxor m4, m4
+ movd m5, [base+byte_blend]
+.x_loop_ar3_inner:
+ pcmpgtb m2, m4, m1
+ punpcklbw m3, m1, m2
+ pmaddwd m2, m3, m12
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw, we only care about one value
+ packsswb m2, m2
+ pandn m3, m5, m1
+ pslld m2, 24
+ pand m2, m5
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82*2
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 6
+%define %%tmp %6
+%endif
+%rep 4
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4]
+%else
+ pinsrw %1, [%3+%4], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+INIT_XMM ssse3
+; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+ mov [rsp+6*mmsize+ 3*gprsize], r0
+ mov [rsp+6*mmsize+ 5*gprsize], r1
+ mov [rsp+6*mmsize+ 7*gprsize], r2
+ mov [rsp+6*mmsize+ 9*gprsize], r3
+ mov [rsp+6*mmsize+10*gprsize], r4
+ mov [rsp+6*mmsize+11*gprsize], r5
+%else
+cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+6*mmsize+ 3*gprsize]
+%define r1m [rsp+6*mmsize+ 4*gprsize]
+%define r2m [rsp+6*mmsize+ 5*gprsize]
+%define r3m [rsp+6*mmsize+ 6*gprsize]
+%define r4m [rsp+6*mmsize+ 7*gprsize]
+%define r5m [rsp+6*mmsize+ 8*gprsize]
+%define r6m [rsp+6*mmsize+ 9*gprsize]
+%define r7m [rsp+6*mmsize+10*gprsize]
+%define r8m [rsp+6*mmsize+11*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r7, [pb_mask]
+%define base r7-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ pcmpeqw m2, m2
+ psrldq m2, 14
+ movd m4, [base+max+r6*4]
+ movd m5, [base+min+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m4, m4
+ punpcklwd m5, m5
+ pshufd m3, m3, q0000
+ pshufd m4, m4, q0000
+ pshufd m5, m5, q0000
+ SCRATCH 2, 10, 0
+ SCRATCH 3, 11, 1
+ SCRATCH 4, 12, 2
+ SCRATCH 5, 13, 3
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz .no_vertical_overlap
+ mova m6, [base+pw_1024]
+ movd m7, [base+pb_27_17_17_27]
+ SCRATCH 6, 14, 4
+ SCRATCH 7, 15, 5
+ test sbyd, sbyd
+ jnz .vertical_overlap
+ ; fall-through
+
+.no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused
+%endif
+
+.loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq, r0, r5, m3
+ vpgatherdw m5, m1, scalingq, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq, r12, r13, m3
+ vpgatherdw m5, m1, scalingq, r12, r13, m3
+%endif
+ pcmpeqw m3, m3
+ psrlw m3, 8
+ pand m4, m3
+ pand m5, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+ xor r8mp, 4
+ test r8mp, 4
+%else
+ lea srcq, [src_bakq+wq]
+ test srcq, 16 ; this relies on buffer alignment...
+%endif
+ jz .next_blk
+
+ add offxyd, 16
+ test dword r8m, 2 ; r8m & 2 = have_top_overlap
+ jz .loop_x_odd
+
+%if ARCH_X86_32
+ add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jnz .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
+
+ add offxyd, 16 ; left_offxyd
+ mov [rsp+6*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq, r0, r5, m3
+ vpgatherdw m5, m1, scalingq, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq, r12, r13, m3
+ vpgatherdw m5, m1, scalingq, r12, r13, m3
+%endif
+ pcmpeqw m3, m3
+ psrlw m3, 8
+ pand m4, m3
+ pand m5, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+6*mmsize+0*gprsize]
+ movd m7, [grain_lutq+r5]
+%else
+ movd m7, [grain_lutq+left_offxyq]
+%endif
+ punpcklbw m7, m3
+ pmaddubsw m6, m15, m7
+ pmulhrsw m6, m14
+ packsswb m6, m6
+ pand m6, m10
+ pandn m7, m10, m3
+ por m6, m7
+ pcmpgtb m2, m6
+ punpcklbw m7, m6, m2
+ punpckhbw m6, m2
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m7, m4
+ pmullw m6, m5
+ pmulhrsw m7, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m7
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+ xor r8mp, 4
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ ; assert(srcq & 16) != 0
+ add offxyd, 16
+
+ ; since this half-block had left-overlap, the next does not
+ test dword r8m, 2 ; have_top_overlap
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
+ ; because of the 'and tmpd, 0x00ff00ff' above
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+6*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+8], r5
+%else
+ mova m8, [pb_27_17]
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq, r0, r5, m3
+ vpgatherdw m5, m1, scalingq, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq, r12, r13, m3
+ vpgatherdw m5, m1, scalingq, r12, r13, m3
+%endif
+ pcmpeqw m3, m3
+ psrlw m3, 8
+ pand m4, m3
+ pand m5, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+6*mmsize+1*gprsize]
+ movu m7, [grain_lutq+r5]
+%else
+ movu m7, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m7, m3
+ punpcklbw m7, m3
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+8]
+ pmaddubsw m3, [r5], m6
+ pmaddubsw m6, [r5], m7
+%else
+ pmaddubsw m3, m8, m6
+ pmaddubsw m6, m8, m7
+%endif
+ pmulhrsw m3, m14
+ pmulhrsw m6, m14
+ packsswb m6, m3
+ pcmpgtb m7, m2, m6
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m6, m5
+ pmulhrsw m2, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+8], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+ xor r8mp, 4
+ test r8mp, 4
+%else
+ lea srcq, [src_bakq+wq]
+ test srcq, 16
+%endif
+ jz .loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+8], r5
+
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
+
+ mov r5, [rsp+6*mmsize+1*gprsize]
+ mov r4, offxyd
+ add r5, 16
+ add r4, 16
+ mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy
+ mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
+
+ xor tmpd, tmpd
+ mov seed, r3m
+%else
+ mova m8, [pb_27_17]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+
+ movzx r5, offxyw ; top_offxy
+ mov [rsp+6*mmsize+1*gprsize], r5
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ movzx top_offxyd, offxyw
+%endif
+ shr offxyd, 16
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy
+ mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy
+ movu m6, [grain_lutq+r5]
+ mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy
+ movd m4, [grain_lutq+r0]
+ movd m7, [grain_lutq+r5]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+ movd m7, [grain_lutq+topleft_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m4, m3
+ punpcklbw m7, m6
+ pmaddubsw m2, m15, m4
+ pmaddubsw m4, m15, m7
+ pmulhrsw m2, m14
+ pmulhrsw m4, m14
+ packsswb m2, m2
+ packsswb m4, m4
+ pand m2, m10
+ pand m4, m10
+ pandn m7, m10, m3
+ pandn m3, m10, m6
+ por m7, m2
+ por m3, m4
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+8]
+ pmaddubsw m7, [r5], m4
+ pmaddubsw m4, [r5], m3
+%else
+ pmaddubsw m7, m8, m4
+ pmaddubsw m4, m8, m3
+%endif
+ pmulhrsw m7, m14
+ pmulhrsw m4, m14
+ packsswb m4, m7
+ pxor m2, m2
+ pcmpgtb m7, m2, m4
+ punpcklbw m3, m4, m7
+ punpckhbw m4, m7
+
+ ; src
+ mova m0, [srcq]
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m5, m0, scalingq, r0, r5, m7
+ vpgatherdw m6, m1, scalingq, r0, r5, m7
+%else
+ vpgatherdw m5, m0, scalingq, r13, r14, m7
+ vpgatherdw m6, m1, scalingq, r13, r14, m7
+%endif
+ pcmpeqw m7, m7
+ psrlw m7, 8
+ pand m5, m7
+ pand m6, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m3, m5
+ pmullw m4, m6
+ pmulhrsw m3, m11
+ pmulhrsw m4, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+8], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+ xor r8mp, 4
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ ; assert(srcq & 16) != 0
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+6*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
+; sby, luma, lstride, uv_pl, is_id)
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
+cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov [rsp+8*mmsize+3*gprsize], r0
+ mov [rsp+8*mmsize+5*gprsize], r1
+ mov [rsp+8*mmsize+7*gprsize], r2
+ mov [rsp+8*mmsize+9*gprsize], r3
+ mov [rsp+8*mmsize+10*gprsize], r4
+
+ mov r0, r8m
+ mov r1, r9m
+ mov r2, r10m
+ mov r4, r11m
+ mov r3, r12m
+ mov [rsp+8*mmsize+11*gprsize], r0
+ mov [rsp+8*mmsize+12*gprsize], r1
+ mov [rsp+8*mmsize+13*gprsize], r2
+ mov [rsp+8*mmsize+14*gprsize], r4
+%else
+cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, r5
+%else
+cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m2, [base+byte_blend+3]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ lea tmpd, [r6d*2]
+%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
+ test r3, r3
+%else
+ cmp dword r12m, 0 ; is_idm
+%endif
+ movd m5, [base+min+r6*2]
+ cmovne r6d, tmpd
+ movd m4, [base+max+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m5, m5
+ punpcklwd m4, m4
+ pshufd m3, m3, q0000
+ pshufd m5, m5, q0000
+ pshufd m4, m4, q0000
+ SCRATCH 2, 10, 0
+ SCRATCH 3, 11, 1
+ SCRATCH 4, 12, 2
+ SCRATCH 5, 13, 3
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+%if %1
+ mov r6d, dword r11m
+ movd m0, [fg_dataq+FGData.uv_mult+r6*4]
+ movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklbw m6, m1, m0
+ movd m7, [fg_dataq+FGData.uv_offset+r6*4]
+ punpcklwd m6, m6
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ SCRATCH 6, 14, 4
+ SCRATCH 7, 15, 5
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz %%no_vertical_overlap
+%if ARCH_X86_32
+ movd m1, [base+pb_23_22]
+ mova m0, [base+pw_1024]
+%else
+ movd m1, [pb_23_22]
+ mova m0, [pw_1024]
+%endif
+ pshufd m1, m1, q0000
+ SCRATCH 0, 8, 6
+ SCRATCH 1, 9, 7
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+ ; fall-through
+
+%%no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+%define luma_bakq lumaq
+
+ mov wq, r4m
+ shl r10mp, 1
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*2]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r10mp, strideq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 82
+ lea offyq, [offyq+offxq+498] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq, r0, r5
+ vpgatherdw m5, m6, scalingq, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq, r12, r2
+ vpgatherdw m5, m6, scalingq, r12, r2
+%endif
+ pcmpeqw m1, m1
+ psrlw m1, 8
+ pand m7, m1
+ pand m5, m1
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq+ 0]
+ pcmpgtb m6, m2, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; we already incremented lumaq above
+%else
+ add srcq, r10mp
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*2]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+ test dword r8m, 1
+ jz %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+ lea r6, [offxyd+16]
+ mov [rsp+8*mmsize+0*gprsize], r6
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 82
+ lea offyq, [offyq+offxq+498] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq, r0, r5
+ vpgatherdw m5, m6, scalingq, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq, r12, r2
+ vpgatherdw m5, m6, scalingq, r12, r2
+%endif
+ pcmpeqw m1, m1
+ psrlw m1, 8
+ pand m7, m1
+ pand m5, m1
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq+ 0]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+0*gprsize]
+ movd m4, [grain_lutq+r0+ 0]
+%else
+ movd m4, [grain_lutq+left_offxyq+ 0]
+%endif
+ punpcklbw m2, m4, m3
+ pmaddubsw m4, m9, m2
+ pmulhrsw m4, m8
+ packsswb m4, m4
+ pand m4, m10
+ pandn m2, m10, m3
+ por m3, m4, m2
+ pxor m4, m4
+ pcmpgtb m4, m3
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r10mp
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*2]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%end:
+ RET
+
+%%vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+ shl r10mp, 1
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*2]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r10mp, strideq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor tmpd, tmpd
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 82
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq+0x10001*498+16*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq, r0, r5
+ vpgatherdw m5, m6, scalingq, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq, r12, r2
+ vpgatherdw m5, m6, scalingq, r12, r2
+%endif
+ pcmpeqw m1, m1
+ psrlw m1, 8
+ pand m7, m1
+ pand m5, m1
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+1*gprsize]
+ movu m4, [grain_lutq+r0]
+%else
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m1, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, m9, m1
+ pmaddubsw m3, m9, m4
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ packsswb m3, m2
+ pxor m1, m1
+ pcmpgtb m1, m3
+ punpcklbw m2, m3, m1
+ punpckhbw m3, m1
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; unpack chroma_source
+ pxor m4, m4
+ punpckhbw m1, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ dec hw
+ je %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r10mp
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ add grain_lutq, 82
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*2]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
+
+ mov r6, [rsp+8*mmsize+1*gprsize]
+ lea r0, [r3d+16]
+ add r6, 16
+ mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
+ mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
+
+ mov seed, r3m
+ xor tmpd, tmpd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offxyq+16]
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 82
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq+0x10001*498+16*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; src
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq, r0, r5
+ vpgatherdw m5, m6, scalingq, r0, r5
+%else
+ movd m1, [grain_lutq+topleft_offxyq]
+ vpgatherdw m7, m4, scalingq, r2, r12
+ vpgatherdw m5, m6, scalingq, r2, r12
+%endif
+ pcmpeqw m2, m2
+ psrlw m2, 8
+ pand m7, m2
+ pand m5, m2
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
+ mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ movd m1, [grain_lutq+r0]
+ mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
+%endif
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ movu m6, [grain_lutq+r5]
+ movd m4, [grain_lutq+r0]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m1, m6
+ punpcklbw m4, m3
+ punpcklwd m4, m1
+ pmaddubsw m1, m9, m4
+ pmulhrsw m1, m8
+ packsswb m1, m1
+ pandn m4, m10, m3
+ pandn m3, m10, m6
+ psrldq m6, m1, 1
+ pand m1, m10
+ pand m6, m10
+ por m4, m1
+ por m3, m6
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhbw m1, m3, m4
+ punpcklbw m3, m4
+ pmaddubsw m4, m9, m1
+ pmaddubsw m1, m9, m3
+ pmulhrsw m4, m8
+ pmulhrsw m1, m8
+ packsswb m1, m4
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ punpckhbw m1, m4
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m1, m5
+ pmulhrsw m2, m11
+ pmulhrsw m1, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; unpack chroma source
+ pxor m4, m4
+ punpckhbw m3, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m3, m1
+ pmaxsw m0, m13
+ pmaxsw m3, m13
+ pminsw m0, m12
+ pminsw m3, m12
+ packuswb m0, m3
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has been adjusted above already
+%else
+ add srcq, r10mp
+ lea lumaq, [lumaq+lstrideq*2]
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*2]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+ jmp %%loop_x_hv_overlap
+
+%%end_hv:
+ RET
+%endmacro
+
+ FGUV_32x32xN_LOOP 1
+.csfl:
+ FGUV_32x32xN_LOOP 0
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -49,22 +49,22 @@
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
- Dav1dFilmGrainData fg_data;
- fg_data.seed = rnd() & 0xFFFF;
+ ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+ fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
- fg_data.grain_scale_shift = rnd() & 3;
- fg_data.ar_coeff_shift = (rnd() & 3) + 6;
- fg_data.ar_coeff_lag = i;
- const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+ fg_data[0].grain_scale_shift = rnd() & 3;
+ fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+ fg_data[0].ar_coeff_lag = i;
+ const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
- fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+ fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
- call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
- call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+ call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX);
+ call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
if (memcmp(grain_lut_c, grain_lut_a,
GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
{
@@ -71,7 +71,7 @@
fail();
}
- bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
+ bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX);
}
}
@@ -97,38 +97,38 @@
"gen_grain_uv_ar%d_%dbpc_%s",
i, BITDEPTH, ss_name[layout_idx]))
{
- Dav1dFilmGrainData fg_data;
- fg_data.seed = rnd() & 0xFFFF;
+ ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
+ fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
- fg_data.num_y_points = rnd() & 1;
- fg_data.grain_scale_shift = rnd() & 3;
- fg_data.ar_coeff_shift = (rnd() & 3) + 6;
- fg_data.ar_coeff_lag = i;
- const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+ fg_data[0].num_y_points = rnd() & 1;
+ fg_data[0].grain_scale_shift = rnd() & 3;
+ fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+ fg_data[0].ar_coeff_lag = i;
+ const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
- fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
- dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
+ fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+ dsp->generate_grain_y(grain_lut_y, fg_data HIGHBD_TAIL_SUFFIX);
const int uv = rnd() & 1;
- const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
+ const int num_uv_pos = num_y_pos + !!fg_data[0].num_y_points;
for (int n = 0; n < num_uv_pos; n++)
- fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
- if (!fg_data.num_y_points)
- fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
+ fg_data[0].ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
+ if (!fg_data[0].num_y_points)
+ fg_data[0].ar_coeffs_uv[uv][num_uv_pos] = 0;
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
- call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
- call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+ call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
+ call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
if (diff) fail();
- bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
+ bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX);
}
}
}
@@ -149,8 +149,8 @@
int bh, int row_num HIGHBD_DECL_SUFFIX);
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
- Dav1dFilmGrainData fg_data;
- fg_data.seed = rnd() & 0xFFFF;
+ ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,);
+ fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@@ -160,23 +160,23 @@
uint8_t scaling[SCALING_SIZE];
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
- fg_data.grain_scale_shift = rnd() & 3;
- fg_data.ar_coeff_shift = (rnd() & 3) + 6;
- fg_data.ar_coeff_lag = rnd() & 3;
- const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+ fg_data[0].grain_scale_shift = rnd() & 3;
+ fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+ fg_data[0].ar_coeff_lag = rnd() & 3;
+ const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
- fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
- dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
+ fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+ dsp->generate_grain_y(grain_lut, fg_data HIGHBD_TAIL_SUFFIX);
- fg_data.num_y_points = 2 + (rnd() % 13);
- const int pad = 0xff / fg_data.num_y_points;
- for (int n = 0; n < fg_data.num_y_points; n++) {
- fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
- fg_data.y_points[n][0] += rnd() % pad;
- fg_data.y_points[n][1] = rnd() & 0xff;
+ fg_data[0].num_y_points = 2 + (rnd() % 13);
+ const int pad = 0xff / fg_data[0].num_y_points;
+ for (int n = 0; n < fg_data[0].num_y_points; n++) {
+ fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+ fg_data[0].y_points[n][0] += rnd() % pad;
+ fg_data[0].y_points[n][1] = rnd() & 0xff;
}
- generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
- fg_data.num_y_points, scaling);
+ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+ fg_data[0].num_y_points, scaling);
const int w = 1 + (rnd() & 127);
const int h = 1 + (rnd() & 31);
@@ -186,20 +186,20 @@
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
- fg_data.clip_to_restricted_range = rnd() & 1;
- fg_data.scaling_shift = (rnd() & 3) + 8;
- for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
- fg_data.overlap_flag++)
+ fg_data[0].clip_to_restricted_range = rnd() & 1;
+ fg_data[0].scaling_shift = (rnd() & 3) + 8;
+ for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+ fg_data[0].overlap_flag++)
{
- call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+ call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
- call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
+ call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
- fg_data.overlap_flag = 1;
- bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
+ fg_data[0].overlap_flag = 1;
+ bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
row_num HIGHBD_TAIL_SUFFIX);
}
@@ -231,9 +231,9 @@
"fguv_32x32xn_%dbpc_%s_csfl%d",
BITDEPTH, ss_name[layout_idx], csfl))
{
- Dav1dFilmGrainData fg_data;
+ ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,);
- fg_data.seed = rnd() & 0xFFFF;
+ fg_data[0].seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
@@ -245,15 +245,18 @@
uint8_t scaling[SCALING_SIZE];
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
- fg_data.grain_scale_shift = rnd() & 3;
- fg_data.ar_coeff_shift = (rnd() & 3) + 6;
- fg_data.ar_coeff_lag = rnd() & 3;
- const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+ fg_data[0].grain_scale_shift = rnd() & 3;
+ fg_data[0].ar_coeff_shift = (rnd() & 3) + 6;
+ fg_data[0].ar_coeff_lag = rnd() & 3;
+ const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
- fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
- dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
+ fg_data[0].ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+ const int num_uv_pos = num_y_pos + 1;
+ for (int n = 0; n < num_uv_pos; n++)
+ fg_data[0].ar_coeffs_uv[uv_pl][n] = (rnd() & 0xff) - 128;
+ dsp->generate_grain_y(grain_lut[0], fg_data HIGHBD_TAIL_SUFFIX);
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
- &fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
+ fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
const int w = 1 + (rnd() & (127 >> ss_x));
const int h = 1 + (rnd() & (31 >> ss_y));
@@ -268,47 +271,47 @@
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
if (csfl) {
- fg_data.num_y_points = 2 + (rnd() % 13);
- const int pad = 0xff / fg_data.num_y_points;
- for (int n = 0; n < fg_data.num_y_points; n++) {
- fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
- fg_data.y_points[n][0] += rnd() % pad;
- fg_data.y_points[n][1] = rnd() & 0xff;
+ fg_data[0].num_y_points = 2 + (rnd() % 13);
+ const int pad = 0xff / fg_data[0].num_y_points;
+ for (int n = 0; n < fg_data[0].num_y_points; n++) {
+ fg_data[0].y_points[n][0] = 0xff * n / fg_data[0].num_y_points;
+ fg_data[0].y_points[n][0] += rnd() % pad;
+ fg_data[0].y_points[n][1] = rnd() & 0xff;
}
- generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
- fg_data.num_y_points, scaling);
+ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
+ fg_data[0].num_y_points, scaling);
} else {
- fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
- const int pad = 0xff / fg_data.num_uv_points[uv_pl];
- for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
- fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
- fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
- fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
+ fg_data[0].num_uv_points[uv_pl] = 2 + (rnd() % 9);
+ const int pad = 0xff / fg_data[0].num_uv_points[uv_pl];
+ for (int n = 0; n < fg_data[0].num_uv_points[uv_pl]; n++) {
+ fg_data[0].uv_points[uv_pl][n][0] = 0xff * n / fg_data[0].num_uv_points[uv_pl];
+ fg_data[0].uv_points[uv_pl][n][0] += rnd() % pad;
+ fg_data[0].uv_points[uv_pl][n][1] = rnd() & 0xff;
}
- generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
- fg_data.num_uv_points[uv_pl], scaling);
+ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].uv_points[uv_pl],
+ fg_data[0].num_uv_points[uv_pl], scaling);
- fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
- fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
- fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
+ fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128;
+ fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
+ fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
}
- fg_data.clip_to_restricted_range = rnd() & 1;
- fg_data.scaling_shift = (rnd() & 3) + 8;
- fg_data.chroma_scaling_from_luma = csfl;
- for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
- fg_data.overlap_flag++)
+ fg_data[0].clip_to_restricted_range = rnd() & 1;
+ fg_data[0].scaling_shift = (rnd() & 3) + 8;
+ fg_data[0].chroma_scaling_from_luma = csfl;
+ for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
+ fg_data[0].overlap_flag++)
{
- call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+ call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
- call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+ call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
- fg_data.overlap_flag = 1;
- bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
+ fg_data[0].overlap_flag = 1;
+ bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}
}