ref: 556890be42d8affef280188c1a5d22cf299b2197
parent: 6d363223dc979c393217c86eda54848531d69b9e
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Sep 10 07:00:54 EDT 2019
AVX2 for chroma 4:2:0 film grain reconstruction fguv_32x32xn_8bpc_420_csfl0_c: 8945.4 fguv_32x32xn_8bpc_420_csfl0_avx2: 1001.6 fguv_32x32xn_8bpc_420_csfl1_c: 6363.4 fguv_32x32xn_8bpc_420_csfl1_avx2: 1299.5
--- a/src/fg_apply_tmpl.c
+++ b/src/fg_apply_tmpl.c
@@ -136,6 +136,8 @@
// Synthesize grain for the affected planes
const int rows = (out->p.h + 31) >> 5;
const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
for (int row = 0; row < rows; row++) {
const pixel *const luma_src =
@@ -144,7 +146,7 @@
if (data->num_y_points) {
const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
- luma_src, out->stride[0], &out->frame_hdr->film_grain.data,
+ luma_src, out->stride[0], data,
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
}
@@ -154,22 +156,19 @@
for (int pl = 0; pl < 2; pl++)
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
((const pixel *) in->data[1 + pl]) + uv_off,
- in->stride[1], luma_src,
- in->stride[0], out->p.w, bh,
- &out->frame_hdr->film_grain.data,
- grain_lut[1 + pl], scaling[0],
- pl, row, is_id HIGHBD_TAIL_SUFFIX);
+ in->stride[1], data, cpw,
+ scaling[0], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
} else {
for (int pl = 0; pl < 2; pl++)
if (data->num_uv_points[pl])
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
((const pixel *) in->data[1 + pl]) + uv_off,
- in->stride[1], luma_src,
- in->stride[0], out->p.w, bh,
- &out->frame_hdr->film_grain.data,
- grain_lut[1 + pl],
- scaling[1 + pl], pl, row, is_id
- HIGHBD_TAIL_SUFFIX);
+ in->stride[1], data, cpw,
+ scaling[1 + pl], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
}
}
}
--- a/src/film_grain.h
+++ b/src/film_grain.h
@@ -64,11 +64,11 @@
#define decl_fguv_32x32xn_fn(name) \
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
- const pixel *luma_row, ptrdiff_t luma_stride, int pw, int bh, \
- const Dav1dFilmGrainData *data, \
- const entry grain_lut[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *data, int pw, \
const uint8_t scaling[SCALING_SIZE], \
- int uv_pl, int row_num, int is_id HIGHBD_DECL_SUFFIX)
+ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+ const pixel *luma_row, ptrdiff_t luma_stride, \
+ int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
typedef struct Dav1dFilmGrainDSPContext {
--- a/src/film_grain_tmpl.c
+++ b/src/film_grain_tmpl.c
@@ -281,12 +281,11 @@
static NOINLINE void
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
- const ptrdiff_t stride, const pixel *const luma_row,
- const ptrdiff_t luma_stride, const int pw, const int bh,
- const Dav1dFilmGrainData *const data,
- const entry grain_lut[][GRAIN_WIDTH],
- const uint8_t scaling[SCALING_SIZE],
- const int uv, const int row_num, const int is_id,
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+ const int pw, const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH], const int bh,
+ const int row_num, const pixel *const luma_row,
+ const ptrdiff_t luma_stride, const int uv, const int is_id,
const int sx, const int sy HIGHBD_DECL_SUFFIX)
{
const int rows = 1 + (data->overlap_flag && row_num > 0);
@@ -320,8 +319,8 @@
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks (subsampled)
- for (int bx = 0; bx < (pw + sx) >> sx; bx += BLOCK_SIZE >> sx) {
- const int bw = (imin(BLOCK_SIZE, pw - (bx << sx)) + sx) >> sx;
+ for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+ const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
if (data->overlap_flag && bx) {
// shift previous offsets left
for (int i = 0; i < rows; i++)
@@ -412,8 +411,8 @@
#define fguv_ss_fn(nm, ss_x, ss_y) \
static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
- fguv_32x32xn_c(dst_row, src_row, stride, luma_row, luma_stride, pw, bh, \
- data, grain_lut, scaling, uv_pl, row_num, is_id, ss_x, ss_y \
+ fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+ row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
HIGHBD_TAIL_SUFFIX); \
}
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -28,19 +28,20 @@
%if ARCH_X86_64
SECTION_RODATA
+pw_1024: times 16 dw 1024
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
-pw_1024: times 2 dw 1024
pd_m65536: dd ~0xffff
+pb_23_22: times 2 db 23, 22
+pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
mul_bits: dw 256, 128, 64, 32, 16
round_vals: dw 32, 64, 128, 256, 512
-max: dw 255, 235
+max: dw 255, 240, 235
min: dw 0, 16
pb_27_17_17_27: db 27, 17, 17, 27
-pb_1: db 1
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
@@ -417,7 +418,7 @@
%define base r8-pb_mask
vpbroadcastw m11, [base+mul_bits+r7*2-14]
mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
- vpbroadcastw m12, [base+max+r7*2]
+ vpbroadcastw m12, [base+max+r7*4]
vpbroadcastw m13, [base+min+r7*2]
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
@@ -435,7 +436,6 @@
movzx seed, seew
xor seed, [fg_dataq+FGData.seed]
-
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
unused1, unused2, see, overlap
@@ -816,7 +816,7 @@
; scaling[src]
pcmpeqw m3, m3
-; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel
+ ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel
vpgatherdd m9, [scalingq+m4], m3
pcmpeqw m3, m3
vpgatherdd m4, [scalingq+m5], m3
@@ -896,5 +896,687 @@
.end_hv:
RET
+
+cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+ pcmpeqw m10, m10
+ psrld m10, 24
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+ vpbroadcastw m11, [base+mul_bits+r7*2-14]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, dword is_idm
+ vpbroadcastw m13, [base+min+r7*2]
+ shlx r7d, r7d, r9d
+ vpbroadcastw m12, [base+max+r7*2]
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+%if %1
+ mov r7d, dword r11m
+ vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4]
+ vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4]
+ punpcklbw m14, m1, m0
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4]
+%else
+ vpbroadcastd m14, [pw_1024]
+ vpbroadcastd m15, [pb_23_22]
+%endif
+
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ movifnidn sbyd, sbym
+ test sbyd, sbyd
+ setnz r7b
+ test r7b, overlapb
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, unused5, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*2]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x:
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ mov offxd, seed
+ rorx offyd, seed, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 82
+ lea offyq, [offyq+offxq+498] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+ mova xm4, [lumaq+lstrideq*0+ 0]
+ mova xm6, [lumaq+lstrideq*0+16]
+ mova xm0, [srcq]
+ vpbroadcastd m7, [pb_1]
+ vinserti128 m4, [lumaq+lstrideq*2 +0], 1
+ vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m0, [srcq+strideq], 1
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ punpckhwd m5, m4, m2
+ punpcklwd m4, m2
+ punpckhwd m7, m6, m2
+ punpcklwd m6, m2 ; m4-7: luma_src as dword
+
+ ; scaling[luma_src]
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m8, [scalingq+m4], m3
+ vpgatherdd m4, [scalingq+m5], m9
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m5, [scalingq+m6], m3
+ vpgatherdd m6, [scalingq+m7], m9
+ pand m8, m10
+ pand m4, m10
+ pand m5, m10
+ pand m6, m10
+ packusdw m8, m4
+ packusdw m5, m6
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu xm3, [grain_lutq+offxyq+ 0]
+ vinserti128 m3, [grain_lutq+offxyq+82], 1
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m8
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*4]
+ add grain_lutq, 82*2
+ sub hb, 2
+ jg %%loop_y
+
+ add wq, 16
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*2]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ mov offxd, seed
+ rorx offyd, seed, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 82
+ lea offyq, [offyq+offxq+498] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ ; src
+ mova xm4, [lumaq+lstrideq*0+ 0]
+ mova xm6, [lumaq+lstrideq*0+16]
+ mova xm0, [srcq]
+ vpbroadcastd m7, [pb_1]
+ vinserti128 m4, [lumaq+lstrideq*2 +0], 1
+ vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m0, [srcq+strideq], 1
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ punpckhwd m5, m4, m2
+ punpcklwd m4, m2
+ punpckhwd m7, m6, m2
+ punpcklwd m6, m2 ; m4-7: luma_src as dword
+
+ ; scaling[luma_src]
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m8, [scalingq+m4], m3
+ vpgatherdd m4, [scalingq+m5], m9
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m5, [scalingq+m6], m3
+ vpgatherdd m6, [scalingq+m7], m9
+ pand m8, m10
+ pand m4, m10
+ pand m5, m10
+ pand m6, m10
+ packusdw m8, m4
+ packusdw m5, m6
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %1
+ vpbroadcastd m6, [pb_23_22] ; FIXME
+%endif
+ movu xm3, [grain_lutq+offxyq+ 0]
+ movd xm4, [grain_lutq+left_offxyq+ 0]
+ vinserti128 m3, [grain_lutq+offxyq+82], 1
+ vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+ punpcklbw m4, m3
+%if %1
+ pmaddubsw m4, m6, m4
+ pmulhrsw m4, [pw_1024]
+%else
+ pmaddubsw m4, m15, m4
+ pmulhrsw m4, m14
+%endif
+ packsswb m4, m4
+ pcmpeqw m6, m6 ; FIXME
+ psrldq m6, 15 ; FIXME
+ vpblendvb m3, m3, m4, m6
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m8
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*4]
+ add grain_lutq, 82*2
+ sub hb, 2
+ jg %%loop_y_h_overlap
+
+ add wq, 16
+ jge %%end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*2]
+ add srcq, wq
+ add dstq, wq
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%end:
+ RET
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, overlap, unused1, unused2, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused1, unused2, see, overlap, unused3, unused4, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*2]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 82
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq+0x10001*498+16*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+ ; src
+ mova xm4, [lumaq+lstrideq*0+ 0]
+ mova xm6, [lumaq+lstrideq*0+16]
+ mova xm0, [srcq]
+ vpbroadcastd m7, [pb_1]
+ vinserti128 m4, [lumaq+lstrideq*2 +0], 1
+ vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m0, [srcq+strideq], 1
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ punpckhwd m5, m4, m2
+ punpcklwd m4, m2
+ punpckhwd m7, m6, m2
+ punpcklwd m6, m2 ; m4-7: luma_src as dword
+
+ ; scaling[luma_src]
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m8, [scalingq+m4], m3
+ vpgatherdd m4, [scalingq+m5], m9
+ pcmpeqw m3, m3
+ pcmpeqw m9, m9
+ vpgatherdd m5, [scalingq+m6], m3
+ vpgatherdd m6, [scalingq+m7], m9
+ pand m8, m10
+ pand m4, m10
+ pand m5, m10
+ pand m6, m10
+ packusdw m8, m4
+ packusdw m5, m6
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %1
+ vpbroadcastd m6, [pb_23_22]
+%endif
+ movq xm3, [grain_lutq+offxyq]
+ movq xm4, [grain_lutq+top_offxyq]
+ vinserti128 m3, [grain_lutq+offxyq+8], 1
+ vinserti128 m4, [grain_lutq+top_offxyq+8], 1
+ punpcklbw m4, m3
+%if %1
+ pmaddubsw m4, m6, m4
+ pmulhrsw m4, [pw_1024]
+%else
+ pmaddubsw m4, m15, m4
+ pmulhrsw m4, m14
+%endif
+ packsswb m4, m4
+ vpermq m4, m4, q3120
+ ; only interpolate first line, insert second line unmodified
+ vinserti128 m3, m4, [grain_lutq+offxyq+82], 1
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m8
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+
+ sub hb, 2
+ jl %%end_y_v_overlap
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*4]
+ add grain_lutq, 82*2
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+ add wq, 16
+ jge %%end_hv
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*2]
+ add srcq, wq
+ add dstq, wq
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 82
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq+0x10001*498+16*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; src
+ mova xm4, [lumaq+lstrideq*0+ 0]
+ mova xm6, [lumaq+lstrideq*0+16]
+ mova xm0, [srcq]
+ vpbroadcastd m7, [pb_1]
+ vinserti128 m4, [lumaq+lstrideq*2 +0], 1
+ vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m0, [srcq+strideq], 1
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+
+%if %1
+ packuswb m4, m6 ; luma
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ punpckhwd m5, m4, m2
+ punpcklwd m4, m2
+ punpckhwd m7, m6, m2
+ punpcklwd m6, m2 ; m4-7: src as dword
+
+ ; scaling[src]
+ pcmpeqw m9, m9
+ pcmpeqw m3, m3
+ vpgatherdd m8, [scalingq+m4], m9
+ vpgatherdd m4, [scalingq+m5], m3
+ pcmpeqw m9, m9
+ pcmpeqw m3, m3
+ vpgatherdd m5, [scalingq+m6], m9
+ vpgatherdd m6, [scalingq+m7], m3
+ pand m8, m10
+ pand m4, m10
+ pand m5, m10
+ pand m6, m10
+ packusdw m8, m4
+ packusdw m5, m6
+
+ ; unpack chroma source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %1
+ vpbroadcastd m9, [pb_23_22]
+%endif
+ movu xm3, [grain_lutq+offxyq]
+ movq xm6, [grain_lutq+top_offxyq]
+ vinserti128 m3, [grain_lutq+offxyq+82], 1
+ vinserti128 m6, [grain_lutq+top_offxyq+8], 1
+ movd xm4, [grain_lutq+left_offxyq]
+ movd xm7, [grain_lutq+topleft_offxyq]
+ vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m4, m3
+ punpcklbw xm7, xm6
+%if %1
+ pmaddubsw m4, m9, m4
+ pmaddubsw xm7, xm9, xm7
+ pmulhrsw m4, [pw_1024]
+ pmulhrsw xm7, [pw_1024]
+%else
+ pmaddubsw m4, m15, m4
+ pmaddubsw xm7, xm15, xm7
+ pmulhrsw m4, m14
+ pmulhrsw xm7, xm14
+%endif
+ packsswb m4, m4
+ packsswb xm7, xm7
+ pcmpeqw m9, m9 ; this is kind of ugly
+ psrldq m9, 15
+ vpblendvb m3, m3, m4, m9
+ shufpd m9, m9, m9, 1110b
+ vpblendvb m6, m6, m7, m9
+ vpermq m9, m3, q3120
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m6, m9
+%if %1
+ vpbroadcastd m9, [pb_23_22]
+ pmaddubsw m6, m9, m6
+ pmulhrsw m6, [pw_1024]
+%else
+ pmaddubsw m6, m15, m6
+ pmulhrsw m6, m14
+%endif
+ packsswb m6, m6
+ vpermq m6, m6, q3120
+ vpblendd m3, m3, m6, 00001111b
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m8
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*4]
+ add grain_lutq, 82*2
+ sub hb, 2
+ jg %%loop_y_h_overlap
+
+%%end_y_hv_overlap:
+ add wq, 16
+ jge %%end_hv
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*2]
+ add srcq, wq
+ add dstq, wq
+ jmp %%loop_x_hv_overlap
+
+%%end_hv:
+ RET
+%endmacro
+
+ FGUV_32x32xN_LOOP 1
+.csfl:
+ FGUV_32x32xN_LOOP 0
%endif ; ARCH_X86_64
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -30,6 +30,7 @@
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -39,5 +40,6 @@
#if BITDEPTH == 8 && ARCH_X86_64
c->generate_grain_y = dav1d_generate_grain_y_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
#endif
}
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -120,9 +120,7 @@
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
- memcpy(a_dst, src, stride * h);
- memcpy(c_dst, src, stride * h);
- const int row_num = rnd() & 0x7ff;
+ const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
fg_data.clip_to_restricted_range = rnd() & 1;
fg_data.scaling_shift = (rnd() & 3) + 8;
@@ -144,6 +142,122 @@
report("fgy_32x32xn");
}
+static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
+ ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+ ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+ ALIGN_STK_32(pixel, src, 128 * 32,);
+ ALIGN_STK_32(pixel, luma_src, 128 * 32,);
+ const ptrdiff_t lstride = 128 * sizeof(pixel);
+
+ declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
+ const Dav1dFilmGrainData *data, size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
+ const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
+ int is_identity HIGHBD_DECL_SUFFIX);
+
+ for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
+ const char ss_name[][4] = {
+ [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
+ [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
+ [DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
+ };
+ const enum Dav1dPixelLayout layout = layout_idx + 1;
+ const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel);
+
+ for (int csfl = 0; csfl <= 1; csfl++) {
+ if (check_func(dsp->fguv_32x32xn[layout_idx],
+ "fguv_32x32xn_%dbpc_%s_csfl%d",
+ BITDEPTH, ss_name[layout_idx], csfl))
+ {
+ Dav1dFilmGrainData fg_data;
+
+ fg_data.seed = rnd() & 0xFFFF;
+
+#if BITDEPTH == 16
+ const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+ const int bitdepth_max = 0xff;
+#endif
+ const int uv_pl = rnd() & 1;
+ const int is_identity = rnd() & 1;
+
+ uint8_t scaling[SCALING_SIZE];
+ entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+ fg_data.grain_scale_shift = rnd() & 3;
+ fg_data.ar_coeff_shift = (rnd() & 3) + 6;
+ fg_data.ar_coeff_lag = rnd() & 3;
+ const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
+ for (int n = 0; n < num_y_pos; n++)
+ fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
+ dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
+ dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
+ &fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
+
+ const int w = 1 + (rnd() & (127 >> ss_x));
+ const int h = 1 + (rnd() & (31 >> ss_y));
+ const int lw = w << ss_x, lh = h << ss_y;
+
+ for (int y = 0; y < h; y++)
+ for (int x = 0; x < w; x++)
+ src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
+ for (int y = 0; y < lh; y++)
+ for (int x = 0; x < lw; x++)
+ luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
+ const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
+
+ if (csfl) {
+ fg_data.num_y_points = 2 + (rnd() % 13);
+ const int pad = 0xff / fg_data.num_y_points;
+ for (int n = 0; n < fg_data.num_y_points; n++) {
+ fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
+ fg_data.y_points[n][0] += rnd() % pad;
+ fg_data.y_points[n][1] = rnd() & 0xff;
+ }
+ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
+ fg_data.num_y_points, scaling);
+ } else {
+ fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
+ const int pad = 0xff / fg_data.num_uv_points[uv_pl];
+ for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
+ fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
+ fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
+ fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
+ }
+ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
+ fg_data.num_uv_points[uv_pl], scaling);
+
+ fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
+ fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
+ fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
+ }
+
+ fg_data.clip_to_restricted_range = rnd() & 1;
+ fg_data.scaling_shift = (rnd() & 3) + 8;
+ fg_data.chroma_scaling_from_luma = csfl;
+ for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
+ fg_data.overlap_flag++)
+ {
+ call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+ row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+ call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
+ row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+
+ checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
+ }
+
+ fg_data.overlap_flag = 1;
+ bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
+ row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
+ }
+ }
+ }
+
+ report("fguv_32x32xn");
+}
+
void bitfn(checkasm_check_filmgrain)(void) {
Dav1dFilmGrainDSPContext c;
@@ -151,4 +265,5 @@
check_gen_grny(&c);
check_fgy_sbrow(&c);
+ check_fguv_sbrow(&c);
}