ref: 275e91de9e3c6ec03a08e617c81923d938eaaa7f
parent: fcc94fa905ba4d87f3383d517fbf6fcb08006ffc
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Sat Mar 28 05:22:15 EDT 2020
x86: add AVX2 versions for filmgrain.fguv_32x32xn[422/444] fguv_32x32xn_8bpc_420_csfl0_c: 14568.2 fguv_32x32xn_8bpc_420_csfl0_avx2: 940.2 fguv_32x32xn_8bpc_420_csfl1_c: 10682.0 fguv_32x32xn_8bpc_420_csfl1_avx2: 783.3 fguv_32x32xn_8bpc_422_csfl0_c: 16370.5 fguv_32x32xn_8bpc_422_csfl0_avx2: 1557.3 fguv_32x32xn_8bpc_422_csfl1_c: 11333.8 fguv_32x32xn_8bpc_422_csfl1_avx2: 902.1 fguv_32x32xn_8bpc_444_csfl0_c: 12950.1 fguv_32x32xn_8bpc_444_csfl0_avx2: 822.9 fguv_32x32xn_8bpc_444_csfl1_c: 8806.7 fguv_32x32xn_8bpc_444_csfl1_avx2: 708.2
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -28,6 +28,8 @@
%if ARCH_X86_64
SECTION_RODATA 32
+pb_8x_27_17_8x_17_27: times 8 db 27, 17
+ times 8 db 17, 27
pw_1024: times 16 dw 1024
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
@@ -1457,8 +1459,9 @@
.end_hv:
RET
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
- grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
pcmpeqw m10, m10
psrld m10, 24
mov r7d, [fg_dataq+FGData.scaling_shift]
@@ -1474,7 +1477,7 @@
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
%if %1
@@ -1485,8 +1488,12 @@
vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4]
%else
vpbroadcastd m14, [pw_1024]
+%if %2
vpbroadcastd m15, [pb_23_22]
+%else
+ vpbroadcastd xm15, [pb_27_17_17_27]
%endif
+%endif
mov overlapd, [fg_dataq+FGData.overlap_flag]
movifnidn sbyd, sbym
@@ -1507,7 +1514,7 @@
mov lumaq, r9mp
lea r12, [srcq+wq]
lea r13, [dstq+wq]
- lea r14, [lumaq+wq*2]
+ lea r14, [lumaq+wq*(1+%2)]
mov r11mp, r12
mov r12mp, r13
mov lstrideq, r10mp
@@ -1528,8 +1535,8 @@
rorx offyd, seed, 8
shr offxd, 12
and offyd, 0xf
- imul offyd, 82
- lea offyq, [offyq+offxq+498] ; offy*stride+offx
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, unused1, unused2, lstride
@@ -1538,12 +1545,13 @@
mov grain_lutq, grain_lutmp
%%loop_y:
; src
+%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
- vinserti128 m4, [lumaq+lstrideq*2 +0], 1
- vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
+ vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
@@ -1550,9 +1558,16 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ pxor m2, m2
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1564,6 +1579,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1592,8 +1610,12 @@
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
+%if %2
movu xm3, [grain_lutq+offxyq+ 0]
vinserti128 m3, [grain_lutq+offxyq+82], 1
+%else
+ movu m3, [grain_lutq+offxyq]
+%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1612,21 +1634,31 @@
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
+%if %2
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
- lea lumaq, [lumaq+lstrideq*4]
- add grain_lutq, 82*2
- sub hb, 2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
jg %%loop_y
- add wq, 16
+ add wq, 32>>%2
jge %%end
mov srcq, r11mp
mov dstq, r12mp
- lea lumaq, [r14+wq*2]
+ lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
test overlapd, overlapd
@@ -1648,13 +1680,13 @@
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
- lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx
mov offxd, seed
rorx offyd, seed, 8
shr offxd, 12
and offyd, 0xf
- imul offyd, 82
- lea offyq, [offyq+offxq+498] ; offy*stride+offx
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, unused1, unused2, lstride
@@ -1663,12 +1695,13 @@
mov grain_lutq, grain_lutmp
%%loop_y_h_overlap:
; src
+%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
- vinserti128 m4, [lumaq+lstrideq*2 +0], 1
- vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
+ vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
@@ -1675,9 +1708,16 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1689,6 +1729,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1717,6 +1760,7 @@
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
+%if %2
%if %1
vpbroadcastd m6, [pb_23_22] ; FIXME
%endif
@@ -1736,6 +1780,25 @@
pcmpeqw m6, m6 ; FIXME
psrldq m6, 15 ; FIXME
vpblendvb m3, m3, m4, m6
+%else
+%if %1
+ vpbroadcastd xm6, [pb_27_17_17_27]
+%endif
+ movu m3, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm3
+%if %1
+ pmaddubsw xm4, xm6, xm4
+ pmulhrsw xm4, [pw_1024]
+%else
+ pmaddubsw xm4, xm15, xm4
+ pmulhrsw xm4, xm14
+%endif
+ packsswb xm4, xm4
+ pcmpeqw xm6, xm6
+ psrldq xm6, 14
+ vpblendvb m3, m3, m4, m6
+%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1754,21 +1817,31 @@
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
+%if %2
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
- lea lumaq, [lumaq+lstrideq*4]
- add grain_lutq, 82*2
- sub hb, 2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(1+%2)
+ sub hb, 1+%2
jg %%loop_y_h_overlap
- add wq, 16
+ add wq, 32>>%2
jge %%end
mov srcq, r11mp
mov dstq, r12mp
- lea lumaq, [r14+wq*2]
+ lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
@@ -1801,7 +1874,7 @@
mov lumaq, r9mp
lea r12, [srcq+wq]
lea r13, [dstq+wq]
- lea r14, [lumaq+wq*2]
+ lea r14, [lumaq+wq*(1+%2)]
mov r11mp, r12
mov r12mp, r13
mov lstrideq, r10mp
@@ -1828,9 +1901,9 @@
rorx offxd, seed, 12
and offyd, 0xf000f
and offxd, 0xf000f
- imul offyd, 82
+ imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
- lea offyq, [offyq+offxq+0x10001*498+16*82]
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, top_offxy, unused, lstride
@@ -1840,14 +1913,18 @@
mov hd, hm
mov grain_lutq, grain_lutmp
+%if %2 == 0
+ vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
+%endif
%%loop_y_v_overlap:
; src
+%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
- vinserti128 m4, [lumaq+lstrideq*2 +0], 1
- vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
+ vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
@@ -1854,9 +1931,16 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1868,6 +1952,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1891,12 +1978,43 @@
packusdw m8, m4
packusdw m5, m6
+%if %2
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
+%endif
; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+ mova m6, [pb_8x_27_17_8x_17_27]
+ movu xm3, [grain_lutq+offxyq]
+ movu xm4, [grain_lutq+top_offxyq]
+ vinserti128 m3, [grain_lutq+offxyq+82], 1
+ vinserti128 m4, [grain_lutq+top_offxyq+82], 1
+%else
+ movu m3, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m9, m4, m3
+ punpcklbw m4, m3
+%if %2
+ pmaddubsw m9, m6, m9
+ pmaddubsw m4, m6, m4
+%else
+ pmaddubsw m9, m1, m9
+ pmaddubsw m4, m1, m4
+%endif
%if %1
+ pmulhrsw m9, [pw_1024]
+ pmulhrsw m4, [pw_1024]
+%else
+ pmulhrsw m9, m14
+ pmulhrsw m4, m14
+%endif
+ packsswb m3, m4, m9
+%else
+%if %1
vpbroadcastd m6, [pb_23_22]
%endif
movq xm3, [grain_lutq+offxyq]
@@ -1915,6 +2033,7 @@
vpermq m4, m4, q3120
; only interpolate first line, insert second line unmodified
vinserti128 m3, m4, [grain_lutq+offxyq+82], 1
+%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1926,6 +2045,7 @@
pmulhrsw m3, m11
; dst = clip_pixel(src, noise)
+%if %2
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
@@ -1935,21 +2055,46 @@
packuswb m0, m1
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
+%else
+ pxor m6, m6
+ punpckhbw m9, m0, m6
+ punpcklbw m0, m6 ; m0-1: src as word
- sub hb, 2
+ paddw m0, m2
+ paddw m9, m3
+ pmaxsw m0, m13
+ pmaxsw m9, m13
+ pminsw m0, m12
+ pminsw m9, m12
+ packuswb m0, m9
+ mova [dstq], m0
+%endif
+
+ sub hb, 1+%2
jl %%end_y_v_overlap
+%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
- lea lumaq, [lumaq+lstrideq*4]
- add grain_lutq, 82*2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+%if %2 == 0
+ vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
+ btc hd, 16
+ jnc %%loop_y_v_overlap
+%endif
jmp %%loop_y
%%end_y_v_overlap:
- add wq, 16
+ add wq, 32>>%2
jge %%end_hv
mov srcq, r11mp
mov dstq, r12mp
- lea lumaq, [r14+wq*2]
+ lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
@@ -1974,15 +2119,15 @@
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
- lea topleft_offxyq, [top_offxyq+16]
- lea left_offxyq, [offyq+16]
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0xf000f
and offxd, 0xf000f
- imul offyd, 82
+ imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
- lea offyq, [offyq+offxq+0x10001*498+16*82]
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
@@ -1992,14 +2137,18 @@
mov hd, hm
mov grain_lutq, grain_lutmp
+%if %2 == 0
+ vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
+%endif
%%loop_y_hv_overlap:
; src
+%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
- vinserti128 m4, [lumaq+lstrideq*2 +0], 1
- vinserti128 m6, [lumaq+lstrideq*2+16], 1
+ vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
+ vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
@@ -2006,9 +2155,16 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2020,6 +2176,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -2043,44 +2202,94 @@
packusdw m8, m4
packusdw m5, m6
+%if %2
; unpack chroma source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
+%endif
; grain = grain_lut[offy+y][offx+x]
%if %1
+%if %2
vpbroadcastd m9, [pb_23_22]
+%else
+ vpbroadcastd xm9, [pb_27_17_17_27]
%endif
+%endif
+
+%if %2
movu xm3, [grain_lutq+offxyq]
+%if %3
movq xm6, [grain_lutq+top_offxyq]
+%else
+ movu xm6, [grain_lutq+top_offxyq]
+%endif
vinserti128 m3, [grain_lutq+offxyq+82], 1
+%if %3
vinserti128 m6, [grain_lutq+top_offxyq+8], 1
+%else
+ vinserti128 m6, [grain_lutq+top_offxyq+82], 1
+%endif
+%else
+ movu m3, [grain_lutq+offxyq]
+ movu m6, [grain_lutq+top_offxyq]
+%endif
movd xm4, [grain_lutq+left_offxyq]
movd xm7, [grain_lutq+topleft_offxyq]
+%if %2
vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+%if %3 == 0
+ vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1
+%endif
+%endif
+
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+%if %2
punpcklbw m4, m3
+%if %3
punpcklbw xm7, xm6
+%else
+ punpcklbw m7, m6
+%endif
+ punpcklwd m4, m7
%if %1
pmaddubsw m4, m9, m4
- pmaddubsw xm7, xm9, xm7
pmulhrsw m4, [pw_1024]
- pmulhrsw xm7, [pw_1024]
%else
pmaddubsw m4, m15, m4
- pmaddubsw xm7, xm15, xm7
pmulhrsw m4, m14
- pmulhrsw xm7, xm14
%endif
packsswb m4, m4
- packsswb xm7, xm7
pcmpeqw m9, m9 ; this is kind of ugly
psrldq m9, 15
vpblendvb m3, m3, m4, m9
- shufpd m9, m9, m9, 1110b
- vpblendvb m6, m6, m7, m9
- vpermq m9, m3, q3120
+ psrldq m4, 1
+%if %3
+ shufpd m9, m9, m9, 1110b ; clear upper lane
+%endif
+ vpblendvb m6, m6, m4, m9
+%else
+ punpcklbw xm4, xm3
+ punpcklbw xm7, xm6
+ punpckldq xm4, xm7
+%if %1
+ pmaddubsw xm4, xm9, xm4
+ pmulhrsw xm4, [pw_1024]
+%else
+ pmaddubsw xm4, xm15, xm4
+ pmulhrsw xm4, xm14
+%endif
+ packsswb xm4, xm4
+ pcmpeqw xm9, xm9 ; this is kind of ugly
+ psrldq xm9, 14
+ vpblendvb m3, m3, m4, m9
+ psrldq xm4, 2
+ vpblendvb m6, m6, m4, m9
+%endif
+
; followed by v interpolation (top | cur -> cur)
+%if %3
+ vpermq m9, m3, q3120
punpcklbw m6, m9
%if %1
vpbroadcastd m9, [pb_23_22]
@@ -2093,6 +2302,26 @@
packsswb m6, m6
vpermq m6, m6, q3120
vpblendd m3, m3, m6, 00001111b
+%else
+ punpckhbw m9, m6, m3
+ punpcklbw m6, m3
+%if %2
+ mova m3, [pb_8x_27_17_8x_17_27]
+ pmaddubsw m9, m3, m9
+ pmaddubsw m6, m3, m6
+%else
+ pmaddubsw m9, m1, m9
+ pmaddubsw m6, m1, m6
+%endif
+%if %1
+ pmulhrsw m9, [pw_1024]
+ pmulhrsw m6, [pw_1024]
+%else
+ pmulhrsw m9, m14
+ pmulhrsw m6, m14
+%endif
+ packsswb m3, m6, m9
+%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -2104,6 +2333,7 @@
pmulhrsw m3, m11
; dst = clip_pixel(src, noise)
+%if %2
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
@@ -2113,20 +2343,47 @@
packuswb m0, m1
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
+%else
+ pxor m6, m6
+ punpckhbw m9, m0, m6
+ punpcklbw m0, m6 ; m0-1: src as word
+ paddw m0, m2
+ paddw m9, m3
+ pmaxsw m0, m13
+ pmaxsw m9, m13
+ pminsw m0, m12
+ pminsw m9, m12
+ packuswb m0, m9
+ mova [dstq], m0
+%endif
+%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
- lea lumaq, [lumaq+lstrideq*4]
- add grain_lutq, 82*2
- sub hb, 2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+%if %2
jg %%loop_y_h_overlap
+%else
+ je %%end_y_hv_overlap
+ vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
+ btc hd, 16
+ jnc %%loop_y_hv_overlap
+ jmp %%loop_y_h_overlap
+%endif
%%end_y_hv_overlap:
- add wq, 16
+ add wq, 32>>%2
jge %%end_hv
mov srcq, r11mp
mov dstq, r12mp
- lea lumaq, [r14+wq*2]
+ lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
jmp %%loop_x_hv_overlap
@@ -2135,8 +2392,13 @@
RET
%endmacro
- FGUV_32x32xN_LOOP 1
+ %%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
- FGUV_32x32xN_LOOP 0
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
%endif ; ARCH_X86_64
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -41,6 +41,8 @@
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -65,5 +67,7 @@
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
#endif
}