shithub: dav1d

Download patch

ref: 6b85daf042d3c7d88c6e3be8a273a6ed1fcbf1a2
parent: dab82163cceb61a2b734189488f5da9e90e98f6f
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Wed Mar 25 05:00:03 EDT 2020

x86: add SSSE3 SIMD for generate_grain_uv_{422,444}

gen_grain_uv_ar0_8bpc_420_c: 72275.4
gen_grain_uv_ar0_8bpc_420_ssse3: 7274.8
gen_grain_uv_ar0_8bpc_422_c: 111742.9
gen_grain_uv_ar0_8bpc_422_ssse3: 13724.8
gen_grain_uv_ar0_8bpc_444_c: 205688.5
gen_grain_uv_ar0_8bpc_444_ssse3: 26218.3
gen_grain_uv_ar1_8bpc_420_c: 100682.5
gen_grain_uv_ar1_8bpc_420_ssse3: 20168.4
gen_grain_uv_ar1_8bpc_422_c: 167931.4
gen_grain_uv_ar1_8bpc_422_ssse3: 39524.7
gen_grain_uv_ar1_8bpc_444_c: 323812.2
gen_grain_uv_ar1_8bpc_444_ssse3: 77930.3
gen_grain_uv_ar2_8bpc_420_c: 159545.7
gen_grain_uv_ar2_8bpc_420_ssse3: 25849.7
gen_grain_uv_ar2_8bpc_422_c: 295959.9
gen_grain_uv_ar2_8bpc_422_ssse3: 49286.6
gen_grain_uv_ar2_8bpc_444_c: 571862.2
gen_grain_uv_ar2_8bpc_444_ssse3: 98814.2
gen_grain_uv_ar3_8bpc_420_c: 243445.9
gen_grain_uv_ar3_8bpc_420_ssse3: 28806.2
gen_grain_uv_ar3_8bpc_422_c: 458189.9
gen_grain_uv_ar3_8bpc_422_ssse3: 56629.9
gen_grain_uv_ar3_8bpc_444_c: 883627.3
gen_grain_uv_ar3_8bpc_444_ssse3: 114761.2

Also contains slight fixes to generate_grain_uv.ar0 to not pack before
adding the current grain value. Fixes overflows in e.g. seed=1115072968.

--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -30,6 +30,8 @@
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
 
@@ -48,6 +50,8 @@
 #if BITDEPTH == 8
     c->generate_grain_y = dav1d_generate_grain_y_ssse3;
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
 #endif
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -60,6 +60,8 @@
 
 JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
 
 struc FGData
     .seed:                      resd 1
@@ -502,8 +504,9 @@
     jg .y_loop_ar3
     RET
 
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
 INIT_XMM ssse3
-cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
     movifnidn        r2, r2mp
     movifnidn        r3, r3mp
     LEA              r4, $$
@@ -520,15 +523,21 @@
     pshuflw          m6, m6, q0000
     pshuflw          m0, m0, q0000
     lea              r6, [base+gaussian_sequence]
+%if %2
 %if ARCH_X86_64
-    mov             r7d, 38
+    mov             r7d, 73-35*%3
 %else
-    mov            r3mp, 38
+    mov            r3mp, 73-35*%3
 %endif
     add            bufq, 44
 .loop_y:
     mov              r5, -44
 .loop_x:
+%else
+    mov              r5, -82*73
+    sub            bufq, r5
+.loop:
+%endif
     pand             m2, m0, m1
     psrlw            m3, m2, 10
     por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
@@ -577,6 +586,7 @@
     packsswb         m3, m3
     movd      [bufq+r5], m3
     add              r5, 4
+%if %2
     jl .loop_x
     add            bufq, 82
 %if ARCH_X86_64
@@ -585,6 +595,9 @@
     dec            r3mp
 %endif
     jg .loop_y
+%else
+    jl .loop
+%endif
 
 %if ARCH_X86_32
     mov              r2, r2mp
@@ -592,8 +605,8 @@
 
     ; auto-regression code
     movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
-    movsxd           r5, [base+generate_grain_uv_420_ssse3_table+r5*4]
-    lea              r5, [r5+base+generate_grain_uv_420_ssse3_table]
+    movsxd           r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_ssse3_table]
     jmp              r5
 
 .ar0:
@@ -607,79 +620,130 @@
     mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
     movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
     movd             m4, [base+hmul_bits+shiftq*2]
-    movd             m1, [base+byte_blend]
-    DEFINE_ARGS buf, bufy, h
+    DEFINE_ARGS buf, bufy, h, x
     pxor             m0, m0
     pcmpgtb          m0, m5
     punpcklbw        m5, m0
     movd             m7, [base+pb_1]
-    movd             m6, [base+hmul_bits+4]
+%if %2
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
     pshuflw          m5, m5, q0000
     pshuflw          m4, m4, q0000
     pshufd           m7, m7, q0000
+%if %2
     pshuflw          m6, m6, q0000
+%endif
     punpcklqdq       m5, m5
     punpcklqdq       m4, m4
+%if %2
     punpcklqdq       m6, m6
-    punpcklbw        m1, m1
+%endif
+    pcmpeqw          m1, m1
+    pslldq           m1, 12>>%2
     SCRATCH           1, 8, 0
     SCRATCH           4, 9, 1
-    sub            bufq, 82*38+82-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
     add           bufyq, 3+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar0:
+    xor              xd, xd
+.x_loop_ar0:
     ; first 32 pixels
-    movu             m1, [bufyq]
-    movu             m2, [bufyq+82]
-    movu             m3, [bufyq+16]
-    movu             m4, [bufyq+82+16]
+%if %2
+    movu             m1, [bufyq+xq*2]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
+    movu             m3, [bufyq+xq*2+16]
+%if %3
+    movu             m4, [bufyq+xq*2+82+16]
+%endif
     pmaddubsw        m0, m7, m1
+%if %3
     pmaddubsw        m1, m7, m2
+%endif
     pmaddubsw        m2, m7, m3
+%if %3
     pmaddubsw        m3, m7, m4
     paddw            m0, m1
     paddw            m2, m3
+%endif
     pmulhrsw         m0, m6
     pmulhrsw         m2, m6
+%else
+    movu             m0, [bufyq+xq]
+    pxor             m6, m6
+    pcmpgtb          m6, m0
+    punpckhbw        m2, m0, m6
+    punpcklbw        m0, m6
+%endif
     pmullw           m0, m5
     pmullw           m2, m5
     pmulhrsw         m0, m9
     pmulhrsw         m2, m9
+    movu             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpckhbw        m3, m1, m4
+%if %2
+    punpcklbw        m1, m4
+    paddw            m2, m3
+    paddw            m0, m1
+%else
+    punpcklbw        m6, m1, m4
+    paddw            m2, m3
+    paddw            m0, m6
+%endif
     packsswb         m0, m2
-    movu             m1, [bufq]
-    punpckhbw        m2, m0, m1
-    punpcklbw        m0, m1
-    pmaddubsw        m1, m7, m2
-    pmaddubsw        m2, m7, m0
-    packsswb         m2, m1
-    movu         [bufq], m2
-    add           bufyq, 32
-    add            bufq, 16
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .y_loop_ar0
+%if %2
+    movu      [bufq+xq], m0
+    add              xd, 16
+    cmp              xd, 32
+    jl .x_loop_ar0
 
-    ; last 6 pixels
-    movu             m1, [bufyq]
-    movu             m2, [bufyq+82]
+    ; last 6/12 pixels
+    movu             m1, [bufyq+xq*(1+%2)]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
     pmaddubsw        m0, m7, m1
+%if %3
     pmaddubsw        m1, m7, m2
     paddw            m0, m1
+%endif
     pmulhrsw         m0, m6
     pmullw           m0, m5
     pmulhrsw         m0, m9
+    movq             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpcklbw        m2, m1, m4
+    paddw            m0, m2
     packsswb         m0, m0
-    movq             m1, [bufq]
-    punpcklbw        m0, m1
-    pmaddubsw        m2, m7, m0
-    packsswb         m2, m2
-    pandn            m0, m8, m2
+    pandn            m2, m8, m0
     pand             m1, m8
-    por              m0, m1
-    movq         [bufq], m0
+    por              m2, m1
+    movq      [bufq+xq], m2
+%else
+    add              xd, 16
+    cmp              xd, 80
+    je .y_loop_final_ar0
+    movu   [bufq+xq-16], m0
+    jmp .x_loop_ar0
+.y_loop_final_ar0:
+    pandn            m2, m8, m0
+    pand             m1, m8
+    por              m2, m1
+    movu   [bufq+xq-16], m2
+%endif
 
-    add            bufq, 82-32
-    add           bufyq, 82*2-64
+    add            bufq, 82
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar0
     RET
@@ -706,8 +770,10 @@
 %endif
     mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
     movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
     movd             m7, [base+pb_1]
-    movd             m6, [base+hmul_bits+4]
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
     psrldq           m4, 1
 %if ARCH_X86_32
     DEFINE_ARGS buf, shift, val0, val3, min, max, x
@@ -718,40 +784,64 @@
 %endif
     pxor             m5, m5
     punpcklwd        m3, m5
+%if %2
     punpcklwd        m6, m6
+%endif
     pcmpgtb          m5, m4
     punpcklbw        m4, m5
     pshufd           m5, m4, q1111
     pshufd           m4, m4, q0000
     pshufd           m3, m3, q0000
+%if %2
     pshufd           m7, m7, q0000
     pshufd           m6, m6, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
 %if ARCH_X86_32
     add            r1mp, 79+82*3
-    mov            r0mp, 35
+    mov            r0mp, 70-35*%3
 %else
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 %endif
     mov            mind, -128
     mov            maxd, 127
 .y_loop_ar1:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
     movsx         val3d, byte [bufq+xq-1]
 .x_loop_ar1:
+%if %2
 %if ARCH_X86_32
     mov              r2, r1mp
     movq             m0, [r2+xq*2]
+%if %3
     movq             m1, [r2+xq*2+82]
+%endif
 %else
     movq             m0, [bufyq+xq*2]
+%if %3
     movq             m1, [bufyq+xq*2+82]
 %endif
+%endif
     pmaddubsw        m2, m7, m0
+%if %3
     pmaddubsw        m0, m7, m1
     paddw            m2, m0
+%endif
     pmulhrsw         m2, m6
+%else
+%if ARCH_X86_32
+    mov              r2, r1mp
+    movd             m2, [r2+xq]
+%else
+    movd             m2, [bufyq+xq]
+%endif
+    pxor             m0, m0
+    pcmpgtb          m0, m2
+    punpcklbw        m2, m0
+%endif
 
     movq             m0, [bufq+xq-82-1]     ; top/left
     pxor             m1, m1
@@ -792,10 +882,10 @@
 .x_loop_ar1_end:
     add            bufq, 82
 %if ARCH_X86_32
-    add            r1mp, 82*2
+    add            r1mp, 82<<%3
     dec            r0mp
 %else
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
 %endif
     jg .y_loop_ar1
@@ -837,16 +927,20 @@
     SCRATCH           5, 13, 5
     SCRATCH           6, 14, 6
     SCRATCH           7, 15, 7
-    movd             m7, [base+hmul_bits+4]
+%if %2
+    movd             m7, [base+hmul_bits+2+%3*2]
     movd             m6, [base+pb_1]
     punpcklwd        m7, m7
     pshufd           m6, m6, q0000
     pshufd           m7, m7, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar2:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar2:
     pxor             m2, m2
@@ -879,12 +973,23 @@
     paddd            m2, m3
     paddd            m2, m4
 
-    movq             m0, [bufyq+xq*2]
+%if %2
+    movq             m1, [bufyq+xq*2]
+%if %3
     movq             m3, [bufyq+xq*2+82]
-    pmaddubsw        m1, m6, m0
-    pmaddubsw        m0, m6, m3
+%endif
+    pmaddubsw        m0, m6, m1
+%if %3
+    pmaddubsw        m1, m6, m3
     paddw            m0, m1
+%endif
     pmulhrsw         m0, m7
+%else
+    movd             m0, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m0
+    punpcklbw        m0, m1
+%endif
     punpcklwd        m0, m15
     pmaddwd          m0, m14
     paddd            m2, m0
@@ -914,7 +1019,7 @@
 
 .x_loop_ar2_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar2
     RET
@@ -977,24 +1082,36 @@
     SCRATCH           5, 12, 11
 
     movd             m2, [base+round_vals-12+shiftq*2]
+%if %2
     movd             m1, [base+pb_1]
-    movd             m3, [base+hmul_bits+4]
+    movd             m3, [base+hmul_bits+2+%3*2]
+%endif
     pxor             m0, m0
     punpcklwd        m2, m0
+%if %2
     punpcklwd        m3, m3
+%endif
     pshufd           m2, m2, q0000
+%if %2
     pshufd           m1, m1, q0000
     pshufd           m3, m3, q0000
     SCRATCH           1, 13, 12
+%endif
     SCRATCH           2, 14, 13
+%if %2
     SCRATCH           3, 15, 14
+%endif
 
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar3:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar3:
     movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
@@ -1058,12 +1175,23 @@
     paddd            m3, m5
     paddd            m0, m3
 
+%if %2
     movq             m1, [bufyq+xq*2]
+%if %3
     movq             m3, [bufyq+xq*2+82]
-    pmaddubsw        m5, m13, m1
-    pmaddubsw        m7, m13, m3
+%endif
+    pmaddubsw        m7, m13, m1
+%if %3
+    pmaddubsw        m5, m13, m3
     paddw            m7, m5
+%endif
     pmulhrsw         m7, m15
+%else
+    movd             m7, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m7
+    punpcklbw        m7, m1
+%endif
 
     psrldq           m1, m2, 4
     psrldq           m3, m2, 6
@@ -1110,10 +1238,15 @@
 
 .x_loop_ar3_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar3
     RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
 
 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
 %assign %%idx 0