ref: a1647a59bdba00a2b7385f4e345804a6399703b8
parent: 07dab8cb71771a5d8506067beb67c50f0a7ec2d7
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Fri Nov 1 07:22:16 EDT 2019
Tiny improvements to generate_grain_uv_420 Before: gen_grain_uv_ar2_8bpc_420_avx2: 29176.2 After: gen_grain_uv_ar2_8bpc_420_avx2: 26794.0
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -609,6 +609,8 @@
movd xm15, [base+hmul_bits-10+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
+ vpbroadcastw xm7, [base+hmul_bits+4]
+ vpbroadcastd xm6, [base+pb_1]
DEFINE_ARGS buf, bufy, h, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
@@ -639,31 +641,28 @@
psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5]
psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5]
- psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5]
+ psrldq xm0, 8 ; y=-2,x=[+2,+5]
punpcklwd xm4, xm5
- punpcklwd xm6, xm1
- psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5]
+ punpcklwd xm0, xm1
+ psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5]
psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5]
- punpcklwd xm7, xm1
+ punpcklwd xm3, xm1
pmaddwd xm4, xm9
- pmaddwd xm6, xm10
- pmaddwd xm7, xm12
- paddd xm4, xm6
- paddd xm2, xm7
+ pmaddwd xm0, xm10
+ pmaddwd xm3, xm12
+ paddd xm4, xm0
+ paddd xm2, xm3
paddd xm2, xm4
- vpbroadcastd xm4, [base+pb_1]
- movq xm6, [bufyq+xq*2]
- movq xm7, [bufyq+xq*2+82]
- pmaddubsw xm6, xm4, xm6
- pmaddubsw xm7, xm4, xm7
- vpbroadcastw xm4, [base+hmul_bits+4]
- paddw xm6, xm7
- pmulhrsw xm6, xm4
- pxor xm7, xm7
- punpcklwd xm6, xm7
- pmaddwd xm6, xm14
- paddd xm2, xm6
+ movq xm0, [bufyq+xq*2]
+ movq xm3, [bufyq+xq*2+82]
+ pmaddubsw xm0, xm6, xm0
+ pmaddubsw xm3, xm6, xm3
+ paddw xm0, xm3
+ pmulhrsw xm0, xm7
+ punpcklwd xm0, xm0
+ pmaddwd xm0, xm14
+ paddd xm2, xm0
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
@@ -807,8 +806,7 @@
pmaddubsw xm1, xm13, xm1
pmaddubsw xm2, xm13, xm2
paddw xm1, xm2
- vpbroadcastw xm3, xm15
- pmulhrsw xm1, xm3
+ pmulhrsw xm1, xm15
punpcklwd xm6, xm7
punpcklwd xm8, xm9