shithub: dav1d

Download patch

ref: 19b4c9c076c5542901871be2b50d9084980cffce
parent: df51117648f0cd5d5a79439ac0f8687b683f335e
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Sun Feb 24 10:14:15 EST 2019

x86: add AVX2 cdef_filter_4x8

used for YUV 422 chroma blocks

cdef_filter_4x8_8bpc_c: 2711.6
cdef_filter_4x8_8bpc_avx2: 189.1

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -112,8 +112,13 @@
 
 %macro cdef_filter_fn 3 ; w, h, stride
 INIT_YMM avx2
+%if %1 != 4 || %2 != 8
 cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
                            dst, stride, left, top, pri, sec, stride3, dst4, edge
+%else
+cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
+                           dst, stride, left, top, pri, sec, stride3, dst4, edge
+%endif
 %define px rsp+2*16+2*%3
     pcmpeqw        m14, m14
     psrlw          m14, 1                   ; 0x7fff
@@ -175,7 +180,20 @@
     movd [px+2*%3+%1*2], xm14
     movd [px+3*%3+%1*2], xm14
 %if %2 == 8
-    ; FIXME w == 4
+ %if %1 == 4
+    movd           xm1, [dst4q+strideq*0]
+    movd           xm2, [dst4q+strideq*1]
+    movd           xm3, [dst4q+strideq*2]
+    movd           xm4, [dst4q+stride3q]
+    pmovzxbw       xm1, xm1
+    pmovzxbw       xm2, xm2
+    pmovzxbw       xm3, xm3
+    pmovzxbw       xm4, xm4
+    movq     [px+4*%3], xm1
+    movq     [px+5*%3], xm2
+    movq     [px+6*%3], xm3
+    movq     [px+7*%3], xm4
+ %else
     pmovzxbw       xm1, [dst4q+strideq*0]
     pmovzxbw       xm2, [dst4q+strideq*1]
     pmovzxbw       xm3, [dst4q+strideq*2]
@@ -184,6 +202,7 @@
     mova     [px+5*%3], xm2
     mova     [px+6*%3], xm3
     mova     [px+7*%3], xm4
+ %endif
     movd [px+4*%3+%1*2], xm14
     movd [px+5*%3+%1*2], xm14
     movd [px+6*%3+%1*2], xm14
@@ -376,7 +395,11 @@
     mov           dird, r6m
     lea           tapq, [tapq+dirq*2+12]
 %if %1*%2*2/mmsize > 1
+ %if %1 == 4
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
+ %else
     DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ %endif
     mov             hd, %1*%2*2/mmsize
 %else
     DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
@@ -431,8 +454,9 @@
 %endif
 
 %if %1*%2*2/mmsize > 1
-    lea           dstq, [dstq+strideq*2]
-    add           stkq, %3*2
+ %define vloop_lines (mmsize/(%1*2))
+    lea           dstq, [dstq+strideq*vloop_lines]
+    add           stkq, %3*vloop_lines
     dec             hd
     jg .v_loop
 %endif
@@ -441,6 +465,7 @@
 %endmacro
 
 cdef_filter_fn 8, 8, 32
+cdef_filter_fn 4, 8, 32
 cdef_filter_fn 4, 4, 32
 
 INIT_YMM avx2
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -29,6 +29,7 @@
 #include "src/cdef.h"
 
 decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
+decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
 decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
 
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
@@ -41,6 +42,7 @@
 #if BITDEPTH == 8 && ARCH_X86_64
     c->dir = dav1d_cdef_dir_avx2;
     c->fb[0] = dav1d_cdef_filter_8x8_avx2;
+    c->fb[1] = dav1d_cdef_filter_4x8_avx2;
     c->fb[2] = dav1d_cdef_filter_4x4_avx2;
 #endif
 }