shithub: dav1d

Download patch

ref: 80650d4ce3ff977420fab8fc9deeb3b4de98fb70
parent: 3811665793d30d62523855f87112d1e267996d53
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Mon Feb 25 12:42:51 EST 2019

x86: optimize AVX2 cdef filters

before: cdef_filter_4x4_8bpc_avx2: 110.4
 after: cdef_filter_4x4_8bpc_avx2: 106.0

before: cdef_filter_4x8_8bpc_avx2: 188.3
 after: cdef_filter_4x8_8bpc_avx2: 182.2

before: cdef_filter_8x8_8bpc_avx2: 276.7
 after: cdef_filter_8x8_8bpc_avx2: 252.5

Credit to Gramner.

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -94,20 +94,18 @@
     psubw           m6, m4                      ; diff_p1(p1 - px)
     pabsw           m9, m5
     pabsw          m10, m6
-    psraw          m11, m9,  %2
-    psraw          m12, m10, %2
-    psubw          m11, %3, m11
-    psubw          m12, %3, m12
-    pmaxsw         m11, m13
-    pmaxsw         m12, m13
-    pminsw         m11, m9
-    pminsw         m12, m10
-    psignw         m11, m5                      ; constrain(diff_p0)
-    psignw         m12, m6                      ; constrain(diff_p1)
-    pmullw         m11, %4                      ; constrain(diff_p0) * taps
-    pmullw         m12, %4                      ; constrain(diff_p1) * taps
-    paddw          m15, m11
-    paddw          m15, m12
+    psignw         m11, %4, m5
+    psignw         m12, %4, m6
+    psrlw           m5, m9, %2
+    psrlw           m6, m10, %2
+    psubusw         m5, %3, m5
+    psubusw         m6, %3, m6
+    pminsw          m5, m9                      ; constrain(diff_p0)
+    pminsw          m6, m10                     ; constrain(diff_p1)
+    pmullw          m5, m11                     ; constrain(diff_p0) * taps
+    pmullw          m6, m12                     ; constrain(diff_p1) * taps
+    paddw          m15, m5
+    paddw          m15, m6
 %endmacro
 
 %macro cdef_filter_fn 3 ; w, h, stride