ref: 80650d4ce3ff977420fab8fc9deeb3b4de98fb70
parent: 3811665793d30d62523855f87112d1e267996d53
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Mon Feb 25 12:42:51 EST 2019
x86: optimize AVX2 cdef filters before: cdef_filter_4x4_8bpc_avx2: 110.4 after: cdef_filter_4x4_8bpc_avx2: 106.0 before: cdef_filter_4x8_8bpc_avx2: 188.3 after: cdef_filter_4x8_8bpc_avx2: 182.2 before: cdef_filter_8x8_8bpc_avx2: 276.7 after: cdef_filter_8x8_8bpc_avx2: 252.5 Credit to Gramner.
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -94,20 +94,18 @@
psubw m6, m4 ; diff_p1(p1 - px)
pabsw m9, m5
pabsw m10, m6
- psraw m11, m9, %2
- psraw m12, m10, %2
- psubw m11, %3, m11
- psubw m12, %3, m12
- pmaxsw m11, m13
- pmaxsw m12, m13
- pminsw m11, m9
- pminsw m12, m10
- psignw m11, m5 ; constrain(diff_p0)
- psignw m12, m6 ; constrain(diff_p1)
- pmullw m11, %4 ; constrain(diff_p0) * taps
- pmullw m12, %4 ; constrain(diff_p1) * taps
- paddw m15, m11
- paddw m15, m12
+ psignw m11, %4, m5
+ psignw m12, %4, m6
+ psrlw m5, m9, %2
+ psrlw m6, m10, %2
+ psubusw m5, %3, m5
+ psubusw m6, %3, m6
+ pminsw m5, m9 ; constrain(diff_p0)
+ pminsw m6, m10 ; constrain(diff_p1)
+ pmullw m5, m11 ; constrain(diff_p0) * taps
+ pmullw m6, m12 ; constrain(diff_p1) * taps
+ paddw m15, m5
+ paddw m15, m6
%endmacro
%macro cdef_filter_fn 3 ; w, h, stride