shithub: dav1d

Download patch

ref: 6b5e8cc587d544ab399488fa032640268856da9c
parent: 3b14f94924987fc66302bb8b803ad9065816f93e
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Wed Dec 31 20:50:10 EST 1969

inline and unroll more

--- a/include/common/intops.h
+++ b/include/common/intops.h
@@ -31,54 +31,19 @@
 #include <stdint.h>
 
 #include "common/attributes.h"
+#define imax(a, b) (int)((int)(a) > (int)(b) ? (a) : (b))
+#define imin(a, b) (int)((int)(a) < (int)(b) ? (a) : (b))
+#define umax(a, b) (unsigned)((unsigned)(a) > (unsigned)(b) ? (a) : (b))
+#define umin(a, b) (unsigned)((unsigned)(a) < (unsigned)(b) ? (a) : (b))
+#define iclip_u8(v) iclip((v), 0, 255)
+#define apply_sign(v, s) ((int)(s) < 0 ? -(int)(v) : (int)(v))
+#define apply_sign64(v, s) ((int64_t)(s) < 0 ? -(int)(v) : (int)(v))
+#define ulog2(v) (int)(31 - clz((unsigned)(v)))
+#define u64log2(v) (int)(63 - clzll((uint64_t)(v)))
+#define inv_recenter(r, v) (unsigned)((unsigned)(v) > ((unsigned)(r)<<1) ? (v) : (((v)&1) == 0) ? (((unsigned)(v)>>1) + (unsigned)(r)) : ((unsigned)(r) - (((unsigned)(v)+1)>>1)))
 
-static inline int imax(const int a, const int b) {
-    return a > b ? a : b;
-}
-
-static inline int imin(const int a, const int b) {
-    return a < b ? a : b;
-}
-
-static inline unsigned umax(const unsigned a, const unsigned b) {
-    return a > b ? a : b;
-}
-
-static inline unsigned umin(const unsigned a, const unsigned b) {
-    return a < b ? a : b;
-}
-
 static inline int iclip(const int v, const int min, const int max) {
     return v < min ? min : v > max ? max : v;
-}
-
-static inline int iclip_u8(const int v) {
-    return iclip(v, 0, 255);
-}
-
-static inline int apply_sign(const int v, const int s) {
-    return s < 0 ? -v : v;
-}
-
-static inline int apply_sign64(const int v, const int64_t s) {
-    return s < 0 ? -v : v;
-}
-
-static inline int ulog2(const unsigned v) {
-    return 31 - clz(v);
-}
-
-static inline int u64log2(const uint64_t v) {
-    return 63 - clzll(v);
-}
-
-static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
-    if (v > (r << 1))
-        return v;
-    else if ((v & 1) == 0)
-        return (v >> 1) + r;
-    else
-        return r - ((v + 1) >> 1);
 }
 
 #endif /* DAV1D_COMMON_INTOPS_H */
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -156,15 +156,12 @@
     const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
     for (int j = 0; j < h + 6; j++) {
         for (int i = 0; i < w; i++) {
-            int sum = (1 << (bitdepth + 6));
+            int sum = (1 << (bitdepth + 6)) +
 #if BITDEPTH == 8
-            sum += tmp_ptr[i + 3] * 128;
+            tmp_ptr[i + 3] * 128 +
 #endif
+			(((tmp_ptr[i+0]*filter[0][0] + tmp_ptr[i+1]*filter[0][1]) + (tmp_ptr[i+2]*filter[0][2] + tmp_ptr[i+3]*filter[0][3])) + (tmp_ptr[i+4]*filter[0][4] + tmp_ptr[i+5]*filter[0][5])) + tmp_ptr[i+6]*filter[0][6];
 
-            for (int k = 0; k < 7; k++) {
-                sum += tmp_ptr[i + k] * filter[0][k];
-            }
-
             hor_ptr[i] =
                 iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
         }
@@ -177,11 +174,7 @@
     const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
     for (int j = 0; j < h; j++) {
         for (int i = 0; i < w; i++) {
-            int sum = -round_offset;
-
-            for (int k = 0; k < 7; k++) {
-                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
-            }
+            int sum = -round_offset + hor[(j+0)*REST_UNIT_STRIDE+i]*filter[1][0] + hor[(j+1)*REST_UNIT_STRIDE+i]*filter[1][1] + hor[(j+2)*REST_UNIT_STRIDE+i]*filter[1][2] + hor[(j+3)*REST_UNIT_STRIDE+i]*filter[1][3] + hor[(j+4)*REST_UNIT_STRIDE+i]*filter[1][4] + hor[(j+5)*REST_UNIT_STRIDE+i]*filter[1][5] + hor[(j+6)*REST_UNIT_STRIDE+i]*filter[1][6];
 
             p[j * PXSTRIDE(p_stride) + i] =
                 iclip_pixel((sum + rounding_off_v) >> round_bits_v);
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -809,13 +809,17 @@
 
     src -= 3 * PXSTRIDE(src_stride);
     for (int y = 0; y < 15; y++, mx += abcd[1]) {
-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+        int tmx = mx;
 
-            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
-                                         7 - intermediate_bits);
-        }
+        mid_ptr[0] = FILTER_WARP_RND(src, 0, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[1] = FILTER_WARP_RND(src, 1, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[2] = FILTER_WARP_RND(src, 2, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[3] = FILTER_WARP_RND(src, 3, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[4] = FILTER_WARP_RND(src, 4, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[5] = FILTER_WARP_RND(src, 5, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[6] = FILTER_WARP_RND(src, 6, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+        mid_ptr[7] = FILTER_WARP_RND(src, 7, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+
         src += PXSTRIDE(src_stride);
         mid_ptr += 8;
     }
@@ -822,12 +826,17 @@
 
     mid_ptr = &mid[3 * 8];
     for (int y = 0; y < 8; y++, my += abcd[3]) {
-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+        int tmy = my;
 
-            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
-        }
+        tmp[0] = FILTER_WARP_RND(mid_ptr, 0, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[1] = FILTER_WARP_RND(mid_ptr, 1, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[2] = FILTER_WARP_RND(mid_ptr, 2, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[3] = FILTER_WARP_RND(mid_ptr, 3, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[4] = FILTER_WARP_RND(mid_ptr, 4, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[5] = FILTER_WARP_RND(mid_ptr, 5, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[6] = FILTER_WARP_RND(mid_ptr, 6, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+        tmp[7] = FILTER_WARP_RND(mid_ptr, 7, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+
         mid_ptr += 8;
         tmp += tmp_stride;
     }