ref: 77c52be0076549020e75c259a10c6fd817b0996c
parent: 93c4bea2d45d7caf5cc6ab712d938dc6f74b98a2
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri Oct 19 20:11:11 EDT 2018
Reorder the dav1d_filter_intra_taps array Ordering the elements this way is more SIMD-friendly.
--- a/src/ipred.c
+++ b/src/ipred.c
@@ -553,7 +553,7 @@
filt_idx &= 511;
assert(filt_idx < 5);
- const int8_t (*const filter)[8] = dav1d_filter_intra_taps[filt_idx];
+ const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
int x, y;
ptrdiff_t left_stride;
const pixel *left, *topleft, *top;
@@ -568,19 +568,18 @@
const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
pixel *ptr = &dst[x];
- const int8_t (*flt_ptr)[8] = filter;
+ const int8_t *flt_ptr = filter;
for (int yy = 0; yy < 2; yy++) {
- for (int xx = 0; xx < 4; xx++, flt_ptr++) {
- int acc = flt_ptr[0][0] * p0 + flt_ptr[0][1] * p1 +
- flt_ptr[0][2] * p2 + flt_ptr[0][3] * p3 +
- flt_ptr[0][4] * p4 + flt_ptr[0][5] * p5 +
- flt_ptr[0][6] * p6;
+ for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
+ int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
+ flt_ptr[16] * p2 + flt_ptr[17] * p3 +
+ flt_ptr[32] * p4 + flt_ptr[33] * p5 +
+ flt_ptr[48] * p6;
ptr[xx] = iclip_pixel((acc + 8) >> 4);
}
ptr += PXSTRIDE(stride);
}
-
left = &dst[x + 4 - 1];
left_stride = PXSTRIDE(stride);
top += 4;
--- a/src/tables.c
+++ b/src/tables.c
@@ -781,51 +781,51 @@
3, 0, 0, // 87, ...
};
-const int8_t dav1d_filter_intra_taps[5][8][8] = {
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
{
- { -6, 10, 0, 0, 0, 12, 0, 0 },
- { -5, 2, 10, 0, 0, 9, 0, 0 },
- { -3, 1, 1, 10, 0, 7, 0, 0 },
- { -3, 1, 1, 2, 10, 5, 0, 0 },
- { -4, 6, 0, 0, 0, 2, 12, 0 },
- { -3, 2, 6, 0, 0, 2, 9, 0 },
- { -3, 2, 2, 6, 0, 2, 7, 0 },
- { -3, 1, 2, 2, 6, 3, 5, 0 },
+ -6, 10, -5, 2, -3, 1, -3, 1,
+ -4, 6, -3, 2, -3, 2, -3, 1,
+ 0, 0, 10, 0, 1, 10, 1, 2,
+ 0, 0, 6, 0, 2, 6, 2, 2,
+ 0, 12, 0, 9, 0, 7, 10, 5,
+ 0, 2, 0, 2, 0, 2, 6, 3,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 12, 0, 9, 0, 7, 0, 5, 0
}, {
- { -10, 16, 0, 0, 0, 10, 0, 0 },
- { -6, 0, 16, 0, 0, 6, 0, 0 },
- { -4, 0, 0, 16, 0, 4, 0, 0 },
- { -2, 0, 0, 0, 16, 2, 0, 0 },
- { -10, 16, 0, 0, 0, 0, 10, 0 },
- { -6, 0, 16, 0, 0, 0, 6, 0 },
- { -4, 0, 0, 16, 0, 0, 4, 0 },
- { -2, 0, 0, 0, 16, 0, 2, 0 },
+ -10, 16, -6, 0, -4, 0, -2, 0,
+ -10, 16, -6, 0, -4, 0, -2, 0,
+ 0, 0, 16, 0, 0, 16, 0, 0,
+ 0, 0, 16, 0, 0, 16, 0, 0,
+ 0, 10, 0, 6, 0, 4, 16, 2,
+ 0, 0, 0, 0, 0, 0, 16, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 10, 0, 6, 0, 4, 0, 2, 0
}, {
- { -8, 8, 0, 0, 0, 16, 0, 0 },
- { -8, 0, 8, 0, 0, 16, 0, 0 },
- { -8, 0, 0, 8, 0, 16, 0, 0 },
- { -8, 0, 0, 0, 8, 16, 0, 0 },
- { -4, 4, 0, 0, 0, 0, 16, 0 },
- { -4, 0, 4, 0, 0, 0, 16, 0 },
- { -4, 0, 0, 4, 0, 0, 16, 0 },
- { -4, 0, 0, 0, 4, 0, 16, 0 },
+ -8, 8, -8, 0, -8, 0, -8, 0,
+ -4, 4, -4, 0, -4, 0, -4, 0,
+ 0, 0, 8, 0, 0, 8, 0, 0,
+ 0, 0, 4, 0, 0, 4, 0, 0,
+ 0, 16, 0, 16, 0, 16, 8, 16,
+ 0, 0, 0, 0, 0, 0, 4, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0
}, {
- { -2, 8, 0, 0, 0, 10, 0, 0 },
- { -1, 3, 8, 0, 0, 6, 0, 0 },
- { -1, 2, 3, 8, 0, 4, 0, 0 },
- { 0, 1, 2, 3, 8, 2, 0, 0 },
- { -1, 4, 0, 0, 0, 3, 10, 0 },
- { -1, 3, 4, 0, 0, 4, 6, 0 },
- { -1, 2, 3, 4, 0, 4, 4, 0 },
- { -1, 2, 2, 3, 4, 3, 3, 0 },
+ -2, 8, -1, 3, -1, 2, 0, 1,
+ -1, 4, -1, 3, -1, 2, -1, 2,
+ 0, 0, 8, 0, 3, 8, 2, 3,
+ 0, 0, 4, 0, 3, 4, 2, 3,
+ 0, 10, 0, 6, 0, 4, 8, 2,
+ 0, 3, 0, 4, 0, 4, 4, 3,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 10, 0, 6, 0, 4, 0, 3, 0
}, {
- { -12, 14, 0, 0, 0, 14, 0, 0 },
- { -10, 0, 14, 0, 0, 12, 0, 0 },
- { -9, 0, 0, 14, 0, 11, 0, 0 },
- { -8, 0, 0, 0, 14, 10, 0, 0 },
- { -10, 12, 0, 0, 0, 0, 14, 0 },
- { -9, 1, 12, 0, 0, 0, 12, 0 },
- { -8, 0, 0, 12, 0, 1, 11, 0 },
- { -7, 0, 0, 1, 12, 1, 9, 0 },
+ -12, 14, -10, 0, -9, 0, -8, 0,
+ -10, 12, -9, 1, -8, 0, -7, 0,
+ 0, 0, 14, 0, 0, 14, 0, 0,
+ 0, 0, 12, 0, 0, 12, 0, 1,
+ 0, 14, 0, 12, 0, 11, 14, 10,
+ 0, 0, 0, 0, 0, 1, 12, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 14, 0, 12, 0, 11, 0, 9, 0
}
};
--- a/src/tables.h
+++ b/src/tables.h
@@ -115,6 +115,6 @@
extern const uint8_t dav1d_sm_weights[128];
extern const int16_t dav1d_dr_intra_derivative[90];
-extern const int8_t dav1d_filter_intra_taps[5][8][8];
+extern const int8_t dav1d_filter_intra_taps[5][64];
#endif /* __DAV1D_SRC_TABLES_H__ */