ref: e3b5d4d044506f9e0e95e79b3de42fd94386cc61
parent: 01386d4c486f2892eec6cd7552d20bfec376b2ee
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Nov 5 04:27:35 EST 2018
Use grouped context setting Decreases runtime of decoding first 1000 frames of Chimera (1080p, 8bit) from 12.227 to 12.075s (average of 6 runs) after changing decode.c, and further down to 12.027s (1.67%) with the changes to recon_tmpl.c included. After the changes to lf_mask.c, it goes down to 11.842s.
--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -32,6 +32,12 @@
#include <stddef.h>
+#ifdef __GNUC__
+#define ATTR_ALIAS __attribute__((may_alias))
+#else
+#define ATTR_ALIAS
+#endif
+
#if ARCH_X86
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
--- /dev/null
+++ b/src/ctx.h
@@ -1,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_CTX_H__
+#define __DAV1D_SRC_CTX_H__
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; };
+
+#define set_ctx_rep4(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ ((union alias64 *) &var[off + 16])->u64 = const_val; \
+ ((union alias64 *) &var[off + 24])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+ ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ }
+#define case_set_upto16(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+
+#endif /* __DAV1D_SRC_CTX_H__ */
--- a/src/decode.c
+++ b/src/decode.c
@@ -38,6 +38,7 @@
#include "common/intops.h"
#include "common/mem.h"
+#include "src/ctx.h"
#include "src/decode.h"
#include "src/dequant_tables.h"
#include "src/env.h"
@@ -171,8 +172,14 @@
}
t->by -= txsh;
} else {
- memset(&t->a->tx[bx4], is_split ? TX_4X4 : txw, t_dim->w);
- memset(&t->l.tx[by4], is_split ? TX_4X4 : txh, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+ case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+ case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
}
}
@@ -611,13 +618,19 @@
{
b->max_ytx = b->uvtx = TX_4X4;
if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
- memset(&t->a->tx[bx4], TX_4X4, bw4);
- memset(&t->l.tx[by4], TX_4X4, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, TX_4X4)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
}
} else if (f->frame_hdr.txfm_mode != TX_SWITCHABLE || b->skip) {
if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
- memset(&t->a->tx[bx4], b_dim[2], bw4);
- memset(&t->l.tx[by4], b_dim[3], bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
} else {
assert(f->frame_hdr.txfm_mode == TX_LARGEST);
}
@@ -694,14 +707,22 @@
if (b->intra) {
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
- if (has_chroma) {
- memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
- memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
- }
const enum IntraPredMode y_mode_nofilt =
b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
- memset(&t->l.mode[by4], y_mode_nofilt, bh4);
- memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir intra, off, mul)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
} else {
if (b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) {
uint64_t mask[2] = { 0, 0 };
@@ -712,17 +733,22 @@
f->bd_fn.recon_b_inter(t, bs, b);
const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
- memset(&t->l.filter[0][by4], filter[0], bh4);
- memset(&t->a->filter[0][bx4], filter[0], bw4);
- memset(&t->l.filter[1][by4], filter[1], bh4);
- memset(&t->a->filter[1][bx4], filter[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir intra, off, 0)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
if (has_chroma) {
- memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
- memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
}
}
- memset(&t->l.intra[by4], b->intra, bh4);
- memset(&t->a->intra[bx4], b->intra, bw4);
return 0;
}
@@ -1106,14 +1132,29 @@
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
// update contexts
- memset(&t->a->tx_intra[bx4], t_dim->lw, bw4);
- memset(&t->l.tx_intra[by4], t_dim->lh, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, mul); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ /* see aomedia bug 2183 for why we use luma coordinates here */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+ if (f->frame_hdr.frame_type & 1) { \
+ rep_macro(type, t->dir comp_type, off, mul * b->skip); \
+ rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir filter[0], off, mul * N_SWITCHABLE_FILTERS); \
+ rep_macro(type, t->dir filter[1], off, mul * N_SWITCHABLE_FILTERS); \
+ }
const enum IntraPredMode y_mode_nofilt =
b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
- memset(&t->l.mode[by4], y_mode_nofilt, bh4);
- memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
- memset(&t->l.pal_sz[by4], b->pal_sz[0], bh4);
- memset(&t->a->pal_sz[bx4], b->pal_sz[0], bw4);
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
if (b->pal_sz[0]) {
uint16_t *const pal = f->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
@@ -1124,11 +1165,11 @@
memcpy(t->al_pal[1][by4 + y][0], pal, 16);
}
if (has_chroma) {
- memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
- memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
- // see aomedia bug 2183 for why we use luma coordinates here
- memset(&t->pal_sz_uv[1][by4], b->pal_sz[1], bh4);
- memset(&t->pal_sz_uv[0][bx4], b->pal_sz[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
if (b->pal_sz[1]) for (int pl = 1; pl < 3; pl++) {
uint16_t *const pal = f->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
@@ -1139,28 +1180,11 @@
for (int y = 0; y < bh4; y++)
memcpy(t->al_pal[1][by4 + y][pl], pal, 16);
}
- } else { // see aomedia bug 2183 for why we reset this
- memset(&t->pal_sz_uv[1][by4], 0, bh4);
- memset(&t->pal_sz_uv[0][bx4], 0, bw4);
}
if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
- memset(&t->a->tx[bx4], t_dim->lw, bw4);
- memset(&t->l.tx[by4], t_dim->lh, bh4);
splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
y_mode_nofilt);
}
- if (f->frame_hdr.frame_type & 1) {
- memset(&t->l.comp_type[by4], COMP_INTER_NONE, bh4);
- memset(&t->a->comp_type[bx4], COMP_INTER_NONE, bw4);
- memset(&t->l.ref[0][by4], -1, bh4);
- memset(&t->a->ref[0][bx4], -1, bw4);
- memset(&t->l.ref[1][by4], -1, bh4);
- memset(&t->a->ref[1][bx4], -1, bw4);
- memset(&t->l.filter[0][by4], N_SWITCHABLE_FILTERS, bh4);
- memset(&t->a->filter[0][bx4], N_SWITCHABLE_FILTERS, bw4);
- memset(&t->l.filter[1][by4], N_SWITCHABLE_FILTERS, bh4);
- memset(&t->a->filter[1][bx4], N_SWITCHABLE_FILTERS, bw4);
- }
} else if (!(f->frame_hdr.frame_type & 1)) {
// intra block copy
candidate_mv mvstack[8];
@@ -1259,18 +1283,25 @@
splat_intrabc_mv(f->mvs, f->b4_stride, t->by, t->bx, bs, b->mv[0]);
- memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
- memset(&t->l.tx_intra[by4], b_dim[3], bh4);
- memset(&t->l.mode[by4], DC_PRED, bh4);
- memset(&t->a->mode[bx4], DC_PRED, bw4);
- memset(&t->l.pal_sz[by4], 0, bh4);
- memset(&t->a->pal_sz[bx4], 0, bw4);
- // see aomedia bug 2183 for why this is outside if (has_chroma)
- memset(&t->pal_sz_uv[1][by4], 0, bh4);
- memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir seg_pred, off, seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, b->skip)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
if (has_chroma) {
- memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
- memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
}
} else {
// inter-specific mode/mv coding
@@ -1764,29 +1795,33 @@
b->inter_mode, b->ref[0], b->mv[0],
b->interintra_type);
}
- memset(&t->l.pal_sz[by4], 0, bh4);
- memset(&t->a->pal_sz[bx4], 0, bw4);
- // see aomedia bug 2183 for why this is outside if (has_chroma)
- memset(&t->pal_sz_uv[1][by4], 0, bh4);
- memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+ rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
if (has_chroma) {
- memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
- memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
}
- memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
- memset(&t->l.tx_intra[by4], b_dim[3], bh4);
- memset(&t->l.comp_type[by4], b->comp_type, bh4);
- memset(&t->a->comp_type[bx4], b->comp_type, bw4);
- memset(&t->l.filter[0][by4], filter[0], bh4);
- memset(&t->a->filter[0][bx4], filter[0], bw4);
- memset(&t->l.filter[1][by4], filter[1], bh4);
- memset(&t->a->filter[1][bx4], filter[1], bw4);
- memset(&t->l.mode[by4], b->inter_mode, bh4);
- memset(&t->a->mode[bx4], b->inter_mode, bw4);
- memset(&t->l.ref[0][by4], b->ref[0], bh4);
- memset(&t->a->ref[0][bx4], b->ref[0], bw4);
- memset(&t->l.ref[1][by4], b->ref[1], bh4);
- memset(&t->a->ref[1][bx4], b->ref[1], bw4);
}
// update contexts
@@ -1794,19 +1829,14 @@
f->frame_hdr.segmentation.update_map)
{
uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
- for (int y = 0; y < bh4; y++) {
- memset(seg_ptr, b->seg_id, bw4);
- seg_ptr += f->b4_stride;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < bh4; y++) { \
+ rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+ seg_ptr += f->b4_stride; \
}
+ case_set(bw4, NULL, 0, 0);
+#undef set_ctx
}
- memset(&t->l.seg_pred[by4], seg_pred, bh4);
- memset(&t->a->seg_pred[bx4], seg_pred, bw4);
- memset(&t->l.skip_mode[by4], b->skip_mode, bh4);
- memset(&t->a->skip_mode[bx4], b->skip_mode, bw4);
- memset(&t->l.intra[by4], b->intra, bh4);
- memset(&t->a->intra[bx4], b->intra, bw4);
- memset(&t->l.skip[by4], b->skip, bh4);
- memset(&t->a->skip[bx4], b->skip, bw4);
if (!b->skip) {
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
@@ -2081,8 +2111,11 @@
}
if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
- memset(&t->a->partition[bx8], dav1d_al_part_ctx[0][bl][bp], hsz);
- memset(&t->l.partition[by8], dav1d_al_part_ctx[1][bl][bp], hsz);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+ rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+ case_set_upto16(hsz,,,);
+#undef set_ctx
}
return 0;
--- a/src/env.h
+++ b/src/env.h
@@ -38,23 +38,23 @@
#include "src/tables.h"
typedef struct BlockContext {
- uint8_t mode[32];
- uint8_t lcoef[32];
- uint8_t ccoef[2][32];
- uint8_t seg_pred[32];
- uint8_t skip[32];
- uint8_t skip_mode[32];
- uint8_t intra[32];
- uint8_t comp_type[32];
- int8_t ref[2][32]; // -1 means intra
- uint8_t filter[2][32]; // 3 means unset
- int8_t tx_intra[32];
- int8_t tx[32];
- uint8_t tx_lpf_y[32];
- uint8_t tx_lpf_uv[32];
- uint8_t partition[16];
- uint8_t uvmode[32];
- uint8_t pal_sz[32];
+ uint8_t ALIGN(mode[32], 8);
+ uint8_t ALIGN(lcoef[32], 8);
+ uint8_t ALIGN(ccoef[2][32], 8);
+ uint8_t ALIGN(seg_pred[32], 8);
+ uint8_t ALIGN(skip[32], 8);
+ uint8_t ALIGN(skip_mode[32], 8);
+ uint8_t ALIGN(intra[32], 8);
+ uint8_t ALIGN(comp_type[32], 8);
+ int8_t ALIGN(ref[2][32], 8); // -1 means intra
+ uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+ int8_t ALIGN(tx_intra[32], 8);
+ int8_t ALIGN(tx[32], 8);
+ uint8_t ALIGN(tx_lpf_y[32], 8);
+ uint8_t ALIGN(tx_lpf_uv[32], 8);
+ uint8_t ALIGN(partition[16], 8);
+ uint8_t ALIGN(uvmode[32], 8);
+ uint8_t ALIGN(pal_sz[32], 8);
} BlockContext;
static inline int get_intra_ctx(const BlockContext *const a,
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -32,6 +32,7 @@
#include "common/intops.h"
+#include "src/ctx.h"
#include "src/levels.h"
#include "src/lf_mask.h"
#include "src/tables.h"
@@ -64,12 +65,18 @@
} else {
const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
- for (int y = 0; y < t_dim->h; y++) {
- memset(txa[0][0][y], lw, t_dim->w);
- memset(txa[1][0][y], lh, t_dim->w);
- txa[0][1][y][0] = t_dim->w;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < t_dim->h; y++) { \
+ rep_macro(type, txa[0][0][y], off, mul * lw); \
+ rep_macro(type, txa[1][0][y], off, mul * lh); \
+ txa[0][1][y][0] = t_dim->w; \
}
- memset(txa[1][1][0], t_dim->h, t_dim->w);
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
}
}
@@ -190,8 +197,20 @@
if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
}
- memset(a, thl4c, w4);
- memset(l, twl4c, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
}
static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
@@ -249,8 +268,20 @@
}
}
- memset(a, thl4c, cw4);
- memset(l, twl4c, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
}
void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -37,6 +37,7 @@
#include "common/mem.h"
#include "src/cdef_apply.h"
+#include "src/ctx.h"
#include "src/ipred_prepare.h"
#include "src/lf_apply.h"
#include "src/lr_apply.h"
@@ -315,10 +316,22 @@
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
ytx, txtp, eob, ts->msac.rng);
- memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
- memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
- for (int y = 0; y < txh; y++)
- memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+ case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < txh; y++) { \
+ rep_macro(type, txtp_map, 0, mul * txtp); \
+ txtp_map += 32; \
+ }
+ uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
+ case_set_upto16(txw,,,);
+#undef set_ctx
if (f->frame_thread.pass == 1) {
cbi->eob[0] = eob;
cbi->txtp[0] = txtp;
@@ -356,11 +369,18 @@
(bh4 > ss_ver || t->by & 1);
if (b->skip) {
- memset(&t->a->lcoef[bx4], 0x40, bw4);
- memset(&t->l.lcoef[by4], 0x40, bh4);
- if (has_chroma) for (int pl = 0; pl < 2; pl++) {
- memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
- memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
}
return;
}
@@ -402,10 +422,16 @@
b->tx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[0] = txtp;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
- memset(&t->a->lcoef[bx4 + x], cf_ctx,
- imin(t_dim->w, f->bw - t->bx));
- memset(&t->l.lcoef[by4 + y], cf_ctx,
- imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
}
}
t->bx -= x;
@@ -441,10 +467,18 @@
pl, b->uvtx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[1 + pl] = txtp;
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
}
t->bx -= x << ss_hor;
}
@@ -763,10 +797,16 @@
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
b->tx, txtp, eob, ts->msac.rng);
- memset(&t->a->lcoef[bx4 + x], cf_ctx,
- imin(t_dim->w, f->bw - t->bx));
- memset(&t->l.lcoef[by4 + y], cf_ctx,
- imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
}
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -781,8 +821,11 @@
t_dim->w * 4, t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
- memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
- memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set_upto16(t_dim->h, l., 1, by4 + y);
+ case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
}
dst += 4 * t_dim->w;
}
@@ -970,10 +1013,18 @@
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
}
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -987,8 +1038,11 @@
uv_t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
- memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
- memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+ case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+ case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
}
dst += uv_t_dim->w * 4;
}
@@ -1301,13 +1355,18 @@
if (b->skip) {
// reset coef contexts
- memset(&t->a->lcoef[bx4], 0x40, w4);
- memset(&t->l.lcoef[by4], 0x40, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
if (has_chroma) {
- memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
- memset(&t->l.ccoef[0][cby4], 0x40, ch4);
- memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
- memset(&t->l.ccoef[1][cby4], 0x40, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
}
return;
}
@@ -1372,10 +1431,18 @@
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n",
pl, b->uvtx, txtp, eob, ts->msac.rng);
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
}
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)