shithub: dav1d

Download patch

ref: e3b5d4d044506f9e0e95e79b3de42fd94386cc61
parent: 01386d4c486f2892eec6cd7552d20bfec376b2ee
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Nov 5 04:27:35 EST 2018

Use grouped context setting

Decreases runtime of decoding first 1000 frames of Chimera (1080p, 8bit)
from 12.227 to 12.075s (average of 6 runs) after changing decode.c, and
further down to 12.027s (1.67%) with the changes to recon_tmpl.c included.
After the changes to lf_mask.c, it goes down to 11.842s.

--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -32,6 +32,12 @@
 
 #include <stddef.h>
 
+#ifdef __GNUC__
+#define ATTR_ALIAS __attribute__((may_alias))
+#else
+#define ATTR_ALIAS
+#endif
+
 #if ARCH_X86
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
--- /dev/null
+++ b/src/ctx.h
@@ -1,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_CTX_H__
+#define __DAV1D_SRC_CTX_H__
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; };
+
+#define set_ctx_rep4(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off +  0])->u64 = const_val; \
+        ((union alias64 *) &var[off +  8])->u64 = const_val; \
+        ((union alias64 *) &var[off + 16])->u64 = const_val; \
+        ((union alias64 *) &var[off + 24])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off + 0])->u64 = const_val; \
+        ((union alias64 *) &var[off + 8])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+    ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    }
+#define case_set_upto16(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+
+#endif /* __DAV1D_SRC_CTX_H__ */
--- a/src/decode.c
+++ b/src/decode.c
@@ -38,6 +38,7 @@
 #include "common/intops.h"
 #include "common/mem.h"
 
+#include "src/ctx.h"
 #include "src/decode.h"
 #include "src/dequant_tables.h"
 #include "src/env.h"
@@ -171,8 +172,14 @@
         }
         t->by -= txsh;
     } else {
-        memset(&t->a->tx[bx4], is_split ? TX_4X4 : txw, t_dim->w);
-        memset(&t->l.tx[by4], is_split ? TX_4X4 : txh, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+        case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+        case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
     }
 }
 
@@ -611,13 +618,19 @@
     {
         b->max_ytx = b->uvtx = TX_4X4;
         if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
-            memset(&t->a->tx[bx4], TX_4X4, bw4);
-            memset(&t->l.tx[by4], TX_4X4, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, TX_4X4)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         }
     } else if (f->frame_hdr.txfm_mode != TX_SWITCHABLE || b->skip) {
         if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
-            memset(&t->a->tx[bx4], b_dim[2], bw4);
-            memset(&t->l.tx[by4], b_dim[3], bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         } else {
             assert(f->frame_hdr.txfm_mode == TX_LARGEST);
         }
@@ -694,14 +707,22 @@
         if (b->intra) {
             f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
 
-            if (has_chroma) {
-                memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
-                memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
-            }
             const enum IntraPredMode y_mode_nofilt =
                 b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-            memset(&t->l.mode[by4], y_mode_nofilt, bh4);
-            memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+            rep_macro(type, t->dir intra, off, mul)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+            if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+            }
         } else {
             if (b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) {
                 uint64_t mask[2] = { 0, 0 };
@@ -712,17 +733,22 @@
             f->bd_fn.recon_b_inter(t, bs, b);
 
             const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
-            memset(&t->l.filter[0][by4], filter[0], bh4);
-            memset(&t->a->filter[0][bx4], filter[0], bw4);
-            memset(&t->l.filter[1][by4], filter[1], bh4);
-            memset(&t->a->filter[1][bx4], filter[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+            rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+            rep_macro(type, t->dir intra, off, 0)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
             if (has_chroma) {
-                memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-                memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
             }
         }
-        memset(&t->l.intra[by4], b->intra, bh4);
-        memset(&t->a->intra[bx4], b->intra, bw4);
         return 0;
     }
 
@@ -1106,14 +1132,29 @@
                                    has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
 
         // update contexts
-        memset(&t->a->tx_intra[bx4], t_dim->lw, bw4);
-        memset(&t->l.tx_intra[by4], t_dim->lh, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+        rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, mul); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        /* see aomedia bug 2183 for why we use luma coordinates here */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+        if (f->frame_hdr.frame_type & 1) { \
+            rep_macro(type, t->dir comp_type, off, mul * b->skip); \
+            rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir filter[0], off, mul * N_SWITCHABLE_FILTERS); \
+            rep_macro(type, t->dir filter[1], off, mul * N_SWITCHABLE_FILTERS); \
+        }
         const enum IntraPredMode y_mode_nofilt =
             b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-        memset(&t->l.mode[by4], y_mode_nofilt, bh4);
-        memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
-        memset(&t->l.pal_sz[by4], b->pal_sz[0], bh4);
-        memset(&t->a->pal_sz[bx4], b->pal_sz[0], bw4);
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (b->pal_sz[0]) {
             uint16_t *const pal = f->frame_thread.pass ?
                 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
@@ -1124,11 +1165,11 @@
                 memcpy(t->al_pal[1][by4 + y][0], pal, 16);
         }
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
-            memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
-            // see aomedia bug 2183 for why we use luma coordinates here
-            memset(&t->pal_sz_uv[1][by4], b->pal_sz[1], bh4);
-            memset(&t->pal_sz_uv[0][bx4], b->pal_sz[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
             if (b->pal_sz[1]) for (int pl = 1; pl < 3; pl++) {
                 uint16_t *const pal = f->frame_thread.pass ?
                     f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
@@ -1139,28 +1180,11 @@
                 for (int y = 0; y < bh4; y++)
                     memcpy(t->al_pal[1][by4 + y][pl], pal, 16);
             }
-        } else { // see aomedia bug 2183 for why we reset this
-            memset(&t->pal_sz_uv[1][by4], 0, bh4);
-            memset(&t->pal_sz_uv[0][bx4], 0, bw4);
         }
         if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
-            memset(&t->a->tx[bx4], t_dim->lw, bw4);
-            memset(&t->l.tx[by4], t_dim->lh, bh4);
             splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
                            y_mode_nofilt);
         }
-        if (f->frame_hdr.frame_type & 1) {
-            memset(&t->l.comp_type[by4], COMP_INTER_NONE, bh4);
-            memset(&t->a->comp_type[bx4], COMP_INTER_NONE, bw4);
-            memset(&t->l.ref[0][by4], -1, bh4);
-            memset(&t->a->ref[0][bx4], -1, bw4);
-            memset(&t->l.ref[1][by4], -1, bh4);
-            memset(&t->a->ref[1][bx4], -1, bw4);
-            memset(&t->l.filter[0][by4], N_SWITCHABLE_FILTERS, bh4);
-            memset(&t->a->filter[0][bx4], N_SWITCHABLE_FILTERS, bw4);
-            memset(&t->l.filter[1][by4], N_SWITCHABLE_FILTERS, bh4);
-            memset(&t->a->filter[1][bx4], N_SWITCHABLE_FILTERS, bw4);
-        }
     } else if (!(f->frame_hdr.frame_type & 1)) {
         // intra block copy
         candidate_mv mvstack[8];
@@ -1259,18 +1283,25 @@
 
         splat_intrabc_mv(f->mvs, f->b4_stride, t->by, t->bx, bs, b->mv[0]);
 
-        memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
-        memset(&t->l.tx_intra[by4], b_dim[3], bh4);
-        memset(&t->l.mode[by4], DC_PRED, bh4);
-        memset(&t->a->mode[bx4], DC_PRED, bw4);
-        memset(&t->l.pal_sz[by4], 0, bh4);
-        memset(&t->a->pal_sz[bx4], 0, bw4);
-        // see aomedia bug 2183 for why this is outside if (has_chroma)
-        memset(&t->pal_sz_uv[1][by4], 0, bh4);
-        memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir seg_pred, off, seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, b->skip)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-            memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
     } else {
         // inter-specific mode/mv coding
@@ -1764,29 +1795,33 @@
                             b->inter_mode, b->ref[0], b->mv[0],
                             b->interintra_type);
         }
-        memset(&t->l.pal_sz[by4], 0, bh4);
-        memset(&t->a->pal_sz[bx4], 0, bw4);
-        // see aomedia bug 2183 for why this is outside if (has_chroma)
-        memset(&t->pal_sz_uv[1][by4], 0, bh4);
-        memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+        rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+        rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+        rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+        rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+        rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-            memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
-        memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
-        memset(&t->l.tx_intra[by4], b_dim[3], bh4);
-        memset(&t->l.comp_type[by4], b->comp_type, bh4);
-        memset(&t->a->comp_type[bx4], b->comp_type, bw4);
-        memset(&t->l.filter[0][by4], filter[0], bh4);
-        memset(&t->a->filter[0][bx4], filter[0], bw4);
-        memset(&t->l.filter[1][by4], filter[1], bh4);
-        memset(&t->a->filter[1][bx4], filter[1], bw4);
-        memset(&t->l.mode[by4], b->inter_mode, bh4);
-        memset(&t->a->mode[bx4], b->inter_mode, bw4);
-        memset(&t->l.ref[0][by4], b->ref[0], bh4);
-        memset(&t->a->ref[0][bx4], b->ref[0], bw4);
-        memset(&t->l.ref[1][by4], b->ref[1], bh4);
-        memset(&t->a->ref[1][bx4], b->ref[1], bw4);
     }
 
     // update contexts
@@ -1794,19 +1829,14 @@
         f->frame_hdr.segmentation.update_map)
     {
         uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
-        for (int y = 0; y < bh4; y++) {
-            memset(seg_ptr, b->seg_id, bw4);
-            seg_ptr += f->b4_stride;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < bh4; y++) { \
+            rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+            seg_ptr += f->b4_stride; \
         }
+        case_set(bw4, NULL, 0, 0);
+#undef set_ctx
     }
-    memset(&t->l.seg_pred[by4], seg_pred, bh4);
-    memset(&t->a->seg_pred[bx4], seg_pred, bw4);
-    memset(&t->l.skip_mode[by4], b->skip_mode, bh4);
-    memset(&t->a->skip_mode[bx4], b->skip_mode, bw4);
-    memset(&t->l.intra[by4], b->intra, bh4);
-    memset(&t->a->intra[bx4], b->intra, bw4);
-    memset(&t->l.skip[by4], b->skip, bh4);
-    memset(&t->a->skip[bx4], b->skip, bw4);
     if (!b->skip) {
         uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
         const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
@@ -2081,8 +2111,11 @@
     }
 
     if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
-        memset(&t->a->partition[bx8], dav1d_al_part_ctx[0][bl][bp], hsz);
-        memset(&t->l.partition[by8], dav1d_al_part_ctx[1][bl][bp], hsz);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+        rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+        case_set_upto16(hsz,,,);
+#undef set_ctx
     }
 
     return 0;
--- a/src/env.h
+++ b/src/env.h
@@ -38,23 +38,23 @@
 #include "src/tables.h"
 
 typedef struct BlockContext {
-    uint8_t mode[32];
-    uint8_t lcoef[32];
-    uint8_t ccoef[2][32];
-    uint8_t seg_pred[32];
-    uint8_t skip[32];
-    uint8_t skip_mode[32];
-    uint8_t intra[32];
-    uint8_t comp_type[32];
-    int8_t ref[2][32]; // -1 means intra
-    uint8_t filter[2][32]; // 3 means unset
-    int8_t tx_intra[32];
-    int8_t tx[32];
-    uint8_t tx_lpf_y[32];
-    uint8_t tx_lpf_uv[32];
-    uint8_t partition[16];
-    uint8_t uvmode[32];
-    uint8_t pal_sz[32];
+    uint8_t ALIGN(mode[32], 8);
+    uint8_t ALIGN(lcoef[32], 8);
+    uint8_t ALIGN(ccoef[2][32], 8);
+    uint8_t ALIGN(seg_pred[32], 8);
+    uint8_t ALIGN(skip[32], 8);
+    uint8_t ALIGN(skip_mode[32], 8);
+    uint8_t ALIGN(intra[32], 8);
+    uint8_t ALIGN(comp_type[32], 8);
+    int8_t ALIGN(ref[2][32], 8); // -1 means intra
+    uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+    int8_t ALIGN(tx_intra[32], 8);
+    int8_t ALIGN(tx[32], 8);
+    uint8_t ALIGN(tx_lpf_y[32], 8);
+    uint8_t ALIGN(tx_lpf_uv[32], 8);
+    uint8_t ALIGN(partition[16], 8);
+    uint8_t ALIGN(uvmode[32], 8);
+    uint8_t ALIGN(pal_sz[32], 8);
 } BlockContext;
 
 static inline int get_intra_ctx(const BlockContext *const a,
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -32,6 +32,7 @@
 
 #include "common/intops.h"
 
+#include "src/ctx.h"
 #include "src/levels.h"
 #include "src/lf_mask.h"
 #include "src/tables.h"
@@ -64,12 +65,18 @@
     } else {
         const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
 
-        for (int y = 0; y < t_dim->h; y++) {
-            memset(txa[0][0][y], lw, t_dim->w);
-            memset(txa[1][0][y], lh, t_dim->w);
-            txa[0][1][y][0] = t_dim->w;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < t_dim->h; y++) { \
+            rep_macro(type, txa[0][0][y], off, mul * lw); \
+            rep_macro(type, txa[1][0][y], off, mul * lh); \
+            txa[0][1][y][0] = t_dim->w; \
         }
-        memset(txa[1][1][0], t_dim->h, t_dim->w);
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
     }
 }
 
@@ -190,8 +197,20 @@
         if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
     }
 
-    memset(a, thl4c, w4);
-    memset(l, twl4c, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
 }
 
 static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
@@ -249,8 +268,20 @@
         }
     }
 
-    memset(a, thl4c, cw4);
-    memset(l, twl4c, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
 }
 
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -37,6 +37,7 @@
 #include "common/mem.h"
 
 #include "src/cdef_apply.h"
+#include "src/ctx.h"
 #include "src/ipred_prepare.h"
 #include "src/lf_apply.h"
 #include "src/lr_apply.h"
@@ -315,10 +316,22 @@
             if (DEBUG_BLOCK_INFO)
                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                        ytx, txtp, eob, ts->msac.rng);
-            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
-            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
-            for (int y = 0; y < txh; y++)
-                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+            memset(&t->dir lcoef[off], cf_ctx, sz)
+            case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+            case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            for (int y = 0; y < txh; y++) { \
+                rep_macro(type, txtp_map, 0, mul * txtp); \
+                txtp_map += 32; \
+            }
+            uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
+            case_set_upto16(txw,,,);
+#undef set_ctx
             if (f->frame_thread.pass == 1) {
                 cbi->eob[0] = eob;
                 cbi->txtp[0] = txtp;
@@ -356,11 +369,18 @@
                            (bh4 > ss_ver || t->by & 1);
 
     if (b->skip) {
-        memset(&t->a->lcoef[bx4], 0x40, bw4);
-        memset(&t->l.lcoef[by4], 0x40, bh4);
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
-            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
         return;
     }
@@ -402,10 +422,16 @@
                                    b->tx, txtp, eob, ts->msac.rng);
                         cbi[t->bx].txtp[0] = txtp;
                         ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                        memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                               imin(t_dim->w, f->bw - t->bx));
-                        memset(&t->l.lcoef[by4 + y], cf_ctx,
-                               imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir lcoef[off], cf_ctx, sz)
+                        case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+                                                     l., 1, by4 + y);
+                        case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+                                                     a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
                     }
                 }
                 t->bx -= x;
@@ -441,10 +467,18 @@
                                    pl, b->uvtx, txtp, eob, ts->msac.rng);
                         cbi[t->bx].txtp[1 + pl] = txtp;
                         ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                 l., 1, cby4 + y);
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                 a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                     }
                     t->bx -= x << ss_hor;
                 }
@@ -763,10 +797,16 @@
                             if (DEBUG_BLOCK_INFO)
                                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                        b->tx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                                   imin(t_dim->w, f->bw - t->bx));
-                            memset(&t->l.lcoef[by4 + y], cf_ctx,
-                                   imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir lcoef[off], cf_ctx, sz)
+                            case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+                                                         l., 1, by4 + y);
+                            case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+                                                         a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -781,8 +821,11 @@
                                          t_dim->w * 4, t_dim->h * 4, "recon");
                         }
                     } else if (!f->frame_thread.pass) {
-                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
-                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+                        case_set_upto16(t_dim->h, l., 1, by4 + y);
+                        case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
                     }
                     dst += 4 * t_dim->w;
                 }
@@ -970,10 +1013,18 @@
                                     printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                            "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
                                            pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
-                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                                rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                                memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                         l., 1, cby4 + y);
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                         a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                             }
                             if (eob >= 0) {
                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@@ -987,8 +1038,11 @@
                                              uv_t_dim->h * 4, "recon");
                             }
                         } else if (!f->frame_thread.pass) {
-                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
-                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+                            case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+                            case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
                         }
                         dst += uv_t_dim->w * 4;
                     }
@@ -1301,13 +1355,18 @@
 
     if (b->skip) {
         // reset coef contexts
-        memset(&t->a->lcoef[bx4], 0x40, w4);
-        memset(&t->l.lcoef[by4], 0x40, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (has_chroma) {
-            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[0][cby4], 0x40, ch4);
-            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[1][cby4], 0x40, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
         return;
     }
@@ -1372,10 +1431,18 @@
                                 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                        "txtp=%d,eob=%d]: r=%d\n",
                                        pl, b->uvtx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                     l., 1, cby4 + y);
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                     a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)