shithub: dav1d

Download patch

ref: 65ba279b5393382a98ddd6844e0c0753f63e749f
parent: 0276455de73c4a520df12a3d6f80574b988d219a
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Jun 27 10:32:32 EDT 2019

Reduce the size of frame threading buffers

Avoid allocating significantly more memory than what is actually used.

--- a/src/cdef_apply_tmpl.c
+++ b/src/cdef_apply_tmpl.c
@@ -109,7 +109,7 @@
 
         if (edges & CDEF_HAVE_BOTTOM) {
             // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride,
+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
                          8, f->bw * 4, layout);
         }
 
@@ -173,8 +173,8 @@
                 if (y_lvl) {
                     dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
                                     (pixel *const [2]) {
-                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
-                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
+                                        &f->lf.cdef_line[tf][0][0][bx * 4],
+                                        &f->lf.cdef_line[tf][0][1][bx * 4],
                                     },
                                     adjust_strength(y_pri_lvl, variance),
                                     y_sec_lvl, y_pri_lvl ? dir : 0,
@@ -188,8 +188,8 @@
                         dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
                                              lr_bak[bit][pl],
                                              (pixel *const [2]) {
-                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
-                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
+                                                 &f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
+                                                 &f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
                                              },
                                              uv_pri_lvl, uv_sec_lvl,
                                              uv_pri_lvl ? uvdir : 0,
--- a/src/decode.c
+++ b/src/decode.c
@@ -2289,6 +2289,14 @@
     memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
 }
 
+// { Y+U+V, Y+U } * 4
+static const uint8_t ss_size_mul[4][2] = {
+    [DAV1D_PIXEL_LAYOUT_I400] = {  4, 4 },
+    [DAV1D_PIXEL_LAYOUT_I420] = {  6, 5 },
+    [DAV1D_PIXEL_LAYOUT_I422] = {  8, 6 },
+    [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },
+};
+
 static void setup_tile(Dav1dTileState *const ts,
                        const Dav1dFrameContext *const f,
                        const uint8_t *const data, const size_t sz,
@@ -2302,8 +2310,11 @@
     const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
     const int sb_shift = f->sb_shift;
 
-    ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
-    ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+    ts->frame_thread.pal_idx =
+        &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4];
+    ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
+        (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
     dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
     ts->last_qidx = f->frame_hdr->quant.yac;
     memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
@@ -2573,13 +2584,14 @@
     int retval = DAV1D_ERR(ENOMEM);
 
     if (f->n_tc > 1) {
-        if (f->frame_hdr->tiling.cols * f->sbh > f->tile_thread.titsati_sz) {
+        const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh;
+        if (titsati_sz != f->tile_thread.titsati_sz) {
             freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
             f->tile_thread.task_idx_to_sby_and_tile_idx =
                 malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *
-                       f->frame_hdr->tiling.cols * f->sbh);
+                       titsati_sz);
             if (!f->tile_thread.task_idx_to_sby_and_tile_idx) goto error;
-            f->tile_thread.titsati_sz = f->frame_hdr->tiling.cols * f->sbh;
+            f->tile_thread.titsati_sz = titsati_sz;
         }
         if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
             f->tile_thread.titsati_init[1] != f->sbh ||
@@ -2606,31 +2618,51 @@
         }
     }
 
-    if (f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows > f->n_ts) {
-        f->ts = realloc(f->ts, f->frame_hdr->tiling.cols *
-                               f->frame_hdr->tiling.rows * sizeof(*f->ts));
-        if (!f->ts) goto error;
-        for (int n = f->n_ts;
-             n < f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; n++)
-        {
-            Dav1dTileState *const ts = &f->ts[n];
-            if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;
-            if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {
+    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+    if (n_ts != f->n_ts) {
+        if (n_ts > f->n_ts) {
+            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
+            if (!ts_new) goto error;
+            f->ts = ts_new;
+            for (int n = f->n_ts; n < n_ts; n++) {
+                Dav1dTileState *const ts = &f->ts[n];
+                if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;
+                if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {
+                    pthread_mutex_destroy(&ts->tile_thread.lock);
+                    goto error;
+                }
+            }
+            f->n_ts = n_ts;
+        } else {
+            for (int n = n_ts; n < f->n_ts; n++) {
+                Dav1dTileState *const ts = &f->ts[n];
+                pthread_cond_destroy(&ts->tile_thread.cond);
                 pthread_mutex_destroy(&ts->tile_thread.lock);
-                goto error;
             }
-            f->n_ts = n + 1;
+            f->n_ts = n_ts;
+            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
+            if (!ts_new) goto error;
+            f->ts = ts_new;
         }
         if (c->n_fc > 1) {
             freep(&f->frame_thread.tile_start_off);
             f->frame_thread.tile_start_off =
-                malloc(sizeof(*f->frame_thread.tile_start_off) *
-                       f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows);
+                malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
             if (!f->frame_thread.tile_start_off) goto error;
         }
-        f->n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
     }
 
+    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows;
+    if (a_sz != f->a_sz) {
+        freep(&f->a);
+        f->a = malloc(sizeof(*f->a) * a_sz);
+        if (!f->a) goto error;
+        f->a_sz = a_sz;
+    }
+
+    const int num_sb128 = f->sb128w * f->sb128h;
+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+    const int hbd = !!f->seq_hdr->hbd;
     if (c->n_fc > 1) {
         int tile_idx = 0;
         for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
@@ -2643,100 +2675,105 @@
                     f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;
             }
         }
-    }
 
-    if (f->sb128w * f->frame_hdr->tiling.rows > f->a_sz) {
-        freep(&f->a);
-        f->a = malloc(f->sb128w * f->frame_hdr->tiling.rows * sizeof(*f->a));
-        if (!f->a) goto error;
-        f->a_sz = f->sb128w * f->frame_hdr->tiling.rows;
+        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
+        if (cf_sz != f->frame_thread.cf_sz) {
+            dav1d_freep_aligned(&f->frame_thread.cf);
+            f->frame_thread.cf =
+                dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 32);
+            if (!f->frame_thread.cf) goto error;
+            memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
+            f->frame_thread.cf_sz = cf_sz;
+        }
+
+        if (f->frame_hdr->allow_screen_content_tools) {
+            if (num_sb128 != f->frame_thread.pal_sz) {
+                dav1d_freep_aligned(&f->frame_thread.pal);
+                f->frame_thread.pal =
+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
+                                        num_sb128 * 16 * 16, 32);
+                if (!f->frame_thread.pal)
+                    goto error;
+                f->frame_thread.pal_sz = num_sb128;
+            }
+
+            const int pal_idx_sz = num_sb128 * size_mul[1];
+            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
+                dav1d_freep_aligned(&f->frame_thread.pal_idx);
+                f->frame_thread.pal_idx =
+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
+                                        pal_idx_sz * 128 * 128 / 4, 32);
+                if (!f->frame_thread.pal_idx)
+                    goto error;
+                f->frame_thread.pal_idx_sz = pal_idx_sz;
+            }
+        } else if (f->frame_thread.pal) {
+            dav1d_freep_aligned(&f->frame_thread.pal);
+            dav1d_freep_aligned(&f->frame_thread.pal_idx);
+            f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
+        }
     }
 
     // update allocation of block contexts for above
-    if (f->sb128w > f->lf.line_sz) {
-        dav1d_freep_aligned(&f->lf.cdef_line);
-
-        // note that we allocate all pixel arrays as if we were dealing with
-        // 10 bits/component data
-        uint16_t *ptr = f->lf.cdef_line =
-            dav1d_alloc_aligned(f->b4_stride * 4 * 12 * sizeof(uint16_t), 32);
+    const int line_sz = (int)f->b4_stride << hbd;
+    if (line_sz != f->lf.line_sz) {
+        dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
+        uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
         if (!ptr) goto error;
 
         for (int pl = 0; pl <= 2; pl++) {
-            f->lf.cdef_line_ptr[0][pl][0] = ptr + f->b4_stride * 4 * 0;
-            f->lf.cdef_line_ptr[0][pl][1] = ptr + f->b4_stride * 4 * 1;
-            f->lf.cdef_line_ptr[1][pl][0] = ptr + f->b4_stride * 4 * 2;
-            f->lf.cdef_line_ptr[1][pl][1] = ptr + f->b4_stride * 4 * 3;
-            ptr += f->b4_stride * 4 * 4;
+            f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
+            f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
+            f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
+            f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
+            ptr += line_sz * 4 * 4;
         }
 
-        f->lf.line_sz = f->sb128w;
+        f->lf.line_sz = line_sz;
     }
 
-    const ptrdiff_t lr_stride = (f->sr_cur.p.p.w + 31) & ~31;
-    if (lr_stride > f->lf.lr_line_sz) {
-        dav1d_freep_aligned(&f->lf.lr_lpf_line);
-
-        uint16_t *lr_ptr = f->lf.lr_lpf_line =
-            dav1d_alloc_aligned(lr_stride * 3 * 12 * sizeof(uint16_t), 32);
-
+    const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
+    if (lr_line_sz != f->lf.lr_line_sz) {
+        dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
+        uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
         if (!lr_ptr) goto error;
 
         for (int pl = 0; pl <= 2; pl++) {
-            f->lf.lr_lpf_line_ptr[pl] = lr_ptr;
-            lr_ptr += lr_stride * 12;
+            f->lf.lr_lpf_line[pl] = lr_ptr;
+            lr_ptr += lr_line_sz * 12;
         }
 
-        f->lf.lr_line_sz = (int) lr_stride;
+        f->lf.lr_line_sz = lr_line_sz;
     }
 
     // update allocation for loopfilter masks
-    if (f->sb128w * f->sb128h > f->lf.mask_sz) {
+    if (num_sb128 != f->lf.mask_sz) {
         freep(&f->lf.mask);
         freep(&f->lf.level);
-        freep(&f->frame_thread.b);
-        f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
+        f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
         // over-allocate by 3 bytes since some of the SIMD implementations
         // index this from the level type and can thus over-read by up to 3
-        f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *
-                             sizeof(*f->lf.level));
+        f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
         if (!f->lf.mask || !f->lf.level) goto error;
         if (c->n_fc > 1) {
             freep(&f->frame_thread.b);
             freep(&f->frame_thread.cbi);
-            dav1d_freep_aligned(&f->frame_thread.cf);
-            dav1d_freep_aligned(&f->frame_thread.pal_idx);
-            dav1d_freep_aligned(&f->frame_thread.pal);
             f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
-                                       f->sb128w * f->sb128h * 32 * 32);
-            f->frame_thread.pal =
-                dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
-                                    f->sb128w * f->sb128h * 16 * 16, 32);
-            f->frame_thread.pal_idx =
-                dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
-                                    f->sb128w * f->sb128h * 128 * 128 * 2, 32);
+                                       num_sb128 * 32 * 32);
             f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
-                                         f->sb128w * f->sb128h * 32 * 32);
-            f->frame_thread.cf =
-                dav1d_alloc_aligned(sizeof(int32_t) * 3 *
-                                    f->sb128w * f->sb128h * 128 * 128, 32);
-            if (!f->frame_thread.b || !f->frame_thread.pal_idx ||
-                !f->frame_thread.pal || !f->frame_thread.cbi ||
-                !f->frame_thread.cf)
-            {
-                goto error;
-            }
-            memset(f->frame_thread.cf, 0,
-                   sizeof(int32_t) * 3 * f->sb128w * f->sb128h * 128 * 128);
+                                         num_sb128 * 32 * 32);
+            if (!f->frame_thread.b || !f->frame_thread.cbi) goto error;
         }
-        f->lf.mask_sz = f->sb128w * f->sb128h;
+        f->lf.mask_sz = num_sb128;
     }
+
     f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
-    if (f->sr_sb128w * f->sb128h > f->lf.lr_mask_sz) {
+    const int lr_mask_sz = f->sr_sb128w * f->sb128h;
+    if (lr_mask_sz != f->lf.lr_mask_sz) {
         freep(&f->lf.lr_mask);
-        f->lf.lr_mask = malloc(f->sr_sb128w * f->sb128h * sizeof(*f->lf.lr_mask));
+        f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
         if (!f->lf.lr_mask) goto error;
-        f->lf.lr_mask_sz = f->sr_sb128w * f->sb128h;
+        f->lf.lr_mask_sz = lr_mask_sz;
     }
     if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
         dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
@@ -2743,26 +2780,26 @@
         f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
     }
     dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
-    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * f->sb128w * f->sb128h);
+    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
 
-    if (f->sbh * f->sb128w * 128 > f->ipred_edge_sz) {
+    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
+    if (ipred_edge_sz != f->ipred_edge_sz) {
         dav1d_freep_aligned(&f->ipred_edge[0]);
-        uint16_t *ptr = f->ipred_edge[0] =
-            dav1d_alloc_aligned(f->sb128w * 128 * f->sbh * 3 * sizeof(uint16_t), 32);
-        if (!f->ipred_edge[0]) goto error;
-        f->ipred_edge_sz = f->sbh * f->sb128w * 128;
-        f->ipred_edge[1] = &ptr[f->ipred_edge_sz];
-        f->ipred_edge[2] = &ptr[f->ipred_edge_sz * 2];
+        uint8_t *ptr = f->ipred_edge[0] =
+            dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 32);
+        if (!ptr) goto error;
+        f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
+        f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
+        f->ipred_edge_sz = ipred_edge_sz;
     }
 
-    if (f->sb128h * f->frame_hdr->tiling.cols > f->lf.re_sz) {
+    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+    if (re_sz != f->lf.re_sz) {
         freep(&f->lf.tx_lpf_right_edge[0]);
-        f->lf.tx_lpf_right_edge[0] = malloc((f->sb128h * 32 * 2) *
-                                            f->frame_hdr->tiling.cols);
+        f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
         if (!f->lf.tx_lpf_right_edge[0]) goto error;
-        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] +
-                                     f->sb128h * 32 * f->frame_hdr->tiling.cols;
-        f->lf.re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
+        f->lf.re_sz = re_sz;
     }
 
     // init ref mvs
@@ -3010,8 +3047,9 @@
             {
                 Dav1dTileState *const ts = &f->ts[tile_idx];
                 const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
-                ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
-                ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
+                ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4];
+                ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
+                    ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
                 if (f->n_tc > 0) {
                     unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
                     atomic_init(&ts->progress, row_sb_start);
--- a/src/internal.h
+++ b/src/internal.h
@@ -196,12 +196,12 @@
             int16_t eob[3 /* plane */];
             uint8_t txtp[3 /* plane */];
         } *cbi;
-        int8_t *txtp;
         // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
         uint16_t (*pal)[3 /* plane */][8 /* idx */];
         // iterated over inside tile state
         uint8_t *pal_idx;
         coef *cf;
+        int pal_sz, pal_idx_sz, cf_sz;
         // start offsets per tile
         int *tile_start_off;
     } frame_thread;
@@ -217,10 +217,8 @@
         int last_sharpness;
         uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
         uint8_t *tx_lpf_right_edge[2];
-        pixel *cdef_line;
-        pixel *cdef_line_ptr[2 /* pre, post */][3 /* plane */][2 /* y */];
-        pixel *lr_lpf_line;
-        pixel *lr_lpf_line_ptr[3 /* plane */];
+        pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
+        pixel *lr_lpf_line[3 /* plane */];
 
         // in-loop filter per-frame state keeping
         int tile_row; // for carry-over at tile row edges
--- a/src/lib.c
+++ b/src/lib.c
@@ -509,8 +509,8 @@
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
         if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
-        dav1d_free_aligned(f->lf.cdef_line);
-        dav1d_free_aligned(f->lf.lr_lpf_line);
+        dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
+        dav1d_free_aligned(f->lf.lr_lpf_line[0]);
     }
     dav1d_free_aligned(c->fc);
     dav1d_data_unref_internal(&c->in);
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -122,7 +122,7 @@
         const int w = f->bw << 2;
         const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
         const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
-        backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,
+        backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
                    src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
                    0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
     }
@@ -137,12 +137,12 @@
             (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
 
         if (restore_planes & LR_RESTORE_U) {
-            backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,
+            backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
                        src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
                        ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
         if (restore_planes & LR_RESTORE_V) {
-            backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,
+            backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
                        src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
                        ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
@@ -158,7 +158,7 @@
     const int chroma = !!plane;
     const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
     const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
-    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
+    const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
     const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
     const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
 
--- a/src/thread_task.c
+++ b/src/thread_task.c
@@ -44,7 +44,7 @@
         const int res = dav1d_decode_frame(f);
         if (res)
             memset(f->frame_thread.cf, 0,
-                   sizeof(int32_t) * 3 * f->lf.mask_sz * 128 * 128);
+                   (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
 
         pthread_mutex_lock(&f->frame_thread.td.lock);
         f->n_tile_data = 0;