ref: 33a8fa870cd187c911072a1b093765948f103028
parent: f216dba55792f772006ab4397b898a593ab492eb
author: Vitaly Buka <vitlaybuka@chromium.org>
date: Wed Nov 13 06:45:53 EST 2019
Move buffer from extend_and_predict into TileWorkerData This avoids unneeded initializations. extend_and_predict is called from multiple nested loops, allocate large buffer on stack and use just a portion of it. -ftrivial-auto-var-init= inserts initializations which performed on multiple iterations of loops causing 258.5% regression on webrtc_perf_tests decode_time/pc_vp9svc_3sl_low_alice-video. Bug: 1020220, 977230 Change-Id: I7e5bb3c3780adab74dd8b5c8bd2a96bf45e0c231
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -529,16 +529,15 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_HIGHBITDEPTH
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
- int x0, int y0, int b_w, int b_h,
- int frame_width, int frame_height,
+static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1,
+ int pre_buf_stride, int x0, int y0, int b_w,
+ int b_h, int frame_width, int frame_height,
int border_offset, uint8_t *const dst,
int dst_buf_stride, int subpel_x, int subpel_y,
const InterpKernel *kernel,
const struct scale_factors *sf, MACROBLOCKD *xd,
int w, int h, int ref, int xs, int ys) {
- DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
-
+ uint16_t *mc_buf_high = twd->extend_and_predict_buf;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0,
b_w, b_h, frame_width, frame_height);
@@ -554,15 +553,15 @@
}
}
#else
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
- int x0, int y0, int b_w, int b_h,
- int frame_width, int frame_height,
+static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1,
+ int pre_buf_stride, int x0, int y0, int b_w,
+ int b_h, int frame_width, int frame_height,
int border_offset, uint8_t *const dst,
int dst_buf_stride, int subpel_x, int subpel_y,
const InterpKernel *kernel,
const struct scale_factors *sf, int w, int h,
int ref, int xs, int ys) {
- DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+ uint8_t *mc_buf = (uint8_t *)twd->extend_and_predict_buf;
const uint8_t *buf_ptr;
build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, x0, y0, b_w, b_h,
@@ -575,8 +574,8 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
static void dec_build_inter_predictors(
- MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h,
- int mi_x, int mi_y, const InterpKernel *kernel,
+ TileWorkerData *twd, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
+ int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel,
const struct scale_factors *sf, struct buf_2d *pre_buf,
struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf,
int is_scaled, int ref) {
@@ -687,9 +686,9 @@
const int b_h = y1 - y0 + 1;
const int border_offset = y_pad * 3 * b_w + x_pad * 3;
- extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h, frame_width,
- frame_height, border_offset, dst, dst_buf->stride,
- subpel_x, subpel_y, kernel, sf,
+ extend_and_predict(twd, buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+ frame_width, frame_height, border_offset, dst,
+ dst_buf->stride, subpel_x, subpel_y, kernel, sf,
#if CONFIG_VP9_HIGHBITDEPTH
xd,
#endif
@@ -712,7 +711,8 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+static void dec_build_inter_predictors_sb(TileWorkerData *twd,
+ VP9Decoder *const pbi,
MACROBLOCKD *xd, int mi_row,
int mi_col) {
int plane;
@@ -755,10 +755,10 @@
for (y = 0; y < num_4x4_h; ++y) {
for (x = 0; x < num_4x4_w; ++x) {
const MV mv = average_split_mvs(pd, mi, ref, i++);
- dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y,
- 4, 4, mi_x, mi_y, kernel, sf, pre_buf,
- dst_buf, &mv, ref_frame_buf, is_scaled,
- ref);
+ dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 4 * x,
+ 4 * y, 4, 4, mi_x, mi_y, kernel, sf,
+ pre_buf, dst_buf, &mv, ref_frame_buf,
+ is_scaled, ref);
}
}
}
@@ -772,7 +772,7 @@
const int n4w_x4 = 4 * num_4x4_w;
const int n4h_x4 = 4 * num_4x4_h;
struct buf_2d *const pre_buf = &pd->pre[ref];
- dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
+ dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
n4h_x4, mi_x, mi_y, kernel, sf, pre_buf,
dst_buf, &mv, ref_frame_buf, is_scaled, ref);
}
@@ -964,7 +964,7 @@
}
} else {
// Prediction
- dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+ dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col);
#if CONFIG_MISMATCH_DEBUG
{
int plane;
@@ -1048,7 +1048,7 @@
predict_and_reconstruct_intra_block_row_mt);
} else {
// Prediction
- dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+ dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col);
// Reconstruction
if (!mi->skip) {
@@ -1905,6 +1905,7 @@
LFWorkerData *lf_data = thread_data->lf_data;
VP9LfSync *lf_sync = thread_data->lf_sync;
volatile int corrupted = 0;
+ TileWorkerData *volatile tile_data_recon = NULL;
while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) {
int mi_col;
@@ -1921,9 +1922,10 @@
} else if (job.job_type == RECON_JOB) {
const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
const int is_last_row = sb_rows - 1 == cur_sb_row;
- TileWorkerData twd_recon;
- TileWorkerData *const tile_data_recon = &twd_recon;
int mi_col_start, mi_col_end;
+ if (!tile_data_recon)
+ CHECK_MEM_ERROR(cm, tile_data_recon,
+ vpx_memalign(32, sizeof(TileWorkerData)));
tile_data_recon->xd = pbi->mb;
vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col);
@@ -2006,6 +2008,7 @@
}
}
+ vpx_free(tile_data_recon);
return !corrupted;
}
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -55,6 +55,7 @@
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+ DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
struct vpx_internal_error_info error_info;
} TileWorkerData;