ref: 20c2892693b3be46cb53b8c9262d41d7a61999aa
parent: 2fac50fa0ee099c632d97a941a9871e7cecbe720
	author: Jerome Jiang <jianj@google.com>
	date: Tue Mar 14 09:45:20 EDT 2017
	
vp9: Enable adaptive_rd_threshold for row mt for realtime speed 8. Change it to row based array to avoid the slow down cause by sync. row-mt on, speed 8, 2 threads: ~4% speedup for VGA on ARM benefited from adaptive_rd_threshold. Change-Id: I887e65a53af20a6c4f48d293daaee09dab3512cf
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4285,7 +4285,7 @@
int i, j;
         for (i = 0; i < BLOCK_SIZES; ++i) {           for (j = 0; j < MAX_MODES; ++j) {- tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
tile_data->mode_map[i][j] = j;
}
}
@@ -4292,6 +4292,7 @@
#if CONFIG_MULTITHREAD
tile_data->search_count_mutex = NULL;
tile_data->enc_row_mt_mutex = NULL;
+ tile_data->row_base_thresh_freq_fact = NULL;
#endif
}
}
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -284,6 +284,9 @@
int ex_search_count;
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
+
+ // Used for adaptive_rd_thresh with row multithreading
+ int *row_base_thresh_freq_fact;
#if CONFIG_MULTITHREAD
pthread_mutex_t *search_count_mutex;
pthread_mutex_t *enc_row_mt_mutex;
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -82,6 +82,16 @@
   for (tile_col = 0; tile_col < tile_cols; tile_col++) {TileDataEnc *this_tile = &cpi->tile_data[tile_col];
vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+    if (cpi->sf.adaptive_rd_thresh_row_mt) {+ const int sb_rows =
+ (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
+ int i;
+ this_tile->row_base_thresh_freq_fact =
+ (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+ sizeof(*(this_tile->row_base_thresh_freq_fact)));
+ for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+ this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+ }
}
// Assign the sync pointer of tile row zero for every tile row > 0
@@ -154,10 +164,15 @@
TileDataEnc *this_tile =
&cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
tile_col];
+      if (cpi->sf.adaptive_rd_thresh_row_mt) {+        if (this_tile->row_base_thresh_freq_fact != NULL) {+ vpx_free(this_tile->row_base_thresh_freq_fact);
+ this_tile->row_base_thresh_freq_fact = NULL;
+ }
+ }
pthread_mutex_destroy(this_tile->search_count_mutex);
vpx_free(this_tile->search_count_mutex);
this_tile->search_count_mutex = NULL;
-
pthread_mutex_destroy(this_tile->enc_row_mt_mutex);
vpx_free(this_tile->enc_row_mt_mutex);
this_tile->enc_row_mt_mutex = NULL;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1016,6 +1016,32 @@
}
}
+static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
+                                             const int *const thresh_fact) {+ int is_rd_less_than_thresh;
+ is_rd_less_than_thresh =
+ best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+ return is_rd_less_than_thresh;
+}
+
+static INLINE void update_thresh_freq_fact_row_mt(
+ VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+ int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
+    THR_MODES best_mode_idx, PREDICTION_MODE mode) {+ THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx;
+ int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx];
+ if (thr_mode_idx == best_mode_idx)
+ *freq_fact -= (*freq_fact >> 4);
+ else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+           ref_frame == LAST_FRAME && source_variance < 5) {+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+  } else {+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+}
+
static INLINE void update_thresh_freq_fact(
VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx,
@@ -1398,7 +1424,13 @@
int64_t inter_mode_thresh =
RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
- const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+ const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES;
+ const int *const rd_thresh_freq_fact =
+ (cpi->sf.adaptive_rd_thresh_row_mt)
+ ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
+ : tile_data->thresh_freq_fact[bsize];
+
INTERP_FILTER filter_ref;
const int bsl = mi_width_log2_lookup[bsize];
const int pred_filter_search =
@@ -1687,14 +1719,19 @@
cpi->rc.frames_since_golden > 4)
mode_rd_thresh = mode_rd_thresh << 3;
- if (rd_less_than_thresh(
- best_rdc.rdcost, mode_rd_thresh,
+ if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])) ||
+ (!cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh(
+ best_rdc.rdcost, mode_rd_thresh,
#if CONFIG_MULTITHREAD
- // Synchronization of this function is only necessary when
- // adaptive_rd_thresh is > 0.
- cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
+ // Synchronization of this function
+ // is only necessary when
+ // adaptive_rd_thresh is > 0.
+ cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
#endif
- &rd_thresh_freq_fact[mode_index]))
+ &rd_thresh_freq_fact[mode_index])))
continue;
     if (this_mode == NEWMV) {@@ -2053,14 +2090,19 @@
if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
continue;
- if (rd_less_than_thresh(
- best_rdc.rdcost, mode_rd_thresh,
+ if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])) ||
+ (!cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh(
+ best_rdc.rdcost, mode_rd_thresh,
#if CONFIG_MULTITHREAD
- // Synchronization of this function is only necessary when
- // adaptive_rd_thresh is > 0.
- cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
+ // Synchronization of this function
+ // is only necessary when
+ // adaptive_rd_thresh is > 0.
+ cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
#endif
- &rd_thresh_freq_fact[mode_index]))
+ &rd_thresh_freq_fact[mode_index])))
continue;
mi->mode = this_mode;
@@ -2168,8 +2210,14 @@
// TODO(yunqingwang): Check intra mode mask and only update freq_fact
// for those valid modes.
       for (i = 0; i < intra_modes; i++) {- update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
- INTRA_FRAME, best_mode_idx, intra_mode_list[i]);
+ if (cpi->sf.adaptive_rd_thresh_row_mt)
+ update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+ thresh_freq_fact_idx, INTRA_FRAME,
+ best_mode_idx, intra_mode_list[i]);
+ else
+ update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+ INTRA_FRAME, best_mode_idx,
+ intra_mode_list[i]);
}
     } else {       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {@@ -2176,8 +2224,13 @@
PREDICTION_MODE this_mode;
if (best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {- update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
- ref_frame, best_mode_idx, this_mode);
+ if (cpi->sf.adaptive_rd_thresh_row_mt)
+ update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+ thresh_freq_fact_idx, ref_frame,
+ best_mode_idx, this_mode);
+ else
+ update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+ ref_frame, best_mode_idx, this_mode);
}
}
}
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -38,6 +38,7 @@
#define MAX_MODES 30
#define MAX_REFS 6
+#define RD_THRESH_INIT_FACT 32
#define RD_THRESH_MAX_FACT 64
#define RD_THRESH_INC 1
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -323,6 +323,7 @@
const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
sf->static_segmentation = 0;
sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh_row_mt = 0;
sf->use_fast_coef_costing = 1;
sf->allow_exhaustive_searches = 0;
sf->exhaustive_searches_thresh = INT_MAX;
@@ -551,6 +552,9 @@
}
}
+ if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+ sf->adaptive_rd_thresh_row_mt = 1;
+
sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
// Only keep INTRA_DC mode for speed 8.
@@ -578,11 +582,10 @@
sf->limit_newmv_early_exit = 0;
if (cm->width > 640 && cm->height > 480) sf->use_simple_block_yrd = 1;
}
- // Turn off adaptive_rd_thresh if row_mt is on for all the non-rd paths. This
- // causes too many locks in realtime mode in certain platforms (Android ARM,
- // Mac).
-  if (speed >= 5 && cpi->row_mt && cpi->num_workers > 1) {+ // Turn off adaptive_rd_thresh if row_mt is on for speed 5, 6, 7.
+  if (speed >= 5 && speed < 8 && cpi->row_mt && cpi->num_workers > 1) {sf->adaptive_rd_thresh = 0;
+ sf->adaptive_rd_thresh_row_mt = 0;
}
}
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -233,6 +233,9 @@
// mode to be evaluated. A high value means we will be faster.
int adaptive_rd_thresh;
+ // Flag to use adaptive_rd_thresh when row-mt it enabled.
+ int adaptive_rd_thresh_row_mt;
+
// Enables skipping the reconstruction step (idct, recon) in the
// intermediate steps assuming the last frame didn't have too many intra
// blocks and the q is less than a threshold.
--
⑨