ref: b54cdcc3de62390dc438a36425665a89958aefea
parent: e858863dda2e242ede57916dae4086a991f618dd
	author: Hui Su <huisu@google.com>
	date: Fri Jul 20 11:29:14 EDT 2018
	
Add prune_ref_frame_for_rect_partitions feature
Add a speed feature to prune reference frames for rectangular
partitions. Rectangular partition RD search happens after square
partition RD search. With this feature, we keep record of the ref
frames picked by square partitions, and only consider those ref
frames during rect partition RD search.
With this feature on, the computation cost of rect partition RD
search is greatly reduced, so we can afford to skip rect partition
RD search less aggressively.
Overall, both compression and encoding speed are improved. Only
speed 0 is affected.
Coding gains:
              lowres    midres    hdres
ovr psnr      0.00%    -0.36%    -0.37%
avg psnr      0.00%    -0.36%    -0.36%
Tested encoding speed with QP=40 on about 30 sequences.
Speed gains:
              lowres    midres    hdres
average       13.4%      7.1%     6.1%
max           28.0%     12.0%     9.8%
Change-Id: Id5f36dd2ac75028ae98550d67b0a524aa251b692
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -75,6 +75,8 @@
// Used for the machine learning-based early termination
int32_t sum_y_eobs;
+ // Skip certain ref frames during RD search of rectangular partitions.
+ uint8_t skip_ref_frame_mask;
} PICK_MODE_CONTEXT;
 typedef struct PC_TREE {--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2633,6 +2633,7 @@
ctx, INT64_MAX);
break;
case PARTITION_HORZ:
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, &pc_tree->horizontal[0], INT64_MAX);
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
@@ -2642,6 +2643,7 @@
vp9_rd_cost_init(&tmp_rdc);
update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
&tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {@@ -2654,6 +2656,7 @@
}
break;
case PARTITION_VERT:
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, &pc_tree->vertical[0], INT64_MAX);
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
@@ -2663,6 +2666,7 @@
vp9_rd_cost_init(&tmp_rdc);
update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+ pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1),
&tmp_rdc, subsize,
&pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
@@ -3712,10 +3716,12 @@
int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
int must_split = 0;
-
int partition_mul = cpi->sf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ
? x->cb_rdmult
: cpi->rd.RDMULT;
+ // Ref frames picked in the [i_th] quarter subblock during square partition
+ // RD search. It may be used to prune ref frame selection of rect partitions.
+  uint8_t ref_frames_used[4] = { 0, 0, 0, 0 };(void)*tp_orig;
@@ -3846,6 +3852,14 @@
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,
best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {+ const int ref1 = ctx->mic.ref_frame[0];
+ const int ref2 = ctx->mic.ref_frame[1];
+        for (i = 0; i < 4; ++i) {+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
+ }
       if (bsize >= BLOCK_8X8) {this_rdc.rdcost += RDCOST(partition_mul, x->rddiv,
cpi->partition_cost[pl][PARTITION_NONE], 0);
@@ -3970,8 +3984,18 @@
pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
pc_tree->leaf_split[0], best_rdc.rdcost);
-
- if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX;
+      if (sum_rdc.rate == INT_MAX) {+ sum_rdc.rdcost = INT64_MAX;
+      } else {+        if (cpi->sf.prune_ref_frame_for_rect_partitions) {+ const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
+ const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+          for (i = 0; i < 4; ++i) {+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
+ }
+ }
     } else {for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split);
            ++i) {@@ -3999,6 +4023,13 @@
sum_rdc.rdcost = INT64_MAX;
break;
         } else {+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+              pc_tree->split[i]->none.rate != INT_MAX) {+ const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
+ const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
sum_rdc.rate += this_rdc.rate;
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
@@ -4034,6 +4065,22 @@
do_rect &= !partition_none_allowed;
}
restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ pc_tree->vertical[1].skip_ref_frame_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions) {+ uint8_t used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
}
// PARTITION_HORZ
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3073,6 +3073,8 @@
// lock mechanism involved with reads from
// tile_mode_map
const int mode_search_skip_flags = sf->mode_search_skip_flags;
+ const int is_rect_partition =
+ num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize];
int64_t mask_filter = 0;
int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
@@ -3223,6 +3225,13 @@
second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
vp9_zero(x->sum_y_eobs);
+
+    if (is_rect_partition) {+ if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
+ if (second_ref_frame > 0 &&
+ (ctx->skip_ref_frame_mask & (1 << second_ref_frame)))
+ continue;
+ }
// Look at the reference frame of the best mode so far and set the
// skip mask to look at a subset of the remaining modes.
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -70,11 +70,14 @@
// speed 0 features
sf->partition_search_breakout_thr.dist = (1 << 20);
sf->partition_search_breakout_thr.rate = 80;
+ sf->use_square_only_threshold = BLOCK_SIZES;
- // Currently, the machine-learning based partition search early termination
- // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
   if (is_480p_or_larger) {+ // Currently, the machine-learning based partition search early termination
+ // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
sf->ml_partition_search_early_termination = 1;
+  } else {+ sf->use_square_only_threshold = BLOCK_32X32;
}
   if (!is_1080p_or_larger) {@@ -92,6 +95,7 @@
   if (speed >= 1) {sf->ml_partition_search_early_termination = 0;
+ sf->use_square_only_threshold = BLOCK_4X4;
     if (is_720p_or_larger) {sf->disable_split_mask =
@@ -193,7 +197,7 @@
sf->allow_skip_recode = 1;
sf->less_rectangular_check = 1;
sf->use_square_partition_only = !frame_is_boosted(cpi);
- sf->use_square_only_threshold = BLOCK_16X16;
+ sf->prune_ref_frame_for_rect_partitions = 1;
   if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {sf->exhaustive_searches_thresh = (1 << 22);
@@ -210,6 +214,7 @@
   if (speed >= 1) {sf->enable_tpl_model = 0;
+ sf->prune_ref_frame_for_rect_partitions = 0;
     if (oxcf->pass == 2) {TWO_PASS *const twopass = &cpi->twopass;
if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
@@ -226,10 +231,7 @@
sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
sf->allow_quant_coeff_opt = sf->optimize_coefficients;
sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
-
- sf->use_square_only_threshold = BLOCK_4X4;
sf->less_rectangular_check = 1;
-
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->mv.auto_mv_step_size = 1;
@@ -848,6 +850,7 @@
#else
sf->enable_tpl_model = 1;
#endif
+ sf->prune_ref_frame_for_rect_partitions = 0;
   for (i = 0; i < TX_SIZES; i++) {sf->intra_y_mode_mask[i] = INTRA_ALL;
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -319,6 +319,9 @@
int use_square_partition_only;
BLOCK_SIZE use_square_only_threshold;
+ // Prune reference frames for rectangular partitions.
+ int prune_ref_frame_for_rect_partitions;
+
// Sets min and max partition sizes for this 64x64 region based on the
// same 64x64 in last encoded frame, and the left and above neighbor.
AUTO_MIN_MAX_MODE auto_min_max_partition_size;
--
⑨