shithub: libvpx

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -2361,6 +2361,17 @@

   if (cpi->sf.enable_tpl_model) {

     const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);

     const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);

+#if CONFIG_NON_GREEDY_MV

+    CHECK_MEM_ERROR(

+        cm, cpi->feature_score_loc_arr,

+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_arr)));

+    CHECK_MEM_ERROR(

+        cm, cpi->feature_score_loc_sort,

+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_sort)));

+    CHECK_MEM_ERROR(

+        cm, cpi->feature_score_loc_heap,

+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->feature_score_loc_heap)));

+#endif

     // TODO(jingning): Reduce the actual memory use for tpl model build up.

     for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {

       CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,

@@ -2577,6 +2588,11 @@

   vp9_denoiser_free(&(cpi->denoiser));

 #endif

+#if CONFIG_NON_GREEDY_MV

+  vpx_free(cpi->feature_score_loc_arr);

+  vpx_free(cpi->feature_score_loc_sort);

+  vpx_free(cpi->feature_score_loc_heap);

+#endif

   for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {

     vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);

     cpi->tpl_stats[frame].is_valid = 0;

@@ -5484,6 +5500,13 @@

   int frame_idx;

   for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {

     TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];

+#if CONFIG_NON_GREEDY_MV

+    int rf_idx;

+    for (rf_idx = 0; rf_idx < 3; ++rf_idx) {

+      tpl_frame->mv_dist_sum[rf_idx] = 0;

+      tpl_frame->mv_cost_sum[rf_idx] = 0;

+    }

+#endif

     memset(tpl_frame->tpl_stats_ptr, 0,

            tpl_frame->height * tpl_frame->width *

                sizeof(*tpl_frame->tpl_stats_ptr));

@@ -5947,6 +5970,131 @@

   tpl_stats->mv.as_int = best_mv.as_int;

+#if CONFIG_NON_GREEDY_MV

+static int compare_feature_score(const void *a, const void *b) {

+  const FEATURE_SCORE_LOC *aa = *(FEATURE_SCORE_LOC *const *)a;

+  const FEATURE_SCORE_LOC *bb = *(FEATURE_SCORE_LOC *const *)b;

+  if (aa->feature_score < bb->feature_score) {

+    return 1;

+  } else if (aa->feature_score > bb->feature_score) {

+    return -1;

+  } else {

+    return 0;

+  }

+}

+static void do_motion_search(VP9_COMP *cpi, ThreadData *td, int frame_idx,

+                             YV12_BUFFER_CONFIG **ref_frame, BLOCK_SIZE bsize,

+                             int mi_row, int mi_col) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCK *x = &td->mb;

+  MACROBLOCKD *xd = &x->e_mbd;

+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];

+  TplDepStats *tpl_stats =

+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];

+  const int mb_y_offset =

+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;

+  int rf_idx;

+  set_mv_limits(cm, x, mi_row, mi_col);

+  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {

+    if (ref_frame[rf_idx] == NULL) {

+      tpl_stats->ready[rf_idx] = 0;

+      continue;

+    } else {

+      tpl_stats->ready[rf_idx] = 1;

+    }

+    motion_compensated_prediction(

+        cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset,

+        ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize,

+        mi_row, mi_col, tpl_stats, rf_idx);

+  }

+}

+#define CHANGE_MV_SEARCH_ORDER 1

+#define USE_PQSORT 1

+#if CHANGE_MV_SEARCH_ORDER

+#if USE_PQSORT

+static void max_heap_pop(FEATURE_SCORE_LOC **heap, int *size,

+                         FEATURE_SCORE_LOC **output) {

+  if (*size > 0) {

+    *output = heap[0];

+    --*size;

+    if (*size > 0) {

+      int p, l, r;

+      heap[0] = heap[*size];

+      p = 0;

+      l = 2 * p + 1;

+      r = 2 * p + 2;

+      while (l < *size) {

+        FEATURE_SCORE_LOC *tmp;

+        int c = l;

+        if (r < *size && heap[r]->feature_score > heap[l]->feature_score) {

+          c = r;

+        }

+        if (heap[p]->feature_score >= heap[c]->feature_score) {

+          break;

+        }

+        tmp = heap[p];

+        heap[p] = heap[c];

+        heap[c] = tmp;

+        p = c;

+        l = 2 * p + 1;

+        r = 2 * p + 2;

+      }

+    }

+  } else {

+    assert(0);

+  }

+}

+static void max_heap_push(FEATURE_SCORE_LOC **heap, int *size,

+                          FEATURE_SCORE_LOC *input) {

+  int c, p;

+  FEATURE_SCORE_LOC *tmp;

+  heap[*size] = input;

+  ++*size;

+  c = *size - 1;

+  p = c >> 1;

+  while (c > 0) {

+    if (heap[c]->feature_score > heap[p]->feature_score) {

+      tmp = heap[p];

+      heap[p] = heap[c];

+      heap[c] = tmp;

+      c = p;

+      p = c >> 1;

+    } else {

+      break;

+    }

+  }

+}

+static void add_nb_blocks_to_heap(VP9_COMP *cpi, const TplDepFrame *tpl_frame,

+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,

+                                  int *heap_size) {

+  const int mi_unit = num_8x8_blocks_wide_lookup[bsize];

+  const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };

+  int i;

+  for (i = 0; i < NB_MVS_NUM; ++i) {

+    int r = dirs[i][0] * mi_unit;

+    int c = dirs[i][1] * mi_unit;

+    if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 &&

+        mi_col + c < tpl_frame->mi_cols) {

+      FEATURE_SCORE_LOC *fs_loc =

+          &cpi->feature_score_loc_arr[(mi_row + r) * tpl_frame->stride +

+                                      (mi_col + c)];

+      if (fs_loc->visited == 0) {

+        max_heap_push(cpi->feature_score_loc_heap, heap_size, fs_loc);

+      }

+    }

+  }

+}

+#endif  // USE_PQSORT

+#endif  // CHANGE_MV_SEARCH_ORDER

+#endif  // CONFIG_NON_GREEDY_MV

 void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx,

                        BLOCK_SIZE bsize) {

   TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];

@@ -5979,7 +6127,15 @@

   int64_t recon_error, sse;

 #if CONFIG_NON_GREEDY_MV

   int rf_idx;

-#endif

+  int fs_loc_sort_size;

+#if CHANGE_MV_SEARCH_ORDER

+#if USE_PQSORT

+  int fs_loc_heap_size;

+#else

+  int i;

+#endif  // USE_PQSORT

+#endif  // CHANGE_MV_SEARCH_ORDER

+#endif  // CONFIG_NON_GREEDY_MV

   // Setup scaling factor

 #if CONFIG_VP9_HIGHBITDEPTH

@@ -6023,6 +6179,7 @@

 #if CONFIG_NON_GREEDY_MV

   tpl_frame->lambda = 250;

+  fs_loc_sort_size = 0;

   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {

     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {

@@ -6032,41 +6189,55 @@

       const int bh = 4 << b_height_log2_lookup[bsize];

       TplDepStats *tpl_stats =

           &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];

+      FEATURE_SCORE_LOC *fs_loc =

+          &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col];

       tpl_stats->feature_score = get_feature_score(

           xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);

+      fs_loc->visited = 0;

+      fs_loc->feature_score = tpl_stats->feature_score;

+      fs_loc->mi_row = mi_row;

+      fs_loc->mi_col = mi_col;

+      cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc;

+      ++fs_loc_sort_size;

-  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {

-    tpl_frame->mv_dist_sum[rf_idx] = 0;

-    tpl_frame->mv_cost_sum[rf_idx] = 0;

-  }

+  qsort(cpi->feature_score_loc_sort, fs_loc_sort_size,

+        sizeof(*cpi->feature_score_loc_sort), compare_feature_score);

+#if CHANGE_MV_SEARCH_ORDER

+#if !USE_PQSORT

+  for (i = 0; i < fs_loc_sort_size; ++i) {

+    FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i];

+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,

+                     fs_loc->mi_col);

+  }

+#else   // !USE_PQSORT

+  fs_loc_heap_size = 0;

+  max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size,

+                cpi->feature_score_loc_sort[0]);

-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {

-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {

-      const int mb_y_offset =

-          mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;

-      TplDepStats *tpl_stats =

-          &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];

+  while (fs_loc_heap_size > 0) {

+    FEATURE_SCORE_LOC *fs_loc;

+    max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc);

-      set_mv_limits(cm, x, mi_row, mi_col);

+    fs_loc->visited = 1;

-      for (rf_idx = 0; rf_idx < 3; ++rf_idx) {

-        if (ref_frame[rf_idx] == NULL) {

-          tpl_stats->ready[rf_idx] = 0;

-          continue;

-        } else {

-          tpl_stats->ready[rf_idx] = 1;

-        }

-        motion_compensated_prediction(

-            cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset,

-            ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride,

-            bsize, mi_row, mi_col, tpl_stats, rf_idx);

-      }

+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,

+                     fs_loc->mi_col);

+    add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col,

+                          &fs_loc_heap_size);

+  }

+#endif  // !USE_PQSORT

+#else   // CHANGE_MV_SEARCH_ORDER

+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {

+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {

+      do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col);

-#endif

+#endif  // CHANGE_MV_SEARCH_ORDER

+#endif  // CONFIG_NON_GREEDY_MV

   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {

     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {

       mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,

--- a/vp9/encoder/vp9_encoder.h

+++ b/vp9/encoder/vp9_encoder.h

@@ -503,6 +503,14 @@

 // Maximum operating frame buffer size needed for a GOP using ARF reference.

 #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)

+#if CONFIG_NON_GREEDY_MV

+typedef struct FEATURE_SCORE_LOC {

+  int visited;

+  double feature_score;

+  int mi_row;

+  int mi_col;

+} FEATURE_SCORE_LOC;

+#endif

 typedef struct VP9_COMP {

   QUANTS quants;

@@ -530,6 +538,11 @@

   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];

   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];

   EncFrameBuf enc_frame_buf[REF_FRAMES];

+#if CONFIG_NON_GREEDY_MV

+  FEATURE_SCORE_LOC *feature_score_loc_arr;

+  FEATURE_SCORE_LOC **feature_score_loc_sort;

+  FEATURE_SCORE_LOC **feature_score_loc_heap;

+#endif

   TileDataEnc *tile_data;

   int allocated_tiles;  // Keep track of memory allocated for tiles.

--

⑨