ref: 037d67f684683ffad22e38ab9a6381ccfedd813f
parent: 719fe0bc5f8af9cd27314e96a6eead2710666050
author: Angie Chiang <angiebird@google.com>
date: Fri Jul 12 12:15:55 EDT 2019
Use sdx8f in exhaustive_mesh_search_single_step This speed up non_greedy_mv by 4% Change-Id: I9288c88db56ea4201a7ec4493ca5c567d76af0f1
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1513,6 +1513,7 @@
}
#if CONFIG_VP9_HIGHBITDEPTH
+// TODO(angiebird): make sdx8f available for highbitdepth if needed
#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].sdaf = SDAF; \
@@ -1519,7 +1520,8 @@
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdx8f = NULL;
#define MAKE_BFP_SAD_WRAPPER(fnname) \
static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
@@ -2418,62 +2420,67 @@
CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
cpi->source_var_thresh = 0;
cpi->frames_till_next_var_check = 0;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdx8f = SDX8F;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
- cpi->fn_ptr[BT].sdf = SDF; \
- cpi->fn_ptr[BT].sdaf = SDAF; \
- cpi->fn_ptr[BT].vf = VF; \
- cpi->fn_ptr[BT].svf = SVF; \
- cpi->fn_ptr[BT].svaf = SVAF; \
- cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
+ // TODO(angiebird): make sdx8f available for every block size
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
- vpx_sad32x16x4d)
+ vpx_sad32x16x4d, NULL)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
- vpx_sad16x32x4d)
+ vpx_sad16x32x4d, NULL)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
- vpx_sad64x32x4d)
+ vpx_sad64x32x4d, NULL)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
- vpx_sad32x64x4d)
+ vpx_sad32x64x4d, NULL)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
- vpx_sad32x32x4d)
+ vpx_sad32x32x4d, NULL)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
- vpx_sad64x64x4d)
+ vpx_sad64x64x4d, NULL)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
- vpx_sad16x16x4d)
+ vpx_sad16x16x4d, vpx_sad16x16x8)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
- vpx_sad16x8x4d)
+ vpx_sad16x8x4d, vpx_sad16x8x8)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
- vpx_sad8x16x4d)
+ vpx_sad8x16x4d, vpx_sad8x16x8)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
- vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+ vpx_sad8x8x8)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
- vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+ NULL)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
- vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+ NULL)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
- vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+ vpx_sad4x4x8)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1979,18 +1979,16 @@
end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
for (r = start_row; r <= end_row; r += 1) {
- for (c = start_col; c <= end_col; c += 4) {
- // 4 sads in a single call if we are checking every location
- if (c + 3 <= end_col) {
- unsigned int sads[4];
- const uint8_t *addrs[4];
- for (i = 0; i < 4; ++i) {
- const MV mv = { r, c + i };
- addrs[i] = get_buf_from_mv(pre, &mv);
- }
- fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+ c = start_col;
+ // sdx8f may not be available some block size
+ if (fn_ptr->sdx8f) {
+ while (c + 7 <= end_col) {
+ unsigned int sads[8];
+ const MV mv = { r, c };
+ const uint8_t *buf = get_buf_from_mv(pre, &mv);
+ fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
- for (i = 0; i < 4; ++i) {
+ for (i = 0; i < 8; ++i) {
int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
if (sad < best_sad) {
const MV mv = { r, c + i };
@@ -2002,23 +2000,45 @@
}
}
}
- } else {
- for (i = 0; i <= end_col - c; ++i) {
+ c += 8;
+ }
+ }
+ while (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = { r, c + i };
+ addrs[i] = get_buf_from_mv(pre, &mv);
+ }
+ fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
+ if (sad < best_sad) {
const MV mv = { r, c + i };
- int64_t sad =
- (int64_t)fn_ptr->sdf(src->buf, src->stride,
- get_buf_from_mv(pre, &mv), pre->stride)
- << LOG2_PRECISION;
+ sad +=
+ lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
if (sad < best_sad) {
- sad += lambda *
- vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
+ best_sad = sad;
+ *best_mv = mv;
}
}
}
+ c += 4;
+ }
+ while (c <= end_col) {
+ const MV mv = { r, c };
+ int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+ get_buf_from_mv(pre, &mv), pre->stride)
+ << LOG2_PRECISION;
+ if (sad < best_sad) {
+ sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ c += 1;
}
}
return best_sad;
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -76,6 +76,7 @@
vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
vpx_sad_multi_d_fn_t sdx4df;
+ vpx_sad_multi_fn_t sdx8f;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9