ref: f64fdae55128ff1c2204f578ee26b6d577862b26
parent: 3d6479cee8170cbcc1b6c3cea7338e86b3594683
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri May 17 21:36:35 EDT 2019
Optimize obmc blend The last 1/4 of the mask is always zero, so we can skip some calculations that doesn't change the output.
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -635,10 +635,8 @@
}
#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
-static NOINLINE void
-blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
- const int w, int h, const uint8_t *mask,
- const ptrdiff_t mask_stride)
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h, const uint8_t *mask)
{
do {
for (int x = 0; x < w; x++) {
@@ -646,20 +644,21 @@
}
dst += PXSTRIDE(dst_stride);
tmp += w;
- mask += mask_stride;
+ mask += w;
} while (--h);
}
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
- const int w, const int h, const uint8_t *mask)
-{
- blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
-}
-
static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
- const int w, const int h)
+ const int w, int h)
{
- blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
+ const uint8_t *const mask = &dav1d_obmc_masks[w];
+ do {
+ for (int x = 0; x < (w * 3) >> 2; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ } while (--h);
}
static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
@@ -666,6 +665,7 @@
const int w, int h)
{
const uint8_t *mask = &dav1d_obmc_masks[h];
+ h = (h * 3) >> 2;
do {
const int m = *mask++;
for (int x = 0; x < w; x++) {
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -644,7 +644,7 @@
if (a_r->ref[0] > 0) {
const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
const int oh4 = imin(b_dim[1], 16) >> 1;
- res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4,
+ res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
t->bx + x, t->by, pl, a_r->mv[0],
&f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,
dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
--- a/src/tables.c
+++ b/src/tables.c
@@ -861,7 +861,7 @@
}
};
-const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = {
+const uint8_t dav1d_obmc_masks[64] = {
/* Unused */
0, 0,
/* 2 */
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -3837,7 +3837,10 @@
movsxd wq, dword [r5+wq*4]
vpbroadcastd m5, [base+pw_512]
add wq, r5
- lea maskq, [base+obmc_masks+hq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
neg hq
jmp wq
.w2:
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -44,8 +44,8 @@
db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
- db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
@@ -53,7 +53,6 @@
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 16 db 64
pw_8: times 8 dw 8
@@ -3773,7 +3772,7 @@
jg .w32
RET
-cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
%define base r5-blend_v_ssse3_table
LEA r5, blend_v_ssse3_table
tzcnt wd, wm
@@ -3833,8 +3832,7 @@
mova m2, [tmpq]; b
BLEND_64M m1, m2, m3, m3
movq [dstq+dsq*0], m0
- punpckhqdq m0, m0
- movq [dstq+dsq*1], m0
+ movhps [dstq+dsq*1], m0
add tmpq, 16
lea dstq, [dstq+dsq*2]
sub hd, 2
@@ -3855,24 +3853,31 @@
jg .w16_loop
RET
.w32:
- mova m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0])
- mova m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1])
- mova m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2])
- mova m7, [maskq+112] ; obmc_masks_32[3] (64-m[3])
+%if WIN64
+ mova [rsp+8], xmm6
+%endif
+ mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+ mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+ mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
; 16 mask blend is provided for 64 pixels
.w32_loop:
mova m1, [dstq+16*0] ; a
mova m2, [tmpq+16*0] ; b
BLEND_64M m1, m2, m3, m4
+ movq m1, [dstq+16*1] ; a
+ punpcklbw m1, [tmpq+16*1] ; b
+ pmaddubsw m1, m6
+ pmulhrsw m1, m5
+ packuswb m1, m1
mova [dstq+16*0], m0
- mova m1, [dstq+16*1] ; a
- mova m2, [tmpq+16*1] ; b
- BLEND_64M m1, m2, m6, m7
- mova [dstq+16*1], m0
+ movq [dstq+16*1], m1
add tmpq, 32
add dstq, dsq
dec hd
jg .w32_loop
+%if WIN64
+ mova xmm6, [rsp+8]
+%endif
RET
cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
@@ -3890,7 +3895,10 @@
movsxd wq, dword [t0+wq*4]
mova m5, [base+pw_512]
add wq, t0
- lea maskq, [base+obmc_masks+hq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
neg hq
jmp wq
.w2: