shithub: dav1d

Download patch

ref: f64fdae55128ff1c2204f578ee26b6d577862b26
parent: 3d6479cee8170cbcc1b6c3cea7338e86b3594683
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri May 17 21:36:35 EDT 2019

Optimize obmc blend

The last 1/4 of the mask is always zero, so we can skip some
calculations that doesn't change the output.

--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -635,10 +635,8 @@
 }
 
 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
-static NOINLINE void
-blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
-                 const int w, int h, const uint8_t *mask,
-                 const ptrdiff_t mask_stride)
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                    const int w, int h, const uint8_t *mask)
 {
     do {
         for (int x = 0; x < w; x++) {
@@ -646,20 +644,21 @@
         }
         dst += PXSTRIDE(dst_stride);
         tmp += w;
-        mask += mask_stride;
+        mask += w;
     } while (--h);
 }
 
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
-                    const int w, const int h, const uint8_t *mask)
-{
-    blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
-}
-
 static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
-                      const int w, const int h)
+                      const int w, int h)
 {
-    blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
+    const uint8_t *const mask = &dav1d_obmc_masks[w];
+    do {
+        for (int x = 0; x < (w * 3) >> 2; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+    } while (--h);
 }
 
 static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
@@ -666,6 +665,7 @@
                       const int w, int h)
 {
     const uint8_t *mask = &dav1d_obmc_masks[h];
+    h = (h * 3) >> 2;
     do {
         const int m = *mask++;
         for (int x = 0; x < w; x++) {
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -644,7 +644,7 @@
             if (a_r->ref[0] > 0) {
                 const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
                 const int oh4 = imin(b_dim[1], 16) >> 1;
-                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4,
+                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
                          t->bx + x, t->by, pl, a_r->mv[0],
                          &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,
                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
--- a/src/tables.c
+++ b/src/tables.c
@@ -861,7 +861,7 @@
     }
 };
 
-const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = {
+const uint8_t dav1d_obmc_masks[64] = {
     /* Unused */
      0,  0,
     /* 2 */
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -3837,7 +3837,10 @@
     movsxd               wq, dword [r5+wq*4]
     vpbroadcastd         m5, [base+pw_512]
     add                  wq, r5
-    lea               maskq, [base+obmc_masks+hq*4]
+    lea               maskq, [base+obmc_masks+hq*2]
+    lea                  hd, [hq*3]
+    shr                  hd, 2 ; h * 3/4
+    lea               maskq, [maskq+hq*2]
     neg                  hq
     jmp                  wq
 .w2:
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -44,8 +44,8 @@
             db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
             db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
             db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
-            db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
 
+blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
                 db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
 subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
@@ -53,7 +53,6 @@
 subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
-blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 
 pb_64:   times 16 db 64
 pw_8:    times 8 dw 8
@@ -3773,7 +3772,7 @@
     jg .w32
     RET
 
-cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
 %define base r5-blend_v_ssse3_table
     LEA                  r5, blend_v_ssse3_table
     tzcnt                wd, wm
@@ -3833,8 +3832,7 @@
     mova                 m2, [tmpq]; b
     BLEND_64M            m1, m2, m3, m3
     movq       [dstq+dsq*0], m0
-    punpckhqdq           m0, m0
-    movq       [dstq+dsq*1], m0
+    movhps     [dstq+dsq*1], m0
     add                tmpq, 16
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
@@ -3855,24 +3853,31 @@
     jg .w16_loop
     RET
 .w32:
-    mova                 m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0])
-    mova                 m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1])
-    mova                 m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2])
-    mova                 m7, [maskq+112] ; obmc_masks_32[3] (64-m[3])
+%if WIN64
+    mova            [rsp+8], xmm6
+%endif
+    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
     ; 16 mask blend is provided for 64 pixels
 .w32_loop:
     mova                 m1, [dstq+16*0] ; a
     mova                 m2, [tmpq+16*0] ; b
     BLEND_64M            m1, m2, m3, m4
+    movq                 m1, [dstq+16*1] ; a
+    punpcklbw            m1, [tmpq+16*1] ; b
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m5
+    packuswb             m1, m1
     mova        [dstq+16*0], m0
-    mova                 m1, [dstq+16*1] ; a
-    mova                 m2, [tmpq+16*1] ; b
-    BLEND_64M            m1, m2, m6, m7
-    mova        [dstq+16*1], m0
+    movq        [dstq+16*1], m1
     add                tmpq, 32
     add                dstq, dsq
     dec                  hd
     jg .w32_loop
+%if WIN64
+    mova               xmm6, [rsp+8]
+%endif
     RET
 
 cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
@@ -3890,7 +3895,10 @@
     movsxd               wq, dword [t0+wq*4]
     mova                 m5, [base+pw_512]
     add                  wq, t0
-    lea               maskq, [base+obmc_masks+hq*4]
+    lea               maskq, [base+obmc_masks+hq*2]
+    lea                  hd, [hq*3]
+    shr                  hd, 2 ; h * 3/4
+    lea               maskq, [maskq+hq*2]
     neg                  hq
     jmp                  wq
 .w2: