shithub: libvpx

Download patch

ref: 4c7c15ee6996eea1c5911e2418554d5c64978c30
parent: 7bb2afa1da5327b66b054195fbb777edc0fe94fa
parent: e60478d46d9a692e2e7b90b35355660682bfe58b
author: Yunqing Wang <yunqingwang@google.com>
date: Fri Nov 16 07:23:06 EST 2012

Merge "Optimize 8x8 dequant and idct" into experimental

--- a/vp9/common/generic/systemdependent.c
+++ b/vp9/common/generic/systemdependent.c
@@ -29,10 +29,11 @@
   rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
   rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
   rtcd->idct.idct8        = vp9_short_idct8x8_c;
+  rtcd->idct.idct10_8     = vp9_short_idct10_8x8_c;
   rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
   rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
   rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
-  rtcd->idct.idct10_16x16    = vp9_short_idct10_16x16_c;
+  rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
 
   rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
   rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
--- a/vp9/common/idct.h
+++ b/vp9/common/idct.h
@@ -60,6 +60,11 @@
 #endif
 extern prototype_idct(vp9_idct_idct8);
 
+#ifndef vp9_idct_idct10_8
+#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
+#endif
+extern prototype_idct(vp9_idct_idct10_8);
+
 #ifndef vp9_idct_idct8_1
 #define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
 #endif
@@ -132,6 +137,7 @@
   vp9_second_order_fn_t iwalsh16;
 
   vp9_idct_fn_t            idct8;
+  vp9_idct_fn_t            idct10_8;
   vp9_idct_fn_t            idct8_1;
   vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
   vp9_idct_fn_t ihaar2;
--- a/vp9/common/idctllm.c
+++ b/vp9/common/idctllm.c
@@ -967,6 +967,127 @@
   }
 }
 
+/* Row IDCT when only first 4 coefficients are non-zero. */
+static void idctrow10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
+           = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+    return;
+  }
+
+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
+  /* first stage */
+  x5 = W7 * x4;
+  x4 = W1 * x4;
+  x6 = W3 * x7;
+  x7 = -W5 * x7;
+
+  /* second stage */
+  x2 = W6 * x3;
+  x3 = W2 * x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
+}
+
+/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
+static void idctcol10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
+        (x7 = blk[8 * 3]))) {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
+        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+    return;
+  }
+
+  x0 = (blk[8 * 0] << 8) + 16384;
+
+  /* first stage */
+  x5 = (W7 * x4 + 4) >> 3;
+  x4 = (W1 * x4 + 4) >> 3;
+  x6 = (W3 * x7 + 4) >> 3;
+  x7 = (-W5 * x7 + 4) >> 3;
+
+  /* second stage */
+  x2 = (W6 * x3 + 4) >> 3;
+  x3 = (W2 * x3 + 4) >> 3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[8 * 0] = (x7 + x1) >> 14;
+  blk[8 * 1] = (x3 + x2) >> 14;
+  blk[8 * 2] = (x0 + x4) >> 14;
+  blk[8 * 3] = (x8 + x6) >> 14;
+  blk[8 * 4] = (x8 - x6) >> 14;
+  blk[8 * 5] = (x0 - x4) >> 14;
+  blk[8 * 6] = (x3 - x2) >> 14;
+  blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
+  int X[TX_DIM * TX_DIM];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
+    }
+  }
+
+  /* Do first 4 row idct only since non-zero dct coefficients are all in
+   *  upper-left 4x4 area. */
+  for (i = 0; i < 4; i++)
+    idctrow10(X + 8 * i);
+
+  for (i = 0; i < 8; i++)
+    idctcol10(X + i);
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
+    }
+  }
+}
 
 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
   int i;
--- a/vp9/common/rtcd_defs.sh
+++ b/vp9/common/rtcd_defs.sh
@@ -57,11 +57,8 @@
 prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
 specialize vp9_dequant_idct_add_16x16
 
-prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
 specialize vp9_dequant_idct_add_8x8
-
-prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
-specialize vp9_dequant_dc_idct_add_8x8
 
 prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 specialize vp9_dequant_idct_add
--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@@ -461,7 +461,8 @@
           vp9_ht_dequant_idct_add_8x8_c(tx_type,
                                         q, dq, pre, dst, 16, stride);
         } else {
-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,
+                                     xd->eobs[idx]);
         }
         q += 64;
       } else {
--- a/vp9/decoder/dequantize.c
+++ b/vp9/decoder/dequantize.c
@@ -19,8 +19,8 @@
 extern int dec_debug;
 #endif
 
-static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
-                  int stride, int width, int height) {
+static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
+                         uint8_t *dest, int stride, int width, int height) {
   int r, c;
 
   for (r = 0; r < height; r++) {
@@ -41,12 +41,34 @@
   }
 }
 
+static void add_constant_residual(const int16_t diff, const uint8_t *pred,
+                                  int pitch, uint8_t *dest, int stride,
+                                  int width, int height) {
+  int r, c;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      int a = diff + pred[c];
+
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+
+      dest[c] = (uint8_t) a;
+    }
+
+    dest += stride;
+    pred += pitch;
+  }
+}
+
 void vp9_dequantize_b_c(BLOCKD *d) {
 
   int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;
 
   for (i = 0; i < 16; i++) {
     DQ[i] = Q[i] * DQC[i];
@@ -54,11 +76,11 @@
 }
 
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
-                               unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                               uint8_t *pred, uint8_t *dest,
                                int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -69,18 +91,15 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
-                                   unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                                   uint8_t *pred, uint8_t *dest,
                                    int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int b, r, c;
+  int16_t output[64];
+  int16_t *diff_ptr = output;
   int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
 
   input[0] = dq[0] * input[0];
   for (i = 1; i < 64; i++) {
@@ -91,35 +110,13 @@
 
   vpx_memset(input, 0, 128);
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    // shift buffer pointers to next 4x4 block in the submacroblock
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
-  }
+  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 }
 
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
-                            unsigned char *dest, int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                            uint8_t *dest, int pitch, int stride) {
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -131,17 +128,17 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
-                               unsigned char *dest, int pitch, int stride,
+void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                               uint8_t *dest, int pitch, int stride,
                                int Dc) {
   int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
 
-  input[0] = (short)Dc;
+  input[0] = (int16_t)Dc;
 
   for (i = 1; i < 16; i++) {
     input[i] = dq[i] * input[i];
@@ -152,15 +149,15 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
 #if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                     uint8_t *pred, uint8_t *dest,
                                      int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -171,18 +168,18 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
-                                        unsigned char *pred,
-                                        unsigned char *dest,
+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                        uint8_t *pred,
+                                        uint8_t *dest,
                                         int pitch, int stride, int dc) {
   int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
 
-  input[0] = (short)dc;
+  input[0] = (int16_t)dc;
 
   for (i = 1; i < 16; i++) {
     input[i] = dq[i] * input[i];
@@ -191,18 +188,18 @@
   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 #endif
 
 void vp9_dequantize_b_2x2_c(BLOCKD *d) {
   int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;
 
   for (i = 0; i < 16; i++) {
-    DQ[i] = (short)((Q[i] * DQC[i]));
+    DQ[i] = (int16_t)((Q[i] * DQC[i]));
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -216,14 +213,12 @@
 #endif
 }
 
-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                unsigned char *dest, int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
+void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                                uint8_t *dest, int pitch, int stride,
+                                int dc, uint16_t eobs) {
+  int16_t output[64];
+  int16_t *diff_ptr = output;
   int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
 
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -236,104 +231,60 @@
   }
 #endif
 
-  input[0] = input[0] * dq[0];
+  /* If dc is 1, then input[0] is the reconstructed value, do not need
+   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+   */
+  if (!dc)
+    input[0] *= dq[0];
 
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input DQ 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to decide what to do.
+   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+   * Combine that with code here.
+   */
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem8x8(pred, pitch, dest, stride);
+  } else if (eobs == 1) {
+    /* DC only DCT coefficient. */
+    int16_t out;
 
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Output 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", output[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct8x8_c() is modified. */
+    out = (input[0] + 1 + (input[0] < 0)) >> 2;
+    out = out << 3;
+    out = (out + 32) >> 7;
 
-  vpx_memset(input, 0, 128);// test what should i put here
+    input[0] = 0;
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
+    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
+  } else if (eobs <= 10) {
+    input[1] = input[1] * dq[1];
+    input[2] = input[2] * dq[1];
+    input[3] = input[3] * dq[1];
+    input[8] = input[8] * dq[1];
+    input[9] = input[9] * dq[1];
+    input[10] = input[10] * dq[1];
+    input[16] = input[16] * dq[1];
+    input[17] = input[17] * dq[1];
+    input[24] = input[24] * dq[1];
 
-        if (a < 0)
-          a = 0;
+    vp9_short_idct10_8x8_c(input, output, 16);
 
-        if (a > 255)
-          a = 255;
+    input[0] = input[1] = input[2] = input[3] = 0;
+    input[8] = input[9] = input[10] = 0;
+    input[16] = input[17] = 0;
+    input[24] = 0;
 
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+  } else {
+    // recover quantizer for 4 4x4 blocks
+    for (i = 1; i < 64; i++) {
+      input[i] = input[i] * dq[1];
     }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
 #ifdef DEC_DEBUG
   if (dec_debug) {
-    int k, j;
-    printf("Final 8x8\n");
-    for (j = 0; j < 8; j++) {
-      for (k = 0; k < 8; k++) {
-        printf("%d ", origdest[k]);
-      }
-      printf("\n");
-      origdest += stride;
-    }
-  }
-#endif
-}
-
-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int Dc) { // Dc for 1st order T in some rear case
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
-  // dc value is recovered after dequantization, since dc need not quantization
-#ifdef DEC_DEBUG
-  if (dec_debug) {
     int j;
-    printf("Input 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
     printf("Input DQ 8x8\n");
     for (j = 0; j < 64; j++) {
       printf("%d ", input[j]);
@@ -342,8 +293,8 @@
   }
 #endif
 
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
+    // the idct halves ( >> 1) the pitch
+    vp9_short_idct8x8_c(input, output, 16);
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int j;
@@ -354,30 +305,11 @@
     }
   }
 #endif
-  vpx_memset(input, 0, 128);
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
+    vpx_memset(input, 0, 128);
 
-        if (a < 0)
-          a = 0;
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int k, j;
@@ -391,13 +323,14 @@
     }
   }
 #endif
+  }
 }
 
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                     int16_t *dq, uint8_t *pred, uint8_t *dest,
                                      int pitch, int stride) {
-  short output[256];
-  short *diff_ptr = output;
+  int16_t output[256];
+  int16_t *diff_ptr = output;
   int i;
 
   input[0]= input[0] * dq[0];
@@ -414,7 +347,7 @@
 
   vpx_memset(input, 0, 512);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
 }
 
 void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
@@ -422,7 +355,7 @@
                                   uint16_t eobs) {
   int16_t output[256];
   int16_t *diff_ptr = output;
-  int r, c, i;
+  int i;
 
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
@@ -433,6 +366,8 @@
     /* DC only DCT coefficient. */
     int16_t out;
 
+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct16x16_c() is modified. */
     out = (input[0] * dq[0] + 2) >> 2;
     out = (out + 2) >> 2;
     out = (out + 4) >> 3;
@@ -439,22 +374,7 @@
 
     input[0] = 0;
 
-    for (r = 0; r < 16; r++) {
-      for (c = 0; c < 16; c++) {
-        int a = out + pred[c];
-
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-
-        dest[c] = (uint8_t) a;
-      }
-
-      dest += stride;
-      pred += pitch;
-    }
-
+    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
   } else if (eobs <= 10) {
     input[0]= input[0] * dq[0];
     input[1] = input[1] * dq[1];
@@ -475,7 +395,7 @@
     input[32] = input[33] = 0;
     input[48] = 0;
 
-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   } else {
     input[0]= input[0] * dq[0];
 
@@ -488,6 +408,6 @@
 
     vpx_memset(input, 0, 512);
 
-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   }
 }
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@@ -177,12 +177,21 @@
                                            int stride, unsigned short *eobs,
                                            short *dc,
                                            MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
-                                dst + 8 * stride, 16, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
-                                dst + 8 * stride + 8, 16, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,
+                             xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
+                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
+                                dst + 8 * stride + 8, 16, stride, 1,
+                                xd->eobs[12]);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -191,13 +200,22 @@
                                                    int stride,
                                                    unsigned short *eobs,
                                                    short *dc, MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
-                                dst + 8, stride, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                                dst + 8 * stride, stride, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                                dst + 8 * stride + 8, stride, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,
+                                dst + 8, stride, stride, 1, xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
+                                dst + 8 * stride, stride, stride, 1,
+                                xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
+                                dst + 8 * stride + 8, stride, stride, 1,
+                                xd->eobs[12]);
 }
 #endif
 
@@ -209,13 +227,14 @@
   unsigned char *origdest = dst;
   unsigned char *origpred = pre;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride);
+                             origdest + 8, 16, stride, 0, xd->eobs[4]);
   vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride);
+                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);
   vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride);
+                             origdest + 8 * stride + 8, 16, stride, 0,
+                             xd->eobs[12]);
 }
 
 void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
@@ -224,12 +243,12 @@
                                          unsigned char *dstv,
                                          int stride, unsigned short *eobs,
                                          MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);
 
   q    += 64;
   pre  += 64;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -239,11 +258,12 @@
                                                  int stride,
                                                  unsigned short *eobs,
                                                  MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
+                             xd->eobs[16]);
 
-  q    += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+  q += 64;
+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
+                             xd->eobs[20]);
 }
 #endif
 
--