ref: 65f118d72fa9045320d8a1e321f0da955a2d8e9a
parent: ac468dde468c4f6f0a48ec14d63341d4677a5d1f
	author: Dmitry Kovalev <dkovalev@google.com>
	date: Fri Oct 11 14:27:12 EDT 2013
	
Making input pointer of any inverse transform constant. Also renaming dest_stride to stride in some places. Change-Id: I75f602b623a5a7071d4922b747c45fa0b7d7a940
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -21,7 +21,7 @@
 extern "C" {#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
-void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
}
#include "vpx/vpx_integer.h"
@@ -258,9 +258,10 @@
}
typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+ int tx_type);
 void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {vp9_short_fdct16x16_c(in, out, stride);
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -75,7 +75,7 @@
}
typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride);
 class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {public:
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -21,7 +21,7 @@
 extern "C" {#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
-void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
}
#include "vpx/vpx_integer.h"
@@ -29,9 +29,10 @@
 namespace {typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+ int tx_type);
 void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {vp9_short_fdct8x8_c(in, out, stride);
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,31 +11,31 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
-extern void vp9_idct16x16_256_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-extern void vp9_idct16x16_10_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
-void vp9_idct16x16_256_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {+void vp9_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};   int16_t row_idct_output[16*16] = {0};@@ -109,8 +109,8 @@
return;
}
-void vp9_idct16x16_10_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {+void vp9_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};   int16_t row_idct_output[16*16] = {0};--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -221,7 +221,7 @@
int lossless;
/* Inverse transform function pointers. */
- void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+ void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
struct subpix_fn_table subpix;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,13 +18,13 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
int16_t output[16];
int a1, b1, c1, d1, e1;
- int16_t *ip = input;
+ const int16_t *ip = input;
int16_t *op = output;
   for (i = 0; i < 4; i++) {@@ -60,10 +60,10 @@
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
- dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
- dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
- dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
- dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+ dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+ dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+ dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+ dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
ip++;
dest++;
@@ -70,11 +70,11 @@
}
}
-void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {int i;
int a1, e1;
int16_t tmp[4];
- int16_t *ip = in;
+ const int16_t *ip = in;
int16_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -116,7 +116,7 @@
output[3] = step[0] - step[3];
}
-void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {int16_t out[4 * 4];
int16_t *outptr = out;
int i, j;
@@ -135,12 +135,12 @@
temp_in[j] = out[j * 4 + i];
idct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
-void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {int i;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -201,7 +201,7 @@
output[7] = step1[0] - step1[7];
}
-void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {int16_t out[8 * 8];
int16_t *outptr = out;
int i, j;
@@ -220,12 +220,12 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
-void vp9_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,7 +234,7 @@
   for (j = 0; j < 8; ++j) {for (i = 0; i < 8; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
@@ -280,8 +280,8 @@
output[3] = dct_const_round_shift(s3);
}
-void vp9_iht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {   const transform_2d IHT_4[] = {     { idct4_1d, idct4_1d  },  // DCT_DCT  = 0     { iadst4_1d, idct4_1d  },   // ADST_DCT = 1@@ -307,8 +307,8 @@
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
 static void iadst8_1d(const int16_t *input, int16_t *output) {@@ -395,8 +395,8 @@
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3};
-void vp9_iht8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
@@ -416,12 +416,12 @@
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
+ }
}
-void vp9_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
-                                int dest_stride) {+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {   int16_t out[8 * 8] = { 0 };int16_t *outptr = out;
int i, j;
@@ -441,8 +441,8 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
@@ -611,7 +611,7 @@
output[15] = step2[0] - step2[15];
}
-void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {int16_t out[16 * 16];
int16_t *outptr = out;
int i, j;
@@ -630,8 +630,8 @@
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
@@ -813,8 +813,8 @@
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3};
-void vp9_iht16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                              int tx_type) {+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
@@ -834,12 +834,11 @@
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]); }
}
-void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {   int16_t out[16 * 16] = { 0 };int16_t *outptr = out;
int i, j;
@@ -859,13 +858,12 @@
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,7 +872,7 @@
   for (j = 0; j < 16; ++j) {for (i = 0; i < 16; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
@@ -1245,7 +1243,7 @@
output[31] = step1[0] - step1[31];
}
-void vp9_idct32x32_1024_add_c(int16_t *input, uint8_t *dest, int dest_stride) {+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {int16_t out[32 * 32];
int16_t *outptr = out;
int i, j;
@@ -1277,13 +1275,12 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {int i, j;
int a1;
@@ -1294,12 +1291,12 @@
   for (j = 0; j < 32; ++j) {for (i = 0; i < 32; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
// idct
-void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {if (eob > 1)
vp9_idct4x4_16_add(input, dest, stride);
else
@@ -1307,7 +1304,7 @@
}
-void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {if (eob > 1)
vp9_iwht4x4_16_add(input, dest, stride);
else
@@ -1314,7 +1311,7 @@
vp9_iwht4x4_1_add(input, dest, stride);
}
-void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) {+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -1333,7 +1330,8 @@
}
}
-void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) {+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
   if (eob) {@@ -1347,7 +1345,8 @@
}
}
-void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob) {+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {   if (eob) {if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
@@ -1357,8 +1356,8 @@
}
// iht
-void vp9_iht4x4_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
-                   int eob) {+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {if (tx_type == DCT_DCT)
vp9_idct4x4_add(input, dest, stride, eob);
else
@@ -1365,8 +1364,8 @@
vp9_iht4x4_16_add(input, dest, stride, tx_type);
}
-void vp9_iht8x8_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob) {+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {   if (tx_type == DCT_DCT) {vp9_idct8x8_add(input, dest, stride, eob);
   } else {@@ -1376,8 +1375,8 @@
}
}
-void vp9_iht16x16_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob) {+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob) {   if (tx_type == DCT_DCT) {vp9_idct16x16_add(input, dest, stride, eob);
   } else {--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -87,18 +87,20 @@
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
-void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+ eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob);
-void vp9_iht4x4_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob);
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -267,51 +267,51 @@
#
# dct
#
-prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct4x4_1_add sse2 neon
-prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct4x4_16_add sse2 neon
-prototype void vp9_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct8x8_1_add sse2 neon
-prototype void vp9_idct8x8_64_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct8x8_64_add sse2 neon
-prototype void vp9_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct8x8_10_add sse2 neon
-prototype void vp9_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct16x16_1_add sse2 neon
-prototype void vp9_idct16x16_256_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct16x16_256_add sse2 neon
-prototype void vp9_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct16x16_10_add sse2 neon
-prototype void vp9_idct32x32_1024_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct32x32_1024_add sse2 neon
-prototype void vp9_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_idct32x32_1_add sse2
-prototype void vp9_iht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_iht4x4_16_add sse2 neon
-prototype void vp9_iht8x8_64_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_iht8x8_64_add sse2 neon
-prototype void vp9_iht16x16_256_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
specialize vp9_iht16x16_256_add sse2
# dct and add
-prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_iwht4x4_1_add
-prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_iwht4x4_16_add
#
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@
__m128i input0, input1, input2, input3;
// Rows
- input0 = _mm_loadl_epi64((__m128i *)input);
- input1 = _mm_loadl_epi64((__m128i *)(input + 4));
- input2 = _mm_loadl_epi64((__m128i *)(input + 8));
- input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+ input0 = _mm_loadl_epi64((const __m128i *)input);
+ input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+ input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+ input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
// Construct i3, i1, i3, i1, i2, i0, i2, i0
input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@
RECON_AND_STORE4X4(dest, input3);
}
-void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -264,16 +264,16 @@
in[3] = _mm_unpackhi_epi64(in[1], in[1]);
}
-void vp9_iht4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {__m128i in[4];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- in[0] = _mm_loadl_epi64((__m128i *)input);
- in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
- in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
- in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+ in[0] = _mm_loadl_epi64((const __m128i *)input);
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
   switch (tx_type) {case 0: // DCT_DCT
@@ -494,7 +494,7 @@
dest += stride; \
}
-void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -514,14 +514,14 @@
int i;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
// 2-D
   for (i = 0; i < 2; i++) {@@ -562,7 +562,7 @@
RECON_AND_STORE(dest, in7);
}
-void vp9_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -883,21 +883,21 @@
}
-void vp9_iht8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1<<4);
// load input data
- in[0] = _mm_load_si128((__m128i *)input);
- in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
- in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
- in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
- in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
- in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
- in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
- in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
   switch (tx_type) {case 0: // DCT_DCT
@@ -950,7 +950,7 @@
RECON_AND_STORE(dest, in[7]);
}
-void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -970,10 +970,10 @@
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// Rows. Load 4-row input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
// 8x4 Transpose
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
@@ -1228,7 +1228,8 @@
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -1283,22 +1284,22 @@
if (i == 1) input += 128;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
- in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
- in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
- in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
- in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
@@ -1435,7 +1436,7 @@
}
}
-void vp9_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a, i;
@@ -2310,24 +2311,24 @@
iadst16_1d_8col(in1);
}
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {- in[0] = _mm_load_si128((__m128i *)(input + 0 * 16));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * 16));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * 16));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * 16));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * 16));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * 16));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * 16));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
- in[8] = _mm_load_si128((__m128i *)(input + 8 * 16));
- in[9] = _mm_load_si128((__m128i *)(input + 9 * 16));
- in[10] = _mm_load_si128((__m128i *)(input + 10 * 16));
- in[11] = _mm_load_si128((__m128i *)(input + 11 * 16));
- in[12] = _mm_load_si128((__m128i *)(input + 12 * 16));
- in[13] = _mm_load_si128((__m128i *)(input + 13 * 16));
- in[14] = _mm_load_si128((__m128i *)(input + 14 * 16));
- in[15] = _mm_load_si128((__m128i *)(input + 15 * 16));
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
}
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {@@ -2386,8 +2387,8 @@
RECON_AND_STORE(dest, in[15]);
}
-void vp9_iht16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                                 int tx_type) {+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {__m128i in0[16], in1[16];
load_buffer_8x16(input, in0);
@@ -2421,8 +2422,8 @@
write_buffer_8x16(dest, in1, stride);
}
-void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
-                                     int stride) {+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -2468,14 +2469,14 @@
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// 1-D idct. Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2780,11 +2781,12 @@
#define LOAD_DQCOEFF(reg, input) \
   {  \- reg = _mm_load_si128((__m128i *) input); \
+ reg = _mm_load_si128((const __m128i *) input); \
input += 8; \
} \
-void vp9_idct32x32_1024_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3515,7 +3517,7 @@
}
} //NOLINT
-void vp9_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a, i;
--
⑨