shithub: libvpx

--- a/vpx_dsp/arm/quantize_neon.c

+++ b/vpx_dsp/arm/quantize_neon.c

@@ -20,12 +20,12 @@

                          const int16_t *round_ptr, const int16_t *quant_ptr,

                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

-                         uint16_t *eob_ptr, const int16_t *scan_ptr,

-                         const int16_t *iscan_ptr) {

+                         uint16_t *eob_ptr, const int16_t *scan,

+                         const int16_t *iscan) {

   const int16x8_t one = vdupq_n_s16(1);

   const int16x8_t neg_one = vdupq_n_s16(-1);

   uint16x8_t eob_max;

-  (void)scan_ptr;

+  (void)scan;

   (void)skip_block;

   assert(!skip_block);

@@ -38,8 +38,8 @@

     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);

     const int16x8_t dequant = vld1q_s16(dequant_ptr);

     // Add one because the eob does not index from 0.

-    const uint16x8_t iscan =

-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));

+    const uint16x8_t v_iscan =

+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));

     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);

     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);

@@ -65,10 +65,10 @@

     qcoeff = vandq_s16(qcoeff, zbin_mask);

     // Set non-zero elements to -1 and use that to extract values for eob.

-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);

+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);

     coeff_ptr += 8;

-    iscan_ptr += 8;

+    iscan += 8;

     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);

     qcoeff_ptr += 8;

@@ -90,8 +90,8 @@

     do {

       // Add one because the eob is not its index.

-      const uint16x8_t iscan =

-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));

+      const uint16x8_t v_iscan =

+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));

       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);

       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);

@@ -118,10 +118,10 @@

       // Set non-zero elements to -1 and use that to extract values for eob.

       eob_max =

-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));

+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));

       coeff_ptr += 8;

-      iscan_ptr += 8;

+      iscan += 8;

       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);

       qcoeff_ptr += 8;

@@ -150,17 +150,19 @@

 // Main difference is that zbin values are halved before comparison and dqcoeff

 // values are divided by 2. zbin is rounded but dqcoeff is not.

-void vpx_quantize_b_32x32_neon(

-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,

-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,

-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,

-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {

+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                               int skip_block, const int16_t *zbin_ptr,

+                               const int16_t *round_ptr,

+                               const int16_t *quant_ptr,

+                               const int16_t *quant_shift_ptr,

+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,

+                               const int16_t *scan, const int16_t *iscan) {

   const int16x8_t one = vdupq_n_s16(1);

   const int16x8_t neg_one = vdupq_n_s16(-1);

   uint16x8_t eob_max;

   int i;

-  (void)scan_ptr;

+  (void)scan;

   (void)n_coeffs;  // Because we will always calculate 32*32.

   (void)skip_block;

   assert(!skip_block);

@@ -174,8 +176,8 @@

     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);

     const int16x8_t dequant = vld1q_s16(dequant_ptr);

     // Add one because the eob does not index from 0.

-    const uint16x8_t iscan =

-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));

+    const uint16x8_t v_iscan =

+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));

     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);

     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);

@@ -203,10 +205,10 @@

     qcoeff = vandq_s16(qcoeff, zbin_mask);

     // Set non-zero elements to -1 and use that to extract values for eob.

-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);

+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);

     coeff_ptr += 8;

-    iscan_ptr += 8;

+    iscan += 8;

     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);

     qcoeff_ptr += 8;

@@ -234,8 +236,8 @@

     for (i = 1; i < 32 * 32 / 8; ++i) {

       // Add one because the eob is not its index.

-      const uint16x8_t iscan =

-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));

+      const uint16x8_t v_iscan =

+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));

       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);

       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);

@@ -264,10 +266,10 @@

       // Set non-zero elements to -1 and use that to extract values for eob.

       eob_max =

-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));

+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));

       coeff_ptr += 8;

-      iscan_ptr += 8;

+      iscan += 8;

       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);

       qcoeff_ptr += 8;

--- a/vpx_dsp/arm/sad4d_neon.c

+++ b/vpx_dsp/arm/sad4d_neon.c

@@ -28,24 +28,25 @@

   return vreinterpret_u8_u32(aa);

-static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,

-                            const uint8_t *const ref[4], const int ref_stride,

-                            const int height, uint32_t *const res) {

+static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,

+                            const uint8_t *const ref_array[4],

+                            const int ref_stride, const int height,

+                            uint32_t *const res) {

   int i;

   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };

   uint16x4_t a[2];

   uint32x4_t r;

-  assert(!((intptr_t)src % sizeof(uint32_t)));

+  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));

   assert(!(src_stride % sizeof(uint32_t)));

   for (i = 0; i < height; ++i) {

     const uint8x8_t s = vreinterpret_u8_u32(

-        vld1_dup_u32((const uint32_t *)(src + i * src_stride)));

-    const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,

-                                                     ref[1] + i * ref_stride);

-    const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,

-                                                     ref[3] + i * ref_stride);

+        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));

+    const uint8x8_t ref01 = load_unaligned_2_buffers(

+        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);

+    const uint8x8_t ref23 = load_unaligned_2_buffers(

+        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);

     abs[0] = vabal_u8(abs[0], s, ref01);

     abs[1] = vabal_u8(abs[1], s, ref23);

@@ -56,16 +57,16 @@

   vst1q_u32(res, r);

-void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,

-                        const uint8_t *const ref[4], int ref_stride,

+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *const ref_array[4], int ref_stride,

                         uint32_t *res) {

-  sad4x_4d(src, src_stride, ref, ref_stride, 4, res);

+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);

-void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,

-                        const uint8_t *const ref[4], int ref_stride,

+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *const ref_array[4], int ref_stride,

                         uint32_t *res) {

-  sad4x_4d(src, src_stride, ref, ref_stride, 8, res);

+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);

 ////////////////////////////////////////////////////////////////////////////////

@@ -137,17 +138,18 @@

   vst1q_u32(res, vcombine_u32(d0, d1));

-static INLINE void sad8x_4d(const uint8_t *src, int src_stride,

-                            const uint8_t *const ref[4], int ref_stride,

+static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,

+                            const uint8_t *const ref_array[4], int ref_stride,

                             uint32_t *res, const int height) {

   int i, j;

-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],

+                                 ref_array[3] };

   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0) };

   for (i = 0; i < height; ++i) {

-    const uint8x8_t s = vld1_u8(src);

-    src += src_stride;

+    const uint8x8_t s = vld1_u8(src_ptr);

+    src_ptr += src_stride;

     for (j = 0; j < 4; ++j) {

       const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);

       ref_loop[j] += ref_stride;

@@ -158,44 +160,45 @@

   sad_512_pel_final_neon(sum, res);

-void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,

-                        const uint8_t *const ref[4], int ref_stride,

+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *const ref_array[4], int ref_stride,

                         uint32_t *res) {

-  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);

+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);

-void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,

-                        const uint8_t *const ref[4], int ref_stride,

+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *const ref_array[4], int ref_stride,

                         uint32_t *res) {

-  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);

+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);

-void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,

-                         const uint8_t *const ref[4], int ref_stride,

+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,

+                         const uint8_t *const ref_array[4], int ref_stride,

                          uint32_t *res) {

-  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);

+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);

 ////////////////////////////////////////////////////////////////////////////////

-static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,

+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,

                               uint16x8_t *const sum) {

-  const uint8x16_t r = vld1q_u8(ref);

-  *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));

-  *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));

+  const uint8x16_t r = vld1q_u8(ref_ptr);

+  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));

+  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));

-static INLINE void sad16x_4d(const uint8_t *src, int src_stride,

-                             const uint8_t *const ref[4], int ref_stride,

+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,

+                             const uint8_t *const ref_array[4], int ref_stride,

                              uint32_t *res, const int height) {

   int i, j;

-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],

+                                 ref_array[3] };

   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0) };

   for (i = 0; i < height; ++i) {

-    const uint8x16_t s = vld1q_u8(src);

-    src += src_stride;

+    const uint8x16_t s = vld1q_u8(src_ptr);

+    src_ptr += src_stride;

     for (j = 0; j < 4; ++j) {

       sad16_neon(ref_loop[j], s, &sum[j]);

       ref_loop[j] += ref_stride;

@@ -205,31 +208,32 @@

   sad_512_pel_final_neon(sum, res);

-void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,

-                         const uint8_t *const ref[4], int ref_stride,

+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,

+                         const uint8_t *const ref_array[4], int ref_stride,

                          uint32_t *res) {

-  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);

+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);

-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

-  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);

+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);

-void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

-  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);

+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);

 ////////////////////////////////////////////////////////////////////////////////

-static INLINE void sad32x_4d(const uint8_t *src, int src_stride,

-                             const uint8_t *const ref[4], int ref_stride,

+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,

+                             const uint8_t *const ref_array[4], int ref_stride,

                              const int height, uint16x8_t *const sum) {

   int i;

-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],

+                                 ref_array[3] };

   sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);

@@ -236,19 +240,19 @@

   for (i = 0; i < height; ++i) {

     uint8x16_t s;

-    s = vld1q_u8(src + 0 * 16);

+    s = vld1q_u8(src_ptr + 0 * 16);

     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);

-    s = vld1q_u8(src + 1 * 16);

+    s = vld1q_u8(src_ptr + 1 * 16);

     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);

-    src += src_stride;

+    src_ptr += src_stride;

     ref_loop[0] += ref_stride;

     ref_loop[1] += ref_stride;

     ref_loop[2] += ref_stride;

@@ -256,37 +260,38 @@

-void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

   uint16x8_t sum[4];

-  sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);

+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);

   sad_512_pel_final_neon(sum, res);

-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

   uint16x8_t sum[4];

-  sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);

+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);

   sad_1024_pel_final_neon(sum, res);

-void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

   uint16x8_t sum[4];

-  sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);

+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);

   sad_2048_pel_final_neon(sum, res);

 ////////////////////////////////////////////////////////////////////////////////

-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

   int i;

-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],

+                                 ref_array[3] };

   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0) };

@@ -293,31 +298,31 @@

   for (i = 0; i < 32; ++i) {

     uint8x16_t s;

-    s = vld1q_u8(src + 0 * 16);

+    s = vld1q_u8(src_ptr + 0 * 16);

     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);

-    s = vld1q_u8(src + 1 * 16);

+    s = vld1q_u8(src_ptr + 1 * 16);

     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);

-    s = vld1q_u8(src + 2 * 16);

+    s = vld1q_u8(src_ptr + 2 * 16);

     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);

-    s = vld1q_u8(src + 3 * 16);

+    s = vld1q_u8(src_ptr + 3 * 16);

     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);

     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);

     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);

-    src += src_stride;

+    src_ptr += src_stride;

     ref_loop[0] += ref_stride;

     ref_loop[1] += ref_stride;

     ref_loop[2] += ref_stride;

@@ -327,11 +332,12 @@

   sad_2048_pel_final_neon(sum, res);

-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

                           uint32_t *res) {

   int i;

-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };

+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],

+                                 ref_array[3] };

   uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),

                         vdupq_n_u16(0), vdupq_n_u16(0) };

@@ -339,31 +345,31 @@

   for (i = 0; i < 64; ++i) {

     uint8x16_t s;

-    s = vld1q_u8(src + 0 * 16);

+    s = vld1q_u8(src_ptr + 0 * 16);

     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);

     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);

     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);

-    s = vld1q_u8(src + 1 * 16);

+    s = vld1q_u8(src_ptr + 1 * 16);

     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);

     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);

     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);

     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);

-    s = vld1q_u8(src + 2 * 16);

+    s = vld1q_u8(src_ptr + 2 * 16);

     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);

     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);

     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);

     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);

-    s = vld1q_u8(src + 3 * 16);

+    s = vld1q_u8(src_ptr + 3 * 16);

     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);

     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);

     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);

     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);

-    src += src_stride;

+    src_ptr += src_stride;

     ref_loop[0] += ref_stride;

     ref_loop[1] += ref_stride;

     ref_loop[2] += ref_stride;

--- a/vpx_dsp/arm/sad_neon.c

+++ b/vpx_dsp/arm/sad_neon.c

@@ -73,53 +73,55 @@

   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);

-static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,

-                               int b_stride, const int height) {

+static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,

+                               const uint8_t *ref_ptr, int ref_stride,

+                               const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x8_t a_u8 = vld1_u8(a);

-    const uint8x8_t b_u8 = vld1_u8(b);

-    a += a_stride;

-    b += b_stride;

+    const uint8x8_t a_u8 = vld1_u8(src_ptr);

+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

     abs = vabal_u8(abs, a_u8, b_u8);

   return abs;

-static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,

-                                   const uint8_t *b, int b_stride,

-                                   const uint8_t *c, const int height) {

+static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,

+                                   const uint8_t *ref_ptr, int ref_stride,

+                                   const uint8_t *second_pred,

+                                   const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x8_t a_u8 = vld1_u8(a);

-    const uint8x8_t b_u8 = vld1_u8(b);

-    const uint8x8_t c_u8 = vld1_u8(c);

+    const uint8x8_t a_u8 = vld1_u8(src_ptr);

+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);

+    const uint8x8_t c_u8 = vld1_u8(second_pred);

     const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);

-    a += a_stride;

-    b += b_stride;

-    c += 8;

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+    second_pred += 8;

     abs = vabal_u8(abs, a_u8, avg);

   return abs;

-#define sad8xN(n)                                                      \

-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \

-                               const uint8_t *ref, int ref_stride) {   \

-    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \

-  }                                                                    \

-                                                                       \

-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \

-                                   const uint8_t *ref, int ref_stride, \

-                                   const uint8_t *second_pred) {       \

-    const uint16x8_t abs =                                             \

-        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \

+#define sad8xN(n)                                                              \

+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \

+                               const uint8_t *ref_ptr, int ref_stride) {       \

+    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \

+  }                                                                            \

+                                                                               \

+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \

+                                   const uint8_t *ref_ptr, int ref_stride,     \

+                                   const uint8_t *second_pred) {               \

+    const uint16x8_t abs =                                                     \

+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \

 sad8xN(4);

@@ -126,17 +128,17 @@

 sad8xN(8);

 sad8xN(16);

-static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,

-                                const uint8_t *b, int b_stride,

+static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,

+                                const uint8_t *ref_ptr, int ref_stride,

                                 const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_u8 = vld1q_u8(a);

-    const uint8x16_t b_u8 = vld1q_u8(b);

-    a += a_stride;

-    b += b_stride;

+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);

+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));

     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));

@@ -143,20 +145,21 @@

   return abs;

-static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,

-                                    const uint8_t *b, int b_stride,

-                                    const uint8_t *c, const int height) {

+static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,

+                                    const uint8_t *ref_ptr, int ref_stride,

+                                    const uint8_t *second_pred,

+                                    const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_u8 = vld1q_u8(a);

-    const uint8x16_t b_u8 = vld1q_u8(b);

-    const uint8x16_t c_u8 = vld1q_u8(c);

+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);

+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);

+    const uint8x16_t c_u8 = vld1q_u8(second_pred);

     const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);

-    a += a_stride;

-    b += b_stride;

-    c += 16;

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+    second_pred += 16;

     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));

     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));

@@ -163,19 +166,20 @@

   return abs;

-#define sad16xN(n)                                                      \

-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \

-                                const uint8_t *ref, int ref_stride) {   \

-    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \

-  }                                                                     \

-                                                                        \

-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \

-                                    const uint8_t *ref, int ref_stride, \

-                                    const uint8_t *second_pred) {       \

-    const uint16x8_t abs =                                              \

-        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \

+#define sad16xN(n)                                                            \

+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \

+                                const uint8_t *ref_ptr, int ref_stride) {     \

+    const uint16x8_t abs =                                                    \

+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \

+  }                                                                           \

+                                                                              \

+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \

+                                    const uint8_t *ref_ptr, int ref_stride,   \

+                                    const uint8_t *second_pred) {             \

+    const uint16x8_t abs =                                                    \

+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \

 sad16xN(8);

@@ -182,19 +186,19 @@

 sad16xN(16);

 sad16xN(32);

-static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,

-                                const uint8_t *b, int b_stride,

+static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,

+                                const uint8_t *ref_ptr, int ref_stride,

                                 const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_lo = vld1q_u8(a);

-    const uint8x16_t a_hi = vld1q_u8(a + 16);

-    const uint8x16_t b_lo = vld1q_u8(b);

-    const uint8x16_t b_hi = vld1q_u8(b + 16);

-    a += a_stride;

-    b += b_stride;

+    const uint8x16_t a_lo = vld1q_u8(src_ptr);

+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);

+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);

+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));

     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));

     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));

@@ -203,24 +207,25 @@

   return abs;

-static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,

-                                    const uint8_t *b, int b_stride,

-                                    const uint8_t *c, const int height) {

+static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,

+                                    const uint8_t *ref_ptr, int ref_stride,

+                                    const uint8_t *second_pred,

+                                    const int height) {

   int i;

   uint16x8_t abs = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_lo = vld1q_u8(a);

-    const uint8x16_t a_hi = vld1q_u8(a + 16);

-    const uint8x16_t b_lo = vld1q_u8(b);

-    const uint8x16_t b_hi = vld1q_u8(b + 16);

-    const uint8x16_t c_lo = vld1q_u8(c);

-    const uint8x16_t c_hi = vld1q_u8(c + 16);

+    const uint8x16_t a_lo = vld1q_u8(src_ptr);

+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);

+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);

+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);

+    const uint8x16_t c_lo = vld1q_u8(second_pred);

+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);

     const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);

     const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);

-    a += a_stride;

-    b += b_stride;

-    c += 32;

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+    second_pred += 32;

     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));

     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));

     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));

@@ -229,19 +234,20 @@

   return abs;

-#define sad32xN(n)                                                      \

-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \

-                                const uint8_t *ref, int ref_stride) {   \

-    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \

-  }                                                                     \

-                                                                        \

-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \

-                                    const uint8_t *ref, int ref_stride, \

-                                    const uint8_t *second_pred) {       \

-    const uint16x8_t abs =                                              \

-        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \

-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \

+#define sad32xN(n)                                                            \

+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \

+                                const uint8_t *ref_ptr, int ref_stride) {     \

+    const uint16x8_t abs =                                                    \

+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \

+  }                                                                           \

+                                                                              \

+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \

+                                    const uint8_t *ref_ptr, int ref_stride,   \

+                                    const uint8_t *second_pred) {             \

+    const uint16x8_t abs =                                                    \

+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \

+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \

 sad32xN(16);

@@ -248,8 +254,8 @@

 sad32xN(32);

 sad32xN(64);

-static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,

-                                const uint8_t *b, int b_stride,

+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,

+                                const uint8_t *ref_ptr, int ref_stride,

                                 const int height) {

   int i;

   uint16x8_t abs_0 = vdupq_n_u16(0);

@@ -256,16 +262,16 @@

   uint16x8_t abs_1 = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_0 = vld1q_u8(a);

-    const uint8x16_t a_1 = vld1q_u8(a + 16);

-    const uint8x16_t a_2 = vld1q_u8(a + 32);

-    const uint8x16_t a_3 = vld1q_u8(a + 48);

-    const uint8x16_t b_0 = vld1q_u8(b);

-    const uint8x16_t b_1 = vld1q_u8(b + 16);

-    const uint8x16_t b_2 = vld1q_u8(b + 32);

-    const uint8x16_t b_3 = vld1q_u8(b + 48);

-    a += a_stride;

-    b += b_stride;

+    const uint8x16_t a_0 = vld1q_u8(src_ptr);

+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);

+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);

+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);

+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);

+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);

+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);

+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));

     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));

     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));

@@ -282,33 +288,34 @@

-static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,

-                                    const uint8_t *b, int b_stride,

-                                    const uint8_t *c, const int height) {

+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,

+                                    const uint8_t *ref_ptr, int ref_stride,

+                                    const uint8_t *second_pred,

+                                    const int height) {

   int i;

   uint16x8_t abs_0 = vdupq_n_u16(0);

   uint16x8_t abs_1 = vdupq_n_u16(0);

   for (i = 0; i < height; ++i) {

-    const uint8x16_t a_0 = vld1q_u8(a);

-    const uint8x16_t a_1 = vld1q_u8(a + 16);

-    const uint8x16_t a_2 = vld1q_u8(a + 32);

-    const uint8x16_t a_3 = vld1q_u8(a + 48);

-    const uint8x16_t b_0 = vld1q_u8(b);

-    const uint8x16_t b_1 = vld1q_u8(b + 16);

-    const uint8x16_t b_2 = vld1q_u8(b + 32);

-    const uint8x16_t b_3 = vld1q_u8(b + 48);

-    const uint8x16_t c_0 = vld1q_u8(c);

-    const uint8x16_t c_1 = vld1q_u8(c + 16);

-    const uint8x16_t c_2 = vld1q_u8(c + 32);

-    const uint8x16_t c_3 = vld1q_u8(c + 48);

+    const uint8x16_t a_0 = vld1q_u8(src_ptr);

+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);

+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);

+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);

+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);

+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);

+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);

+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);

+    const uint8x16_t c_0 = vld1q_u8(second_pred);

+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);

+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);

+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);

     const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);

     const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);

     const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);

     const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);

-    a += a_stride;

-    b += b_stride;

-    c += 64;

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+    second_pred += 64;

     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));

     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));

     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));

@@ -325,19 +332,20 @@

-#define sad64xN(n)                                                      \

-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \

-                                const uint8_t *ref, int ref_stride) {   \

-    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \

-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \

-  }                                                                     \

-                                                                        \

-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \

-                                    const uint8_t *ref, int ref_stride, \

-                                    const uint8_t *second_pred) {       \

-    const uint32x4_t abs =                                              \

-        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \

-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \

+#define sad64xN(n)                                                            \

+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \

+                                const uint8_t *ref_ptr, int ref_stride) {     \

+    const uint32x4_t abs =                                                    \

+        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \

+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \

+  }                                                                           \

+                                                                              \

+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \

+                                    const uint8_t *ref_ptr, int ref_stride,   \

+                                    const uint8_t *second_pred) {             \

+    const uint32x4_t abs =                                                    \

+        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \

+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \

 sad64xN(32);

--- a/vpx_dsp/bitwriter.h

+++ b/vpx_dsp/bitwriter.h

@@ -27,8 +27,8 @@

   uint8_t *buffer;

 } vpx_writer;

-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);

-void vpx_stop_encode(vpx_writer *bc);

+void vpx_start_encode(vpx_writer *br, uint8_t *source);

+void vpx_stop_encode(vpx_writer *br);

 static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {

   unsigned int split;

--- a/vpx_dsp/deblock.c

+++ b/vpx_dsp/deblock.c

@@ -39,11 +39,10 @@

   9,  10, 13,

};

-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,

-                                            unsigned char *dst_ptr,

-                                            int src_pixels_per_line,

-                                            int dst_pixels_per_line, int cols,

-                                            unsigned char *f, int size) {

+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,

+                                            unsigned char *dst, int src_pitch,

+                                            int dst_pitch, int cols,

+                                            unsigned char *flimits, int size) {

   unsigned char *p_src, *p_dst;

   int row;

   int col;

@@ -55,19 +54,21 @@

   for (row = 0; row < size; row++) {

     /* post_proc_down for one row */

-    p_src = src_ptr;

-    p_dst = dst_ptr;

+    p_src = src;

+    p_dst = dst;

     for (col = 0; col < cols; col++) {

-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];

-      unsigned char p_above1 = p_src[col - src_pixels_per_line];

-      unsigned char p_below1 = p_src[col + src_pixels_per_line];

-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];

+      unsigned char p_above2 = p_src[col - 2 * src_pitch];

+      unsigned char p_above1 = p_src[col - src_pitch];

+      unsigned char p_below1 = p_src[col + src_pitch];

+      unsigned char p_below2 = p_src[col + 2 * src_pitch];

       v = p_src[col];

-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&

-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {

+      if ((abs(v - p_above2) < flimits[col]) &&

+          (abs(v - p_above1) < flimits[col]) &&

+          (abs(v - p_below1) < flimits[col]) &&

+          (abs(v - p_below2) < flimits[col])) {

         unsigned char k1, k2, k3;

         k1 = (p_above2 + p_above1 + 1) >> 1;

         k2 = (p_below2 + p_below1 + 1) >> 1;

@@ -79,8 +80,8 @@

     /* now post_proc_across */

-    p_src = dst_ptr;

-    p_dst = dst_ptr;

+    p_src = dst;

+    p_dst = dst;

     p_src[-2] = p_src[-1] = p_src[0];

     p_src[cols] = p_src[cols + 1] = p_src[cols - 1];

@@ -88,10 +89,10 @@

     for (col = 0; col < cols; col++) {

       v = p_src[col];

-      if ((abs(v - p_src[col - 2]) < f[col]) &&

-          (abs(v - p_src[col - 1]) < f[col]) &&

-          (abs(v - p_src[col + 1]) < f[col]) &&

-          (abs(v - p_src[col + 2]) < f[col])) {

+      if ((abs(v - p_src[col - 2]) < flimits[col]) &&

+          (abs(v - p_src[col - 1]) < flimits[col]) &&

+          (abs(v - p_src[col + 1]) < flimits[col]) &&

+          (abs(v - p_src[col + 2]) < flimits[col])) {

         unsigned char k1, k2, k3;

         k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;

         k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;

@@ -109,8 +110,8 @@

     p_dst[col - 1] = d[(col - 1) & 3];

     /* next row */

-    src_ptr += src_pixels_per_line;

-    dst_ptr += dst_pixels_per_line;

+    src += src_pitch;

+    dst += dst_pitch;

--- a/vpx_dsp/fwd_txfm.c

+++ b/vpx_dsp/fwd_txfm.c

@@ -87,11 +87,11 @@

   output[0] = sum * 2;

-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {

+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {

   int i, j;

   tran_low_t intermediate[64];

   int pass;

-  tran_low_t *output = intermediate;

+  tran_low_t *out = intermediate;

   const tran_low_t *in = NULL;

   // Transform columns

@@ -133,10 +133,10 @@

       t1 = (x0 - x1) * cospi_16_64;

       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;

       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;

-      output[0] = (tran_low_t)fdct_round_shift(t0);

-      output[2] = (tran_low_t)fdct_round_shift(t2);

-      output[4] = (tran_low_t)fdct_round_shift(t1);

-      output[6] = (tran_low_t)fdct_round_shift(t3);

+      out[0] = (tran_low_t)fdct_round_shift(t0);

+      out[2] = (tran_low_t)fdct_round_shift(t2);

+      out[4] = (tran_low_t)fdct_round_shift(t1);

+      out[6] = (tran_low_t)fdct_round_shift(t3);

       // Stage 2

       t0 = (s6 - s5) * cospi_16_64;

@@ -155,19 +155,19 @@

       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;

       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;

-      output[1] = (tran_low_t)fdct_round_shift(t0);

-      output[3] = (tran_low_t)fdct_round_shift(t2);

-      output[5] = (tran_low_t)fdct_round_shift(t1);

-      output[7] = (tran_low_t)fdct_round_shift(t3);

-      output += 8;

+      out[1] = (tran_low_t)fdct_round_shift(t0);

+      out[3] = (tran_low_t)fdct_round_shift(t2);

+      out[5] = (tran_low_t)fdct_round_shift(t1);

+      out[7] = (tran_low_t)fdct_round_shift(t3);

+      out += 8;

     in = intermediate;

-    output = final_output;

+    out = output;

   // Rows

   for (i = 0; i < 8; ++i) {

-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;

+    for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;

@@ -705,9 +705,9 @@

   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);

-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {

+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {

   int i, j;

-  tran_high_t output[32 * 32];

+  tran_high_t out[32 * 32];

   // Columns

   for (i = 0; i < 32; ++i) {

@@ -715,16 +715,16 @@

     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;

     vpx_fdct32(temp_in, temp_out, 0);

     for (j = 0; j < 32; ++j)

-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

   // Rows

   for (i = 0; i < 32; ++i) {

     tran_high_t temp_in[32], temp_out[32];

-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];

+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];

     vpx_fdct32(temp_in, temp_out, 0);

     for (j = 0; j < 32; ++j)

-      out[j + i * 32] =

+      output[j + i * 32] =

           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);

@@ -732,9 +732,9 @@

 // Note that although we use dct_32_round in dct32 computation flow,

 // this 2d fdct32x32 for rate-distortion optimization loop is operating

 // within 16 bits precision.

-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {

+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {

   int i, j;

-  tran_high_t output[32 * 32];

+  tran_high_t out[32 * 32];

   // Columns

   for (i = 0; i < 32; ++i) {

@@ -745,15 +745,15 @@

       // TODO(cd): see quality impact of only doing

       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;

       //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c

-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

   // Rows

   for (i = 0; i < 32; ++i) {

     tran_high_t temp_in[32], temp_out[32];

-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];

+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];

     vpx_fdct32(temp_in, temp_out, 1);

-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];

+    for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];

@@ -772,14 +772,14 @@

   vpx_fdct4x4_c(input, output, stride);

-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,

+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,

                           int stride) {

-  vpx_fdct8x8_c(input, final_output, stride);

+  vpx_fdct8x8_c(input, output, stride);

-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,

+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,

                             int stride) {

-  vpx_fdct8x8_1_c(input, final_output, stride);

+  vpx_fdct8x8_1_c(input, output, stride);

 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,

@@ -792,17 +792,18 @@

   vpx_fdct16x16_1_c(input, output, stride);

-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {

-  vpx_fdct32x32_c(input, out, stride);

+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,

+                            int stride) {

+  vpx_fdct32x32_c(input, output, stride);

-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,

+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,

                                int stride) {

-  vpx_fdct32x32_rd_c(input, out, stride);

+  vpx_fdct32x32_rd_c(input, output, stride);

-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,

+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,

                               int stride) {

-  vpx_fdct32x32_1_c(input, out, stride);

+  vpx_fdct32x32_1_c(input, output, stride);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/inv_txfm.c

+++ b/vpx_dsp/inv_txfm.c

@@ -67,11 +67,11 @@

-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {

+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {

   int i;

   tran_high_t a1, e1;

   tran_low_t tmp[4];

-  const tran_low_t *ip = in;

+  const tran_low_t *ip = input;

   tran_low_t *op = tmp;

   a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -1346,12 +1346,12 @@

-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,

+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,

                                 int stride, int bd) {

   int i;

   tran_high_t a1, e1;

   tran_low_t tmp[4];

-  const tran_low_t *ip = in;

+  const tran_low_t *ip = input;

   tran_low_t *op = tmp;

   (void)bd;

--- a/vpx_dsp/loopfilter.c

+++ b/vpx_dsp/loopfilter.c

@@ -109,29 +109,30 @@

   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,

-                            const uint8_t *blimit, const uint8_t *limit,

-                            const uint8_t *thresh) {

+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,

+                            const uint8_t *limit, const uint8_t *thresh) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8; ++i) {

-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],

+                  p0 = s[-pitch];

+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],

+                  q3 = s[3 * pitch];

     const int8_t mask =

         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);

-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);

+    filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);

     ++s;

-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,

+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,

                                  const uint8_t *limit0, const uint8_t *thresh0,

                                  const uint8_t *blimit1, const uint8_t *limit1,

                                  const uint8_t *thresh1) {

-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);

-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);

+  vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,

@@ -178,7 +179,7 @@

-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,

+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,

                             const uint8_t *limit, const uint8_t *thresh) {

   int i;

@@ -185,24 +186,26 @@

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8; ++i) {

-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],

+                  p0 = s[-pitch];

+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],

+                  q3 = s[3 * pitch];

     const int8_t mask =

         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);

     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,

-            s + 1 * p, s + 2 * p, s + 3 * p);

+    filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,

+            s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);

     ++s;

-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,

+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,

                                  const uint8_t *limit0, const uint8_t *thresh0,

                                  const uint8_t *blimit1, const uint8_t *limit1,

                                  const uint8_t *thresh1) {

-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);

-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);

+  vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,

@@ -283,7 +286,8 @@

-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,

+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,

+                                     const uint8_t *blimit,

                                      const uint8_t *limit,

                                      const uint8_t *thresh, int count) {

   int i;

@@ -291,34 +295,37 @@

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8 * count; ++i) {

-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],

+                  p0 = s[-pitch];

+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],

+                  q3 = s[3 * pitch];

     const int8_t mask =

         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);

     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);

-    const int8_t flat2 =

-        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,

-                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);

+    const int8_t flat2 = flat_mask5(

+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,

+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);

-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,

-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,

-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,

-             s + 7 * p);

+    filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,

+             s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,

+             s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,

+             s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,

+             s + 7 * pitch);

     ++s;

-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,

+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,

                              const uint8_t *limit, const uint8_t *thresh) {

-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);

+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);

-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,

+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,

                                   const uint8_t *limit, const uint8_t *thresh) {

-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);

+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);

-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,

+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,

                                    const uint8_t *limit, const uint8_t *thresh,

                                    int count) {

   int i;

@@ -335,18 +342,18 @@

     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,

              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,

              s + 7);

-    s += p;

+    s += pitch;

-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,

+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,

                            const uint8_t *limit, const uint8_t *thresh) {

-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);

+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);

-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,

+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,

                                 const uint8_t *limit, const uint8_t *thresh) {

-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);

+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);

 #if CONFIG_VP9_HIGHBITDEPTH

@@ -440,7 +447,7 @@

   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);

-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,

+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,

                                    const uint8_t *blimit, const uint8_t *limit,

                                    const uint8_t *thresh, int bd) {

   int i;

@@ -448,27 +455,28 @@

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8; ++i) {

-    const uint16_t p3 = s[-4 * p];

-    const uint16_t p2 = s[-3 * p];

-    const uint16_t p1 = s[-2 * p];

-    const uint16_t p0 = s[-p];

-    const uint16_t q0 = s[0 * p];

-    const uint16_t q1 = s[1 * p];

-    const uint16_t q2 = s[2 * p];

-    const uint16_t q3 = s[3 * p];

+    const uint16_t p3 = s[-4 * pitch];

+    const uint16_t p2 = s[-3 * pitch];

+    const uint16_t p1 = s[-2 * pitch];

+    const uint16_t p0 = s[-pitch];

+    const uint16_t q0 = s[0 * pitch];

+    const uint16_t q1 = s[1 * pitch];

+    const uint16_t q2 = s[2 * pitch];

+    const uint16_t q3 = s[3 * pitch];

     const int8_t mask =

         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);

-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);

+    highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,

+                   s + 1 * pitch, bd);

     ++s;

 void vpx_highbd_lpf_horizontal_4_dual_c(

-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

     const uint8_t *thresh1, int bd) {

-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);

-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);

+  vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);

 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,

@@ -517,33 +525,36 @@

-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,

-                                   const uint8_t *limit, const uint8_t *thresh,

-                                   int bd) {

+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,

+                                   const uint8_t *blimit, const uint8_t *limit,

+                                   const uint8_t *thresh, int bd) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8; ++i) {

-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

+    const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],

+                   p0 = s[-pitch];

+    const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],

+                   q3 = s[3 * pitch];

     const int8_t mask =

         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);

     const int8_t flat =

         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);

-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,

-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);

+    highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,

+                   s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,

+                   s + 2 * pitch, s + 3 * pitch, bd);

     ++s;

 void vpx_highbd_lpf_horizontal_8_dual_c(

-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

     const uint8_t *thresh1, int bd) {

-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);

-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);

+  vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);

 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,

@@ -639,7 +650,7 @@

-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,

+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,

                                             const uint8_t *blimit,

                                             const uint8_t *limit,

                                             const uint8_t *thresh, int count,

@@ -649,44 +660,45 @@

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

   for (i = 0; i < 8 * count; ++i) {

-    const uint16_t p3 = s[-4 * p];

-    const uint16_t p2 = s[-3 * p];

-    const uint16_t p1 = s[-2 * p];

-    const uint16_t p0 = s[-p];

-    const uint16_t q0 = s[0 * p];

-    const uint16_t q1 = s[1 * p];

-    const uint16_t q2 = s[2 * p];

-    const uint16_t q3 = s[3 * p];

+    const uint16_t p3 = s[-4 * pitch];

+    const uint16_t p2 = s[-3 * pitch];

+    const uint16_t p1 = s[-2 * pitch];

+    const uint16_t p0 = s[-pitch];

+    const uint16_t q0 = s[0 * pitch];

+    const uint16_t q1 = s[1 * pitch];

+    const uint16_t q2 = s[2 * pitch];

+    const uint16_t q3 = s[3 * pitch];

     const int8_t mask =

         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);

     const int8_t flat =

         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);

-    const int8_t flat2 =

-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,

-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);

+    const int8_t flat2 = highbd_flat_mask5(

+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,

+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);

-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,

-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,

-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,

-                    s + 6 * p, s + 7 * p, bd);

+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,

+                    s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,

+                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,

+                    s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,

+                    s + 6 * pitch, s + 7 * pitch, bd);

     ++s;

-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,

-                                    const uint8_t *limit, const uint8_t *thresh,

-                                    int bd) {

-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);

+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,

+                                    const uint8_t *blimit, const uint8_t *limit,

+                                    const uint8_t *thresh, int bd) {

+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);

-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,

+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,

                                          const uint8_t *blimit,

                                          const uint8_t *limit,

                                          const uint8_t *thresh, int bd) {

-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);

+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);

-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,

+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,

                                           const uint8_t *blimit,

                                           const uint8_t *limit,

                                           const uint8_t *thresh, int count,

@@ -712,20 +724,20 @@

     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,

                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,

                     s + 5, s + 6, s + 7, bd);

-    s += p;

+    s += pitch;

-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,

+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,

                                   const uint8_t *limit, const uint8_t *thresh,

                                   int bd) {

-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);

+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);

-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,

+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,

                                        const uint8_t *blimit,

                                        const uint8_t *limit,

                                        const uint8_t *thresh, int bd) {

-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);

+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/mips/deblock_msa.c

+++ b/vpx_dsp/mips/deblock_msa.c

@@ -508,11 +508,11 @@

-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,

-                                   int32_t rows, int32_t cols, int32_t flimit) {

+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,

+                                   int32_t cols, int32_t flimit) {

   int32_t row, col, cnt;

-  uint8_t *src_dup = src_ptr;

-  v16u8 src0, src, tmp_orig;

+  uint8_t *src_dup = src;

+  v16u8 src0, src1, tmp_orig;

   v16u8 tmp = { 0 };

   v16i8 zero = { 0 };

   v8u16 sum_h, src_r_h, src_l_h;

@@ -531,13 +531,13 @@

     src_dup[cols + 16] = src_dup[cols - 1];

     tmp_orig = (v16u8)__msa_ldi_b(0);

     tmp_orig[15] = tmp[15];

-    src = LD_UB(src_dup - 8);

-    src[15] = 0;

-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);

+    src1 = LD_UB(src_dup - 8);

+    src1[15] = 0;

+    ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);

     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);

     src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);

     sum_sq = HADD_SW_S32(src_r_w) + 16;

-    sum_h = __msa_hadd_u_h(src, src);

+    sum_h = __msa_hadd_u_h(src1, src1);

     sum = HADD_UH_U32(sum_h);

       v16u8 src7, src8, src_r, src_l;

@@ -566,8 +566,8 @@

           sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];

         sum = sum_l[7];

-        src = LD_UB(src_dup + 16 * col);

-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);

+        src1 = LD_UB(src_dup + 16 * col);

+        ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);

         src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);

         src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);

         tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);

@@ -613,7 +613,7 @@

         total3 = (total3 < flimit_vec);

         PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);

         mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);

-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);

+        tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);

         if (col == 0) {

           uint64_t src_d;

--- a/vpx_dsp/quantize.c

+++ b/vpx_dsp/quantize.c

@@ -17,7 +17,7 @@

 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,

                      const int16_t *round_ptr, const int16_t quant,

                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {

+                     const int16_t dequant, uint16_t *eob_ptr) {

   const int rc = 0;

   const int coeff = coeff_ptr[rc];

   const int coeff_sign = (coeff >> 31);

@@ -31,7 +31,7 @@

     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

     tmp = (tmp * quant) >> 16;

     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;

     if (tmp) eob = 0;

   *eob_ptr = eob + 1;

@@ -41,7 +41,7 @@

 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,

                             int skip_block, const int16_t *round_ptr,

                             const int16_t quant, tran_low_t *qcoeff_ptr,

-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,

+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,

                             uint16_t *eob_ptr) {

   int eob = -1;

@@ -55,7 +55,7 @@

     const int64_t tmp = abs_coeff + round_ptr[0];

     const int abs_qcoeff = (int)((tmp * quant) >> 16);

     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;

+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;

     if (abs_qcoeff) eob = 0;

   *eob_ptr = eob + 1;

@@ -65,7 +65,7 @@

 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

                            const int16_t *round_ptr, const int16_t quant,

                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {

+                           const int16_t dequant, uint16_t *eob_ptr) {

   const int n_coeffs = 1024;

   const int rc = 0;

   const int coeff = coeff_ptr[rc];

@@ -81,7 +81,7 @@

                 INT16_MIN, INT16_MAX);

     tmp = (tmp * quant) >> 15;

     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;

     if (tmp) eob = 0;

   *eob_ptr = eob + 1;

@@ -92,8 +92,7 @@

                                   const int16_t *round_ptr, const int16_t quant,

                                   tran_low_t *qcoeff_ptr,

                                   tran_low_t *dqcoeff_ptr,

-                                  const int16_t dequant_ptr,

-                                  uint16_t *eob_ptr) {

+                                  const int16_t dequant, uint16_t *eob_ptr) {

   const int n_coeffs = 1024;

   int eob = -1;

@@ -107,7 +106,7 @@

     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);

     const int abs_qcoeff = (int)((tmp * quant) >> 15);

     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);

-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;

+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;

     if (abs_qcoeff) eob = 0;

   *eob_ptr = eob + 1;

--- a/vpx_dsp/quantize.h

+++ b/vpx_dsp/quantize.h

@@ -19,26 +19,25 @@

 #endif

 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,

-                     const int16_t *round_ptr, const int16_t quant_ptr,

+                     const int16_t *round_ptr, const int16_t quant,

                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                     const int16_t dequant_ptr, uint16_t *eob_ptr);

+                     const int16_t dequant, uint16_t *eob_ptr);

 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

-                           const int16_t *round_ptr, const int16_t quant_ptr,

+                           const int16_t *round_ptr, const int16_t quant,

                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

-                           const int16_t dequant_ptr, uint16_t *eob_ptr);

+                           const int16_t dequant, uint16_t *eob_ptr);

 #if CONFIG_VP9_HIGHBITDEPTH

 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,

                             int skip_block, const int16_t *round_ptr,

-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,

-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,

+                            const int16_t quant, tran_low_t *qcoeff_ptr,

+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,

                             uint16_t *eob_ptr);

 void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,

-                                  const int16_t *round_ptr,

-                                  const int16_t quant_ptr,

+                                  const int16_t *round_ptr, const int16_t quant,

                                   tran_low_t *qcoeff_ptr,

                                   tran_low_t *dqcoeff_ptr,

-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);

+                                  const int16_t dequant, uint16_t *eob_ptr);

 #endif

 #ifdef __cplusplus

--- a/vpx_dsp/sad.c

+++ b/vpx_dsp/sad.c

@@ -17,54 +17,55 @@

 #include "vpx_ports/mem.h"

 /* Sum the difference between every corresponding element of the buffers. */

-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,

-                               int b_stride, int width, int height) {

+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,

+                               const uint8_t *ref_ptr, int ref_stride,

+                               int width, int height) {

   int y, x;

   unsigned int sad = 0;

   for (y = 0; y < height; y++) {

-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);

+    for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);

-    a += a_stride;

-    b += b_stride;

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

   return sad;

-#define sadMxN(m, n)                                                        \

-  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \

-                                    const uint8_t *ref, int ref_stride) {   \

-    return sad(src, src_stride, ref, ref_stride, m, n);                     \

-  }                                                                         \

-  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \

-                                        const uint8_t *ref, int ref_stride, \

-                                        const uint8_t *second_pred) {       \

-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                         \

-    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \

-    return sad(src, src_stride, comp_pred, m, m, n);                        \

+#define sadMxN(m, n)                                                          \

+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride,   \

+                                    const uint8_t *ref_ptr, int ref_stride) { \

+    return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);               \

+  }                                                                           \

+  unsigned int vpx_sad##m##x##n##_avg_c(                                      \

+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \

+      int ref_stride, const uint8_t *second_pred) {                           \

+    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \

+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \

+    return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \

 // depending on call sites, pass **ref_array to avoid & in subsequent call and

 // de-dup with 4D below.

-#define sadMxNxK(m, n, k)                                                   \

-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \

-                                  const uint8_t *ref_array, int ref_stride, \

-                                  uint32_t *sad_array) {                    \

-    int i;                                                                  \

-    for (i = 0; i < k; ++i)                                                 \

-      sad_array[i] =                                                        \

-          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \

+#define sadMxNxK(m, n, k)                                                     \

+  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \

+                                  const uint8_t *ref_ptr, int ref_stride,     \

+                                  uint32_t *sad_array) {                      \

+    int i;                                                                    \

+    for (i = 0; i < k; ++i)                                                   \

+      sad_array[i] =                                                          \

+          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \

 // This appears to be equivalent to the above when k == 4 and refs is const

-#define sadMxNx4D(m, n)                                                    \

-  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \

-                               const uint8_t *const ref_array[],           \

-                               int ref_stride, uint32_t *sad_array) {      \

-    int i;                                                                 \

-    for (i = 0; i < 4; ++i)                                                \

-      sad_array[i] =                                                       \

-          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \

+#define sadMxNx4D(m, n)                                                        \

+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \

+                               const uint8_t *const ref_array[],               \

+                               int ref_stride, uint32_t *sad_array) {          \

+    int i;                                                                     \

+    for (i = 0; i < 4; ++i)                                                    \

+      sad_array[i] =                                                           \

+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \

 /* clang-format off */

@@ -133,60 +134,61 @@

 #if CONFIG_VP9_HIGHBITDEPTH

         static INLINE

-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,

-                            int b_stride, int width, int height) {

+    unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,

+                            const uint8_t *ref8_ptr, int ref_stride, int width,

+                            int height) {

   int y, x;

   unsigned int sad = 0;

-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);

-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);

+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);

+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);

   for (y = 0; y < height; y++) {

-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);

+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);

-    a += a_stride;

-    b += b_stride;

+    src += src_stride;

+    ref_ptr += ref_stride;

   return sad;

-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,

-                                       const uint16_t *b, int b_stride,

+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,

+                                       const uint16_t *ref_ptr, int ref_stride,

                                        int width, int height) {

   int y, x;

   unsigned int sad = 0;

-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);

+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);

   for (y = 0; y < height; y++) {

-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);

+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);

-    a += a_stride;

-    b += b_stride;

+    src += src_stride;

+    ref_ptr += ref_stride;

   return sad;

 #define highbd_sadMxN(m, n)                                                    \

-  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \

-                                           const uint8_t *ref,                 \

-                                           int ref_stride) {                   \

-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \

+  unsigned int vpx_highbd_sad##m##x##n##_c(                                    \

+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \

+      int ref_stride) {                                                        \

+    return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);         \

   }                                                                            \

   unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \

-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \

-      const uint8_t *second_pred) {                                            \

+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \

+      int ref_stride, const uint8_t *second_pred) {                            \

     DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \

     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \

-                               n, CONVERT_TO_SHORTPTR(ref), ref_stride);       \

-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \

+                               n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \

+    return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \

-#define highbd_sadMxNx4D(m, n)                                               \

-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \

-                                      const uint8_t *const ref_array[],      \

-                                      int ref_stride, uint32_t *sad_array) { \

-    int i;                                                                   \

-    for (i = 0; i < 4; ++i) {                                                \

-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \

-                                                 ref_array[i], ref_stride);  \

-    }                                                                        \

+#define highbd_sadMxNx4D(m, n)                                                \

+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \

+                                      const uint8_t *const ref_array[],       \

+                                      int ref_stride, uint32_t *sad_array) {  \

+    int i;                                                                    \

+    for (i = 0; i < 4; ++i) {                                                 \

+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \

+                                                 ref_array[i], ref_stride);   \

+    }                                                                         \

 /* clang-format off */

--- a/vpx_dsp/subtract.c

+++ b/vpx_dsp/subtract.c

@@ -16,37 +16,37 @@

 #include "vpx/vpx_integer.h"

 #include "vpx_ports/mem.h"

-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,

-                          ptrdiff_t diff_stride, const uint8_t *src,

-                          ptrdiff_t src_stride, const uint8_t *pred,

+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,

+                          ptrdiff_t diff_stride, const uint8_t *src_ptr,

+                          ptrdiff_t src_stride, const uint8_t *pred_ptr,

                           ptrdiff_t pred_stride) {

   int r, c;

   for (r = 0; r < rows; r++) {

-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];

+    for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    diff += diff_stride;

-    pred += pred_stride;

-    src += src_stride;

+    diff_ptr += diff_stride;

+    pred_ptr += pred_stride;

+    src_ptr += src_stride;

 #if CONFIG_VP9_HIGHBITDEPTH

-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,

-                                 ptrdiff_t diff_stride, const uint8_t *src8,

-                                 ptrdiff_t src_stride, const uint8_t *pred8,

+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,

+                                 ptrdiff_t diff_stride, const uint8_t *src8_ptr,

+                                 ptrdiff_t src_stride, const uint8_t *pred8_ptr,

                                  ptrdiff_t pred_stride, int bd) {

   int r, c;

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);

+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);

   (void)bd;

   for (r = 0; r < rows; r++) {

     for (c = 0; c < cols; c++) {

-      diff[c] = src[c] - pred[c];

+      diff_ptr[c] = src[c] - pred[c];

-    diff += diff_stride;

+    diff_ptr += diff_stride;

     pred += pred_stride;

     src += src_stride;

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -37,322 +37,322 @@

 # Intra prediction

-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d207_predictor_4x4 sse2/;

-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d45_predictor_4x4 neon sse2/;

-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d63_predictor_4x4 ssse3/;

-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d135_predictor_4x4 neon/;

-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d153_predictor_4x4 ssse3/;

-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;

-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;

-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;

-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;

-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;

-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d207_predictor_8x8 ssse3/;

-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;

-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;

-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d135_predictor_8x8 neon/;

-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d153_predictor_8x8 ssse3/;

-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;

-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;

-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;

-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;

-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d207_predictor_16x16 ssse3/;

-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;

-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;

-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d135_predictor_16x16 neon/;

-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d153_predictor_16x16 ssse3/;

-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;

-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;

-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d207_predictor_32x32 ssse3/;

-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;

-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;

-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;

-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d135_predictor_32x32 neon/;

-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_d153_predictor_32x32 ssse3/;

-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;

-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;

-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;

-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;

-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;

-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";

+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";

 specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;

 # High bitdepth functions

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;

-  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;

-  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;

-  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;

-  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;

-  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;

-  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;

-  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;

-  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;

-  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;

-  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;

-  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;

-  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;

-  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;

-  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;

-  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;

-  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;

-  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;

-  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;

-  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;

-  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;

-  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;

-  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;

-  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;

-  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;

-  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;

-  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;

-  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";

+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";

   specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;

 }  # CONFIG_VP9_HIGHBITDEPTH

@@ -400,28 +400,28 @@

   # Sub Pixel Filters

-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

   specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";

 }  # CONFIG_VP9_HIGHBITDEPTH

@@ -897,43 +897,43 @@

 # Multi-block SAD, comparing a reference to N independent blocks

-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;

-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;

-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;

-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;

-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;

-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;

 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";

@@ -954,7 +954,7 @@

   # Block subtraction

-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";

+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";

   # Single block SAD

@@ -999,13 +999,13 @@

   # Avg

-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";

+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";

   specialize qw/vpx_highbd_avg_8x8 sse2/;

-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";

+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";

   specialize qw/vpx_highbd_avg_4x4 sse2/;

-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";

   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";

   specialize qw/vpx_highbd_sad64x64_avg sse2/;

@@ -1047,43 +1047,43 @@

   # Multi-block SAD, comparing a reference to N independent blocks

-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad64x64x4d sse2/;

-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad64x32x4d sse2/;

-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad32x64x4d sse2/;

-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad32x32x4d sse2/;

-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad32x16x4d sse2/;

-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad16x32x4d sse2/;

-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad16x16x4d sse2/;

-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad16x8x4d sse2/;

-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad8x16x4d sse2/;

-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad8x8x4d sse2/;

-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad8x4x4d sse2/;

-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad4x8x4d sse2/;

-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";

+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";

   specialize qw/vpx_highbd_sad4x4x4d sse2/;

@@ -1619,7 +1619,7 @@

     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";

     specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;

-    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";

+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";

     specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;

     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";

--- a/vpx_dsp/x86/avg_pred_sse2.c

+++ b/vpx_dsp/x86/avg_pred_sse2.c

@@ -15,10 +15,10 @@

 #include "vpx/vpx_integer.h"

 #include "vpx_dsp/x86/mem_sse2.h"

-void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,

+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,

                             int height, const uint8_t *ref, int ref_stride) {

-  /* comp and pred must be 16 byte aligned. */

-  assert(((intptr_t)comp & 0xf) == 0);

+  /* comp_pred and pred must be 16 byte aligned. */

+  assert(((intptr_t)comp_pred & 0xf) == 0);

   assert(((intptr_t)pred & 0xf) == 0);

   if (width > 8) {

     int x, y;

@@ -27,17 +27,17 @@

         const __m128i p = _mm_load_si128((const __m128i *)(pred + x));

         const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));

         const __m128i avg = _mm_avg_epu8(p, r);

-        _mm_store_si128((__m128i *)(comp + x), avg);

+        _mm_store_si128((__m128i *)(comp_pred + x), avg);

-      comp += width;

+      comp_pred += width;

       pred += width;

       ref += ref_stride;

   } else {  // width must be 4 or 8.

     int i;

-    // Process 16 elements at a time. comp and pred have width == stride and

-    // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all

-    // divisible by 16 so just ref needs to be massaged when loading.

+    // Process 16 elements at a time. comp_pred and pred have width == stride

+    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are

+    // all divisible by 16 so just ref needs to be massaged when loading.

     for (i = 0; i < width * height; i += 16) {

       const __m128i p = _mm_load_si128((const __m128i *)pred);

       __m128i r;

@@ -60,10 +60,10 @@

         ref += 2 * ref_stride;

       avg = _mm_avg_epu8(p, r);

-      _mm_store_si128((__m128i *)comp, avg);

+      _mm_store_si128((__m128i *)comp_pred, avg);

       pred += 16;

-      comp += 16;

+      comp_pred += 16;

--- a/vpx_dsp/x86/convolve.h

+++ b/vpx_dsp/x86/convolve.h

@@ -23,19 +23,19 @@

 #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \

   void vpx_convolve8_##name##_##opt(                                         \

       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \

-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \

+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \

       int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \

-    const int16_t *filter = filter_kernel[offset];                           \

+    const int16_t *filter_row = filter[offset];                              \

     (void)x0_q4;                                                             \

     (void)x_step_q4;                                                         \

     (void)y0_q4;                                                             \

     (void)y_step_q4;                                                         \

-    assert(filter[3] != 128);                                                \

+    assert(filter_row[3] != 128);                                            \

     assert(step_q4 == 16);                                                   \

-    if (filter[0] | filter[1] | filter[6] | filter[7]) {                     \

+    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \

       while (w >= 16) {                                                      \

         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \

-                                                 dst_stride, h, filter);     \

+                                                 dst_stride, h, filter_row); \

         src += 16;                                                           \

         dst += 16;                                                           \

         w -= 16;                                                             \

@@ -42,15 +42,15 @@

       }                                                                      \

       if (w == 8) {                                                          \

         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       } else if (w == 4) {                                                   \

         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       }                                                                      \

-    } else if (filter[2] | filter[5]) {                                      \

+    } else if (filter_row[2] | filter_row[5]) {                              \

       while (w >= 16) {                                                      \

         vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \

-                                                 dst_stride, h, filter);     \

+                                                 dst_stride, h, filter_row); \

         src += 16;                                                           \

         dst += 16;                                                           \

         w -= 16;                                                             \

@@ -57,15 +57,15 @@

       }                                                                      \

       if (w == 8) {                                                          \

         vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       } else if (w == 4) {                                                   \

         vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       }                                                                      \

     } else {                                                                 \

       while (w >= 16) {                                                      \

         vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \

-                                                 dst_stride, h, filter);     \

+                                                 dst_stride, h, filter_row); \

         src += 16;                                                           \

         dst += 16;                                                           \

         w -= 16;                                                             \

@@ -72,10 +72,10 @@

       }                                                                      \

       if (w == 8) {                                                          \

         vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       } else if (w == 4) {                                                   \

         vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \

-                                                dst_stride, h, filter);      \

+                                                dst_stride, h, filter_row);  \

       }                                                                      \

     }                                                                        \

@@ -121,86 +121,86 @@

                                        unsigned int output_height,

                                        const int16_t *filter, int bd);

-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \

-  void vpx_highbd_convolve8_##name##_##opt(                                   \

-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \

-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \

-      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \

-    const int16_t *filter = filter_kernel[offset];                            \

-    if (step_q4 == 16 && filter[3] != 128) {                                  \

-      if (filter[0] | filter[1] | filter[6] | filter[7]) {                    \

-        while (w >= 16) {                                                     \

-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 16;                                                          \

-          dst += 16;                                                          \

-          w -= 16;                                                            \

-        }                                                                     \

-        while (w >= 8) {                                                      \

-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 8;                                                           \

-          dst += 8;                                                           \

-          w -= 8;                                                             \

-        }                                                                     \

-        while (w >= 4) {                                                      \

-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 4;                                                           \

-          dst += 4;                                                           \

-          w -= 4;                                                             \

-        }                                                                     \

-      } else if (filter[2] | filter[5]) {                                     \

-        while (w >= 16) {                                                     \

-          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 16;                                                          \

-          dst += 16;                                                          \

-          w -= 16;                                                            \

-        }                                                                     \

-        while (w >= 8) {                                                      \

-          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 8;                                                           \

-          dst += 8;                                                           \

-          w -= 8;                                                             \

-        }                                                                     \

-        while (w >= 4) {                                                      \

-          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

-          src += 4;                                                           \

-          dst += 4;                                                           \

-          w -= 4;                                                             \

-        }                                                                     \

-      } else {                                                                \

-        while (w >= 16) {                                                     \

-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \

-              src, src_stride, dst, dst_stride, h, filter, bd);               \

-          src += 16;                                                          \

-          dst += 16;                                                          \

-          w -= 16;                                                            \

-        }                                                                     \

-        while (w >= 8) {                                                      \

-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \

-              src, src_stride, dst, dst_stride, h, filter, bd);               \

-          src += 8;                                                           \

-          dst += 8;                                                           \

-          w -= 8;                                                             \

-        }                                                                     \

-        while (w >= 4) {                                                      \

-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \

-              src, src_stride, dst, dst_stride, h, filter, bd);               \

-          src += 4;                                                           \

-          dst += 4;                                                           \

-          w -= 4;                                                             \

-        }                                                                     \

-      }                                                                       \

-    }                                                                         \

-    if (w) {                                                                  \

-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \

-                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \

-                                      y_step_q4, w, h, bd);                   \

-    }                                                                         \

+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)  \

+  void vpx_highbd_convolve8_##name##_##opt(                                \

+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,            \

+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,         \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {     \

+    const int16_t *filter_row = filter[offset];                            \

+    if (step_q4 == 16 && filter_row[3] != 128) {                           \

+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \

+        while (w >= 16) {                                                  \

+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                 \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 16;                                                       \

+          dst += 16;                                                       \

+          w -= 16;                                                         \

+        }                                                                  \

+        while (w >= 8) {                                                   \

+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                  \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 8;                                                        \

+          dst += 8;                                                        \

+          w -= 8;                                                          \

+        }                                                                  \

+        while (w >= 4) {                                                   \

+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                  \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 4;                                                        \

+          dst += 4;                                                        \

+          w -= 4;                                                          \

+        }                                                                  \

+      } else if (filter_row[2] | filter_row[5]) {                          \

+        while (w >= 16) {                                                  \

+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                 \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 16;                                                       \

+          dst += 16;                                                       \

+          w -= 16;                                                         \

+        }                                                                  \

+        while (w >= 8) {                                                   \

+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                  \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 8;                                                        \

+          dst += 8;                                                        \

+          w -= 8;                                                          \

+        }                                                                  \

+        while (w >= 4) {                                                   \

+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                  \

+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \

+          src += 4;                                                        \

+          dst += 4;                                                        \

+          w -= 4;                                                          \

+        }                                                                  \

+      } else {                                                             \

+        while (w >= 16) {                                                  \

+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                 \

+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \

+          src += 16;                                                       \

+          dst += 16;                                                       \

+          w -= 16;                                                         \

+        }                                                                  \

+        while (w >= 8) {                                                   \

+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                  \

+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \

+          src += 8;                                                        \

+          dst += 8;                                                        \

+          w -= 8;                                                          \

+        }                                                                  \

+        while (w >= 4) {                                                   \

+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                  \

+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \

+          src += 4;                                                        \

+          dst += 4;                                                        \

+          w -= 4;                                                          \

+        }                                                                  \

+      }                                                                    \

+    }                                                                      \

+    if (w) {                                                               \

+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,    \

+                                      filter, x0_q4, x_step_q4, y0_q4,     \

+                                      y_step_q4, w, h, bd);                \

+    }                                                                      \

 #define HIGH_FUN_CONV_2D(avg, opt)                                             \

--- a/vpx_dsp/x86/highbd_convolve_avx2.c

+++ b/vpx_dsp/x86/highbd_convolve_avx2.c

@@ -20,7 +20,7 @@

                                    uint16_t *dst, ptrdiff_t dst_stride,

                                    const InterpKernel *filter, int x0_q4,

                                    int x_step_q4, int y0_q4, int y_step_q4,

-                                   int width, int h, int bd) {

+                                   int w, int h, int bd) {

   (void)filter;

   (void)x0_q4;

   (void)x_step_q4;

@@ -28,8 +28,8 @@

   (void)y_step_q4;

   (void)bd;

-  assert(width % 4 == 0);

-  if (width > 32) {  // width = 64

+  assert(w % 4 == 0);

+  if (w > 32) {  // w = 64

     do {

       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);

       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));

@@ -43,7 +43,7 @@

       dst += dst_stride;

       h--;

     } while (h > 0);

-  } else if (width > 16) {  // width = 32

+  } else if (w > 16) {  // w = 32

     do {

       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);

       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));

@@ -53,7 +53,7 @@

       dst += dst_stride;

       h--;

     } while (h > 0);

-  } else if (width > 8) {  // width = 16

+  } else if (w > 8) {  // w = 16

     __m256i p0, p1;

     do {

       p0 = _mm256_loadu_si256((const __m256i *)src);

@@ -67,7 +67,7 @@

       dst += dst_stride;

       h -= 2;

     } while (h > 0);

-  } else if (width > 4) {  // width = 8

+  } else if (w > 4) {  // w = 8

     __m128i p0, p1;

     do {

       p0 = _mm_loadu_si128((const __m128i *)src);

@@ -81,7 +81,7 @@

       dst += dst_stride;

       h -= 2;

     } while (h > 0);

-  } else {  // width = 4

+  } else {  // w = 4

     __m128i p0, p1;

     do {

       p0 = _mm_loadl_epi64((const __m128i *)src);

@@ -102,7 +102,7 @@

                                   uint16_t *dst, ptrdiff_t dst_stride,

                                   const InterpKernel *filter, int x0_q4,

                                   int x_step_q4, int y0_q4, int y_step_q4,

-                                  int width, int h, int bd) {

+                                  int w, int h, int bd) {

   (void)filter;

   (void)x0_q4;

   (void)x_step_q4;

@@ -110,8 +110,8 @@

   (void)y_step_q4;

   (void)bd;

-  assert(width % 4 == 0);

-  if (width > 32) {  // width = 64

+  assert(w % 4 == 0);

+  if (w > 32) {  // w = 64

     __m256i p0, p1, p2, p3, u0, u1, u2, u3;

     do {

       p0 = _mm256_loadu_si256((const __m256i *)src);

@@ -130,7 +130,7 @@

       dst += dst_stride;

       h--;

     } while (h > 0);

-  } else if (width > 16) {  // width = 32

+  } else if (w > 16) {  // w = 32

     __m256i p0, p1, u0, u1;

     do {

       p0 = _mm256_loadu_si256((const __m256i *)src);

@@ -143,7 +143,7 @@

       dst += dst_stride;

       h--;

     } while (h > 0);

-  } else if (width > 8) {  // width = 16

+  } else if (w > 8) {  // w = 16

     __m256i p0, p1, u0, u1;

     do {

       p0 = _mm256_loadu_si256((const __m256i *)src);

@@ -158,7 +158,7 @@

       dst += dst_stride << 1;

       h -= 2;

     } while (h > 0);

-  } else if (width > 4) {  // width = 8

+  } else if (w > 4) {  // w = 8

     __m128i p0, p1, u0, u1;

     do {

       p0 = _mm_loadu_si128((const __m128i *)src);

@@ -172,7 +172,7 @@

       dst += dst_stride << 1;

       h -= 2;

     } while (h > 0);

-  } else {  // width = 4

+  } else {  // w = 4

     __m128i p0, p1, u0, u1;

     do {

       p0 = _mm_loadl_epi64((const __m128i *)src);

--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm

+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm

@@ -256,7 +256,7 @@

   REP_RET

 INIT_XMM sse2

-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps

+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd

   movd                  m1, [aboveq-2]

   movq                  m0, [aboveq]

   pshuflw               m1, m1, 0x0

@@ -264,7 +264,7 @@

   movlhps               m1, m1         ; tl tl tl tl tl tl tl tl

   ; Get the values to compute the maximum value at this bit depth

   pcmpeqw               m3, m3

-  movd                  m4, bpsd

+  movd                  m4, bdd

   psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl

   psllw                 m3, m4

   pcmpeqw               m2, m2

@@ -295,7 +295,7 @@

RET

 INIT_XMM sse2

-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one

+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one

   movd                  m1, [aboveq-2]

   mova                  m0, [aboveq]

   pshuflw               m1, m1, 0x0

@@ -304,7 +304,7 @@

   pxor                  m3, m3

   pxor                  m4, m4

   pinsrw                m3, oned, 0

-  pinsrw                m4, bpsd, 0

+  pinsrw                m4, bdd, 0

   pshuflw               m3, m3, 0x0

   DEFINE_ARGS dst, stride, line, left

   punpcklqdq            m3, m3

@@ -339,7 +339,7 @@

   REP_RET

 INIT_XMM sse2

-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps

+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd

   movd                  m2, [aboveq-2]

   mova                  m0, [aboveq]

   mova                  m1, [aboveq+16]

@@ -346,7 +346,7 @@

   pshuflw               m2, m2, 0x0

   ; Get the values to compute the maximum value at this bit depth

   pcmpeqw               m3, m3

-  movd                  m4, bpsd

+  movd                  m4, bdd

   punpcklqdq            m2, m2

   psllw                 m3, m4

   pcmpeqw               m5, m5

@@ -386,7 +386,7 @@

   REP_RET

 INIT_XMM sse2

-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps

+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd

   movd                  m0, [aboveq-2]

   mova                  m1, [aboveq]

   mova                  m2, [aboveq+16]

@@ -395,7 +395,7 @@

   pshuflw               m0, m0, 0x0

   ; Get the values to compute the maximum value at this bit depth

   pcmpeqw               m5, m5

-  movd                  m6, bpsd

+  movd                  m6, bdd

   psllw                 m5, m6

   pcmpeqw               m7, m7

   pxor                  m6, m6         ; min possible value

--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c

+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -47,13 +47,13 @@

 // TODO(debargha, peter): Break up large functions into smaller ones

 // in this file.

-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,

-                                       const uint8_t *_blimit,

-                                       const uint8_t *_limit,

-                                       const uint8_t *_thresh, int bd) {

+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,

+                                       const uint8_t *blimit,

+                                       const uint8_t *limit,

+                                       const uint8_t *thresh, int bd) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi16(1);

-  __m128i blimit, limit, thresh;

+  __m128i blimit_v, limit_v, thresh_v;

   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;

   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;

   __m128i ps1, qs1, ps0, qs0;

@@ -70,35 +70,35 @@

   __m128i eight, four;

   if (bd == 8) {

-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);

-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);

-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);

+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);

+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);

+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);

   } else if (bd == 10) {

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);

   } else {  // bd == 12

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);

-  q4 = _mm_load_si128((__m128i *)(s + 4 * p));

-  p4 = _mm_load_si128((__m128i *)(s - 5 * p));

-  q3 = _mm_load_si128((__m128i *)(s + 3 * p));

-  p3 = _mm_load_si128((__m128i *)(s - 4 * p));

-  q2 = _mm_load_si128((__m128i *)(s + 2 * p));

-  p2 = _mm_load_si128((__m128i *)(s - 3 * p));

-  q1 = _mm_load_si128((__m128i *)(s + 1 * p));

-  p1 = _mm_load_si128((__m128i *)(s - 2 * p));

-  q0 = _mm_load_si128((__m128i *)(s + 0 * p));

-  p0 = _mm_load_si128((__m128i *)(s - 1 * p));

+  q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));

+  p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));

+  q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));

+  p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));

+  q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));

+  p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));

+  q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));

+  p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));

+  q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));

+  p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));

   //  highbd_filter_mask

   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));

@@ -111,14 +111,14 @@

   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)

   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);

-  hev = _mm_subs_epu16(flat, thresh);

+  hev = _mm_subs_epu16(flat, thresh_v);

   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);

   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2

   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2

-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);

+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);

   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);

-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));

+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));

   work = _mm_max_epi16(

       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),

       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));

@@ -132,7 +132,7 @@

       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));

   mask = _mm_max_epi16(work, mask);

-  mask = _mm_subs_epu16(mask, limit);

+  mask = _mm_subs_epu16(mask, limit_v);

   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask

   // lp filter

@@ -207,12 +207,12 @@

   // (because, in both vars, each block of 16 either all 1s or all 0s)

   flat = _mm_and_si128(flat, mask);

-  p5 = _mm_load_si128((__m128i *)(s - 6 * p));

-  q5 = _mm_load_si128((__m128i *)(s + 5 * p));

-  p6 = _mm_load_si128((__m128i *)(s - 7 * p));

-  q6 = _mm_load_si128((__m128i *)(s + 6 * p));

-  p7 = _mm_load_si128((__m128i *)(s - 8 * p));

-  q7 = _mm_load_si128((__m128i *)(s + 7 * p));

+  p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));

+  q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));

+  p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));

+  q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));

+  p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));

+  q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));

   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7

   // but referred to as p0-p4 & q0-q4 in fn)

@@ -389,8 +389,8 @@

   flat2_q6 = _mm_and_si128(flat2, flat2_q6);

   //  get values for when (flat2 && flat && mask)

   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values

-  _mm_store_si128((__m128i *)(s - 7 * p), p6);

-  _mm_store_si128((__m128i *)(s + 6 * p), q6);

+  _mm_store_si128((__m128i *)(s - 7 * pitch), p6);

+  _mm_store_si128((__m128i *)(s + 6 * pitch), q6);

   p5 = _mm_andnot_si128(flat2, p5);

   //  p5 remains unchanged if !(flat2 && flat && mask)

@@ -404,8 +404,8 @@

   //  get values for when (flat2 && flat && mask)

   q5 = _mm_or_si128(q5, flat2_q5);

   //  full list of q5 values

-  _mm_store_si128((__m128i *)(s - 6 * p), p5);

-  _mm_store_si128((__m128i *)(s + 5 * p), q5);

+  _mm_store_si128((__m128i *)(s - 6 * pitch), p5);

+  _mm_store_si128((__m128i *)(s + 5 * pitch), q5);

   p4 = _mm_andnot_si128(flat2, p4);

   //  p4 remains unchanged if !(flat2 && flat && mask)

@@ -417,8 +417,8 @@

   flat2_q4 = _mm_and_si128(flat2, flat2_q4);

   //  get values for when (flat2 && flat && mask)

   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values

-  _mm_store_si128((__m128i *)(s - 5 * p), p4);

-  _mm_store_si128((__m128i *)(s + 4 * p), q4);

+  _mm_store_si128((__m128i *)(s - 5 * pitch), p4);

+  _mm_store_si128((__m128i *)(s + 4 * pitch), q4);

   p3 = _mm_andnot_si128(flat2, p3);

   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)

@@ -430,8 +430,8 @@

   flat2_q3 = _mm_and_si128(flat2, flat2_q3);

   //  get values for when (flat2 && flat && mask)

   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values

-  _mm_store_si128((__m128i *)(s - 4 * p), p3);

-  _mm_store_si128((__m128i *)(s + 3 * p), q3);

+  _mm_store_si128((__m128i *)(s - 4 * pitch), p3);

+  _mm_store_si128((__m128i *)(s + 3 * pitch), q3);

   p2 = _mm_andnot_si128(flat2, p2);

   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)

@@ -444,8 +444,8 @@

   flat2_q2 = _mm_and_si128(flat2, flat2_q2);

   //  get values for when (flat2 && flat && mask)

   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values

-  _mm_store_si128((__m128i *)(s - 3 * p), p2);

-  _mm_store_si128((__m128i *)(s + 2 * p), q2);

+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);

+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);

   p1 = _mm_andnot_si128(flat2, p1);

   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)

@@ -457,8 +457,8 @@

   flat2_q1 = _mm_and_si128(flat2, flat2_q1);

   //  get values for when (flat2 && flat && mask)

   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values

-  _mm_store_si128((__m128i *)(s - 2 * p), p1);

-  _mm_store_si128((__m128i *)(s + 1 * p), q1);

+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);

+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);

   p0 = _mm_andnot_si128(flat2, p0);

   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)

@@ -470,22 +470,22 @@

   flat2_q0 = _mm_and_si128(flat2, flat2_q0);

   //  get values for when (flat2 && flat && mask)

   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values

-  _mm_store_si128((__m128i *)(s - 1 * p), p0);

-  _mm_store_si128((__m128i *)(s - 0 * p), q0);

+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);

+  _mm_store_si128((__m128i *)(s - 0 * pitch), q0);

-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,

-                                            const uint8_t *_blimit,

-                                            const uint8_t *_limit,

-                                            const uint8_t *_thresh, int bd) {

-  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);

-  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);

+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,

+                                            const uint8_t *blimit,

+                                            const uint8_t *limit,

+                                            const uint8_t *thresh, int bd) {

+  vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);

+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);

-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,

-                                      const uint8_t *_blimit,

-                                      const uint8_t *_limit,

-                                      const uint8_t *_thresh, int bd) {

+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,

+                                      const uint8_t *blimit,

+                                      const uint8_t *limit,

+                                      const uint8_t *thresh, int bd) {

   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);

   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);

   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);

@@ -493,16 +493,16 @@

   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);

   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);

   const __m128i zero = _mm_set1_epi16(0);

-  __m128i blimit, limit, thresh;

+  __m128i blimit_v, limit_v, thresh_v;

   __m128i mask, hev, flat;

-  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));

-  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));

-  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));

-  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));

-  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));

-  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));

-  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));

-  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));

+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));

+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));

+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));

+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));

+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));

+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));

+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));

+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));

   const __m128i one = _mm_set1_epi16(1);

   const __m128i ffff = _mm_cmpeq_epi16(one, one);

   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;

@@ -519,25 +519,25 @@

   __m128i filter1, filter2;

   if (bd == 8) {

-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);

-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);

-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);

+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);

+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);

+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);

     t80 = _mm_set1_epi16(0x80);

   } else if (bd == 10) {

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);

     t80 = _mm_set1_epi16(0x200);

   } else {  // bd == 12

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);

     t80 = _mm_set1_epi16(0x800);

@@ -553,16 +553,16 @@

   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));

   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));

   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);

-  hev = _mm_subs_epu16(flat, thresh);

+  hev = _mm_subs_epu16(flat, thresh_v);

   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);

   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);

   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);

-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);

+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);

   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);

   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

   // So taking maximums continues to work:

-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));

+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));

   mask = _mm_max_epi16(abs_p1p0, mask);

   // mask |= (abs(p1 - p0) > limit) * -1;

   mask = _mm_max_epi16(abs_q1q0, mask);

@@ -576,7 +576,7 @@

       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),

       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));

   mask = _mm_max_epi16(work, mask);

-  mask = _mm_subs_epu16(mask, limit);

+  mask = _mm_subs_epu16(mask, limit_v);

   mask = _mm_cmpeq_epi16(mask, zero);

   // flat_mask4

@@ -674,7 +674,7 @@

   q1 = _mm_and_si128(flat, q1);

   q1 = _mm_or_si128(work_a, q1);

-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

   q2 = _mm_load_si128((__m128i *)flat_oq2);

   work_a = _mm_andnot_si128(flat, work_a);

   q2 = _mm_and_si128(flat, q2);

@@ -694,43 +694,43 @@

   p1 = _mm_and_si128(flat, p1);

   p1 = _mm_or_si128(work_a, p1);

-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

   p2 = _mm_load_si128((__m128i *)flat_op2);

   work_a = _mm_andnot_si128(flat, work_a);

   p2 = _mm_and_si128(flat, p2);

   p2 = _mm_or_si128(work_a, p2);

-  _mm_store_si128((__m128i *)(s - 3 * p), p2);

-  _mm_store_si128((__m128i *)(s - 2 * p), p1);

-  _mm_store_si128((__m128i *)(s - 1 * p), p0);

-  _mm_store_si128((__m128i *)(s + 0 * p), q0);

-  _mm_store_si128((__m128i *)(s + 1 * p), q1);

-  _mm_store_si128((__m128i *)(s + 2 * p), q2);

+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);

+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);

+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);

+  _mm_store_si128((__m128i *)(s + 0 * pitch), q0);

+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);

+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);

 void vpx_highbd_lpf_horizontal_8_dual_sse2(

-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,

-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,

-    const uint8_t *_thresh1, int bd) {

-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);

-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

+    const uint8_t *thresh1, int bd) {

+  vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);

-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,

-                                      const uint8_t *_blimit,

-                                      const uint8_t *_limit,

-                                      const uint8_t *_thresh, int bd) {

+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,

+                                      const uint8_t *blimit,

+                                      const uint8_t *limit,

+                                      const uint8_t *thresh, int bd) {

   const __m128i zero = _mm_set1_epi16(0);

-  __m128i blimit, limit, thresh;

+  __m128i blimit_v, limit_v, thresh_v;

   __m128i mask, hev, flat;

-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));

+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));

+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));

+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));

+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));

+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));

   const __m128i abs_p1p0 =

       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));

   const __m128i abs_q1q0 =

@@ -760,9 +760,9 @@

   __m128i filter1, filter2;

   if (bd == 8) {

-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);

-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);

-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);

+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);

+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);

+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);

     t80 = _mm_set1_epi16(0x80);

     tff80 = _mm_set1_epi16(0xff80);

     tffe0 = _mm_set1_epi16(0xffe0);

@@ -769,12 +769,12 @@

     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);

     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);

   } else if (bd == 10) {

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);

     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);

     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);

     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);

@@ -781,12 +781,12 @@

     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);

     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);

   } else {  // bd == 12

-    blimit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);

-    limit = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);

-    thresh = _mm_slli_epi16(

-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);

+    blimit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);

+    limit_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);

+    thresh_v = _mm_slli_epi16(

+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);

     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);

     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);

     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);

@@ -794,23 +794,23 @@

     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);

-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);

-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);

-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);

-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);

+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);

+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);

+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);

+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);

   // filter_mask and hev_mask

   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);

-  hev = _mm_subs_epu16(flat, thresh);

+  hev = _mm_subs_epu16(flat, thresh_v);

   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);

   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);

   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);

-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);

+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);

   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);

   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

   // So taking maximums continues to work:

-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));

+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));

   mask = _mm_max_epi16(flat, mask);

   // mask |= (abs(p1 - p0) > limit) * -1;

   // mask |= (abs(q1 - q0) > limit) * -1;

@@ -822,7 +822,7 @@

       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),

       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));

   mask = _mm_max_epi16(work, mask);

-  mask = _mm_subs_epu16(mask, limit);

+  mask = _mm_subs_epu16(mask, limit_v);

   mask = _mm_cmpeq_epi16(mask, zero);

   // filter4

@@ -872,18 +872,18 @@

   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),

                       t80);

-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);

-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

+  _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);

+  _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);

+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);

+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);

 void vpx_highbd_lpf_horizontal_4_dual_sse2(

-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,

-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,

-    const uint8_t *_thresh1, int bd) {

-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);

-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

+    const uint8_t *thresh1, int bd) {

+  vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);

+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);

 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],

@@ -998,9 +998,9 @@

   highbd_transpose(src1, in_p, dest1, out_p, 1);

-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,

-                                    const uint8_t *limit, const uint8_t *thresh,

-                                    int bd) {

+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,

+                                    const uint8_t *blimit, const uint8_t *limit,

+                                    const uint8_t *thresh, int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);

   uint16_t *src[1];

   uint16_t *dst[1];

@@ -1009,7 +1009,7 @@

   src[0] = s - 4;

   dst[0] = t_dst;

-  highbd_transpose(src, p, dst, 8, 1);

+  highbd_transpose(src, pitch, dst, 8, 1);

   // Loop filtering

   vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);

@@ -1018,11 +1018,11 @@

   dst[0] = s - 4;

   // Transpose back

-  highbd_transpose(src, 8, dst, p, 1);

+  highbd_transpose(src, 8, dst, pitch, 1);

 void vpx_highbd_lpf_vertical_4_dual_sse2(

-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

     const uint8_t *thresh1, int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);

@@ -1030,7 +1030,7 @@

   uint16_t *dst[2];

   // Transpose 8x16

-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);

   // Loop filtering

   vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,

@@ -1038,15 +1038,15 @@

   src[0] = t_dst;

   src[1] = t_dst + 8;

   dst[0] = s - 4;

-  dst[1] = s - 4 + p * 8;

+  dst[1] = s - 4 + pitch * 8;

   // Transpose back

-  highbd_transpose(src, 16, dst, p, 2);

+  highbd_transpose(src, 16, dst, pitch, 2);

-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,

-                                    const uint8_t *limit, const uint8_t *thresh,

-                                    int bd) {

+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,

+                                    const uint8_t *blimit, const uint8_t *limit,

+                                    const uint8_t *thresh, int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);

   uint16_t *src[1];

   uint16_t *dst[1];

@@ -1055,7 +1055,7 @@

   src[0] = s - 4;

   dst[0] = t_dst;

-  highbd_transpose(src, p, dst, 8, 1);

+  highbd_transpose(src, pitch, dst, 8, 1);

   // Loop filtering

   vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);

@@ -1064,11 +1064,11 @@

   dst[0] = s - 4;

   // Transpose back

-  highbd_transpose(src, 8, dst, p, 1);

+  highbd_transpose(src, 8, dst, pitch, 1);

 void vpx_highbd_lpf_vertical_8_dual_sse2(

-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,

+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

     const uint8_t *thresh1, int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);

@@ -1076,7 +1076,7 @@

   uint16_t *dst[2];

   // Transpose 8x16

-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);

   // Loop filtering

   vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,

@@ -1085,13 +1085,14 @@

   src[1] = t_dst + 8;

   dst[0] = s - 4;

-  dst[1] = s - 4 + p * 8;

+  dst[1] = s - 4 + pitch * 8;

   // Transpose back

-  highbd_transpose(src, 16, dst, p, 2);

+  highbd_transpose(src, 16, dst, pitch, 2);

-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,

+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,

+                                     const uint8_t *blimit,

                                      const uint8_t *limit,

                                      const uint8_t *thresh, int bd) {

   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);

@@ -1104,7 +1105,7 @@

   dst[1] = t_dst + 8 * 8;

   // Transpose 16x8

-  highbd_transpose(src, p, dst, 8, 2);

+  highbd_transpose(src, pitch, dst, 8, 2);

   // Loop filtering

   vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,

@@ -1115,10 +1116,10 @@

   dst[1] = s;

   // Transpose back

-  highbd_transpose(src, 8, dst, p, 2);

+  highbd_transpose(src, 8, dst, pitch, 2);

-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,

+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,

                                           const uint8_t *blimit,

                                           const uint8_t *limit,

                                           const uint8_t *thresh, int bd) {

@@ -1125,8 +1126,8 @@

   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);

   //  Transpose 16x16

-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);

-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

+  highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);

+  highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);

   //  Loop filtering

   vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,

@@ -1133,6 +1134,7 @@

                                          thresh, bd);

   //  Transpose back

-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);

+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);

+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,

+                       pitch);

--- a/vpx_dsp/x86/loopfilter_avx2.c

+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -13,10 +13,10 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_ports/mem.h"

-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,

-                                const unsigned char *_blimit,

-                                const unsigned char *_limit,

-                                const unsigned char *_thresh) {

+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,

+                                const unsigned char *blimit,

+                                const unsigned char *limit,

+                                const unsigned char *thresh) {

   __m128i mask, hev, flat, flat2;

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

@@ -23,28 +23,28 @@

   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;

   __m128i abs_p1p0;

-  const __m128i thresh =

-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));

-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));

-  const __m128i blimit =

-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

+  const __m128i thresh_v =

+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));

+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));

+  const __m128i blimit_v =

+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));

-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));

+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));

   q4p4 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));

-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));

+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));

   q3p3 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));

-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));

+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));

   q2p2 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));

-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));

+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));

   q1p1 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));

+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));

   p1q1 = _mm_shuffle_epi32(q1p1, 78);

-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));

+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));

   q0p0 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));

+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));

   p0q0 = _mm_shuffle_epi32(q0p0, 78);

@@ -59,12 +59,12 @@

     abs_p1q1 =

         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));

     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    hev = _mm_subs_epu8(flat, thresh);

+    hev = _mm_subs_epu8(flat, thresh_v);

     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);

     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);

     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

     mask = _mm_max_epu8(abs_p1p0, mask);

@@ -76,7 +76,7 @@

         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));

     mask = _mm_max_epu8(work, mask);

     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));

-    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_subs_epu8(mask, limit_v);

     mask = _mm_cmpeq_epi8(mask, zero);

@@ -136,21 +136,21 @@

       flat = _mm_cmpeq_epi8(flat, zero);

       flat = _mm_and_si128(flat, mask);

-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));

+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));

       q5p5 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));

-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));

+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));

       q6p6 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));

       flat2 = _mm_max_epu8(

           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),

           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));

+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));

       q7p7 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));

       work = _mm_max_epu8(

           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),

@@ -321,44 +321,44 @@

     q6p6 = _mm_andnot_si128(flat2, q6p6);

     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);

     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);

-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);

-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);

+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));

     q5p5 = _mm_andnot_si128(flat2, q5p5);

     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);

     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);

-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);

-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);

+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));

     q4p4 = _mm_andnot_si128(flat2, q4p4);

     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);

     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);

-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);

-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);

+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));

     q3p3 = _mm_andnot_si128(flat2, q3p3);

     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);

     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);

-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);

-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);

+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));

     q2p2 = _mm_andnot_si128(flat2, q2p2);

     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);

     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);

-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);

-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);

+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));

     q1p1 = _mm_andnot_si128(flat2, q1p1);

     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);

     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);

-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);

-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);

+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));

     q0p0 = _mm_andnot_si128(flat2, q0p0);

     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);

     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);

-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);

-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));

+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);

+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));

@@ -367,10 +367,10 @@

   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128

};

-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,

-                                     const unsigned char *_blimit,

-                                     const unsigned char *_limit,

-                                     const unsigned char *_thresh) {

+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,

+                                     const unsigned char *blimit,

+                                     const unsigned char *limit,

+                                     const unsigned char *thresh) {

   __m128i mask, hev, flat, flat2;

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

@@ -380,32 +380,32 @@

   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,

       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;

-  const __m128i thresh =

-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));

-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));

-  const __m128i blimit =

-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

+  const __m128i thresh_v =

+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));

+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));

+  const __m128i blimit_v =

+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));

-  p256_4 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));

-  p256_3 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));

-  p256_2 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));

-  p256_1 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));

-  p256_0 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));

-  q256_0 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));

-  q256_1 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));

-  q256_2 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));

-  q256_3 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));

-  q256_4 =

-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));

+  p256_4 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));

+  p256_3 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));

+  p256_2 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));

+  p256_1 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));

+  p256_0 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));

+  q256_0 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));

+  q256_1 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));

+  q256_2 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));

+  q256_3 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));

+  q256_4 = _mm256_castpd_si256(

+      _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));

   p4 = _mm256_castsi256_si128(p256_4);

   p3 = _mm256_castsi256_si128(p256_3);

@@ -431,12 +431,12 @@

         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));

     __m128i work;

     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    hev = _mm_subs_epu8(flat, thresh);

+    hev = _mm_subs_epu8(flat, thresh_v);

     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);

     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);

     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

     mask = _mm_max_epu8(flat, mask);

@@ -450,7 +450,7 @@

         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),

         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));

     mask = _mm_max_epu8(work, mask);

-    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_subs_epu8(mask, limit_v);

     mask = _mm_cmpeq_epi8(mask, zero);

@@ -532,9 +532,9 @@

       flat = _mm_and_si128(flat, mask);

       p256_5 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));

       q256_5 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));

       p5 = _mm256_castsi256_si128(p256_5);

       q5 = _mm256_castsi256_si128(q256_5);

       flat2 = _mm_max_epu8(

@@ -543,9 +543,9 @@

       flat2 = _mm_max_epu8(work, flat2);

       p256_6 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));

       q256_6 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));

       p6 = _mm256_castsi256_si128(p256_6);

       q6 = _mm256_castsi256_si128(q256_6);

       work = _mm_max_epu8(

@@ -555,9 +555,9 @@

       flat2 = _mm_max_epu8(work, flat2);

       p256_7 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));

       q256_7 = _mm256_castpd_si256(

-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));

+          _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));

       p7 = _mm256_castsi256_si128(p256_7);

       q7 = _mm256_castsi256_si128(q256_7);

       work = _mm_max_epu8(

@@ -843,71 +843,71 @@

     p6 = _mm_andnot_si128(flat2, p6);

     flat2_p6 = _mm_and_si128(flat2, flat2_p6);

     p6 = _mm_or_si128(flat2_p6, p6);

-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);

+    _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);

     p5 = _mm_andnot_si128(flat2, p5);

     flat2_p5 = _mm_and_si128(flat2, flat2_p5);

     p5 = _mm_or_si128(flat2_p5, p5);

-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);

+    _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);

     p4 = _mm_andnot_si128(flat2, p4);

     flat2_p4 = _mm_and_si128(flat2, flat2_p4);

     p4 = _mm_or_si128(flat2_p4, p4);

-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);

+    _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);

     p3 = _mm_andnot_si128(flat2, p3);

     flat2_p3 = _mm_and_si128(flat2, flat2_p3);

     p3 = _mm_or_si128(flat2_p3, p3);

-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);

+    _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);

     p2 = _mm_andnot_si128(flat2, p2);

     flat2_p2 = _mm_and_si128(flat2, flat2_p2);

     p2 = _mm_or_si128(flat2_p2, p2);

-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);

     p1 = _mm_andnot_si128(flat2, p1);

     flat2_p1 = _mm_and_si128(flat2, flat2_p1);

     p1 = _mm_or_si128(flat2_p1, p1);

-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);

     p0 = _mm_andnot_si128(flat2, p0);

     flat2_p0 = _mm_and_si128(flat2, flat2_p0);

     p0 = _mm_or_si128(flat2_p0, p0);

-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);

     q0 = _mm_andnot_si128(flat2, q0);

     flat2_q0 = _mm_and_si128(flat2, flat2_q0);

     q0 = _mm_or_si128(flat2_q0, q0);

-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);

+    _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);

     q1 = _mm_andnot_si128(flat2, q1);

     flat2_q1 = _mm_and_si128(flat2, flat2_q1);

     q1 = _mm_or_si128(flat2_q1, q1);

-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);

     q2 = _mm_andnot_si128(flat2, q2);

     flat2_q2 = _mm_and_si128(flat2, flat2_q2);

     q2 = _mm_or_si128(flat2_q2, q2);

-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);

+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);

     q3 = _mm_andnot_si128(flat2, q3);

     flat2_q3 = _mm_and_si128(flat2, flat2_q3);

     q3 = _mm_or_si128(flat2_q3, q3);

-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);

+    _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);

     q4 = _mm_andnot_si128(flat2, q4);

     flat2_q4 = _mm_and_si128(flat2, flat2_q4);

     q4 = _mm_or_si128(flat2_q4, q4);

-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);

+    _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);

     q5 = _mm_andnot_si128(flat2, q5);

     flat2_q5 = _mm_and_si128(flat2, flat2_q5);

     q5 = _mm_or_si128(flat2_q5, q5);

-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);

+    _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);

     q6 = _mm_andnot_si128(flat2, q6);

     flat2_q6 = _mm_and_si128(flat2, flat2_q6);

     q6 = _mm_or_si128(flat2_q6, q6);

-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);

+    _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);

--- a/vpx_dsp/x86/loopfilter_sse2.c

+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -31,7 +31,7 @@

     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \

     hev =                                                                     \

         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \

-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \

+    hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \

     hev = _mm_packs_epi16(hev, hev);                                          \

     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \

@@ -52,7 +52,7 @@

     flat = _mm_max_epu8(work, flat);                                          \

     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \

     mask = _mm_unpacklo_epi64(mask, flat);                                    \

-    mask = _mm_subs_epu8(mask, limit);                                        \

+    mask = _mm_subs_epu8(mask, limit_v);                                      \

     mask = _mm_cmpeq_epi8(mask, zero);                                        \

     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \

   } while (0)

@@ -104,27 +104,26 @@

     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \

   } while (0)

-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,

-                               const uint8_t *_blimit, const uint8_t *_limit,

-                               const uint8_t *_thresh) {

+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,

+                               const uint8_t *limit, const uint8_t *thresh) {

   const __m128i zero = _mm_set1_epi16(0);

-  const __m128i limit =

-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),

-                         _mm_loadl_epi64((const __m128i *)_limit));

-  const __m128i thresh =

-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);

+  const __m128i limit_v =

+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),

+                         _mm_loadl_epi64((const __m128i *)limit));

+  const __m128i thresh_v =

+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);

   const __m128i ff = _mm_cmpeq_epi8(zero, zero);

   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;

   __m128i mask, hev;

-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),

-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));

-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));

-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));

-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));

+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));

+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));

+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));

+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));

   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);

   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);

   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);

@@ -133,21 +132,20 @@

   FILTER_HEV_MASK;

   FILTER4;

-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1

-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0

-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0

-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1

+  _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1

+  _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0

+  _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0

+  _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1

-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,

-                             const uint8_t *_blimit, const uint8_t *_limit,

-                             const uint8_t *_thresh) {

+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,

+                             const uint8_t *limit, const uint8_t *thresh) {

   const __m128i zero = _mm_set1_epi16(0);

-  const __m128i limit =

-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),

-                         _mm_loadl_epi64((const __m128i *)_limit));

-  const __m128i thresh =

-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);

+  const __m128i limit_v =

+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),

+                         _mm_loadl_epi64((const __m128i *)limit));

+  const __m128i thresh_v =

+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);

   const __m128i ff = _mm_cmpeq_epi8(zero, zero);

   __m128i x0, x1, x2, x3;

   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;

@@ -154,20 +152,20 @@

   __m128i mask, hev;

   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17

-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),

-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));

+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),

+                           _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));

   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37

-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),

-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));

+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),

+                         _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));

   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57

-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),

-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));

+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),

+                         _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));

   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77

-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),

-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));

+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),

+                         _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));

   // Transpose 8x8

   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33

@@ -213,52 +211,52 @@

   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33

   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);

-  storeu_uint32(s + 0 * p - 2, _mm_cvtsi128_si32(ps1ps0));

+  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));

   ps1ps0 = _mm_srli_si128(ps1ps0, 4);

-  storeu_uint32(s + 1 * p - 2, _mm_cvtsi128_si32(ps1ps0));

+  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));

   ps1ps0 = _mm_srli_si128(ps1ps0, 4);

-  storeu_uint32(s + 2 * p - 2, _mm_cvtsi128_si32(ps1ps0));

+  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));

   ps1ps0 = _mm_srli_si128(ps1ps0, 4);

-  storeu_uint32(s + 3 * p - 2, _mm_cvtsi128_si32(ps1ps0));

+  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));

-  storeu_uint32(s + 4 * p - 2, _mm_cvtsi128_si32(qs1qs0));

+  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));

   qs1qs0 = _mm_srli_si128(qs1qs0, 4);

-  storeu_uint32(s + 5 * p - 2, _mm_cvtsi128_si32(qs1qs0));

+  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));

   qs1qs0 = _mm_srli_si128(qs1qs0, 4);

-  storeu_uint32(s + 6 * p - 2, _mm_cvtsi128_si32(qs1qs0));

+  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));

   qs1qs0 = _mm_srli_si128(qs1qs0, 4);

-  storeu_uint32(s + 7 * p - 2, _mm_cvtsi128_si32(qs1qs0));

+  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));

-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,

-                                const unsigned char *_blimit,

-                                const unsigned char *_limit,

-                                const unsigned char *_thresh) {

+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,

+                                const unsigned char *blimit,

+                                const unsigned char *limit,

+                                const unsigned char *thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);

-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);

+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);

+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);

+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);

   __m128i mask, hev, flat, flat2;

   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;

   __m128i abs_p1p0;

-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));

+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));

   q4p4 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));

-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));

+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));

   q3p3 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));

-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));

+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));

   q2p2 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));

-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));

+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));

+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));

   q1p1 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));

+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));

   p1q1 = _mm_shuffle_epi32(q1p1, 78);

-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));

+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));

   q0p0 = _mm_castps_si128(

-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));

+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));

   p0q0 = _mm_shuffle_epi32(q0p0, 78);

@@ -270,12 +268,12 @@

     abs_p0q0 = abs_diff(q0p0, p0q0);

     abs_p1q1 = abs_diff(q1p1, p1q1);

     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    hev = _mm_subs_epu8(flat, thresh);

+    hev = _mm_subs_epu8(flat, thresh_v);

     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);

     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);

     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

     mask = _mm_max_epu8(abs_p1p0, mask);

@@ -285,7 +283,7 @@

     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));

     mask = _mm_max_epu8(work, mask);

     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));

-    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_subs_epu8(mask, limit_v);

     mask = _mm_cmpeq_epi8(mask, zero);

@@ -343,18 +341,18 @@

       flat = _mm_cmpeq_epi8(flat, zero);

       flat = _mm_and_si128(flat, mask);

-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));

+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));

       q5p5 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));

-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));

+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));

       q6p6 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));

       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));

-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));

+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));

       q7p7 = _mm_castps_si128(

-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));

       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));

       flat2 = _mm_max_epu8(work, flat2);

       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));

@@ -521,44 +519,44 @@

     q6p6 = _mm_andnot_si128(flat2, q6p6);

     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);

     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);

-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);

-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);

+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));

     q5p5 = _mm_andnot_si128(flat2, q5p5);

     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);

     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);

-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);

-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);

+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));

     q4p4 = _mm_andnot_si128(flat2, q4p4);

     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);

     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);

-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);

-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);

+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));

     q3p3 = _mm_andnot_si128(flat2, q3p3);

     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);

     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);

-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);

-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);

+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));

     q2p2 = _mm_andnot_si128(flat2, q2p2);

     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);

     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);

-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);

-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);

+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));

     q1p1 = _mm_andnot_si128(flat2, q1p1);

     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);

     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);

-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);

-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);

+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));

     q0p0 = _mm_andnot_si128(flat2, q0p0);

     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);

     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);

-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);

-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));

+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);

+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));

@@ -592,15 +590,15 @@

   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);

-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,

-                                     const unsigned char *_blimit,

-                                     const unsigned char *_limit,

-                                     const unsigned char *_thresh) {

+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,

+                                     const unsigned char *blimit,

+                                     const unsigned char *limit,

+                                     const unsigned char *thresh) {

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi8(1);

-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);

-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);

+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);

+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);

+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);

   __m128i mask, hev, flat, flat2;

   __m128i p7, p6, p5;

   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;

@@ -610,22 +608,22 @@

   __m128i max_abs_p1p0q1q0;

-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));

-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));

-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));

-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));

-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));

-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));

-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));

-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));

+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));

+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));

+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));

+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));

+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));

+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));

+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));

+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));

+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));

+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));

+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));

+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));

+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));

+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));

     const __m128i abs_p1p0 = abs_diff(p1, p0);

@@ -639,7 +637,7 @@

     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);

     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);

     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);

@@ -649,7 +647,7 @@

     mask = _mm_max_epu8(work, mask);

     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));

     mask = _mm_max_epu8(work, mask);

-    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_subs_epu8(mask, limit_v);

     mask = _mm_cmpeq_epi8(mask, zero);

@@ -695,7 +693,7 @@

     oq0 = _mm_xor_si128(q0, t80);

     oq1 = _mm_xor_si128(q1, t80);

-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);

+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);

     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);

@@ -852,72 +850,72 @@

       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);

       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);

+      _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);

       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);

       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);

+      _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);

       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);

       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);

+      _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);

       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);

       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);

+      _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);

       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);

       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);

+      _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);

       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);

       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);

+      _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);

       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);

       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);

+      _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);

       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);

+      _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);

       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);

+      _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);

       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);

+      _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);

       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);

+      _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);

       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);

+      _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);

       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);

+      _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);

       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);

       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);

       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);

-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);

+      _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);

     // wide flat

     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -924,10 +922,10 @@

-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,

-                               const unsigned char *_blimit,

-                               const unsigned char *_limit,

-                               const unsigned char *_thresh) {

+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,

+                               const unsigned char *blimit,

+                               const unsigned char *limit,

+                               const unsigned char *thresh) {

   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

@@ -935,21 +933,21 @@

   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

   const __m128i zero = _mm_set1_epi16(0);

-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);

-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);

-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);

+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);

+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);

+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);

   __m128i mask, hev, flat;

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;

   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;

-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));

-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));

-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),

-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));

-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),

-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));

+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));

+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));

+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));

+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),

+                            _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));

   p1q1 = _mm_shuffle_epi32(q1p1, 78);

   p0q0 = _mm_shuffle_epi32(q0p0, 78);

@@ -965,12 +963,12 @@

     abs_p0q0 = abs_diff(q0p0, p0q0);

     abs_p1q1 = abs_diff(q1p1, p1q1);

     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    hev = _mm_subs_epu8(flat, thresh);

+    hev = _mm_subs_epu8(flat, thresh_v);

     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);

     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);

     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

     mask = _mm_max_epu8(abs_p1p0, mask);

@@ -980,7 +978,7 @@

     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));

     mask = _mm_max_epu8(work, mask);

     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));

-    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_subs_epu8(mask, limit_v);

     mask = _mm_cmpeq_epi8(mask, zero);

     // flat_mask4

@@ -998,14 +996,22 @@

     unsigned char *src = s;

       __m128i workp_a, workp_b, workp_shft;

-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),

+                             zero);

+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),

+                             zero);

+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),

+                             zero);

+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),

+                             zero);

+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),

+                             zero);

+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),

+                             zero);

+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),

+                             zero);

+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),

+                             zero);

       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));

       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

@@ -1051,13 +1057,13 @@

     const __m128i t80 = _mm_set1_epi8(0x80);

     const __m128i t1 = _mm_set1_epi8(0x1);

     const __m128i ps1 =

-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);

+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);

     const __m128i ps0 =

-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);

+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);

     const __m128i qs0 =

-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);

+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);

     const __m128i qs1 =

-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);

+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);

     __m128i filt;

     __m128i work_a;

     __m128i filter1, filter2;

@@ -1103,7 +1109,7 @@

     q1 = _mm_and_si128(flat, q1);

     q1 = _mm_or_si128(work_a, q1);

-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);

     work_a = _mm_andnot_si128(flat, work_a);

     q2 = _mm_and_si128(flat, q2);

@@ -1121,27 +1127,25 @@

     p1 = _mm_and_si128(flat, p1);

     p1 = _mm_or_si128(work_a, p1);

-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

     p2 = _mm_loadl_epi64((__m128i *)flat_op2);

     work_a = _mm_andnot_si128(flat, work_a);

     p2 = _mm_and_si128(flat, p2);

     p2 = _mm_or_si128(work_a, p2);

-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);

-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);

-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);

-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);

-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);

-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);

+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);

+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);

+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);

+    _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);

+    _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);

+    _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);

-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,

-                                    const uint8_t *_limit0,

-                                    const uint8_t *_thresh0,

-                                    const uint8_t *_blimit1,

-                                    const uint8_t *_limit1,

-                                    const uint8_t *_thresh1) {

+void vpx_lpf_horizontal_8_dual_sse2(

+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,

+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,

+    const uint8_t *thresh1) {

   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

@@ -1150,26 +1154,26 @@

   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i blimit =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),

-                         _mm_load_si128((const __m128i *)_blimit1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),

+                         _mm_load_si128((const __m128i *)blimit1));

   const __m128i limit =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),

-                         _mm_load_si128((const __m128i *)_limit1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),

+                         _mm_load_si128((const __m128i *)limit1));

   const __m128i thresh =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),

-                         _mm_load_si128((const __m128i *)_thresh1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),

+                         _mm_load_si128((const __m128i *)thresh1));

   __m128i mask, hev, flat;

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;

-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));

+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));

+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));

+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));

+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));

+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));

     const __m128i abs_p1p0 =

         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));

@@ -1228,14 +1232,22 @@

     do {

       __m128i workp_a, workp_b, workp_shft;

-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),

+                             zero);

+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),

+                             zero);

+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),

+                             zero);

+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),

+                             zero);

+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),

+                             zero);

+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),

+                             zero);

+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),

+                             zero);

+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),

+                             zero);

       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));

       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

@@ -1287,13 +1299,13 @@

     const __m128i t7f = _mm_set1_epi8(0x7f);

     const __m128i ps1 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);

     const __m128i ps0 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);

     const __m128i qs0 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);

     const __m128i qs1 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);

     __m128i filt;

     __m128i work_a;

     __m128i filter1, filter2;

@@ -1345,7 +1357,7 @@

     q1 = _mm_and_si128(flat, q1);

     q1 = _mm_or_si128(work_a, q1);

-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

     q2 = _mm_load_si128((__m128i *)flat_oq2);

     work_a = _mm_andnot_si128(flat, work_a);

     q2 = _mm_and_si128(flat, q2);

@@ -1363,49 +1375,49 @@

     p1 = _mm_and_si128(flat, p1);

     p1 = _mm_or_si128(work_a, p1);

-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

     p2 = _mm_load_si128((__m128i *)flat_op2);

     work_a = _mm_andnot_si128(flat, work_a);

     p2 = _mm_and_si128(flat, p2);

     p2 = _mm_or_si128(work_a, p2);

-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);

-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);

+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);

+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);

+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);

+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);

+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);

+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);

-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,

-                                    const unsigned char *_blimit0,

-                                    const unsigned char *_limit0,

-                                    const unsigned char *_thresh0,

-                                    const unsigned char *_blimit1,

-                                    const unsigned char *_limit1,

-                                    const unsigned char *_thresh1) {

+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,

+                                    const unsigned char *blimit0,

+                                    const unsigned char *limit0,

+                                    const unsigned char *thresh0,

+                                    const unsigned char *blimit1,

+                                    const unsigned char *limit1,

+                                    const unsigned char *thresh1) {

   const __m128i blimit =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),

-                         _mm_load_si128((const __m128i *)_blimit1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),

+                         _mm_load_si128((const __m128i *)blimit1));

   const __m128i limit =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),

-                         _mm_load_si128((const __m128i *)_limit1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),

+                         _mm_load_si128((const __m128i *)limit1));

   const __m128i thresh =

-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),

-                         _mm_load_si128((const __m128i *)_thresh1));

+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),

+                         _mm_load_si128((const __m128i *)thresh1));

   const __m128i zero = _mm_set1_epi16(0);

   __m128i p3, p2, p1, p0, q0, q1, q2, q3;

   __m128i mask, hev, flat;

-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));

+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));

+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));

+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));

+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));

+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));

+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));

+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));

   // filter_mask and hev_mask

@@ -1456,13 +1468,13 @@

     const __m128i t7f = _mm_set1_epi8(0x7f);

     const __m128i ps1 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);

     const __m128i ps0 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);

     const __m128i qs0 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);

     const __m128i qs1 =

-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);

+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);

     __m128i filt;

     __m128i work_a;

     __m128i filter1, filter2;

@@ -1507,10 +1519,10 @@

     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);

-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);

-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);

+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);

+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);

+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);

@@ -1650,7 +1662,7 @@

   } while (++idx8x8 < num_8x8_to_transpose);

-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,

+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,

                                   const uint8_t *limit0, const uint8_t *thresh0,

                                   const uint8_t *blimit1, const uint8_t *limit1,

                                   const uint8_t *thresh1) {

@@ -1659,7 +1671,7 @@

   unsigned char *dst[2];

   // Transpose 8x16

-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);

   // Loop filtering

   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,

@@ -1667,13 +1679,13 @@

   src[0] = t_dst;

   src[1] = t_dst + 8;

   dst[0] = s - 4;

-  dst[1] = s - 4 + p * 8;

+  dst[1] = s - 4 + pitch * 8;

   // Transpose back

-  transpose(src, 16, dst, p, 2);

+  transpose(src, 16, dst, pitch, 2);

-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,

+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,

                              const unsigned char *blimit,

                              const unsigned char *limit,

                              const unsigned char *thresh) {

@@ -1685,7 +1697,7 @@

   src[0] = s - 4;

   dst[0] = t_dst;

-  transpose(src, p, dst, 8, 1);

+  transpose(src, pitch, dst, 8, 1);

   // Loop filtering

   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);

@@ -1694,10 +1706,10 @@

   dst[0] = s - 4;

   // Transpose back

-  transpose(src, 8, dst, p, 1);

+  transpose(src, 8, dst, pitch, 1);

-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,

+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,

                                   const uint8_t *limit0, const uint8_t *thresh0,

                                   const uint8_t *blimit1, const uint8_t *limit1,

                                   const uint8_t *thresh1) {

@@ -1706,7 +1718,7 @@

   unsigned char *dst[2];

   // Transpose 8x16

-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);

   // Loop filtering

   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,

@@ -1715,13 +1727,13 @@

   src[1] = t_dst + 8;

   dst[0] = s - 4;

-  dst[1] = s - 4 + p * 8;

+  dst[1] = s - 4 + pitch * 8;

   // Transpose back

-  transpose(src, 16, dst, p, 2);

+  transpose(src, 16, dst, pitch, 2);

-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,

+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,

                               const unsigned char *blimit,

                               const unsigned char *limit,

                               const unsigned char *thresh) {

@@ -1735,7 +1747,7 @@

   dst[1] = t_dst + 8 * 8;

   // Transpose 16x8

-  transpose(src, p, dst, 8, 2);

+  transpose(src, pitch, dst, 8, 2);

   // Loop filtering

   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);

@@ -1746,22 +1758,22 @@

   dst[1] = s;

   // Transpose back

-  transpose(src, 8, dst, p, 2);

+  transpose(src, 8, dst, pitch, 2);

-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,

+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,

                                    const uint8_t *blimit, const uint8_t *limit,

                                    const uint8_t *thresh) {

   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);

   // Transpose 16x16

-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);

-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);

+  transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);

+  transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);

   // Loop filtering

   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);

   // Transpose back

-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);

-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);

+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);

+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);

--- a/vpx_dsp/x86/quantize_avx.c

+++ b/vpx_dsp/x86/quantize_avx.c

@@ -24,8 +24,8 @@

                         const int16_t *round_ptr, const int16_t *quant_ptr,

                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

-                        uint16_t *eob_ptr, const int16_t *scan_ptr,

-                        const int16_t *iscan_ptr) {

+                        uint16_t *eob_ptr, const int16_t *scan,

+                        const int16_t *iscan) {

   const __m128i zero = _mm_setzero_si128();

   const __m256i big_zero = _mm256_setzero_si256();

   int index;

@@ -37,7 +37,7 @@

   __m128i all_zero;

   __m128i eob = zero, eob0;

-  (void)scan_ptr;

+  (void)scan;

   (void)skip_block;

   assert(!skip_block);

@@ -97,8 +97,7 @@

     store_tran_low(coeff0, dqcoeff_ptr);

     store_tran_low(coeff1, dqcoeff_ptr + 8);

-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,

-                       zero);

+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

@@ -141,8 +140,8 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,

-                        index, zero);

+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+                        zero);

     eob = _mm_max_epi16(eob, eob0);

@@ -149,12 +148,14 @@

   *eob_ptr = accumulate_eob(eob);

-void vpx_quantize_b_32x32_avx(

-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,

-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,

-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,

-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {

+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                              int skip_block, const int16_t *zbin_ptr,

+                              const int16_t *round_ptr,

+                              const int16_t *quant_ptr,

+                              const int16_t *quant_shift_ptr,

+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,

+                              const int16_t *scan, const int16_t *iscan) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i one = _mm_set1_epi16(1);

   const __m256i big_zero = _mm256_setzero_si256();

@@ -167,7 +168,7 @@

   __m128i all_zero;

   __m128i eob = zero, eob0;

-  (void)scan_ptr;

+  (void)scan;

   (void)n_coeffs;

   (void)skip_block;

   assert(!skip_block);

@@ -253,8 +254,7 @@

     store_tran_low(coeff0, dqcoeff_ptr);

     store_tran_low(coeff1, dqcoeff_ptr + 8);

-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,

-                       zero);

+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

@@ -306,8 +306,8 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,

-                        index, zero);

+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+                        zero);

     eob = _mm_max_epi16(eob, eob0);

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -22,8 +22,8 @@

                          const int16_t *round_ptr, const int16_t *quant_ptr,

                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,

-                         uint16_t *eob_ptr, const int16_t *scan_ptr,

-                         const int16_t *iscan_ptr) {

+                         uint16_t *eob_ptr, const int16_t *scan,

+                         const int16_t *iscan) {

   const __m128i zero = _mm_setzero_si128();

   int index = 16;

@@ -33,7 +33,7 @@

   __m128i cmp_mask0, cmp_mask1;

   __m128i eob, eob0;

-  (void)scan_ptr;

+  (void)scan;

   (void)skip_block;

   assert(!skip_block);

@@ -81,8 +81,7 @@

   store_tran_low(coeff0, dqcoeff_ptr);

   store_tran_low(coeff1, dqcoeff_ptr + 8);

-  eob =

-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);

+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

   while (index < n_coeffs) {

@@ -115,8 +114,8 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,

-                        index, zero);

+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+                        zero);

     eob = _mm_max_epi16(eob, eob0);

     index += 16;

--- a/vpx_dsp/x86/quantize_ssse3.c

+++ b/vpx_dsp/x86/quantize_ssse3.c

@@ -22,7 +22,7 @@

                           const int16_t *quant_shift_ptr,

                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

                           const int16_t *dequant_ptr, uint16_t *eob_ptr,

-                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {

+                          const int16_t *scan, const int16_t *iscan) {

   const __m128i zero = _mm_setzero_si128();

   int index = 16;

@@ -32,7 +32,7 @@

   __m128i cmp_mask0, cmp_mask1;

   __m128i eob, eob0;

-  (void)scan_ptr;

+  (void)scan;

   (void)skip_block;

   assert(!skip_block);

@@ -74,8 +74,7 @@

   store_tran_low(coeff0, dqcoeff_ptr);

   store_tran_low(coeff1, dqcoeff_ptr + 8);

-  eob =

-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);

+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

   while (index < n_coeffs) {

@@ -106,8 +105,8 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,

-                        index, zero);

+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+                        zero);

     eob = _mm_max_epi16(eob, eob0);

     index += 16;

@@ -116,12 +115,14 @@

   *eob_ptr = accumulate_eob(eob);

-void vpx_quantize_b_32x32_ssse3(

-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,

-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,

-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,

-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,

-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {

+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

+                                int skip_block, const int16_t *zbin_ptr,

+                                const int16_t *round_ptr,

+                                const int16_t *quant_ptr,

+                                const int16_t *quant_shift_ptr,

+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,

+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,

+                                const int16_t *scan, const int16_t *iscan) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i one = _mm_set1_epi16(1);

   int index;

@@ -133,7 +134,7 @@

   __m128i all_zero;

   __m128i eob = zero, eob0;

-  (void)scan_ptr;

+  (void)scan;

   (void)n_coeffs;

   (void)skip_block;

   assert(!skip_block);

@@ -226,8 +227,7 @@

     store_tran_low(coeff0, dqcoeff_ptr);

     store_tran_low(coeff1, dqcoeff_ptr + 8);

-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,

-                       zero);

+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

@@ -283,8 +283,8 @@

     store_tran_low(coeff0, dqcoeff_ptr + index);

     store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,

-                        index, zero);

+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+                        zero);

     eob = _mm_max_epi16(eob, eob0);

--- a/vpx_dsp/x86/quantize_x86.h

+++ b/vpx_dsp/x86/quantize_x86.h

@@ -48,17 +48,17 @@

   return _mm_mullo_epi16(qcoeff, dequant);

-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing

-// to zbin to add 1 to the index in 'scan'.

+// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to

+// zbin to add 1 to the index in 'scan'.

 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,

                                    const __m128i zbin_mask0,

                                    const __m128i zbin_mask1,

-                                   const int16_t *scan_ptr, const int index,

+                                   const int16_t *scan, const int index,

                                    const __m128i zero) {

   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);

   const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);

-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));

-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));

+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));

+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));

   __m128i eob0, eob1;

   // Add one to convert from indices to counts

   scan0 = _mm_sub_epi16(scan0, zbin_mask0);

--- a/vpx_dsp/x86/sad4d_avx2.c

+++ b/vpx_dsp/x86/sad4d_avx2.c

@@ -12,26 +12,26 @@

 #include "vpx/vpx_integer.h"

 static INLINE void calc_final(const __m256i *const sums /*[4]*/,

-                              uint32_t res[4]) {

+                              uint32_t sad_array[4]) {

   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);

   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);

   const __m256i t2 = _mm256_hadd_epi32(t0, t1);

   const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),

                                     _mm256_extractf128_si256(t2, 1));

-  _mm_storeu_si128((__m128i *)res, sum);

+  _mm_storeu_si128((__m128i *)sad_array, sum);

-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

-                          uint32_t res[4]) {

+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

+                          uint32_t sad_array[4]) {

   int i;

   const uint8_t *refs[4];

   __m256i sums[4];

-  refs[0] = ref[0];

-  refs[1] = ref[1];

-  refs[2] = ref[2];

-  refs[3] = ref[3];

+  refs[0] = ref_array[0];

+  refs[1] = ref_array[1];

+  refs[2] = ref_array[2];

+  refs[3] = ref_array[3];

   sums[0] = _mm256_setzero_si256();

   sums[1] = _mm256_setzero_si256();

   sums[2] = _mm256_setzero_si256();

@@ -40,26 +40,26 @@

   for (i = 0; i < 32; i++) {

     __m256i r[4];

-    // load src and all refs

-    const __m256i s = _mm256_load_si256((const __m256i *)src);

+    // load src and all ref[]

+    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);

     r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);

     r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);

     r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);

     r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);

-    // sum of the absolute differences between every ref-i to src

+    // sum of the absolute differences between every ref[] to src

     r[0] = _mm256_sad_epu8(r[0], s);

     r[1] = _mm256_sad_epu8(r[1], s);

     r[2] = _mm256_sad_epu8(r[2], s);

     r[3] = _mm256_sad_epu8(r[3], s);

-    // sum every ref-i

+    // sum every ref[]

     sums[0] = _mm256_add_epi32(sums[0], r[0]);

     sums[1] = _mm256_add_epi32(sums[1], r[1]);

     sums[2] = _mm256_add_epi32(sums[2], r[2]);

     sums[3] = _mm256_add_epi32(sums[3], r[3]);

-    src += src_stride;

+    src_ptr += src_stride;

     refs[0] += ref_stride;

     refs[1] += ref_stride;

     refs[2] += ref_stride;

@@ -66,20 +66,20 @@

     refs[3] += ref_stride;

-  calc_final(sums, res);

+  calc_final(sums, sad_array);

-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,

-                          const uint8_t *const ref[4], int ref_stride,

-                          uint32_t res[4]) {

+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,

+                          const uint8_t *const ref_array[4], int ref_stride,

+                          uint32_t sad_array[4]) {

   __m256i sums[4];

   int i;

   const uint8_t *refs[4];

-  refs[0] = ref[0];

-  refs[1] = ref[1];

-  refs[2] = ref[2];

-  refs[3] = ref[3];

+  refs[0] = ref_array[0];

+  refs[1] = ref_array[1];

+  refs[2] = ref_array[2];

+  refs[3] = ref_array[3];

   sums[0] = _mm256_setzero_si256();

   sums[1] = _mm256_setzero_si256();

   sums[2] = _mm256_setzero_si256();

@@ -87,9 +87,9 @@

   for (i = 0; i < 64; i++) {

     __m256i r_lo[4], r_hi[4];

-    // load 64 bytes from src and all refs

-    const __m256i s_lo = _mm256_load_si256((const __m256i *)src);

-    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src + 32));

+    // load 64 bytes from src and all ref[]

+    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);

+    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));

     r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);

     r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));

     r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);

@@ -99,7 +99,7 @@

     r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);

     r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));

-    // sum of the absolute differences between every ref-i to src

+    // sum of the absolute differences between every ref[] to src

     r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);

     r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);

     r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);

@@ -109,7 +109,7 @@

     r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);

     r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);

-    // sum every ref-i

+    // sum every ref[]

     sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);

     sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);

     sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);

@@ -119,7 +119,7 @@

     sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);

     sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);

-    src += src_stride;

+    src_ptr += src_stride;

     refs[0] += ref_stride;

     refs[1] += ref_stride;

     refs[2] += ref_stride;

@@ -126,5 +126,5 @@

     refs[3] += ref_stride;

-  calc_final(sums, res);

+  calc_final(sums, sad_array);

--- a/vpx_dsp/x86/sad4d_avx512.c

+++ b/vpx_dsp/x86/sad4d_avx512.c

@@ -11,8 +11,8 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx/vpx_integer.h"

-void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,

-                            const uint8_t *const ref[4], int ref_stride,

+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,

+                            const uint8_t *const ref_array[4], int ref_stride,

                             uint32_t res[4]) {

   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;

   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;

@@ -20,33 +20,33 @@

   int i;

   const uint8_t *ref0, *ref1, *ref2, *ref3;

-  ref0 = ref[0];

-  ref1 = ref[1];

-  ref2 = ref[2];

-  ref3 = ref[3];

+  ref0 = ref_array[0];

+  ref1 = ref_array[1];

+  ref2 = ref_array[2];

+  ref3 = ref_array[3];

   sum_ref0 = _mm512_set1_epi16(0);

   sum_ref1 = _mm512_set1_epi16(0);

   sum_ref2 = _mm512_set1_epi16(0);

   sum_ref3 = _mm512_set1_epi16(0);

   for (i = 0; i < 64; i++) {

-    // load src and all refs

-    src_reg = _mm512_loadu_si512((const __m512i *)src);

+    // load src and all ref[]

+    src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);

     ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);

     ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);

     ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);

     ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);

-    // sum of the absolute differences between every ref-i to src

+    // sum of the absolute differences between every ref[] to src

     ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);

     ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);

     ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);

     ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);

-    // sum every ref-i

+    // sum every ref[]

     sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);

     sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);

     sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);

     sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);

-    src += src_stride;

+    src_ptr += src_stride;

     ref0 += ref_stride;

     ref1 += ref_stride;

     ref2 += ref_stride;

@@ -55,7 +55,7 @@

     __m256i sum256;

     __m128i sum128;

-    // in sum_ref-i the result is saved in the first 4 bytes

+    // in sum_ref[] the result is saved in the first 4 bytes

     // the other 4 bytes are zeroed.

     // sum_ref1 and sum_ref3 are shifted left by 4 bytes

     sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);

@@ -65,7 +65,7 @@

     sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);

     sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);

-    // merge every 64 bit from each sum_ref-i

+    // merge every 64 bit from each sum_ref[]

     sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);

     sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);

--- a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm

+++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm

@@ -45,7 +45,7 @@

     ;Compute max and min values of a pixel

     mov         rdx, 0x00010001

-    movsxd      rcx, DWORD PTR arg(6)      ;bps

+    movsxd      rcx, DWORD PTR arg(6)      ;bd

     movq        xmm0, rdx

     movq        xmm1, rcx

     pshufd      xmm0, xmm0, 0b

@@ -121,7 +121,7 @@

     ;Compute max and min values of a pixel

     mov         rdx, 0x00010001

-    movsxd      rcx, DWORD PTR arg(6)       ;bps

+    movsxd      rcx, DWORD PTR arg(6)       ;bd

     movq        xmm0, rdx

     movq        xmm1, rcx

     pshufd      xmm0, xmm0, 0b

--- a/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm

+++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm

@@ -26,7 +26,7 @@

     pshufd      xmm3, xmm3, 0

     mov         rdx, 0x00010001

-    movsxd      rcx, DWORD PTR arg(6)       ;bps

+    movsxd      rcx, DWORD PTR arg(6)       ;bd

     movq        xmm5, rdx

     movq        xmm2, rcx

     pshufd      xmm5, xmm5, 0b

@@ -82,7 +82,7 @@

     pshufd      xmm4, xmm4, 0

     mov         rdx, 0x00010001

-    movsxd      rcx, DWORD PTR arg(6)       ;bps

+    movsxd      rcx, DWORD PTR arg(6)       ;bd

     movq        xmm8, rdx

     movq        xmm5, rcx

     pshufd      xmm8, xmm8, 0b

--

⑨