shithub: libvpx

--- a/vpx_dsp/vpx_convolve.c

+++ b/vpx_dsp/vpx_convolve.c

@@ -31,8 +31,7 @@

     for (x = 0; x < w; ++x) {

       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];

       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];

-      int k, sum = 0;

-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];

+      u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));

       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));

       x_q4 += x_step_q4;

@@ -53,8 +52,7 @@

     for (x = 0; x < w; ++x) {

       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];

       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];

-      int k, sum = 0;

-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];

+      u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));

       dst[x] = ROUND_POWER_OF_TWO(

           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);

       x_q4 += x_step_q4;

@@ -76,9 +74,7 @@

     for (y = 0; y < h; ++y) {

       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];

       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];

-      int k, sum = 0;

-      for (k = 0; k < SUBPEL_TAPS; ++k)

-        sum += src_y[k * src_stride] * y_filter[k];

+      u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));

       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));

       y_q4 += y_step_q4;

@@ -99,9 +95,7 @@

     for (y = 0; y < h; ++y) {

       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];

       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];

-      int k, sum = 0;

-      for (k = 0; k < SUBPEL_TAPS; ++k)

-        sum += src_y[k * src_stride] * y_filter[k];

+      u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));

       dst[y * dst_stride] = ROUND_POWER_OF_TWO(

           dst[y * dst_stride] +

               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),

--- a/vpx_dsp/vpx_dsp_common.h

+++ b/vpx_dsp/vpx_dsp_common.h

@@ -45,33 +45,15 @@

 typedef int16_t tran_coef_t;

-static INLINE uint8_t clip_pixel(int val) {

-  return (val > 255) ? 255 : (val < 0) ? 0 : val;

-}

+#define clip_pixel(val) (uint8_t)(((int)(val) > 255) ? 255 : (((int)(val) < 0) ? 0 : (val)))

+#define clamp(value, low, high) (int)((int)(value) < (int)(low) ? (low) : ((int)(value) > (int)(high) ? (high) : (value)))

+#define fcclamp(value, low, high) (double)((double)(value) < (double)(low) ? (low) : ((double)(value) > (double)(high) ? (high) : (value)))

+#define lclamp(value, low, high) (int64_t)((int64_t)(value) < (int64_t)(low) ? (low) : ((int64_t)(value) > (int64_t)(high) ? (high) : (value)))

+#define clip_pixel_highbd(val, bd) (uint16_t)((bd) == 12 ? clamp((val), 0, 4095) : ((bd) == 10 ? clamp((val), 0, 1023) : clamp((val), 0, 255)))

-static INLINE int clamp(int value, int low, int high) {

-  return value < low ? low : (value > high ? high : value);

-}

-static INLINE double fclamp(double value, double low, double high) {

-  return value < low ? low : (value > high ? high : value);

-}

-static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {

-  return value < low ? low : (value > high ? high : value);

-}

-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {

-  switch (bd) {

-    case 8:

-    default: return (uint16_t)clamp(val, 0, 255);

-    case 10: return (uint16_t)clamp(val, 0, 1023);

-    case 12: return (uint16_t)clamp(val, 0, 4095);

-  }

-}

 #ifdef __cplusplus

 }  // extern "C"

 #endif

 #endif  // VPX_VPX_DSP_VPX_DSP_COMMON_H_

--

⑨