ref: 7c880906a0b4c7586ac167f13eb721548989061f
parent: df0d3a415216340a44953c6ed936bc2a4d7a1175
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Wed Dec 31 19:04:49 EST 1969
convolutions: make things a bit faster by inlining and unrolling loops
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -31,8 +31,7 @@
for (x = 0; x < w; ++x) {
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));
dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
x_q4 += x_step_q4;
}
@@ -53,8 +52,7 @@
for (x = 0; x < w; ++x) {
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ u32int sum = ((src_x[0]*x_filter[0] + src_x[1]*x_filter[1]) + (src_x[2]*x_filter[2] + src_x[3]*x_filter[3])) + ((src_x[4]*x_filter[4] + src_x[5]*x_filter[5]) + (src_x[6]*x_filter[6] + src_x[7]*x_filter[7]));
dst[x] = ROUND_POWER_OF_TWO(
dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
x_q4 += x_step_q4;
@@ -76,9 +74,7 @@
for (y = 0; y < h; ++y) {
const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
+ u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));
dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
y_q4 += y_step_q4;
}
@@ -99,9 +95,7 @@
for (y = 0; y < h; ++y) {
const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
+ u32int sum = ((src_y[src_stride*0]*y_filter[0] + src_y[src_stride*1]*y_filter[1]) + (src_y[src_stride*2]*y_filter[2] + src_y[src_stride*3]*y_filter[3])) + ((src_y[src_stride*4]*y_filter[4] + src_y[src_stride*5]*y_filter[5]) + (src_y[src_stride*6]*y_filter[6] + src_y[src_stride*7]*y_filter[7]));
dst[y * dst_stride] = ROUND_POWER_OF_TWO(
dst[y * dst_stride] +
clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,33 +45,15 @@
typedef int16_t tran_coef_t;
-static INLINE uint8_t clip_pixel(int val) {
- return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
+#define clip_pixel(val) (uint8_t)(((int)(val) > 255) ? 255 : (((int)(val) < 0) ? 0 : (val)))
+#define clamp(value, low, high) (int)((int)(value) < (int)(low) ? (low) : ((int)(value) > (int)(high) ? (high) : (value)))
+#define fcclamp(value, low, high) (double)((double)(value) < (double)(low) ? (low) : ((double)(value) > (double)(high) ? (high) : (value)))
+#define lclamp(value, low, high) (int64_t)((int64_t)(value) < (int64_t)(low) ? (low) : ((int64_t)(value) > (int64_t)(high) ? (high) : (value)))
+#define clip_pixel_highbd(val, bd) (uint16_t)((bd) == 12 ? clamp((val), 0, 4095) : ((bd) == 10 ? clamp((val), 0, 1023) : clamp((val), 0, 255)))
-static INLINE int clamp(int value, int low, int high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
- switch (bd) {
- case 8:
- default: return (uint16_t)clamp(val, 0, 255);
- case 10: return (uint16_t)clamp(val, 0, 1023);
- case 12: return (uint16_t)clamp(val, 0, 4095);
- }
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VPX_VPX_DSP_VPX_DSP_COMMON_H_
+