ref: 7544e766e495771b17cceba07f0df8c47d546b6e
parent: be380f200503e133423f81b55cfa28a2544eb704
 parent: a2a13cbe5f330d32ea702c034819cb213c8ca1ee
	author: James Zern <jzern@google.com>
	date: Fri May 29 21:51:35 EDT 2015
	
Merge changes from topic 'vp9-intra-pred' * changes: vp9_reconintra_neon: add DC 16x16 predictors vp9_reconintra_neon: add DC 8x8 predictors
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -248,9 +248,11 @@
#endif // HAVE_DSPR2
#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, NULL, NULL, NULL, NULL,
- vp9_v_predictor_8x8_neon, vp9_h_predictor_8x8_neon, NULL, NULL,
- NULL, NULL, NULL, NULL, vp9_tm_predictor_8x8_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred8, vp9_dc_predictor_8x8_neon,
+ vp9_dc_left_predictor_8x8_neon, vp9_dc_top_predictor_8x8_neon,
+ vp9_dc_128_predictor_8x8_neon, vp9_v_predictor_8x8_neon,
+ vp9_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+ vp9_tm_predictor_8x8_neon)
#endif // HAVE_NEON
@@ -289,9 +291,12 @@
#endif // HAVE_DSPR2
#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred16, NULL, NULL, NULL, NULL,
- vp9_v_predictor_16x16_neon, vp9_h_predictor_16x16_neon, NULL,
- NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred16, vp9_dc_predictor_16x16_neon,
+ vp9_dc_left_predictor_16x16_neon,
+ vp9_dc_top_predictor_16x16_neon,
+ vp9_dc_128_predictor_16x16_neon, vp9_v_predictor_16x16_neon,
+ vp9_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+ vp9_tm_predictor_16x16_neon)
#endif // HAVE_NEON
// -----------------------------------------------------------------------------
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -8,9 +8,161 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stddef.h>
#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+  if (do_above) {+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_top = vcombine_u16(p2, p2);
+ }
+
+  if (do_left) {+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ sum_left = vcombine_u16(p2, p2);
+ }
+
+  if (do_above && do_left) {+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {+ dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {+ dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {+ dc0 = vdup_n_u8(0x80);
+ }
+
+  {+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+    for (i = 0; i < 8; ++i) {+ vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+ }
+ }
+}
+
+void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {+ dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {+ (void)above;
+ dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {+ (void)left;
+ dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {+ (void)above;
+ (void)left;
+ dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+  if (do_above) {+ const uint8x16_t A = vld1q_u8(above); // top row
+ const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_top = vcombine_u16(p3, p3);
+ }
+
+  if (do_left) {+ const uint8x16_t L = vld1q_u8(left); // left row
+ const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ const uint16x4_t p3 = vpadd_u16(p2, p2);
+ sum_left = vcombine_u16(p3, p3);
+ }
+
+  if (do_above && do_left) {+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {+ dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {+ dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {+ dc0 = vdup_n_u8(0x80);
+ }
+
+  {+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+    for (i = 0; i < 16; ++i) {+ vst1q_u8(dst + i * stride, dc);
+ }
+ }
+}
+
+void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {+ dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+                                      const uint8_t *left) {+ (void)above;
+ dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+                                     const uint8_t *left) {+ (void)left;
+ dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+                                     const uint8_t *left) {+ (void)above;
+ (void)left;
+ dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+#if !HAVE_NEON_ASM
+
void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
                               const uint8_t *above, const uint8_t *left) {int i;
@@ -423,3 +575,4 @@
}
}
}
+#endif // !HAVE_NEON_ASM
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -123,16 +123,16 @@
specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -162,16 +162,16 @@
specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc";
add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -199,8 +199,9 @@
# TODO(johannkoenig): re-enable when chromium build is fixed
# # https://code.google.com/p/chromium/issues/detail?id=443839
#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
+
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
--
⑨