ref: e69b5258fd604e95d39b6eff6ea68e7e6e941d5d
parent: 13dbf1fb173753cadba8c66a6bf08b2757ab07e6
author: Jim Bankoski <jimbankoski@google.com>
date: Thu Nov 29 01:53:08 EST 2012
fix vp9_vp8 files renamed Change-Id: I20c426e91ee49666db42e20eb074095ab6b8ec5d
--- /dev/null
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -1,0 +1,602 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_subpixel.h"
+
+extern const short vp9_six_tap_mmx[16][6 * 8];
+
+extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
+
+extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp9_filter);
+
+extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width);
+
+extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_lin,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_pitch,
+ unsigned int output_height,
+ const short *vp9_filter);
+
+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
+
+#if HAVE_MMX
+void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict4x4_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
+ const short *hfilter, *vfilter;
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 9, 8, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
+ 8, 4, 4, 4, vfilter);
+}
+
+void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
+ fdata2 + 8, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
+ fdata2 + 12, src_pixels_per_line, 1, 21, 32,
+ hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
+ 32, 16, 16, 16, vfilter);
+}
+
+void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 13, 16,
+ hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 13, 16,
+ hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 8, 8, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+ 16, 8, 8, 8, vfilter);
+}
+
+void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_mmx\n");
+#endif
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
+ fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
+ vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
+ fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
+
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 4, 8, vfilter);
+ vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
+ 16, 8, 4, 8, vfilter);
+}
+
+void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ vp9_bilinear_predict8x8_mmx(src_ptr,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + 8, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + dst_pitch * 8, dst_pitch);
+ vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
+ src_pixels_per_line, xoffset, yoffset,
+ dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+#if HAVE_SSE2
+void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 21, 32, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 21, 32);
+ vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
+ 32, 16, 16, dst_pitch, vfilter);
+ }
+}
+
+void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 13, 16, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 8, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, vfilter);
+ }
+}
+
+void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ /* Temp data bufffer used in filtering */
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
+ const short *hfilter, *vfilter;
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_sse2\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
+ src_pixels_per_line, 1, 9, 16, hfilter);
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
+ 16, 8, 4, dst_pitch, vfilter);
+ } else {
+ /* First-pass only */
+ hfilter = vp9_six_tap_mmx[xoffset];
+ vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, hfilter);
+ }
+ } else {
+ /* Second-pass only */
+ vfilter = vp9_six_tap_mmx[yoffset];
+ vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, vfilter);
+ }
+}
+#endif
+
+#if HAVE_SSSE3
+extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp9_filter_index);
+
+void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict16x16_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ fdata2, 16, 21, xoffset);
+ vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
+ 16, yoffset);
+ } else {
+ /* First-pass only */
+ vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, yoffset);
+ }
+}
+
+void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x8_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 8, 13, xoffset);
+ vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
+ } else {
+ vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, yoffset);
+ }
+}
+
+void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict8x4_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 8, 9, xoffset);
+ vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ /* First-pass only */
+ vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ } else {
+ /* Second-pass only */
+ vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+}
+
+void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
+#ifdef ANNOUNCE_FUNCTION
+ printf("vp9_sixtap_predict4x4_ssse3\n");
+#endif
+
+ if (xoffset) {
+ if (yoffset) {
+ vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, fdata2, 4, 9, xoffset);
+ vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
+ } else {
+ vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ } else {
+ vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+}
+
+void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 23, hfilter_aligned16);
+ vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
+ 16, hfilter_aligned16);
+ } else {
+ vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 16, vfilter_aligned16);
+ }
+ }
+}
+
+void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 15, hfilter_aligned16);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
+ hfilter_aligned16);
+ } else {
+ vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 8, vfilter_aligned16);
+ }
+ }
+}
+
+void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_stride,
+ const short *hfilter_aligned16,
+ const short *vfilter_aligned16,
+ unsigned char *dst_ptr,
+ unsigned int dst_stride) {
+ if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+
+ vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ fdata2, 16, 11, hfilter_aligned16);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
+ vfilter_aligned16);
+ } else {
+ if (hfilter_aligned16[3] != 128) {
+ vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
+ hfilter_aligned16);
+ } else {
+ vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
+ dst_ptr, dst_stride, 4, vfilter_aligned16);
+ }
+ }
+}
+#endif
--- a/vp9/common/x86/vp9_vp8_asm_stubs.c
+++ /dev/null
@@ -1,602 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[16][6 * 8];
-
-extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_height,
- unsigned int output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_lin,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
- const short *hfilter, *vfilter;
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 8, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
- 8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
- fdata2 + 8, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
- fdata2 + 12, src_pixels_per_line, 1, 21, 32,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
- 32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 13, 16,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 13, 16,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 4, 8, vfilter);
-}
-
-void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- vp9_bilinear_predict8x8_mmx(src_ptr,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + 8, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + dst_pitch * 8, dst_pitch);
- vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
- src_pixels_per_line, xoffset, yoffset,
- dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 21, 32, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 21, 32);
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 13, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, vfilter);
- }
-}
-#endif
-
-#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- fdata2, 16, 21, xoffset);
- vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
- 16, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 16, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 13, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
- } else {
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 9, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 4, 9, xoffset);
- vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 23, hfilter_aligned16);
- vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
- 16, hfilter_aligned16);
- } else {
- vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 16, vfilter_aligned16);
- }
- }
-}
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 15, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 8, vfilter_aligned16);
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 11, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 4, vfilter_aligned16);
- }
- }
-}
-#endif
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm
@@ -1,0 +1,291 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_armv5|
+
+ INCLUDE vp9_asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 vp9_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+ push {r4-r11, lr}
+
+ ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+ ; sizeof (TOKENEXTRA) is 8
+ sub sp, sp, #12
+ add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
+ str r2, [sp, #0]
+ str r3, [sp, #8] ; save vp8_coef_encodings
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+ b check_p_lt_stop
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #8] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #52] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #48] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ add sp, sp, #12
+ pop {r4-r11, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm
@@ -1,0 +1,327 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+
+ INCLUDE vp9_asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp9_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #24
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r2, [sp, #20] ; save vp8_coef_encodings
+ str r5, [sp, #12] ; save mb_rows
+ str r3, [sp, #8] ; save vp8_extra_bits
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+
+ mov r0, r1 ; keep same as other loops
+
+ ldr r2, [r0, #vp9_writer_lowvalue]
+ ldr r5, [r0, #vp9_writer_range]
+ ldr r3, [r0, #vp9_writer_count]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actuall work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #20] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #8] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, #1
+ add r7, r7, #TOKENLIST_SZ ; next element in the array
+ str r6, [sp, #12]
+ bne mb_row_loop
+
+ str r2, [r0, #vp9_writer_lowvalue]
+ str r5, [r0, #vp9_writer_range]
+ str r3, [r0, #vp9_writer_count]
+ add sp, sp, #24
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm
@@ -1,0 +1,465 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+
+ INCLUDE vp9_asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp9_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+ push {r4-r11, lr}
+ sub sp, sp, #44
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r5, [sp, #36] ; save mb_rows
+ str r1, [sp, #24] ; save cx_data
+ str r2, [sp, #20] ; save num_part
+ str r3, [sp, #8] ; save *size
+
+ ; *size = 3*(num_part -1 );
+ sub r2, r2, #1 ; num_part - 1
+ add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
+ str r2, [r3]
+
+ add r2, r2, r1 ; cx_data + *size
+ str r2, [sp, #40] ; ptr
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+ str r7, [sp, #32] ; store start of cpi->tp_list
+
+ ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
+ add r0, r0, r11
+
+ mov r11, #0
+ str r11, [sp, #28] ; i
+
+numparts_loop
+ ldr r10, [sp, #40] ; ptr
+ ldr r5, [sp, #36] ; move mb_rows to the counting section
+ sub r5, r5, r11 ; move start point with each partition
+ ; mb_rows starts at i
+ str r5, [sp, #12]
+
+ ; Reset all of the VP8 Writer data for each partition that
+ ; is processed.
+ ; start_encode
+ mov r2, #0 ; vp9_writer_lowvalue
+ mov r5, #255 ; vp9_writer_range
+ mvn r3, #23 ; vp9_writer_count
+
+ str r2, [r0, #vp9_writer_value]
+ str r2, [r0, #vp9_writer_pos]
+ str r10, [r0, #vp9_writer_buffer]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actual work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #80] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp9_token_value] ; v
+ ldr r8, [r4, #vp9_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #84] ; vp8_extra_bits
+ ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
+ ; element. Here vp9_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp9_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp9_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp9_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp9_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp9_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp9_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp9_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #0x10]
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r10, [sp, #20] ; num_parts
+ mov r1, #TOKENLIST_SZ
+ mul r1, r10, r1
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, r10
+ add r7, r7, r1 ; next element in the array
+ str r6, [sp, #12]
+ bgt mb_row_loop
+
+ mov r12, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp9_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp9_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp9_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp9_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp9_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r12, r12, #1
+ bne stop_encode_loop
+
+ ldr r10, [sp, #8] ; *size
+ ldr r11, [r10]
+ ldr r4, [r0, #vp9_writer_pos] ; w->pos
+ add r11, r11, r4 ; *size += w->pos
+ str r11, [r10]
+
+ ldr r9, [sp, #20] ; num_parts
+ sub r9, r9, #1
+ ldr r10, [sp, #28] ; i
+ cmp r10, r9 ; if(i<(num_part - 1))
+ bge skip_write_partition
+
+ ldr r12, [sp, #40] ; ptr
+ add r12, r12, r4 ; ptr += w->pos
+ str r12, [sp, #40]
+
+ ldr r9, [sp, #24] ; cx_data
+ mov r8, r4, asr #8
+ strb r4, [r9, #0]
+ strb r8, [r9, #1]
+ mov r4, r4, asr #16
+ strb r4, [r9, #2]
+
+ add r9, r9, #3 ; cx_data += 3
+ str r9, [sp, #24]
+
+skip_write_partition
+
+ ldr r11, [sp, #28] ; i
+ ldr r10, [sp, #20] ; num_parts
+
+ add r11, r11, #1 ; i++
+ str r11, [sp, #28]
+
+ ldr r7, [sp, #32] ; cpi->tp_list[i]
+ mov r1, #TOKENLIST_SZ
+ add r7, r7, r1 ; next element in cpi->tp_list
+ str r7, [sp, #32] ; cpi->tp_list[i+1]
+
+ cmp r10, r11
+ bgt numparts_loop
+
+
+ add sp, sp, #44
+ pop {r4-r11, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+_VP8_COMP_bc2_
+ DCD vp8_comp_bc2
+
+ END
--- a/vp9/encoder/arm/armv5te/vp9_vp8_packtokens_armv5.asm
+++ /dev/null
@@ -1,291 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_armv5|
-
- INCLUDE vp9_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 vp9_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
- push {r4-r11, lr}
-
- ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
- ; sizeof (TOKENEXTRA) is 8
- sub sp, sp, #12
- add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
- str r2, [sp, #0]
- str r3, [sp, #8] ; save vp8_coef_encodings
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
- b check_p_lt_stop
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #8] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #52] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #52] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #48] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- add sp, sp, #12
- pop {r4-r11, pc}
- ENDP
-
- END
--- a/vp9/encoder/arm/armv5te/vp9_vp8_packtokens_mbrow_armv5.asm
+++ /dev/null
@@ -1,327 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-
- INCLUDE vp9_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 vp9_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
- push {r4-r11, lr}
- sub sp, sp, #24
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r2, [sp, #20] ; save vp8_coef_encodings
- str r5, [sp, #12] ; save mb_rows
- str r3, [sp, #8] ; save vp8_extra_bits
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
-
- mov r0, r1 ; keep same as other loops
-
- ldr r2, [r0, #vp9_writer_lowvalue]
- ldr r5, [r0, #vp9_writer_range]
- ldr r3, [r0, #vp9_writer_count]
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actuall work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #20] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #60] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #60] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #8] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, #1
- add r7, r7, #TOKENLIST_SZ ; next element in the array
- str r6, [sp, #12]
- bne mb_row_loop
-
- str r2, [r0, #vp9_writer_lowvalue]
- str r5, [r0, #vp9_writer_range]
- str r3, [r0, #vp9_writer_count]
- add sp, sp, #24
- pop {r4-r11, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-
- END
--- a/vp9/encoder/arm/armv5te/vp9_vp8_packtokens_partitions_armv5.asm
+++ /dev/null
@@ -1,465 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-
- INCLUDE vp9_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp9_tree_index *,
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
- push {r4-r11, lr}
- sub sp, sp, #44
-
- ; Compute address of cpi->common.mb_rows
- ldr r4, _VP8_COMP_common_
- ldr r6, _VP8_COMMON_MBrows_
- add r4, r0, r4
-
- ldr r5, [r4, r6] ; load up mb_rows
-
- str r5, [sp, #36] ; save mb_rows
- str r1, [sp, #24] ; save cx_data
- str r2, [sp, #20] ; save num_part
- str r3, [sp, #8] ; save *size
-
- ; *size = 3*(num_part -1 );
- sub r2, r2, #1 ; num_part - 1
- add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
- str r2, [r3]
-
- add r2, r2, r1 ; cx_data + *size
- str r2, [sp, #40] ; ptr
-
- ldr r4, _VP8_COMP_tplist_
- add r4, r0, r4
- ldr r7, [r4, #0] ; dereference cpi->tp_list
- str r7, [sp, #32] ; store start of cpi->tp_list
-
- ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
- add r0, r0, r11
-
- mov r11, #0
- str r11, [sp, #28] ; i
-
-numparts_loop
- ldr r10, [sp, #40] ; ptr
- ldr r5, [sp, #36] ; move mb_rows to the counting section
- sub r5, r5, r11 ; move start point with each partition
- ; mb_rows starts at i
- str r5, [sp, #12]
-
- ; Reset all of the VP8 Writer data for each partition that
- ; is processed.
- ; start_encode
- mov r2, #0 ; vp9_writer_lowvalue
- mov r5, #255 ; vp9_writer_range
- mvn r3, #23 ; vp9_writer_count
-
- str r2, [r0, #vp9_writer_value]
- str r2, [r0, #vp9_writer_pos]
- str r10, [r0, #vp9_writer_buffer]
-
-mb_row_loop
-
- ldr r1, [r7, #tokenlist_start]
- ldr r9, [r7, #tokenlist_stop]
- str r9, [sp, #0] ; save stop for later comparison
- str r7, [sp, #16] ; tokenlist address for next time
-
- b check_p_lt_stop
-
- ; actual work gets done here!
-
-while_p_lt_stop
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r4, [sp, #80] ; vp8_coef_encodings
- mov lr, #0
- add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
- ldr r9, [r1, #tokenextra_context_tree] ; pp
-
- ldrb r7, [r1, #tokenextra_skip_eob_node]
-
- ldr r6, [r4, #vp9_token_value] ; v
- ldr r8, [r4, #vp9_token_len] ; n
-
- ; vp8 specific skip_eob_node
- cmp r7, #0
- movne lr, #2 ; i = 2
- subne r8, r8, #1 ; --n
-
- rsb r4, r8, #32 ; 32-n
- ldr r10, [sp, #88] ; vp8_coef_tree
-
- ; v is kept in r12 during the token pack loop
- lsl r12, r6, r4 ; r12 = v << 32 - n
-
-; loop start
-token_loop
- ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
- sub r7, r5, #1 ; range-1
-
- ; Decisions are made based on the bit value shifted
- ; off of v, so set a flag here based on this.
- ; This value is refered to as "bb"
- lsls r12, r12, #1 ; bb = v >> n
- mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
-
- ; bb can only be 0 or 1. So only execute this statement
- ; if bb == 1, otherwise it will act like i + 0
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
- add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start
-token_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
- ; r10 is used earlier in the loop, but r10 is used as
- ; temp variable here. So after r10 is used, reload
- ; vp8_coef_tree_dcd into r10
- ldr r10, [sp, #88] ; vp8_coef_tree
-
-token_count_lt_zero
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r8, r8, #1 ; --n
- bne token_loop
-
- ldrb r6, [r1, #tokenextra_token] ; t
- ldr r7, [sp, #84] ; vp8_extra_bits
- ; Add t * sizeof (vp9_extra_bit_struct) to get the desired
- ; element. Here vp9_extra_bit_struct == 16
- add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
-
- ldr r4, [r12, #vp9_extra_bit_struct_base_val]
- cmp r4, #0
- beq skip_extra_bits
-
-; if( b->base_val)
- ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
- ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
- cmp r8, #0 ; if( L)
- beq no_extra_bits
-
- ldr r9, [r12, #vp9_extra_bit_struct_prob]
- asr r7, lr, #1 ; v=e>>1
-
- ldr r10, [r12, #vp9_extra_bit_struct_tree]
- str r10, [sp, #4] ; b->tree
-
- rsb r4, r8, #32
- lsl r12, r7, r4
-
- mov lr, #0 ; i = 0
-
-extra_bits_loop
- ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
- sub r7, r5, #1 ; range-1
- lsls r12, r12, #1 ; v >> n
- mul r6, r4, r7 ; (range-1) * pp[i>>1]
- addcs lr, lr, #1 ; i + bb
-
- mov r7, #1
- ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
- add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
-
- addcs r2, r2, r4 ; if (bb) lowvalue += split
- subcs r4, r5, r4 ; if (bb) range = range-split
-
- clz r6, r4
- sub r6, r6, #24
-
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi extra_count_lt_zero ; if(count >= 0)
-
- sub r6, r6, r3 ; offset= shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl extra_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos - 1
- b extra_zero_while_start
-extra_zero_while_loop
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-extra_zero_while_start
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq extra_zero_while_loop
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4]
- add r10, r10, #1
- strb r10, [r7, r4]
-extra_high_bit_not_set
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos]
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
- ldr r10, [sp, #4] ; b->tree
-extra_count_lt_zero
- lsl r2, r2, r6
-
- subs r8, r8, #1 ; --n
- bne extra_bits_loop ; while (n)
-
-no_extra_bits
- ldr lr, [r1, #4] ; e = p->Extra
- add r4, r5, #1 ; range + 1
- tst lr, #1
- lsr r4, r4, #1 ; split = (range + 1) >> 1
- addne r2, r2, r4 ; lowvalue += split
- subne r4, r5, r4 ; range = range-split
- tst r2, #0x80000000 ; lowvalue & 0x80000000
- lsl r5, r4, #1 ; range <<= 1
- beq end_high_bit_not_set
-
- ldr r4, [r0, #vp9_writer_pos]
- mov r7, #0
- sub r4, r4, #1
- b end_zero_while_start
-end_zero_while_loop
- strb r7, [r6, r4]
- sub r4, r4, #1 ; x--
-end_zero_while_start
- cmp r4, #0
- ldrge r6, [r0, #vp9_writer_buffer]
- ldrb r12, [r6, r4]
- cmpge r12, #0xff
- beq end_zero_while_loop
-
- ldr r6, [r0, #vp9_writer_buffer]
- ldrb r7, [r6, r4]
- add r7, r7, #1
- strb r7, [r6, r4]
-end_high_bit_not_set
- adds r3, r3, #1 ; ++count
- lsl r2, r2, #1 ; lowvalue <<= 1
- bne end_count_zero
-
- ldr r4, [r0, #vp9_writer_pos]
- mvn r3, #7
- ldr r7, [r0, #vp9_writer_buffer]
- lsr r6, r2, #24 ; lowvalue >> 24
- add r12, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r12, [r0, #0x10]
- strb r6, [r7, r4]
-end_count_zero
-skip_extra_bits
- add r1, r1, #TOKENEXTRA_SZ ; ++p
-check_p_lt_stop
- ldr r4, [sp, #0] ; stop
- cmp r1, r4 ; while( p < stop)
- bcc while_p_lt_stop
-
- ldr r10, [sp, #20] ; num_parts
- mov r1, #TOKENLIST_SZ
- mul r1, r10, r1
-
- ldr r6, [sp, #12] ; mb_rows
- ldr r7, [sp, #16] ; tokenlist address
- subs r6, r6, r10
- add r7, r7, r1 ; next element in the array
- str r6, [sp, #12]
- bgt mb_row_loop
-
- mov r12, #32
-
-stop_encode_loop
- sub r7, r5, #1 ; range-1
-
- mov r4, r7, lsl #7 ; ((range-1) * 128)
-
- mov r7, #1
- add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
-
- ; Counting the leading zeros is used to normalize range.
- clz r6, r4
- sub r6, r6, #24 ; shift
-
- ; Flag is set on the sum of count. This flag is used later
- ; to determine if count >= 0
- adds r3, r3, r6 ; count += shift
- lsl r5, r4, r6 ; range <<= shift
- bmi token_count_lt_zero_se ; if(count >= 0)
-
- sub r6, r6, r3 ; offset = shift - count
- sub r4, r6, #1 ; offset-1
- lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
- bpl token_high_bit_not_set_se
-
- ldr r4, [r0, #vp9_writer_pos] ; x
- sub r4, r4, #1 ; x = w->pos-1
- b token_zero_while_start_se
-token_zero_while_loop_se
- mov r10, #0
- strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
- sub r4, r4, #1 ; x--
-token_zero_while_start_se
- cmp r4, #0
- ldrge r7, [r0, #vp9_writer_buffer]
- ldrb r11, [r7, r4]
- cmpge r11, #0xff
- beq token_zero_while_loop_se
-
- ldr r7, [r0, #vp9_writer_buffer]
- ldrb r10, [r7, r4] ; w->buffer[x]
- add r10, r10, #1
- strb r10, [r7, r4] ; w->buffer[x] + 1
-token_high_bit_not_set_se
- rsb r4, r6, #24 ; 24-offset
- ldr r10, [r0, #vp9_writer_buffer]
- lsr r7, r2, r4 ; lowvalue >> (24-offset)
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- lsl r2, r2, r6 ; lowvalue <<= offset
- mov r6, r3 ; shift = count
- add r11, r4, #1 ; w->pos++
- bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
- str r11, [r0, #vp9_writer_pos]
- sub r3, r3, #8 ; count -= 8
- strb r7, [r10, r4] ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
- lsl r2, r2, r6 ; lowvalue <<= shift
-
- subs r12, r12, #1
- bne stop_encode_loop
-
- ldr r10, [sp, #8] ; *size
- ldr r11, [r10]
- ldr r4, [r0, #vp9_writer_pos] ; w->pos
- add r11, r11, r4 ; *size += w->pos
- str r11, [r10]
-
- ldr r9, [sp, #20] ; num_parts
- sub r9, r9, #1
- ldr r10, [sp, #28] ; i
- cmp r10, r9 ; if(i<(num_part - 1))
- bge skip_write_partition
-
- ldr r12, [sp, #40] ; ptr
- add r12, r12, r4 ; ptr += w->pos
- str r12, [sp, #40]
-
- ldr r9, [sp, #24] ; cx_data
- mov r8, r4, asr #8
- strb r4, [r9, #0]
- strb r8, [r9, #1]
- mov r4, r4, asr #16
- strb r4, [r9, #2]
-
- add r9, r9, #3 ; cx_data += 3
- str r9, [sp, #24]
-
-skip_write_partition
-
- ldr r11, [sp, #28] ; i
- ldr r10, [sp, #20] ; num_parts
-
- add r11, r11, #1 ; i++
- str r11, [sp, #28]
-
- ldr r7, [sp, #32] ; cpi->tp_list[i]
- mov r1, #TOKENLIST_SZ
- add r7, r7, r1 ; next element in cpi->tp_list
- str r7, [sp, #32] ; cpi->tp_list[i+1]
-
- cmp r10, r11
- bgt numparts_loop
-
-
- add sp, sp, #44
- pop {r4-r11, pc}
- ENDP
-
-_VP8_COMP_common_
- DCD vp8_comp_common
-_VP8_COMMON_MBrows_
- DCD vp8_common_mb_rows
-_VP8_COMP_tplist_
- DCD vp8_comp_tplist
-_VP8_COMP_bc2_
- DCD vp8_comp_bc2
-
- END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm
@@ -1,0 +1,223 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_armv6|
+
+ INCLUDE vp9_asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *b
+; r1 BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+ stmfd sp!, {r1, r4-r11, lr}
+
+ ldr r3, [r0, #vp8_block_coeff] ; coeff
+ ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
+ ldr r5, [r0, #vp8_block_round] ; round
+ ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
+ ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
+ ldr r8, [r1, #vp8_blockd_dequant] ; dequant
+
+ ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
+ ; is used to update the counter so that
+ ; it can be used to mark nonzero
+ ; quantized coefficient pairs.
+
+ mov r1, #0 ; flags for quantized coeffs
+
+ ; PART 1: quantization and dequantization loop
+loop
+ ldr r9, [r3], #4 ; [z1 | z0]
+ ldr r10, [r5], #4 ; [r1 | r0]
+ ldr r11, [r4], #4 ; [q1 | q0]
+
+ ssat16 lr, #1, r9 ; [sz1 | sz0]
+ eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
+ ssub16 r9, r9, lr ; x = (z ^ sz) - sz
+ sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
+
+ ldr r12, [r3], #4 ; [z3 | z2]
+
+ smulbb r0, r9, r11 ; [(x0+r0)*q0]
+ smultt r9, r9, r11 ; [(x1+r1)*q1]
+
+ ldr r10, [r5], #4 ; [r3 | r2]
+
+ ssat16 r11, #1, r12 ; [sz3 | sz2]
+ eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
+ pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
+ ldr r9, [r4], #4 ; [q3 | q2]
+ ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
+
+ sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
+
+ eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+ smulbb r10, r12, r9 ; [(x2+r2)*q2]
+ smultt r12, r12, r9 ; [(x3+r3)*q3]
+
+ ssub16 r0, r0, lr ; x = (y ^ sz) - sz
+
+ cmp r0, #0 ; check if zero
+ orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
+
+ str r0, [r6], #4 ; *qcoeff++ = x
+ ldr r9, [r8], #4 ; [dq1 | dq0]
+
+ pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
+ eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
+ ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
+
+ cmp r10, #0 ; check if zero
+ orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
+
+ str r10, [r6], #4 ; *qcoeff++ = x
+ ldr r11, [r8], #4 ; [dq3 | dq2]
+
+ smulbb r12, r0, r9 ; [x0*dq0]
+ smultt r0, r0, r9 ; [x1*dq1]
+
+ smulbb r9, r10, r11 ; [x2*dq2]
+ smultt r10, r10, r11 ; [x3*dq3]
+
+ lsls r2, r2, #2 ; update loop counter
+ strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
+ strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
+ strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
+ strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
+ add r7, r7, #8 ; dqcoeff += 8
+ bne loop
+
+ ; PART 2: check position for eob...
+ mov lr, #0 ; init eob
+ cmp r1, #0 ; coeffs after quantization?
+ ldr r11, [sp, #0] ; restore BLOCKD pointer
+ beq end ; skip eob calculations if all zero
+
+ ldr r0, [r11, #vp8_blockd_qcoeff]
+
+ ; check shortcut for nonzero qcoeffs
+ tst r1, #0x80
+ bne quant_coeff_15_14
+ tst r1, #0x20
+ bne quant_coeff_13_11
+ tst r1, #0x8
+ bne quant_coeff_12_7
+ tst r1, #0x40
+ bne quant_coeff_10_9
+ tst r1, #0x10
+ bne quant_coeff_8_3
+ tst r1, #0x2
+ bne quant_coeff_6_5
+ tst r1, #0x4
+ bne quant_coeff_4_2
+ b quant_coeff_1_0
+
+quant_coeff_15_14
+ ldrh r2, [r0, #30] ; rc=15, i=15
+ mov lr, #16
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #28] ; rc=14, i=14
+ mov lr, #15
+ cmp r3, #0
+ bne end
+
+quant_coeff_13_11
+ ldrh r2, [r0, #22] ; rc=11, i=13
+ mov lr, #14
+ cmp r2, #0
+ bne end
+
+quant_coeff_12_7
+ ldrh r3, [r0, #14] ; rc=7, i=12
+ mov lr, #13
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #20] ; rc=10, i=11
+ mov lr, #12
+ cmp r2, #0
+ bne end
+
+quant_coeff_10_9
+ ldrh r3, [r0, #26] ; rc=13, i=10
+ mov lr, #11
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #24] ; rc=12, i=9
+ mov lr, #10
+ cmp r2, #0
+ bne end
+
+quant_coeff_8_3
+ ldrh r3, [r0, #18] ; rc=9, i=8
+ mov lr, #9
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #12] ; rc=6, i=7
+ mov lr, #8
+ cmp r2, #0
+ bne end
+
+quant_coeff_6_5
+ ldrh r3, [r0, #6] ; rc=3, i=6
+ mov lr, #7
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #4] ; rc=2, i=5
+ mov lr, #6
+ cmp r2, #0
+ bne end
+
+quant_coeff_4_2
+ ldrh r3, [r0, #10] ; rc=5, i=4
+ mov lr, #5
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #16] ; rc=8, i=3
+ mov lr, #4
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #8] ; rc=4, i=2
+ mov lr, #3
+ cmp r3, #0
+ bne end
+
+quant_coeff_1_0
+ ldrh r2, [r0, #2] ; rc=1, i=1
+ mov lr, #2
+ cmp r2, #0
+ bne end
+
+ mov lr, #1 ; rc=0, i=0
+
+end
+ str lr, [r11, #vp8_blockd_eob]
+ ldmfd sp!, {r1, r4-r11, pc}
+
+ ENDP
+
+loop_count
+ DCD 0x1000000
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm
@@ -1,0 +1,138 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+ push {r4-r9, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r4, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm
@@ -1,0 +1,95 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 const unsigned char *src_ptr
+; r1 int src_stride
+; r2 const unsigned char *ref_ptr
+; r3 int ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ mov r4, #0 ; sad = 0;
+ mov r5, #8 ; loop count
+
+loop
+ ; 1st row
+ ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
+
+ usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
+ add r4, r4, r8 ; add partial sad values
+
+ ; 2nd row
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
+
+ usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ subs r5, r5, #1 ; decrement loop counter
+ add r4, r4, r8 ; add partial sad values
+
+ bne loop
+
+ mov r0, r4 ; return sad
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm
@@ -1,0 +1,262 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_fdct4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+ stmfd sp!, {r4 - r12, lr}
+
+ ; PART 1
+
+ ; coeffs 0-3
+ ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
+
+ ldr r10, c7500
+ ldr r11, c14500
+ ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
+ ldr lr, c0x00080008
+ ror r5, r5, #16 ; [i2 | i3]
+
+ qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
+ smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+ smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
+
+ pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
+
+ str r6, [r1, #4]
+
+ ; coeffs 4-7
+ ror r9, r9, #16 ; [i6 | i7]
+
+ qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+ qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
+ smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+ smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
+
+ pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
+
+ str r6, [r1, #12]
+
+ ; coeffs 8-11
+ ror r5, r5, #16 ; [i10 | i11]
+
+ qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
+ smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+ smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
+
+ pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
+
+ str r6, [r1, #20]
+
+ ; coeffs 12-15
+ ror r5, r5, #16 ; [i14 | i15]
+
+ qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
+ qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
+ smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+ smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
+
+ str r6, [r1, #28]
+
+
+ ; PART 2 -------------------------------------------------
+ ldr r11, c12000
+ ldr r10, c51000
+ ldr lr, c0x00070007
+
+ qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
+ qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
+ qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
+ qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ add r0, r11, #0x10000 ; add (d!=0)
+
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ ldr r12, c0x08a914e8 ; [2217 | 5352]
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #0] ; [ o1 | o0]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #16] ; [ o9 | o8]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ ldr r3, [r1, #4] ; [i3 | i2]
+
+ pkhtb r5, r5, r4, asr #16 ; [o13|o12]
+
+ str r9, [r1, #8] ; [o5 | 04]
+
+ ldr r9, [r1, #12] ; [i7 | i6]
+ ldr r8, [r1, #28] ; [i15|i14]
+ ldr r2, [r1, #20] ; [i11|i10]
+ str r5, [r1, #24] ; [o13|o12]
+
+ qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
+ qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #4] ; [ o3 | o2]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #20] ; [ o11 | o10]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ str r9, [r1, #12] ; [o7 | o6]
+ pkhtb r5, r5, r4, asr #16 ; [o15|o14]
+
+ str r5, [r1, #28] ; [o15|o14]
+
+ ldmfd sp!, {r4 - r12, pc}
+
+ ENDP
+
+; Used constants
+c7500
+ DCD 7500
+c14500
+ DCD 14500
+c0x22a453a0
+ DCD 0x22a453a0
+c0x00080008
+ DCD 0x00080008
+c12000
+ DCD 12000
+c51000
+ DCD 51000
+c0x00070007
+ DCD 0x00070007
+c0x08a914e8
+ DCD 0x08a914e8
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm
@@ -1,0 +1,264 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_subtract_mby_armv6|
+ EXPORT |vp8_subtract_mbuv_armv6|
+ EXPORT |vp8_subtract_b_armv6|
+
+ INCLUDE vp9_asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *be
+; r1 BLOCKD *bd
+; r2 int pitch
+|vp8_subtract_b_armv6| PROC
+
+ stmfd sp!, {r4-r9}
+
+ ldr r4, [r0, #vp8_block_base_src]
+ ldr r5, [r0, #vp8_block_src]
+ ldr r6, [r0, #vp8_block_src_diff]
+
+ ldr r3, [r4]
+ ldr r7, [r0, #vp8_block_src_stride]
+ add r3, r3, r5 ; src = *base_src + src
+ ldr r8, [r1, #vp8_blockd_predictor]
+
+ mov r9, #4 ; loop count
+
+loop_block
+
+ ldr r0, [r3], r7 ; src
+ ldr r1, [r8], r2 ; pred
+
+ uxtb16 r4, r0 ; [s2 | s0]
+ uxtb16 r5, r1 ; [p2 | p0]
+ uxtb16 r0, r0, ror #8 ; [s3 | s1]
+ uxtb16 r1, r1, ror #8 ; [p3 | p1]
+
+ usub16 r4, r4, r5 ; [d2 | d0]
+ usub16 r5, r0, r1 ; [d3 | d1]
+
+ subs r9, r9, #1 ; decrement loop counter
+
+ pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
+ pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
+
+ str r0, [r6, #0] ; diff
+ str r1, [r6, #4] ; diff
+
+ add r6, r6, r2, lsl #1 ; update diff pointer
+ bne loop_block
+
+ ldmfd sp!, {r4-r9}
+ mov pc, lr
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *usrc
+; r2 unsigned char *vsrc
+; r3 unsigned char *pred
+; stack int stride
+|vp8_subtract_mbuv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ add r0, r0, #512 ; set *diff point to Cb
+ add r3, r3, #256 ; set *pred point to Cb
+
+ mov r4, #8 ; loop count
+ ldr r5, [sp, #40] ; stride
+
+ ; Subtract U block
+loop_u
+ ldr r6, [r1] ; src (A)
+ ldr r7, [r3], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; src (B)
+ ldr r11, [r3], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r1, r1, r5 ; update usrc pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_u
+
+ mov r4, #8 ; loop count
+
+ ; Subtract V block
+loop_v
+ ldr r6, [r2] ; src (A)
+ ldr r7, [r3], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r2, #4] ; src (B)
+ ldr r11, [r3], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r2, r2, r5 ; update vsrc pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_v
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *src
+; r2 unsigned char *pred
+; r3 int stride
+|vp8_subtract_mby_armv6| PROC
+
+ stmfd sp!, {r4-r11}
+
+ mov r4, #16
+loop
+ ldr r6, [r1] ; src (A)
+ ldr r7, [r2], #4 ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; src (B)
+ ldr r11, [r2], #4 ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ ldr r10, [r1, #8] ; src (C)
+ ldr r11, [r2], #4 ; pred (C)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ uxtb16 r8, r10 ; [s2 | s0] (C)
+ str r9, [r0], #4 ; diff (B)
+
+ uxtb16 r9, r11 ; [p2 | p0] (C)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (C)
+ usub16 r7, r10, r11 ; [d3 | d1] (C)
+
+ ldr r10, [r1, #12] ; src (D)
+ ldr r11, [r2], #4 ; pred (D)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
+
+ str r8, [r0], #4 ; diff (C)
+ uxtb16 r8, r10 ; [s2 | s0] (D)
+ str r9, [r0], #4 ; diff (C)
+
+ uxtb16 r9, r11 ; [p2 | p0] (D)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (D)
+ usub16 r7, r10, r11 ; [d3 | d1] (D)
+
+ add r1, r1, r3 ; update src pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+ str r8, [r0], #4 ; diff (D)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (D)
+
+ bne loop
+
+ ldmfd sp!, {r4-r11}
+ mov pc, lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm
@@ -1,0 +1,153 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance16x16_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm
@@ -1,0 +1,101 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance8x8_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance8x8_armv6| PROC
+
+ push {r4-r10, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #8 ; set loop counter to 8 (=block height)
+ mov r4, #0 ; initialize sum = 0
+ mov r5, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r6, [r0, #0x0] ; load 4 src pixels
+ ldr r7, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r6, r7 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r0, #0x4] ; load 4 src pixels
+ ldr r7, [r2, #0x4] ; load 4 ref pixels
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r6, r7 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1 ; next row
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r8, [sp, #32] ; get address of sse
+ mul r1, r4, r4 ; sum * sum
+ str r5, [r8] ; store sse
+ sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm
@@ -1,0 +1,181 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_h_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm
@@ -1,0 +1,222 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_hv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm
@@ -1,0 +1,183 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp9_variance_halfpixvar16x16_v_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
--- a/vp9/encoder/arm/armv6/vp9_vp8_fast_quantize_b_armv6.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_fast_quantize_b_armv6|
-
- INCLUDE vp9_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *b
-; r1 BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
- stmfd sp!, {r1, r4-r11, lr}
-
- ldr r3, [r0, #vp8_block_coeff] ; coeff
- ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
- ldr r5, [r0, #vp8_block_round] ; round
- ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
- ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
- ldr r8, [r1, #vp8_blockd_dequant] ; dequant
-
- ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
- ; is used to update the counter so that
- ; it can be used to mark nonzero
- ; quantized coefficient pairs.
-
- mov r1, #0 ; flags for quantized coeffs
-
- ; PART 1: quantization and dequantization loop
-loop
- ldr r9, [r3], #4 ; [z1 | z0]
- ldr r10, [r5], #4 ; [r1 | r0]
- ldr r11, [r4], #4 ; [q1 | q0]
-
- ssat16 lr, #1, r9 ; [sz1 | sz0]
- eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
- ssub16 r9, r9, lr ; x = (z ^ sz) - sz
- sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
-
- ldr r12, [r3], #4 ; [z3 | z2]
-
- smulbb r0, r9, r11 ; [(x0+r0)*q0]
- smultt r9, r9, r11 ; [(x1+r1)*q1]
-
- ldr r10, [r5], #4 ; [r3 | r2]
-
- ssat16 r11, #1, r12 ; [sz3 | sz2]
- eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
- pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
- ldr r9, [r4], #4 ; [q3 | q2]
- ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
-
- sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
-
- eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
- smulbb r10, r12, r9 ; [(x2+r2)*q2]
- smultt r12, r12, r9 ; [(x3+r3)*q3]
-
- ssub16 r0, r0, lr ; x = (y ^ sz) - sz
-
- cmp r0, #0 ; check if zero
- orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
-
- str r0, [r6], #4 ; *qcoeff++ = x
- ldr r9, [r8], #4 ; [dq1 | dq0]
-
- pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
- eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
- ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
-
- cmp r10, #0 ; check if zero
- orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
-
- str r10, [r6], #4 ; *qcoeff++ = x
- ldr r11, [r8], #4 ; [dq3 | dq2]
-
- smulbb r12, r0, r9 ; [x0*dq0]
- smultt r0, r0, r9 ; [x1*dq1]
-
- smulbb r9, r10, r11 ; [x2*dq2]
- smultt r10, r10, r11 ; [x3*dq3]
-
- lsls r2, r2, #2 ; update loop counter
- strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
- strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
- strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
- strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
- add r7, r7, #8 ; dqcoeff += 8
- bne loop
-
- ; PART 2: check position for eob...
- mov lr, #0 ; init eob
- cmp r1, #0 ; coeffs after quantization?
- ldr r11, [sp, #0] ; restore BLOCKD pointer
- beq end ; skip eob calculations if all zero
-
- ldr r0, [r11, #vp8_blockd_qcoeff]
-
- ; check shortcut for nonzero qcoeffs
- tst r1, #0x80
- bne quant_coeff_15_14
- tst r1, #0x20
- bne quant_coeff_13_11
- tst r1, #0x8
- bne quant_coeff_12_7
- tst r1, #0x40
- bne quant_coeff_10_9
- tst r1, #0x10
- bne quant_coeff_8_3
- tst r1, #0x2
- bne quant_coeff_6_5
- tst r1, #0x4
- bne quant_coeff_4_2
- b quant_coeff_1_0
-
-quant_coeff_15_14
- ldrh r2, [r0, #30] ; rc=15, i=15
- mov lr, #16
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #28] ; rc=14, i=14
- mov lr, #15
- cmp r3, #0
- bne end
-
-quant_coeff_13_11
- ldrh r2, [r0, #22] ; rc=11, i=13
- mov lr, #14
- cmp r2, #0
- bne end
-
-quant_coeff_12_7
- ldrh r3, [r0, #14] ; rc=7, i=12
- mov lr, #13
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #20] ; rc=10, i=11
- mov lr, #12
- cmp r2, #0
- bne end
-
-quant_coeff_10_9
- ldrh r3, [r0, #26] ; rc=13, i=10
- mov lr, #11
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #24] ; rc=12, i=9
- mov lr, #10
- cmp r2, #0
- bne end
-
-quant_coeff_8_3
- ldrh r3, [r0, #18] ; rc=9, i=8
- mov lr, #9
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #12] ; rc=6, i=7
- mov lr, #8
- cmp r2, #0
- bne end
-
-quant_coeff_6_5
- ldrh r3, [r0, #6] ; rc=3, i=6
- mov lr, #7
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #4] ; rc=2, i=5
- mov lr, #6
- cmp r2, #0
- bne end
-
-quant_coeff_4_2
- ldrh r3, [r0, #10] ; rc=5, i=4
- mov lr, #5
- cmp r3, #0
- bne end
-
- ldrh r2, [r0, #16] ; rc=8, i=3
- mov lr, #4
- cmp r2, #0
- bne end
-
- ldrh r3, [r0, #8] ; rc=4, i=2
- mov lr, #3
- cmp r3, #0
- bne end
-
-quant_coeff_1_0
- ldrh r2, [r0, #2] ; rc=1, i=1
- mov lr, #2
- cmp r2, #0
- bne end
-
- mov lr, #1 ; rc=0, i=0
-
-end
- str lr, [r11, #vp8_blockd_eob]
- ldmfd sp!, {r1, r4-r11, pc}
-
- ENDP
-
-loop_count
- DCD 0x1000000
-
- END
-
--- a/vp9/encoder/arm/armv6/vp9_vp8_mse16x16_armv6.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mse16x16_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
-; So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
- push {r4-r9, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov r4, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r5, [r0, #0x0] ; load 4 src pixels
- ldr r6, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r5, r6 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0x4] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r2, #0x4] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
- ldr r5, [r0, #0x8] ; load 4 src pixels
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r6, [r2, #0x8] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- ldr r5, [r0, #0xc] ; load 4 src pixels
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r6, [r2, #0xc] ; load 4 ref pixels
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r5, r6 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r8, lr ; select bytes with positive difference
- usub8 r9, r6, r5 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r5, r7, lr ; calculate sum of positive differences
- usad8 r6, r8, lr ; calculate sum of negative differences
- orr r8, r8, r7 ; differences of all 4 pixels
-
- subs r12, r12, #1 ; next row
-
- ; calculate sse
- uxtb16 r6, r8 ; byte (two pixels) to halfwords
- uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
- smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
- smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r1, [sp, #28] ; get address of sse
- mov r0, r4 ; return sse
- str r4, [r1] ; store sse
-
- pop {r4-r9, pc}
-
- ENDP
-
- END
--- a/vp9/encoder/arm/armv6/vp9_vp8_sad16x16_armv6.asm
+++ /dev/null
@@ -1,96 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad16x16_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 const unsigned char *src_ptr
-; r1 int src_stride
-; r2 const unsigned char *ref_ptr
-; r3 int ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- mov r4, #0 ; sad = 0;
- mov r5, #8 ; loop count
-
-loop
- ; 1st row
- ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
- ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
- ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
- ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
- ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
- ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
-
- usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
- usad8 r8, r7, r9 ; calculate sad for 4 pixels
-
- ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
- ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
-
- add r0, r0, r1 ; set src pointer to next row
- add r2, r2, r3 ; set dst pointer to next row
-
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
- usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
-
- ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
- ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
- add r4, r4, r8 ; add partial sad values
-
- ; 2nd row
- ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
- ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
- ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
- ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
-
- usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
- usad8 r8, r7, r9 ; calculate sad for 4 pixels
-
- ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
- ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
-
- add r0, r0, r1 ; set src pointer to next row
- add r2, r2, r3 ; set dst pointer to next row
-
- usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
- usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
-
- pld [r0, r1, lsl #1]
- pld [r2, r3, lsl #1]
-
- subs r5, r5, #1 ; decrement loop counter
- add r4, r4, r8 ; add partial sad values
-
- bne loop
-
- mov r0, r4 ; return sad
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
- END
-
--- a/vp9/encoder/arm/armv6/vp9_vp8_short_fdct4x4_armv6.asm
+++ /dev/null
@@ -1,262 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_short_fdct4x4_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA |.text|, CODE, READONLY
-; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_armv6| PROC
-
- stmfd sp!, {r4 - r12, lr}
-
- ; PART 1
-
- ; coeffs 0-3
- ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
-
- ldr r10, c7500
- ldr r11, c14500
- ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
- ldr lr, c0x00080008
- ror r5, r5, #16 ; [i2 | i3]
-
- qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
- qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
- smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
-
- smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
-
- pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
- pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
-
- str r6, [r1, #4]
-
- ; coeffs 4-7
- ror r9, r9, #16 ; [i6 | i7]
-
- qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
- qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
- smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
-
- smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
-
- pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
- pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
-
- str r6, [r1, #12]
-
- ; coeffs 8-11
- ror r5, r5, #16 ; [i10 | i11]
-
- qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
- qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
-
- add r0, r0, r2 ; update input pointer
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
- smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
-
- smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
-
- ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
-
- pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
- pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
-
- str r6, [r1, #20]
-
- ; coeffs 12-15
- ror r5, r5, #16 ; [i14 | i15]
-
- qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
- qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
-
- qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
- ; with 2217*4 and 5352*4 without losing the
- ; sign bit (overflow)
-
- smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
- smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
-
- smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
- smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
-
- pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
- pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
-
- str r6, [r1, #28]
-
-
- ; PART 2 -------------------------------------------------
- ldr r11, c12000
- ldr r10, c51000
- ldr lr, c0x00070007
-
- qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
- qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
- qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
- qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
-
- qadd16 r4, r4, lr ; a1 + 7
-
- add r0, r11, #0x10000 ; add (d!=0)
-
- qadd16 r2, r4, r5 ; a1 + b1 + 7
- qsub16 r3, r4, r5 ; a1 - b1 + 7
-
- ldr r12, c0x08a914e8 ; [2217 | 5352]
-
- lsl r8, r2, #16 ; prepare bottom halfword for scaling
- asr r2, r2, #4 ; scale top halfword
- lsl r9, r3, #16 ; prepare bottom halfword for scaling
- asr r3, r3, #4 ; scale top halfword
- pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
- pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
- smulbt r2, r6, r12 ; [ ------ | c1*2217]
- str r4, [r1, #0] ; [ o1 | o0]
- smultt r3, r6, r12 ; [c1*2217 | ------ ]
- str r5, [r1, #16] ; [ o9 | o8]
-
- smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
- smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
-
- smulbb r2, r6, r12 ; [ ------ | c1*5352]
- smultb r3, r6, r12 ; [c1*5352 | ------ ]
-
- lsls r6, r7, #16 ; d1 != 0 ?
- addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
- addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
- asrs r6, r7, #16
- addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
- addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
- smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
- smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
-
- pkhtb r9, r9, r8, asr #16
-
- sub r4, r4, r2
- sub r5, r5, r3
-
- ldr r3, [r1, #4] ; [i3 | i2]
-
- pkhtb r5, r5, r4, asr #16 ; [o13|o12]
-
- str r9, [r1, #8] ; [o5 | 04]
-
- ldr r9, [r1, #12] ; [i7 | i6]
- ldr r8, [r1, #28] ; [i15|i14]
- ldr r2, [r1, #20] ; [i11|i10]
- str r5, [r1, #24] ; [o13|o12]
-
- qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
- qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
-
- qadd16 r4, r4, lr ; a1 + 7
-
- qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
- qadd16 r2, r4, r5 ; a1 + b1 + 7
- qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
- qsub16 r3, r4, r5 ; a1 - b1 + 7
-
- lsl r8, r2, #16 ; prepare bottom halfword for scaling
- asr r2, r2, #4 ; scale top halfword
- lsl r9, r3, #16 ; prepare bottom halfword for scaling
- asr r3, r3, #4 ; scale top halfword
- pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
- pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
-
- smulbt r2, r6, r12 ; [ ------ | c1*2217]
- str r4, [r1, #4] ; [ o3 | o2]
- smultt r3, r6, r12 ; [c1*2217 | ------ ]
- str r5, [r1, #20] ; [ o11 | o10]
-
- smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
- smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
-
- smulbb r2, r6, r12 ; [ ------ | c1*5352]
- smultb r3, r6, r12 ; [c1*5352 | ------ ]
-
- lsls r6, r7, #16 ; d1 != 0 ?
- addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
- addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
-
- asrs r6, r7, #16
- addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
- addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
-
- smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
- smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
-
- pkhtb r9, r9, r8, asr #16
-
- sub r4, r4, r2
- sub r5, r5, r3
-
- str r9, [r1, #12] ; [o7 | o6]
- pkhtb r5, r5, r4, asr #16 ; [o15|o14]
-
- str r5, [r1, #28] ; [o15|o14]
-
- ldmfd sp!, {r4 - r12, pc}
-
- ENDP
-
-; Used constants
-c7500
- DCD 7500
-c14500
- DCD 14500
-c0x22a453a0
- DCD 0x22a453a0
-c0x00080008
- DCD 0x00080008
-c12000
- DCD 12000
-c51000
- DCD 51000
-c0x00070007
- DCD 0x00070007
-c0x08a914e8
- DCD 0x08a914e8
-
- END
--- a/vp9/encoder/arm/armv6/vp9_vp8_subtract_armv6.asm
+++ /dev/null
@@ -1,265 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_subtract_mby_armv6|
- EXPORT |vp8_subtract_mbuv_armv6|
- EXPORT |vp8_subtract_b_armv6|
-
- INCLUDE vp9_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 BLOCK *be
-; r1 BLOCKD *bd
-; r2 int pitch
-|vp8_subtract_b_armv6| PROC
-
- stmfd sp!, {r4-r9}
-
- ldr r4, [r0, #vp8_block_base_src]
- ldr r5, [r0, #vp8_block_src]
- ldr r6, [r0, #vp8_block_src_diff]
-
- ldr r3, [r4]
- ldr r7, [r0, #vp8_block_src_stride]
- add r3, r3, r5 ; src = *base_src + src
- ldr r8, [r1, #vp8_blockd_predictor]
-
- mov r9, #4 ; loop count
-
-loop_block
-
- ldr r0, [r3], r7 ; src
- ldr r1, [r8], r2 ; pred
-
- uxtb16 r4, r0 ; [s2 | s0]
- uxtb16 r5, r1 ; [p2 | p0]
- uxtb16 r0, r0, ror #8 ; [s3 | s1]
- uxtb16 r1, r1, ror #8 ; [p3 | p1]
-
- usub16 r4, r4, r5 ; [d2 | d0]
- usub16 r5, r0, r1 ; [d3 | d1]
-
- subs r9, r9, #1 ; decrement loop counter
-
- pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
- pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
-
- str r0, [r6, #0] ; diff
- str r1, [r6, #4] ; diff
-
- add r6, r6, r2, lsl #1 ; update diff pointer
- bne loop_block
-
- ldmfd sp!, {r4-r9}
- mov pc, lr
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *usrc
-; r2 unsigned char *vsrc
-; r3 unsigned char *pred
-; stack int stride
-|vp8_subtract_mbuv_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- add r0, r0, #512 ; set *diff point to Cb
- add r3, r3, #256 ; set *pred point to Cb
-
- mov r4, #8 ; loop count
- ldr r5, [sp, #40] ; stride
-
- ; Subtract U block
-loop_u
- ldr r6, [r1] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r1, r1, r5 ; update usrc pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_u
-
- mov r4, #8 ; loop count
-
- ; Subtract V block
-loop_v
- ldr r6, [r2] ; src (A)
- ldr r7, [r3], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r2, #4] ; src (B)
- ldr r11, [r3], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- add r2, r2, r5 ; update vsrc pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (B)
-
- bne loop_v
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-
-; r0 short *diff
-; r1 unsigned char *src
-; r2 unsigned char *pred
-; r3 int stride
-|vp8_subtract_mby_armv6| PROC
-
- stmfd sp!, {r4-r11}
-
- mov r4, #16
-loop
- ldr r6, [r1] ; src (A)
- ldr r7, [r2], #4 ; pred (A)
-
- uxtb16 r8, r6 ; [s2 | s0] (A)
- uxtb16 r9, r7 ; [p2 | p0] (A)
- uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
- uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
-
- usub16 r6, r8, r9 ; [d2 | d0] (A)
- usub16 r7, r10, r11 ; [d3 | d1] (A)
-
- ldr r10, [r1, #4] ; src (B)
- ldr r11, [r2], #4 ; pred (B)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
- str r8, [r0], #4 ; diff (A)
- uxtb16 r8, r10 ; [s2 | s0] (B)
- str r9, [r0], #4 ; diff (A)
-
- uxtb16 r9, r11 ; [p2 | p0] (B)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
-
- usub16 r6, r8, r9 ; [d2 | d0] (B)
- usub16 r7, r10, r11 ; [d3 | d1] (B)
-
- ldr r10, [r1, #8] ; src (C)
- ldr r11, [r2], #4 ; pred (C)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
- str r8, [r0], #4 ; diff (B)
- uxtb16 r8, r10 ; [s2 | s0] (C)
- str r9, [r0], #4 ; diff (B)
-
- uxtb16 r9, r11 ; [p2 | p0] (C)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
-
- usub16 r6, r8, r9 ; [d2 | d0] (C)
- usub16 r7, r10, r11 ; [d3 | d1] (C)
-
- ldr r10, [r1, #12] ; src (D)
- ldr r11, [r2], #4 ; pred (D)
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
-
- str r8, [r0], #4 ; diff (C)
- uxtb16 r8, r10 ; [s2 | s0] (D)
- str r9, [r0], #4 ; diff (C)
-
- uxtb16 r9, r11 ; [p2 | p0] (D)
- uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
- uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
-
- usub16 r6, r8, r9 ; [d2 | d0] (D)
- usub16 r7, r10, r11 ; [d3 | d1] (D)
-
- add r1, r1, r3 ; update src pointer
-
- pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
- pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
- str r8, [r0], #4 ; diff (D)
- subs r4, r4, #1 ; update loop counter
- str r9, [r0], #4 ; diff (D)
-
- bne loop
-
- ldmfd sp!, {r4-r11}
- mov pc, lr
-
- ENDP
-
- END
-
--- a/vp9/encoder/arm/armv6/vp9_vp8_variance16x16_armv6.asm
+++ /dev/null
@@ -1,154 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance16x16_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance16x16_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
-
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r5, [r2, #4] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r5, [r2, #8] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r5, [r2, #12] ; load 4 ref pixels
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r9, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
- END
-
--- a/vp9/encoder/arm/armv6/vp9_vp8_variance8x8_armv6.asm
+++ /dev/null
@@ -1,101 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance8x8_armv6|
-
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance8x8_armv6| PROC
-
- push {r4-r10, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r12, #8 ; set loop counter to 8 (=block height)
- mov r4, #0 ; initialize sum = 0
- mov r5, #0 ; initialize sse = 0
-
-loop
- ; 1st 4 pixels
- ldr r6, [r0, #0x0] ; load 4 src pixels
- ldr r7, [r2, #0x0] ; load 4 ref pixels
-
- mov lr, #0 ; constant zero
-
- usub8 r8, r6, r7 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r6, [r0, #0x4] ; load 4 src pixels
- ldr r7, [r2, #0x4] ; load 4 ref pixels
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- usub8 r8, r6, r7 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r10, r8, lr ; select bytes with positive difference
- usub8 r9, r7, r6 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r8, r9, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r6, r10, lr ; calculate sum of positive differences
- usad8 r7, r8, lr ; calculate sum of negative differences
- orr r8, r8, r10 ; differences of all 4 pixels
-
- ; calculate total sum
- add r4, r4, r6 ; add positive differences to sum
- sub r4, r4, r7 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r7, r8 ; byte (two pixels) to halfwords
- uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
- smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1 ; next row
- smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r8, [sp, #32] ; get address of sse
- mul r1, r4, r4 ; sum * sum
- str r5, [r8] ; store sse
- sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
-
- pop {r4-r10, pc}
-
- ENDP
-
- END
--- a/vp9/encoder/arm/armv6/vp9_vp8_variance_halfpixvar16x16_h_armv6.asm
+++ /dev/null
@@ -1,182 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_h_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- a/vp9/encoder/arm/armv6/vp9_vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ /dev/null
@@ -1,222 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_hv_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; pointer to pixels on the next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load source pixels a, row N
- ldr r6, [r0, #1] ; load source pixels b, row N
- ldr r5, [r9, #0] ; load source pixels c, row N+1
- ldr r7, [r9, #1] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #0] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load source pixels a, row N
- ldr r6, [r0, #5] ; load source pixels b, row N
- ldr r5, [r9, #4] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #5] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #4] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load source pixels a, row N
- ldr r6, [r0, #9] ; load source pixels b, row N
- ldr r5, [r9, #8] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #9] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #8] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load source pixels a, row N
- ldr r6, [r0, #13] ; load source pixels b, row N
- ldr r5, [r9, #12] ; load source pixels c, row N+1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
- ldr r7, [r9, #13] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #12] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
--- a/vp9/encoder/arm/armv6/vp9_vp8_variance_halfpixvar16x16_v_armv6.asm
+++ /dev/null
@@ -1,184 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp9_variance_halfpixvar16x16_v_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; set src pointer to next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r9, #0] ; load 4 src pixels from next row
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r9, #4] ; load 4 src pixels from next row
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r9, #8] ; load 4 src pixels from next row
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r9, #12] ; load 4 src pixels from next row
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; substract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_memcpy_neon.asm
@@ -1,0 +1,68 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_memcpy_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+ ;pld [r1] ;preload pred data
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ mov r12, r2, lsr #8 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+ vld1.8 {q0, q1}, [r1]! ;load src data
+ subs r12, r12, #1
+ vld1.8 {q2, q3}, [r1]!
+ vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
+ vld1.8 {q4, q5}, [r1]!
+ vst1.8 {q2, q3}, [r0]!
+ vld1.8 {q6, q7}, [r1]!
+ vst1.8 {q4, q5}, [r0]!
+ vld1.8 {q8, q9}, [r1]!
+ vst1.8 {q6, q7}, [r0]!
+ vld1.8 {q10, q11}, [r1]!
+ vst1.8 {q8, q9}, [r0]!
+ vld1.8 {q12, q13}, [r1]!
+ vst1.8 {q10, q11}, [r0]!
+ vld1.8 {q14, q15}, [r1]!
+ vst1.8 {q12, q13}, [r0]!
+ vst1.8 {q14, q15}, [r0]!
+
+ ;pld [r1] ;preload pred data -- need to adjust for real device
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ bne memcpy_neon_loop
+
+ ands r3, r2, #0xff ;extra copy
+ beq done_copy_neon_loop
+
+extra_copy_neon_loop
+ vld1.8 {q0}, [r1]! ;load src data
+ subs r3, r3, #16
+ vst1.8 {q0}, [r0]!
+ bne extra_copy_neon_loop
+
+done_copy_neon_loop
+ bx lr
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm
@@ -1,0 +1,116 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_neon|
+ EXPORT |vp8_get4x4sse_cs_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp9_variance().
+
+|vp8_mse16x16_neon| PROC
+ vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
+ vmov.i8 q8, #0
+ vmov.i8 q9, #0
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+mse16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmlal.s16 q7, d22, d22
+ vmlal.s16 q8, d23, d23
+
+ subs r12, r12, #1
+
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vmlal.s16 q7, d26, d26
+ vmlal.s16 q8, d27, d27
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne mse16x16_neon_loop
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vadd.u32 q10, q7, q9
+ vpaddl.u32 q1, q10
+ vadd.u64 d0, d2, d3
+
+ vst1.32 {d0[0]}, [r12]
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+
+;=============================
+; r0 unsigned char *src_ptr,
+; r1 int source_stride,
+; r2 unsigned char *ref_ptr,
+; r3 int recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmull.s16 q7, d22, d22
+ vmull.s16 q8, d24, d24
+ vmull.s16 q9, d26, d26
+ vmull.s16 q10, d28, d28
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+ vadd.u32 q9, q7, q9
+
+ vpaddl.u32 q1, q9
+ vadd.u64 d0, d2, d3
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm
@@ -1,0 +1,103 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_walsh4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
+|vp8_short_walsh4x4_neon| PROC
+
+ vld1.16 {d0}, [r0@64], r2 ; load input
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {d2}, [r0@64], r2
+ vld1.16 {d3}, [r0@64]
+
+ ;First for-loop
+ ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+
+ vmov.s32 q15, #3 ; add 3 to all values
+
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
+ vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
+ vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
+ vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
+
+ vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
+ vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
+ vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
+ vceq.s16 d16, d4, #0 ; a1 == 0
+ vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
+
+ vadd.s16 d0, d4, d5 ; a1 + d1
+ vmvn d16, d16 ; a1 != 0
+ vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
+ vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
+ vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
+ vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
+
+ ;Second for-loop
+ ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d1, d3
+ vtrn.32 d0, d2
+ vtrn.16 d2, d3
+ vtrn.16 d0, d1
+
+ vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
+ vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
+ vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
+ vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
+
+ vadd.s32 q0, q8, q9 ; a2 = a1 + d1
+ vadd.s32 q1, q11, q10 ; b2 = b1 + c1
+ vsub.s32 q2, q11, q10 ; c2 = b1 - c1
+ vsub.s32 q3, q8, q9 ; d2 = a1 - d1
+
+ vclt.s32 q8, q0, #0
+ vclt.s32 q9, q1, #0
+ vclt.s32 q10, q2, #0
+ vclt.s32 q11, q3, #0
+
+ ; subtract -1 (or 0)
+ vsub.s32 q0, q0, q8 ; a2 += a2 < 0
+ vsub.s32 q1, q1, q9 ; b2 += b2 < 0
+ vsub.s32 q2, q2, q10 ; c2 += c2 < 0
+ vsub.s32 q3, q3, q11 ; d2 += d2 < 0
+
+ vadd.s32 q8, q0, q15 ; a2 + 3
+ vadd.s32 q9, q1, q15 ; b2 + 3
+ vadd.s32 q10, q2, q15 ; c2 + 3
+ vadd.s32 q11, q3, q15 ; d2 + 3
+
+ ; vrshrn? would add 1 << 3-1 = 2
+ vshrn.s32 d0, q8, #3
+ vshrn.s32 d1, q9, #3
+ vshrn.s32 d2, q10, #3
+ vshrn.s32 d3, q11, #3
+
+ vst1.16 {q0, q1}, [r1@128]
+
+ bx lr
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm
@@ -1,0 +1,425 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_sub_pixel_variance16x16_neon_func|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
+
+|vp9_sub_pixel_variance16x16_neon_func| PROC
+ push {r4-r6, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #16] ;load *dst_ptr from stack
+ ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
+ ldr r6, [sp, #24] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ sub sp, sp, #256
+ mov r3, sp
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5}, [r3]!
+ vst1.u8 {d6, d7}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_sp16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r3]! ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r3]!
+ vst1.u8 {d18, d19}, [r3]!
+ vst1.u8 {d20, d21}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+ mov r3, sp
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r3]!
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r12, #8
+
+sub_pixel_variance16x16_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q2}, [r4], r5
+ vld1.8 {q1}, [r3]!
+ vld1.8 {q3}, [r4], r5
+
+ vsubl.u8 q11, d0, d4 ;diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne sub_pixel_variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r6] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #528
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4-r6,pc}
+
+ ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm
@@ -1,0 +1,572 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_variance_halfpixvar16x16_h_neon|
+ EXPORT |vp9_variance_halfpixvar16x16_v_neon|
+ EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
+ EXPORT |vp9_sub_pixel_variance16x16s_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_h_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_h_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.8 {q11}, [r2], r3
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.8 {q12}, [r2], r3
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.8 {q13}, [r2], r3
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vext.8 q3, q2, q3, #1
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vld1.8 {q14}, [r2], r3
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+
+ vsubl.u8 q4, d0, d22 ;diff
+ vsubl.u8 q5, d1, d23
+ vsubl.u8 q6, d2, d24
+ vsubl.u8 q7, d3, d25
+ vsubl.u8 q0, d4, d26
+ vsubl.u8 q1, d5, d27
+ vsubl.u8 q2, d6, d28
+ vsubl.u8 q3, d7, d29
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_fpo16x16s_4_0_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_v_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_v_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+
+ vld1.u8 {q0}, [r0], r1 ;load src data
+ ldr lr, [sp, #4] ;load *sse from stack
+
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+ vld1.u8 {q2}, [r0], r1
+ vld1.8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+
+ vrhadd.u8 q0, q0, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q4
+ vrhadd.u8 q4, q4, q6
+ vrhadd.u8 q6, q6, q15
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+
+ vmov q0, q15
+
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_spo16x16s_0_4_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp9_variance_halfpixvar16x16_hv_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp9_variance_halfpixvar16x16_hv_neon| PROC
+ push {lr}
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q13, #0 ;q8 - sum
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+
+ vmov.i8 q14, #0 ;q9, q10 - sse
+ vmov.i8 q15, #0
+
+ mov r12, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vld1.8 {q5}, [r2], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.8 {q6}, [r2], r3
+ vrhadd.u8 q1, q1, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q3
+ vld1.8 {q8}, [r2], r3
+ vrhadd.u8 q3, q3, q4
+
+ vsubl.u8 q9, d0, d10 ;diff
+ vsubl.u8 q10, d1, d11
+ vsubl.u8 q11, d2, d12
+ vsubl.u8 q12, d3, d13
+
+ vsubl.u8 q0, d4, d14 ;diff
+ vsubl.u8 q1, d5, d15
+ vsubl.u8 q5, d6, d16
+ vsubl.u8 q6, d7, d17
+
+ vpadal.s16 q13, q9 ;sum
+ vmlal.s16 q14, d18, d18 ;sse
+ vmlal.s16 q15, d19, d19
+
+ vpadal.s16 q13, q10 ;sum
+ vmlal.s16 q14, d20, d20 ;sse
+ vmlal.s16 q15, d21, d21
+
+ vpadal.s16 q13, q11 ;sum
+ vmlal.s16 q14, d22, d22 ;sse
+ vmlal.s16 q15, d23, d23
+
+ vpadal.s16 q13, q12 ;sum
+ vmlal.s16 q14, d24, d24 ;sse
+ vmlal.s16 q15, d25, d25
+
+ subs r12, r12, #1
+
+ vpadal.s16 q13, q0 ;sum
+ vmlal.s16 q14, d0, d0 ;sse
+ vmlal.s16 q15, d1, d1
+
+ vpadal.s16 q13, q1 ;sum
+ vmlal.s16 q14, d2, d2 ;sse
+ vmlal.s16 q15, d3, d3
+
+ vpadal.s16 q13, q5 ;sum
+ vmlal.s16 q14, d10, d10 ;sse
+ vmlal.s16 q15, d11, d11
+
+ vmov q0, q4
+
+ vpadal.s16 q13, q6 ;sum
+ vmlal.s16 q14, d12, d12 ;sse
+ vmlal.s16 q15, d13, d13
+
+ bne vp8_filt16x16s_4_4_loop_neon
+
+ vadd.u32 q15, q14, q15 ;accumulate sse
+ vpaddl.s32 q0, q13 ;accumulate sum
+
+ vpaddl.u32 q1, q15
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;==============================
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp9_sub_pixel_variance16x16s_neon| PROC
+ push {r4, lr}
+
+ ldr r4, [sp, #8] ;load *dst_ptr from stack
+ ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #16] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16s_only
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq firstpass_bfilter16x16s_only
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ mov r3, sp
+ mov r2, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q1, q1, q2
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q3, q3, q4
+
+ subs r2, r2, #1
+ vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
+ vmov q0, q4
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+
+ bne vp8e_filt_blk2d_fp16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+ mov r2, #2 ;loop counter
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+ vext.8 q3, q2, q3, #1
+ vld1.u8 {d20, d21, d22, d23}, [r0], r1
+ vext.8 q5, q4, q5, #1
+ vld1.u8 {d24, d25, d26, d27}, [r0], r1
+ vext.8 q7, q6, q7, #1
+ vld1.u8 {d28, d29, d30, d31}, [r0], r1
+ vext.8 q9, q8, q9, #1
+ vext.8 q11, q10, q11, #1
+ vext.8 q13, q12, q13, #1
+ vext.8 q15, q14, q15, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+ vrhadd.u8 q5, q10, q11
+ vrhadd.u8 q6, q12, q13
+ vrhadd.u8 q7, q14, q15
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]!
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+
+ mov r2, #2 ;loop counter
+ vld1.u8 {d0, d1}, [r0], r1 ;load src data
+ mov r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+ vld1.u8 {d2, d3}, [r0], r1
+ vld1.u8 {d4, d5}, [r0], r1
+ vld1.u8 {d6, d7}, [r0], r1
+ vld1.u8 {d8, d9}, [r0], r1
+
+ vrhadd.u8 q0, q0, q1
+ vld1.u8 {d10, d11}, [r0], r1
+ vrhadd.u8 q1, q1, q2
+ vld1.u8 {d12, d13}, [r0], r1
+ vrhadd.u8 q2, q2, q3
+ vld1.u8 {d14, d15}, [r0], r1
+ vrhadd.u8 q3, q3, q4
+ vld1.u8 {d16, d17}, [r0], r1
+ vrhadd.u8 q4, q4, q5
+ vrhadd.u8 q5, q5, q6
+ vrhadd.u8 q6, q6, q7
+ vrhadd.u8 q7, q7, q8
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vmov q0, q8
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q1}, [r4], r12
+ vld1.8 {q2}, [r3]!
+ vld1.8 {q3}, [r4], r12
+ vld1.8 {q4}, [r3]!
+ vld1.8 {q5}, [r4], r12
+ vld1.8 {q6}, [r3]!
+ vld1.8 {q7}, [r4], r12
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r2, r2, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne sub_pixel_variance16x16s_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #8
+ vsub.s32 d0, d1, d10
+
+ add sp, sp, #256
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subpixelvariance8x8_neon.asm
@@ -1,0 +1,224 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_sub_pixel_variance8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
+
+|vp9_sub_pixel_variance8x8_neon| PROC
+ push {r4-r5, lr}
+
+ ldr r12, _BilinearTaps_coeff_
+ ldr r4, [sp, #12] ;load *dst_ptr from stack
+ ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #20] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ ;skip_secondpass_filter
+ beq sub_pixel_variance8x8_neon
+
+ add r3, r12, r3, lsl #3
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q2, #7
+ vqrshrn.u16 d24, q3, #7
+ vqrshrn.u16 d25, q4, #7
+ vqrshrn.u16 d26, q5, #7
+ vqrshrn.u16 d27, q6, #7
+ vqrshrn.u16 d28, q7, #7
+ vqrshrn.u16 d29, q8, #7
+
+ b sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;----------------------
+;vp9_variance8x8_neon
+sub_pixel_variance8x8_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+sub_pixel_variance8x8_neon_loop
+ vld1.8 {d0}, [r4], r5 ;load dst data
+ subs r12, r12, #1
+ vld1.8 {d1}, [r4], r5
+ vld1.8 {d2}, [r4], r5
+ vsubl.u8 q4, d22, d0 ;calculate diff
+ vld1.8 {d3}, [r4], r5
+
+ vsubl.u8 q5, d23, d1
+ vsubl.u8 q6, d24, d2
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ vsubl.u8 q7, d25, d3
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+
+ vmov q11, q13
+
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+
+ vmov q12, q14
+
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ bne sub_pixel_variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.s32 d10, d10, #6
+ vsub.s32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {r4-r5, pc}
+
+ ENDP
+
+;-----------------
+
+_BilinearTaps_coeff_
+ DCD bilinear_taps_coeff
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
--- a/vp9/encoder/arm/neon/vp9_vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,68 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_memcpy_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
-|vp8_memcpy_neon| PROC
- ;pld [r1] ;preload pred data
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- mov r12, r2, lsr #8 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
- vld1.8 {q0, q1}, [r1]! ;load src data
- subs r12, r12, #1
- vld1.8 {q2, q3}, [r1]!
- vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
- vld1.8 {q4, q5}, [r1]!
- vst1.8 {q2, q3}, [r0]!
- vld1.8 {q6, q7}, [r1]!
- vst1.8 {q4, q5}, [r0]!
- vld1.8 {q8, q9}, [r1]!
- vst1.8 {q6, q7}, [r0]!
- vld1.8 {q10, q11}, [r1]!
- vst1.8 {q8, q9}, [r0]!
- vld1.8 {q12, q13}, [r1]!
- vst1.8 {q10, q11}, [r0]!
- vld1.8 {q14, q15}, [r1]!
- vst1.8 {q12, q13}, [r0]!
- vst1.8 {q14, q15}, [r0]!
-
- ;pld [r1] ;preload pred data -- need to adjust for real device
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- bne memcpy_neon_loop
-
- ands r3, r2, #0xff ;extra copy
- beq done_copy_neon_loop
-
-extra_copy_neon_loop
- vld1.8 {q0}, [r1]! ;load src data
- subs r3, r3, #16
- vst1.8 {q0}, [r0]!
- bne extra_copy_neon_loop
-
-done_copy_neon_loop
- bx lr
- ENDP
-
- END
--- a/vp9/encoder/arm/neon/vp9_vp8_mse16x16_neon.asm
+++ /dev/null
@@ -1,116 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_mse16x16_neon|
- EXPORT |vp8_get4x4sse_cs_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;============================
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-;note: in this function, sum is never used. So, we can remove this part of calculation
-;from vp9_variance().
-
-|vp8_mse16x16_neon| PROC
- vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
- vmov.i8 q8, #0
- vmov.i8 q9, #0
- vmov.i8 q10, #0
-
- mov r12, #8
-
-mse16x16_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vmlal.s16 q7, d22, d22
- vmlal.s16 q8, d23, d23
-
- subs r12, r12, #1
-
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vmlal.s16 q7, d26, d26
- vmlal.s16 q8, d27, d27
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne mse16x16_neon_loop
-
- vadd.u32 q7, q7, q8
- vadd.u32 q9, q9, q10
-
- ldr r12, [sp] ;load *sse from stack
-
- vadd.u32 q10, q7, q9
- vpaddl.u32 q1, q10
- vadd.u64 d0, d2, d3
-
- vst1.32 {d0[0]}, [r12]
- vmov.32 r0, d0[0]
-
- bx lr
-
- ENDP
-
-
-;=============================
-; r0 unsigned char *src_ptr,
-; r1 int source_stride,
-; r2 unsigned char *ref_ptr,
-; r3 int recon_stride
-|vp8_get4x4sse_cs_neon| PROC
- vld1.8 {d0}, [r0], r1 ;Load up source and reference
- vld1.8 {d4}, [r2], r3
- vld1.8 {d1}, [r0], r1
- vld1.8 {d5}, [r2], r3
- vld1.8 {d2}, [r0], r1
- vld1.8 {d6}, [r2], r3
- vld1.8 {d3}, [r0], r1
- vld1.8 {d7}, [r2], r3
-
- vsubl.u8 q11, d0, d4
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vmull.s16 q7, d22, d22
- vmull.s16 q8, d24, d24
- vmull.s16 q9, d26, d26
- vmull.s16 q10, d28, d28
-
- vadd.u32 q7, q7, q8
- vadd.u32 q9, q9, q10
- vadd.u32 q9, q7, q9
-
- vpaddl.u32 q1, q9
- vadd.u64 d0, d2, d3
-
- vmov.32 r0, d0[0]
- bx lr
-
- ENDP
-
- END
--- a/vp9/encoder/arm/neon/vp9_vp8_shortwalsh4x4_neon.asm
+++ /dev/null
@@ -1,103 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_walsh4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0 short *input,
-; r1 short *output,
-; r2 int pitch
-|vp8_short_walsh4x4_neon| PROC
-
- vld1.16 {d0}, [r0@64], r2 ; load input
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {d2}, [r0@64], r2
- vld1.16 {d3}, [r0@64]
-
- ;First for-loop
- ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
-
- vmov.s32 q15, #3 ; add 3 to all values
-
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
- vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
- vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
- vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
-
- vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
- vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
- vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
- vceq.s16 d16, d4, #0 ; a1 == 0
- vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
-
- vadd.s16 d0, d4, d5 ; a1 + d1
- vmvn d16, d16 ; a1 != 0
- vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
- vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
- vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
- vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
-
- ;Second for-loop
- ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d1, d3
- vtrn.32 d0, d2
- vtrn.16 d2, d3
- vtrn.16 d0, d1
-
- vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
- vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
- vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
- vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
-
- vadd.s32 q0, q8, q9 ; a2 = a1 + d1
- vadd.s32 q1, q11, q10 ; b2 = b1 + c1
- vsub.s32 q2, q11, q10 ; c2 = b1 - c1
- vsub.s32 q3, q8, q9 ; d2 = a1 - d1
-
- vclt.s32 q8, q0, #0
- vclt.s32 q9, q1, #0
- vclt.s32 q10, q2, #0
- vclt.s32 q11, q3, #0
-
- ; subtract -1 (or 0)
- vsub.s32 q0, q0, q8 ; a2 += a2 < 0
- vsub.s32 q1, q1, q9 ; b2 += b2 < 0
- vsub.s32 q2, q2, q10 ; c2 += c2 < 0
- vsub.s32 q3, q3, q11 ; d2 += d2 < 0
-
- vadd.s32 q8, q0, q15 ; a2 + 3
- vadd.s32 q9, q1, q15 ; b2 + 3
- vadd.s32 q10, q2, q15 ; c2 + 3
- vadd.s32 q11, q3, q15 ; d2 + 3
-
- ; vrshrn? would add 1 << 3-1 = 2
- vshrn.s32 d0, q8, #3
- vshrn.s32 d1, q9, #3
- vshrn.s32 d2, q10, #3
- vshrn.s32 d3, q11, #3
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
- END
--- a/vp9/encoder/arm/neon/vp9_vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_sub_pixel_variance16x16_neon_func|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
-
-|vp9_sub_pixel_variance16x16_neon_func| PROC
- push {r4-r6, lr}
-
- ldr r12, _BilinearTaps_coeff_
- ldr r4, [sp, #16] ;load *dst_ptr from stack
- ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
- ldr r6, [sp, #24] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- sub sp, sp, #256
- mov r3, sp
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5}, [r3]!
- vst1.u8 {d6, d7}, [r3]!
- vmov q11, q15
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_sp16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r3]! ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r3]!
- vst1.u8 {d18, d19}, [r3]!
- vst1.u8 {d20, d21}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
- mov r3, sp
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r3]!
- vmov q11, q15
- vst1.u8 {d6, d7}, [r3]!
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r12, #8
-
-sub_pixel_variance16x16_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q2}, [r4], r5
- vld1.8 {q1}, [r3]!
- vld1.8 {q3}, [r4], r5
-
- vsubl.u8 q11, d0, d4 ;diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne sub_pixel_variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r6] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- add sp, sp, #528
- vmov.32 r0, d0[0] ;return
-
- pop {r4-r6,pc}
-
- ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
- DCD bilinear_taps_coeff
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp9/encoder/arm/neon/vp9_vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,572 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_variance_halfpixvar16x16_h_neon|
- EXPORT |vp9_variance_halfpixvar16x16_v_neon|
- EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
- EXPORT |vp9_sub_pixel_variance16x16s_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_h_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_h_neon| PROC
- push {lr}
-
- mov r12, #4 ;loop counter
- ldr lr, [sp, #4] ;load *sse from stack
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.8 {q11}, [r2], r3
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.8 {q12}, [r2], r3
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.8 {q13}, [r2], r3
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vext.8 q3, q2, q3, #1
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vld1.8 {q14}, [r2], r3
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
-
- vsubl.u8 q4, d0, d22 ;diff
- vsubl.u8 q5, d1, d23
- vsubl.u8 q6, d2, d24
- vsubl.u8 q7, d3, d25
- vsubl.u8 q0, d4, d26
- vsubl.u8 q1, d5, d27
- vsubl.u8 q2, d6, d28
- vsubl.u8 q3, d7, d29
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_fpo16x16s_4_0_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_v_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_v_neon| PROC
- push {lr}
-
- mov r12, #4 ;loop counter
-
- vld1.u8 {q0}, [r0], r1 ;load src data
- ldr lr, [sp, #4] ;load *sse from stack
-
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
- vld1.u8 {q2}, [r0], r1
- vld1.8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vld1.8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vld1.8 {q5}, [r2], r3
- vld1.u8 {q15}, [r0], r1
-
- vrhadd.u8 q0, q0, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q4
- vrhadd.u8 q4, q4, q6
- vrhadd.u8 q6, q6, q15
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
-
- vmov q0, q15
-
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_spo16x16s_0_4_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp9_variance_halfpixvar16x16_hv_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp9_variance_halfpixvar16x16_hv_neon| PROC
- push {lr}
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
-
- ldr lr, [sp, #4] ;load *sse from stack
- vmov.i8 q13, #0 ;q8 - sum
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
-
- vmov.i8 q14, #0 ;q9, q10 - sse
- vmov.i8 q15, #0
-
- mov r12, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vld1.8 {q5}, [r2], r3
- vrhadd.u8 q0, q0, q1
- vld1.8 {q6}, [r2], r3
- vrhadd.u8 q1, q1, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q3
- vld1.8 {q8}, [r2], r3
- vrhadd.u8 q3, q3, q4
-
- vsubl.u8 q9, d0, d10 ;diff
- vsubl.u8 q10, d1, d11
- vsubl.u8 q11, d2, d12
- vsubl.u8 q12, d3, d13
-
- vsubl.u8 q0, d4, d14 ;diff
- vsubl.u8 q1, d5, d15
- vsubl.u8 q5, d6, d16
- vsubl.u8 q6, d7, d17
-
- vpadal.s16 q13, q9 ;sum
- vmlal.s16 q14, d18, d18 ;sse
- vmlal.s16 q15, d19, d19
-
- vpadal.s16 q13, q10 ;sum
- vmlal.s16 q14, d20, d20 ;sse
- vmlal.s16 q15, d21, d21
-
- vpadal.s16 q13, q11 ;sum
- vmlal.s16 q14, d22, d22 ;sse
- vmlal.s16 q15, d23, d23
-
- vpadal.s16 q13, q12 ;sum
- vmlal.s16 q14, d24, d24 ;sse
- vmlal.s16 q15, d25, d25
-
- subs r12, r12, #1
-
- vpadal.s16 q13, q0 ;sum
- vmlal.s16 q14, d0, d0 ;sse
- vmlal.s16 q15, d1, d1
-
- vpadal.s16 q13, q1 ;sum
- vmlal.s16 q14, d2, d2 ;sse
- vmlal.s16 q15, d3, d3
-
- vpadal.s16 q13, q5 ;sum
- vmlal.s16 q14, d10, d10 ;sse
- vmlal.s16 q15, d11, d11
-
- vmov q0, q4
-
- vpadal.s16 q13, q6 ;sum
- vmlal.s16 q14, d12, d12 ;sse
- vmlal.s16 q15, d13, d13
-
- bne vp8_filt16x16s_4_4_loop_neon
-
- vadd.u32 q15, q14, q15 ;accumulate sse
- vpaddl.s32 q0, q13 ;accumulate sum
-
- vpaddl.u32 q1, q15
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {pc}
- ENDP
-
-;==============================
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp9_sub_pixel_variance16x16s_neon| PROC
- push {r4, lr}
-
- ldr r4, [sp, #8] ;load *dst_ptr from stack
- ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #16] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16s_only
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq firstpass_bfilter16x16s_only
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- mov r3, sp
- mov r2, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q1, q1, q2
- vrhadd.u8 q2, q2, q3
- vrhadd.u8 q3, q3, q4
-
- subs r2, r2, #1
- vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
- vmov q0, q4
- vst1.u8 {d4, d5, d6, d7}, [r3]!
-
- bne vp8e_filt_blk2d_fp16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
- mov r2, #2 ;loop counter
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
- vext.8 q3, q2, q3, #1
- vld1.u8 {d20, d21, d22, d23}, [r0], r1
- vext.8 q5, q4, q5, #1
- vld1.u8 {d24, d25, d26, d27}, [r0], r1
- vext.8 q7, q6, q7, #1
- vld1.u8 {d28, d29, d30, d31}, [r0], r1
- vext.8 q9, q8, q9, #1
- vext.8 q11, q10, q11, #1
- vext.8 q13, q12, q13, #1
- vext.8 q15, q14, q15, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
- vrhadd.u8 q5, q10, q11
- vrhadd.u8 q6, q12, q13
- vrhadd.u8 q7, q14, q15
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]!
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
- sub sp, sp, #256 ;reserve space on stack for temporary storage
-
- mov r2, #2 ;loop counter
- vld1.u8 {d0, d1}, [r0], r1 ;load src data
- mov r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
- vld1.u8 {d2, d3}, [r0], r1
- vld1.u8 {d4, d5}, [r0], r1
- vld1.u8 {d6, d7}, [r0], r1
- vld1.u8 {d8, d9}, [r0], r1
-
- vrhadd.u8 q0, q0, q1
- vld1.u8 {d10, d11}, [r0], r1
- vrhadd.u8 q1, q1, q2
- vld1.u8 {d12, d13}, [r0], r1
- vrhadd.u8 q2, q2, q3
- vld1.u8 {d14, d15}, [r0], r1
- vrhadd.u8 q3, q3, q4
- vld1.u8 {d16, d17}, [r0], r1
- vrhadd.u8 q4, q4, q5
- vrhadd.u8 q5, q5, q6
- vrhadd.u8 q6, q6, q7
- vrhadd.u8 q7, q7, q8
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vmov q0, q8
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r2, #4
-
-sub_pixel_variance16x16s_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q1}, [r4], r12
- vld1.8 {q2}, [r3]!
- vld1.8 {q3}, [r4], r12
- vld1.8 {q4}, [r3]!
- vld1.8 {q5}, [r4], r12
- vld1.8 {q6}, [r3]!
- vld1.8 {q7}, [r4], r12
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r2, r2, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne sub_pixel_variance16x16s_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- add sp, sp, #256
- vmov.32 r0, d0[0] ;return
-
- pop {r4, pc}
- ENDP
-
- END
--- a/vp9/encoder/arm/neon/vp9_vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,224 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp9_sub_pixel_variance8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
-
-|vp9_sub_pixel_variance8x8_neon| PROC
- push {r4-r5, lr}
-
- ldr r12, _BilinearTaps_coeff_
- ldr r4, [sp, #12] ;load *dst_ptr from stack
- ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #20] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- ;skip_secondpass_filter
- beq sub_pixel_variance8x8_neon
-
- add r3, r12, r3, lsl #3
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q2, #7
- vqrshrn.u16 d24, q3, #7
- vqrshrn.u16 d25, q4, #7
- vqrshrn.u16 d26, q5, #7
- vqrshrn.u16 d27, q6, #7
- vqrshrn.u16 d28, q7, #7
- vqrshrn.u16 d29, q8, #7
-
- b sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;----------------------
-;vp9_variance8x8_neon
-sub_pixel_variance8x8_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-sub_pixel_variance8x8_neon_loop
- vld1.8 {d0}, [r4], r5 ;load dst data
- subs r12, r12, #1
- vld1.8 {d1}, [r4], r5
- vld1.8 {d2}, [r4], r5
- vsubl.u8 q4, d22, d0 ;calculate diff
- vld1.8 {d3}, [r4], r5
-
- vsubl.u8 q5, d23, d1
- vsubl.u8 q6, d24, d2
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- vsubl.u8 q7, d25, d3
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
-
- vmov q11, q13
-
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
-
- vmov q12, q14
-
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- bne sub_pixel_variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.s32 d10, d10, #6
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
- pop {r4-r5, pc}
-
- ENDP
-
-;-----------------
-
-_BilinearTaps_coeff_
- DCD bilinear_taps_coeff
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -83,7 +83,7 @@
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_vp8_asm_stubs.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
--
⑨