ref: be6c031fb3b0ee940c852a6a2db63af4747ff022
parent: 70deaf00eb4a035247fd102337f949bff7b1f232
parent: 3ee6db6c8110680c051fe7a4dca97bb27474ca00
author: Jingning Han <jingning@google.com>
date: Tue Aug 11 21:57:15 EDT 2015
Merge "Fork VP9 and VP10 codebase"
--- a/configure
+++ b/configure
@@ -37,6 +37,7 @@
${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles
${toggle_vp8} VP8 codec support
${toggle_vp9} VP9 codec support
+ ${toggle_vp10} VP10 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
${toggle_postproc} postprocessing
${toggle_vp9_postproc} vp9 specific postprocessing
@@ -191,6 +192,7 @@
# disable codecs when their source directory does not exist
[ -d "${source_path}/vp8" ] || disable_feature vp8
[ -d "${source_path}/vp9" ] || disable_feature vp9
+[ -d "${source_path}/vp10" ] || disable_feature vp10
# install everything except the sources, by default. sources will have
# to be enabled when doing dist builds, since that's no longer a common
@@ -212,10 +214,13 @@
vp8_decoder
vp9_encoder
vp9_decoder
+ vp10_encoder
+ vp10_decoder
"
CODEC_FAMILIES="
vp8
vp9
+ vp10
"
ARCH_LIST="
--- a/libs.mk
+++ b/libs.mk
@@ -109,6 +109,40 @@
VP9_PREFIX=vp9/
$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
+# VP10 make file
+ifneq ($(CONFIG_VP10_ENCODER)$(CONFIG_VP10_DECODER),)
+ VP10_PREFIX=vp10/
+ include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
+endif
+
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+ VP10_PREFIX=vp10/
+ include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10cx.mk
+ CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_CX_SRCS))
+ CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_CX_EXPORTS))
+ CODEC_SRCS-yes += $(VP10_PREFIX)vp10cx.mk vpx/vp8.h vpx/vp8cx.h
+ INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+ INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+ INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+ CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+ CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
+
+ifeq ($(CONFIG_VP10_DECODER),yes)
+ VP10_PREFIX=vp10/
+ include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10dx.mk
+ CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_DX_SRCS))
+ CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_DX_EXPORTS))
+ CODEC_SRCS-yes += $(VP10_PREFIX)vp10dx.mk vpx/vp8.h vpx/vp8dx.h
+ INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+ INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+ CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+ CODEC_DOC_SECTIONS += vp9 vp9_decoder
+endif
+
+VP10_PREFIX=vp10/
+$(BUILD_PFX)$(VP10_PREFIX)%.c.o: CFLAGS += -Wextra
+
ifeq ($(CONFIG_ENCODERS),yes)
CODEC_DOC_SECTIONS += encoder
endif
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -13,10 +13,10 @@
#include "./vpx_config.h"
#include "vpx/vpx_decoder.h"
#include "vpx/vpx_encoder.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
#include "vpx/vp8cx.h"
#endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
#include "vpx/vp8dx.h"
#endif
@@ -233,6 +233,8 @@
int usage) const {
#if CONFIG_VP9_ENCODER
return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
+#elif CONFIG_VP10_ENCODER
+ return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
#else
return VPX_CODEC_INCAPABLE;
#endif
@@ -251,7 +253,96 @@
#define VP9_INSTANTIATE_TEST_CASE(test, ...)
#endif // CONFIG_VP9
+/*
+ * VP10 Codec Definitions
+ */
+#if CONFIG_VP10
+class VP10Decoder : public Decoder {
+ public:
+ VP10Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+ : Decoder(cfg, deadline) {}
-} // namespace libvpx_test
+ VP10Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+ unsigned long deadline) // NOLINT
+ : Decoder(cfg, flag, deadline) {}
+ protected:
+ virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_DECODER
+ return &vpx_codec_vp10_dx_algo;
+#else
+ return NULL;
+#endif
+ }
+};
+
+class VP10Encoder : public Encoder {
+ public:
+ VP10Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+ const unsigned long init_flags, TwopassStatsStore *stats)
+ : Encoder(cfg, deadline, init_flags, stats) {}
+
+ protected:
+ virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_ENCODER
+ return &vpx_codec_vp10_cx_algo;
+#else
+ return NULL;
+#endif
+ }
+};
+
+class VP10CodecFactory : public CodecFactory {
+ public:
+ VP10CodecFactory() : CodecFactory() {}
+
+ virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+ unsigned long deadline) const {
+ return CreateDecoder(cfg, 0, deadline);
+ }
+
+ virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+ const vpx_codec_flags_t flags,
+ unsigned long deadline) const { // NOLINT
+#if CONFIG_VP10_DECODER
+ return new VP10Decoder(cfg, flags, deadline);
+#else
+ return NULL;
+#endif
+ }
+
+ virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+ unsigned long deadline,
+ const unsigned long init_flags,
+ TwopassStatsStore *stats) const {
+#if CONFIG_VP10_ENCODER
+ return new VP10Encoder(cfg, deadline, init_flags, stats);
+#else
+ return NULL;
+#endif
+ }
+
+ virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+ int usage) const {
+#if CONFIG_VP10_ENCODER
+ return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+ }
+};
+
+const libvpx_test::VP10CodecFactory kVP10;
+
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)\
+ INSTANTIATE_TEST_CASE_P(VP10, test, \
+ ::testing::Combine( \
+ ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
+ &libvpx_test::kVP10)), \
+ __VA_ARGS__))
+#else
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)
+#endif // CONFIG_VP10
+
+} // namespace libvpx_test
#endif // TEST_CODEC_FACTORY_H_
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -27,6 +27,9 @@
#if CONFIG_VP9_DECODER
&vpx_codec_vp9_dx_algo,
#endif
+#if CONFIG_VP10_DECODER
+ &vpx_codec_vp10_dx_algo,
+#endif
};
uint8_t buf[1] = {0};
vpx_codec_ctx_t dec;
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -16,7 +16,7 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
#include "vpx/vp8cx.h"
#endif
#include "vpx/vpx_encoder.h"
@@ -138,7 +138,7 @@
const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
}
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
void Control(int ctrl_id, vpx_active_map_t *arg) {
const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
--- a/tools_common.c
+++ b/tools_common.c
@@ -16,11 +16,11 @@
#include "./tools_common.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
#include "vpx/vp8cx.h"
#endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
#include "vpx/vp8dx.h"
#endif
@@ -140,6 +140,10 @@
#if CONFIG_VP9_ENCODER
{"vp9", VP9_FOURCC, &vpx_codec_vp9_cx},
#endif
+
+#if CONFIG_VP10_ENCODER
+ {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
+#endif
};
int get_vpx_encoder_count(void) {
@@ -173,6 +177,10 @@
#if CONFIG_VP9_DECODER
{"vp9", VP9_FOURCC, &vpx_codec_vp9_dx},
+#endif
+
+#if CONFIG_VP10_DECODER
+ {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
#endif
};
--- a/tools_common.h
+++ b/tools_common.h
@@ -62,6 +62,7 @@
#define VP8_FOURCC 0x30385056
#define VP9_FOURCC 0x30395056
+#define VP10_FOURCC 0x303a5056
enum VideoFileType {
FILE_TYPE_RAW,
--- /dev/null
+++ b/vp10/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -1,0 +1,248 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int32x4_t q8s32, q9s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+
+ d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+ d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+ q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+ q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+ q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+ *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+ *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+ return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16) {
+ *d0s16 = vdup_n_s16(cospi_8_64);
+ *d1s16 = vdup_n_s16(cospi_16_64);
+ *d2s16 = vdup_n_s16(cospi_24_64);
+ return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16) {
+ *d3s16 = vdup_n_s16(sinpi_1_9);
+ *d4s16 = vdup_n_s16(sinpi_2_9);
+ *q3s16 = vdupq_n_s16(sinpi_3_9);
+ *d5s16 = vdup_n_s16(sinpi_4_9);
+ return;
+}
+
+static INLINE void IDCT4x4_1D(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ int32x4_t q10s32, q13s32, q14s32, q15s32;
+ int16x8_t q13s16, q14s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, *d2s16);
+ q10s32 = vmull_s16(d17s16, *d0s16);
+ q13s32 = vmull_s16(d23s16, *d1s16);
+ q14s32 = vmull_s16(d24s16, *d1s16);
+ q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+ q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+ *q8s16 = vaddq_s16(q13s16, q14s16);
+ *q9s16 = vsubq_s16(q13s16, q14s16);
+ *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+ vget_low_s16(*q9s16)); // vswp
+ return;
+}
+
+static INLINE void IADST4x4_1D(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d6s16 = vget_low_s16(*q3s16);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ q10s32 = vmull_s16(*d3s16, d16s16);
+ q11s32 = vmull_s16(*d4s16, d16s16);
+ q12s32 = vmull_s16(d6s16, d17s16);
+ q13s32 = vmull_s16(*d5s16, d18s16);
+ q14s32 = vmull_s16(*d3s16, d18s16);
+ q15s32 = vmovl_s16(d16s16);
+ q15s32 = vaddw_s16(q15s32, d19s16);
+ q8s32 = vmull_s16(*d4s16, d19s16);
+ q15s32 = vsubw_s16(q15s32, d18s16);
+ q9s32 = vmull_s16(*d5s16, d19s16);
+
+ q10s32 = vaddq_s32(q10s32, q13s32);
+ q10s32 = vaddq_s32(q10s32, q8s32);
+ q11s32 = vsubq_s32(q11s32, q14s32);
+ q8s32 = vdupq_n_s32(sinpi_3_9);
+ q11s32 = vsubq_s32(q11s32, q9s32);
+ q15s32 = vmulq_s32(q15s32, q8s32);
+
+ q13s32 = vaddq_s32(q10s32, q12s32);
+ q10s32 = vaddq_s32(q10s32, q11s32);
+ q14s32 = vaddq_s32(q11s32, q12s32);
+ q10s32 = vsubq_s32(q10s32, q12s32);
+
+ d16s16 = vqrshrn_n_s32(q13s32, 14);
+ d17s16 = vqrshrn_n_s32(q14s32, 14);
+ d18s16 = vqrshrn_n_s32(q15s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+ *q8s16 = vcombine_s16(d16s16, d17s16);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ return;
+}
+
+void vp10_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ uint8x8_t d26u8, d27u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+ uint32x2_t d26u32, d27u32;
+ int16x8_t q3s16, q8s16, q9s16;
+ uint16x8_t q8u16, q9u16;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp10_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate constants
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ case 2: // idct_iadst
+ // generate constantsyy
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+ break;
+ case 3: // iadst_iadst
+ // generate constants
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+ dest += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+ return;
+}
--- /dev/null
+++ b/vp10/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -1,0 +1,624 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static INLINE void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+static INLINE void IADST8X8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q2s16, q4s16, q5s16, q6s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ d14s16 = vdup_n_s16(cospi_2_64);
+ d15s16 = vdup_n_s16(cospi_30_64);
+
+ q1s32 = vmull_s16(d30s16, d14s16);
+ q2s32 = vmull_s16(d31s16, d14s16);
+ q3s32 = vmull_s16(d30s16, d15s16);
+ q4s32 = vmull_s16(d31s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_18_64);
+ d31s16 = vdup_n_s16(cospi_14_64);
+
+ q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+ q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+ q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+ q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+ q5s32 = vmull_s16(d22s16, d30s16);
+ q6s32 = vmull_s16(d23s16, d30s16);
+ q7s32 = vmull_s16(d22s16, d31s16);
+ q8s32 = vmull_s16(d23s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+ q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+ q11s32 = vaddq_s32(q1s32, q5s32);
+ q12s32 = vaddq_s32(q2s32, q6s32);
+ q1s32 = vsubq_s32(q1s32, q5s32);
+ q2s32 = vsubq_s32(q2s32, q6s32);
+
+ d22s16 = vqrshrn_n_s32(q11s32, 14);
+ d23s16 = vqrshrn_n_s32(q12s32, 14);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q12s32 = vaddq_s32(q3s32, q7s32);
+ q15s32 = vaddq_s32(q4s32, q8s32);
+ q3s32 = vsubq_s32(q3s32, q7s32);
+ q4s32 = vsubq_s32(q4s32, q8s32);
+
+ d2s16 = vqrshrn_n_s32(q1s32, 14);
+ d3s16 = vqrshrn_n_s32(q2s32, 14);
+ d24s16 = vqrshrn_n_s32(q12s32, 14);
+ d25s16 = vqrshrn_n_s32(q15s32, 14);
+ d6s16 = vqrshrn_n_s32(q3s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ d0s16 = vdup_n_s16(cospi_10_64);
+ d1s16 = vdup_n_s16(cospi_22_64);
+ q4s32 = vmull_s16(d26s16, d0s16);
+ q5s32 = vmull_s16(d27s16, d0s16);
+ q2s32 = vmull_s16(d26s16, d1s16);
+ q6s32 = vmull_s16(d27s16, d1s16);
+
+ d30s16 = vdup_n_s16(cospi_26_64);
+ d31s16 = vdup_n_s16(cospi_6_64);
+
+ q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+ q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+ q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+ q0s32 = vmull_s16(d18s16, d30s16);
+ q13s32 = vmull_s16(d19s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+ q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+ q10s32 = vmull_s16(d18s16, d31s16);
+ q9s32 = vmull_s16(d19s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+ q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+ q14s32 = vaddq_s32(q2s32, q10s32);
+ q15s32 = vaddq_s32(q6s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q6s32 = vsubq_s32(q6s32, q9s32);
+
+ d28s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ q9s32 = vaddq_s32(q4s32, q0s32);
+ q10s32 = vaddq_s32(q5s32, q13s32);
+ q4s32 = vsubq_s32(q4s32, q0s32);
+ q5s32 = vsubq_s32(q5s32, q13s32);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ d18s16 = vqrshrn_n_s32(q9s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+ d8s16 = vqrshrn_n_s32(q4s32, 14);
+ d9s16 = vqrshrn_n_s32(q5s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q5s32 = vmull_s16(d2s16, d30s16);
+ q6s32 = vmull_s16(d3s16, d30s16);
+ q7s32 = vmull_s16(d2s16, d31s16);
+ q0s32 = vmull_s16(d3s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+ q1s32 = vmull_s16(d4s16, d30s16);
+ q3s32 = vmull_s16(d5s16, d30s16);
+ q10s32 = vmull_s16(d4s16, d31s16);
+ q2s32 = vmull_s16(d5s16, d31s16);
+
+ q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+ q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+ *q8s16 = vaddq_s16(*q11s16, *q9s16);
+ *q11s16 = vsubq_s16(*q11s16, *q9s16);
+ q4s16 = vaddq_s16(*q12s16, *q14s16);
+ *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+ q14s32 = vaddq_s32(q5s32, q1s32);
+ q15s32 = vaddq_s32(q6s32, q3s32);
+ q5s32 = vsubq_s32(q5s32, q1s32);
+ q6s32 = vsubq_s32(q6s32, q3s32);
+
+ d18s16 = vqrshrn_n_s32(q14s32, 14);
+ d19s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q1s32 = vaddq_s32(q7s32, q10s32);
+ q3s32 = vaddq_s32(q0s32, q2s32);
+ q7s32 = vsubq_s32(q7s32, q10s32);
+ q0s32 = vsubq_s32(q0s32, q2s32);
+
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ d29s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q7s32, 14);
+ d15s16 = vqrshrn_n_s32(q0s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ q2s32 = vmull_s16(d22s16, d30s16);
+ q3s32 = vmull_s16(d23s16, d30s16);
+ q13s32 = vmull_s16(d22s16, d30s16);
+ q1s32 = vmull_s16(d23s16, d30s16);
+
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+ q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q3s32, 14);
+ d24s16 = vqrshrn_n_s32(q13s32, 14);
+ d25s16 = vqrshrn_n_s32(q1s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ q13s32 = vmull_s16(d10s16, d30s16);
+ q1s32 = vmull_s16(d11s16, d30s16);
+ q11s32 = vmull_s16(d10s16, d30s16);
+ q0s32 = vmull_s16(d11s16, d30s16);
+
+ q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+ q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+ d20s16 = vqrshrn_n_s32(q13s32, 14);
+ d21s16 = vqrshrn_n_s32(q1s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q0s32, 14);
+ *q10s16 = vcombine_s16(d20s16, d21s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q5s16 = vdupq_n_s16(0);
+
+ *q9s16 = vsubq_s16(q5s16, *q9s16);
+ *q11s16 = vsubq_s16(q5s16, q2s16);
+ *q13s16 = vsubq_s16(q5s16, q6s16);
+ *q15s16 = vsubq_s16(q5s16, q4s16);
+ return;
+}
+
+void vp10_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i;
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 8 * 2);
+ q11s16 = vld1q_s16(input + 8 * 3);
+ q12s16 = vld1q_s16(input + 8 * 4);
+ q13s16 = vld1q_s16(input + 8 * 5);
+ q14s16 = vld1q_s16(input + 8 * 6);
+ q15s16 = vld1q_s16(input + 8 * 7);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp10_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // first transform rows
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 2: // idct_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // then transform columns
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 3: // iadst_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ for (d1 = d2 = dest, i = 0; i < 2; i++) {
+ if (i != 0) {
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+ }
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ }
+ return;
+}
--- /dev/null
+++ b/vp10/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -1,0 +1,108 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp9_common.h"
+#include "vp10/common/vp9_blockd.h"
+#include "vp10/common/vp9_idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int pitch, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ int16_t temp_out[16];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct16_rows_dspr2(input, outptr, 16);
+ idct16_cols_add_blk_dspr2(out, dest, pitch);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct16_rows_dspr2(input, outptr, 16);
+
+ outptr = out;
+
+ for (i = 0; i < 16; ++i) {
+ iadst16_dspr2(outptr, temp_out);
+
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ outptr += 16;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ {
+ int16_t temp_in[16 * 16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_dspr2(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i)
+ for (j = 0; j < 16; ++j)
+ temp_in[j * 16 + i] = out[i * 16 + j];
+
+ idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
+ }
+ break;
+ case ADST_ADST: // ADST in both directions
+ {
+ int16_t temp_in[16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_dspr2(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ iadst16_dspr2(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ }
+ }
+ break;
+ default:
+ printf("vp10_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
--- /dev/null
+++ b/vp10/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -1,0 +1,97 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp9_common.h"
+#include "vp10/common/vp9_blockd.h"
+#include "vp10/common/vp9_idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ int16_t temp_in[4 * 4], temp_out[4];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ vpx_idct4_rows_dspr2(input, outptr);
+ vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ vpx_idct4_rows_dspr2(input, outptr);
+
+ outptr = out;
+
+ for (i = 0; i < 4; ++i) {
+ iadst4_dspr2(outptr, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+
+ outptr += 4;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 4; ++i) {
+ iadst4_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ temp_in[i * 4 + j] = out[j * 4 + i];
+ }
+ }
+ vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 4; ++i) {
+ iadst4_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ iadst4_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp10_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
--- /dev/null
+++ b/vp10/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -1,0 +1,93 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp9_common.h"
+#include "vp10/common/vp9_blockd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ int16_t temp_in[8 * 8], temp_out[8];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct8_rows_dspr2(input, outptr, 8);
+ idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct8_rows_dspr2(input, outptr, 8);
+
+ for (i = 0; i < 8; ++i) {
+ iadst8_dspr2(&out[i * 8], temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 8; ++i) {
+ iadst8_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) {
+ temp_in[i * 8 + j] = out[j * 8 + i];
+ }
+ }
+ idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 8; ++i) {
+ iadst8_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+
+ iadst8_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp10_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
--- /dev/null
+++ b/vp10/common/mips/msa/vp9_idct16x16_msa.c
@@ -1,0 +1,81 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride, int32_t tx_type) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *out_ptr = &out[0];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+ break;
+ case ADST_DCT:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dst + (i << 3)), dst_stride);
+ }
+ break;
+ case DCT_ADST:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ /* process 8 * 16 block */
+ vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+ dst_stride);
+ }
+ break;
+ case ADST_ADST:
+ /* transform rows */
+ for (i = 0; i < 2; ++i) {
+ /* process 16 * 8 block */
+ vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+ }
+
+ /* transform columns */
+ for (i = 0; i < 2; ++i) {
+ vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+ (dst + (i << 3)), dst_stride);
+ }
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
--- /dev/null
+++ b/vp10/common/mips/msa/vp9_idct4x4_msa.c
@@ -1,0 +1,62 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride, int32_t tx_type) {
+ v8i16 in0, in1, in2, in3;
+
+ /* load vector elements of 4x4 block */
+ LD4x4_SH(input, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* DCT in horizontal */
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* DCT in vertical */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ /* DCT in horizontal */
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* ADST in vertical */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ /* ADST in horizontal */
+ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* DCT in vertical */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ /* ADST in horizontal */
+ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ /* ADST in vertical */
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ /* final rounding (add 2^3, divide by 2^4) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 4);
+ /* add block and store 4x4 */
+ ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
--- /dev/null
+++ b/vp10/common/mips/msa/vp9_idct8x8_msa.c
@@ -1,0 +1,80 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+ int32_t dst_stride, int32_t tx_type) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ /* load vector elements of 8x8 block */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* DCT in horizontal */
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ /* DCT in vertical */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ break;
+ case ADST_DCT:
+ /* DCT in horizontal */
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ /* ADST in vertical */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ break;
+ case DCT_ADST:
+ /* ADST in horizontal */
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ /* DCT in vertical */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ break;
+ case ADST_ADST:
+ /* ADST in horizontal */
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ /* ADST in vertical */
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ /* final rounding (add 2^4, divide by 2^5) and shift */
+ SRARI_H4_SH(in0, in1, in2, in3, 5);
+ SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+ /* add block and store 8x8 */
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+ dst += (4 * dst_stride);
+ VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
--- /dev/null
+++ b/vp10/common/mips/msa/vp9_mfqe_msa.c
@@ -1,0 +1,137 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp9_onyxc_int.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ uint64_t src0_d, src1_d, dst0_d, dst1_d;
+ v16i8 src0 = { 0 };
+ v16i8 src1 = { 0 };
+ v16i8 dst0 = { 0 };
+ v16i8 dst1 = { 0 };
+ v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 2; row--;) {
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src0);
+ INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src1);
+ INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst0, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst1, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+ }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 4; row--;) {
+ LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src2, src_r, src_l);
+ UNPCK_UB_SH(dst2, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src3, src_r, src_l);
+ UNPCK_UB_SH(dst3, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp10_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp10_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}
--- /dev/null
+++ b/vp10/common/vp10_rtcd.c
@@ -1,0 +1,19 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp10_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vp10_rtcd() {
+ // TODO(JBB): Remove this once, by insuring that both the encoder and
+ // decoder setup functions are protected by once();
+ once(setup_rtcd_internal);
+}
--- /dev/null
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -1,0 +1,362 @@
+sub vp10_common_forward_decls() {
+print <<EOF
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp9_common.h"
+#include "vp10/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp10_common_forward_decls/;
+
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+ $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+ $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+ $mmx_x86inc = 'mmx';
+ $sse_x86inc = 'sse';
+ $sse2_x86inc = 'sse2';
+ $ssse3_x86inc = 'ssse3';
+ $avx_x86inc = 'avx';
+ $avx2_x86inc = 'avx2';
+ if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64_x86inc = 'mmx';
+ $sse_x86_64_x86inc = 'sse';
+ $sse2_x86_64_x86inc = 'sse2';
+ $ssse3_x86_64_x86inc = 'ssse3';
+ $avx_x86_64_x86inc = 'avx';
+ $avx2_x86_64_x86inc = 'avx2';
+ }
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+}
+
+#
+# post proc
+#
+if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+add_proto qw/void vp10_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
+specialize qw/vp10_mbpost_proc_down sse2/;
+$vp10_mbpost_proc_down_sse2=vp10_mbpost_proc_down_xmm;
+
+add_proto qw/void vp10_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
+specialize qw/vp10_mbpost_proc_across_ip sse2/;
+$vp10_mbpost_proc_across_ip_sse2=vp10_mbpost_proc_across_ip_xmm;
+
+add_proto qw/void vp10_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+specialize qw/vp10_post_proc_down_and_across sse2/;
+$vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;
+
+add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+specialize qw/vp10_plane_add_noise sse2/;
+$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
+
+add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp10_filter_by_weight16x16 sse2 msa/;
+
+add_proto qw/void vp10_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
+}
+
+#
+# dct
+#
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ # Note as optimized versions of these functions are added we need to add a check to ensure
+ # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+ add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht4x4_16_add/;
+
+ add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht8x8_64_add/;
+
+ add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp10_iht16x16_256_add/;
+} else {
+ # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht4x4_16_add/;
+
+ add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht8x8_64_add/;
+
+ add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp10_iht16x16_256_add/;
+ } else {
+ add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht8x8_64_add sse2 neon dspr2 msa/;
+
+ add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+ }
+}
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Sub Pixel Filters
+ #
+ add_proto qw/void vp10_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve_copy/;
+
+ add_proto qw/void vp10_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve_avg/;
+
+ add_proto qw/void vp10_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8_vert/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8_avg/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vp10_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+ #
+ # post proc
+ #
+ if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+ add_proto qw/void vp10_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+ specialize qw/vp10_highbd_mbpost_proc_down/;
+
+ add_proto qw/void vp10_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+ specialize qw/vp10_highbd_mbpost_proc_across_ip/;
+
+ add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+ specialize qw/vp10_highbd_post_proc_down_and_across/;
+
+ add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+ specialize qw/vp10_highbd_plane_add_noise/;
+ }
+
+ #
+ # dct
+ #
+ # Note as optimized versions of these functions are added we need to add a check to ensure
+ # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+ add_proto qw/void vp10_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ specialize qw/vp10_highbd_iht4x4_16_add/;
+
+ add_proto qw/void vp10_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ specialize qw/vp10_highbd_iht8x8_64_add/;
+
+ add_proto qw/void vp10_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+ specialize qw/vp10_highbd_iht16x16_256_add/;
+}
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+
+add_proto qw/unsigned int vp10_avg_8x8/, "const uint8_t *, int p";
+specialize qw/vp10_avg_8x8 sse2 neon msa/;
+
+add_proto qw/unsigned int vp10_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp10_avg_4x4 sse2 msa/;
+
+add_proto qw/void vp10_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+specialize qw/vp10_minmax_8x8 sse2/;
+
+add_proto qw/void vp10_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp10_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
+
+add_proto qw/void vp10_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp10_hadamard_16x16 sse2/;
+
+add_proto qw/int16_t vp10_satd/, "const int16_t *coeff, int length";
+specialize qw/vp10_satd sse2/;
+
+add_proto qw/void vp10_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+specialize qw/vp10_int_pro_row sse2 neon/;
+
+add_proto qw/int16_t vp10_int_pro_col/, "uint8_t const *ref, const int width";
+specialize qw/vp10_int_pro_col sse2 neon/;
+
+add_proto qw/int vp10_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp10_vector_var neon sse2/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int vp10_highbd_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/vp10_highbd_avg_8x8/;
+ add_proto qw/unsigned int vp10_highbd_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/vp10_highbd_avg_4x4/;
+ add_proto qw/void vp10_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/vp10_highbd_minmax_8x8/;
+}
+
+# ENCODEMB INVOKE
+
+#
+# Denoiser
+#
+if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
+ add_proto qw/int vp10_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+ specialize qw/vp10_denoiser_filter sse2/;
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for vp10_block_error can no longer be used.
+ add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp10_block_error/;
+
+ add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_quantize_fp/;
+
+ add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_quantize_fp_32x32/;
+
+ add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_fdct8x8_quant/;
+} else {
+ add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
+
+ add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+ specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
+
+ add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+
+ add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+
+ add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
+}
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vp10_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+ specialize qw/vp10_ssim_parms_8x8/, "$sse2_x86_64";
+
+ add_proto qw/void vp10_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+ specialize qw/vp10_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+# fdct functions
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht4x4 sse2/;
+
+ add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht8x8 sse2/;
+
+ add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht16x16 sse2/;
+
+ add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+} else {
+ add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht4x4 sse2 msa/;
+
+ add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht8x8 sse2 msa/;
+
+ add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_fht16x16 sse2 msa/;
+
+ add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+}
+
+#
+# Motion search
+#
+add_proto qw/int vp10_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/vp10_full_search_sad sse3 sse4_1/;
+$vp10_full_search_sad_sse3=vp10_full_search_sadx3;
+$vp10_full_search_sad_sse4_1=vp10_full_search_sadx8;
+
+add_proto qw/int vp10_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp10_diamond_search_sad/;
+
+add_proto qw/int vp10_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp10_full_range_search/;
+
+add_proto qw/void vp10_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/vp10_temporal_filter_apply sse2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+ # ENCODEMB INVOKE
+
+ add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp10_highbd_block_error sse2/;
+
+ add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_highbd_quantize_fp/;
+
+ add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp10_highbd_quantize_fp_32x32/;
+
+ #
+ # Structured Similarity (SSIM)
+ #
+ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vp10_highbd_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/vp10_highbd_ssim_parms_8x8/;
+ }
+
+ # fdct functions
+ add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_highbd_fht4x4/;
+
+ add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_highbd_fht8x8/;
+
+ add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp10_highbd_fht16x16/;
+
+ add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fwht4x4/;
+
+ add_proto qw/void vp10_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ specialize qw/vp10_highbd_temporal_filter_apply/;
+
+}
+# End vp10_high encoder functions
+
+}
+# end encoder functions
+1;
--- /dev/null
+++ b/vp10/common/vp9_alloccommon.c
@@ -1,0 +1,165 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/vp9_alloccommon.h"
+#include "vp10/common/vp9_blockd.h"
+#include "vp10/common/vp9_entropymode.h"
+#include "vp10/common/vp9_entropymv.h"
+#include "vp10/common/vp9_onyxc_int.h"
+#include "vp10/common/vp9_systemdependent.h"
+
+void vp10_set_mb_mi(VP9_COMMON *cm, int width, int height) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+ cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+ cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+ cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+ cm->mb_cols = (cm->mi_cols + 1) >> 1;
+ cm->mb_rows = (cm->mi_rows + 1) >> 1;
+ cm->MBs = cm->mb_rows * cm->mb_cols;
+}
+
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
+ int i;
+
+ for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+ cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+ if (cm->seg_map_array[i] == NULL)
+ return 1;
+ }
+ cm->seg_map_alloc_size = seg_map_size;
+
+ // Init the index.
+ cm->seg_map_idx = 0;
+ cm->prev_seg_map_idx = 1;
+
+ cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+ if (!cm->frame_parallel_decode)
+ cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+
+ return 0;
+}
+
+static void free_seg_map(VP9_COMMON *cm) {
+ int i;
+
+ for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+ vpx_free(cm->seg_map_array[i]);
+ cm->seg_map_array[i] = NULL;
+ }
+
+ cm->current_frame_seg_map = NULL;
+
+ if (!cm->frame_parallel_decode) {
+ cm->last_frame_seg_map = NULL;
+ }
+}
+
+void vp10_free_ref_frame_buffers(BufferPool *pool) {
+ int i;
+
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (pool->frame_bufs[i].ref_count > 0 &&
+ pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+ pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+ pool->frame_bufs[i].ref_count = 0;
+ }
+ vpx_free(pool->frame_bufs[i].mvs);
+ pool->frame_bufs[i].mvs = NULL;
+ vp9_free_frame_buffer(&pool->frame_bufs[i].buf);
+ }
+}
+
+void vp10_free_postproc_buffers(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+ vp9_free_frame_buffer(&cm->post_proc_buffer);
+ vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+ (void)cm;
+#endif
+}
+
+void vp10_free_context_buffers(VP9_COMMON *cm) {
+ cm->free_mi(cm);
+ free_seg_map(cm);
+ vpx_free(cm->above_context);
+ cm->above_context = NULL;
+ vpx_free(cm->above_seg_context);
+ cm->above_seg_context = NULL;
+}
+
+int vp10_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+ int new_mi_size;
+
+ vp10_set_mb_mi(cm, width, height);
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ cm->free_mi(cm);
+ if (cm->alloc_mi(cm, new_mi_size))
+ goto fail;
+ }
+
+ if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+ // Create the segmentation map structure and set to 0.
+ free_seg_map(cm);
+ if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+ goto fail;
+ }
+
+ if (cm->above_context_alloc_cols < cm->mi_cols) {
+ vpx_free(cm->above_context);
+ cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+ 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+ sizeof(*cm->above_context));
+ if (!cm->above_context) goto fail;
+
+ vpx_free(cm->above_seg_context);
+ cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+ mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+ if (!cm->above_seg_context) goto fail;
+ cm->above_context_alloc_cols = cm->mi_cols;
+ }
+
+ return 0;
+
+ fail:
+ vp10_free_context_buffers(cm);
+ return 1;
+}
+
+void vp10_remove_common(VP9_COMMON *cm) {
+ vp10_free_context_buffers(cm);
+
+ vpx_free(cm->fc);
+ cm->fc = NULL;
+ vpx_free(cm->frame_contexts);
+ cm->frame_contexts = NULL;
+}
+
+void vp10_init_context_buffers(VP9_COMMON *cm) {
+ cm->setup_mi(cm);
+ if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+ memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+}
+
+void vp10_swap_current_and_last_seg_map(VP9_COMMON *cm) {
+ // Swap indices.
+ const int tmp = cm->seg_map_idx;
+ cm->seg_map_idx = cm->prev_seg_map_idx;
+ cm->prev_seg_map_idx = tmp;
+
+ cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+ cm->last_frame_seg_map = cm->seg_map_ar