ref: 641fda79bb357599e5b38f750196bce66ec5df6b
parent: a16ca80b09cb4d698606d706486c2d22577af124
author: Johann <johannkoenig@google.com>
date: Thu Feb 2 09:17:26 EST 2017
highbd x86: consolidate tran_low_t conversions Create new helper files specifically for converting tran_low_t types. Change-Id: I7c4c458ef910f3b3d10a3cfbf9df4de7682fd905
--- a/libs.mk
+++ b/libs.mk
@@ -149,6 +149,7 @@
INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
endif
CODEC_EXPORTS-yes += vpx/exports_com
CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
@@ -204,6 +205,7 @@
third_party/x86inc/x86inc.asm \
vpx_config.asm \
vpx_ports/x86_abi_support.asm \
+ vpx_dsp/x86/bitdepth_conversion_sse2.asm \
vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -11,6 +11,7 @@
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
@@ -62,25 +63,7 @@
psllw m0, 2
psllw m1, 2
-%if CONFIG_VP9_HIGHBITDEPTH
- ; sign extension
- mova m2, m0
- mova m3, m1
- punpcklwd m0, m0
- punpcklwd m1, m1
- punpckhwd m2, m2
- punpckhwd m3, m3
- psrad m0, 16
- psrad m1, 16
- psrad m2, 16
- psrad m3, 16
- mova [outputq], m0
- mova [outputq + 16], m2
- mova [outputq + 32], m1
- mova [outputq + 48], m3
-%else
- mova [outputq], m0
- mova [outputq + 16], m1
-%endif
+ STORE_TRAN_LOW 0, outputq, 0, 2, 3
+ STORE_TRAN_LOW 1, outputq, 1, 2, 3
RET
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -14,7 +14,7 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,12 @@
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM)
+
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
@@ -245,7 +251,6 @@
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
-DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -12,7 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
@@ -94,20 +95,6 @@
SWAP 7, 9
%endmacro
-%if CONFIG_VP9_HIGHBITDEPTH
-; store %1 to outputq + %2
-; uses m8-m10 as scratch registers
-%macro STORE_TRAN_LOW 2
- pxor m8, m8
- mova m9, m%1
- mova m10, m%1
- pcmpgtw m8, m%1
- punpcklwd m9, m8
- punpckhwd m10, m8
- mova [outputq + %2], m9
- mova [outputq + %2 + 16], m10
-%endmacro
-%endif
INIT_XMM ssse3
cglobal hadamard_8x8, 3, 5, 11, input, stride, output
@@ -130,25 +117,14 @@
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
-%if CONFIG_VP9_HIGHBITDEPTH
- STORE_TRAN_LOW 0, 0
- STORE_TRAN_LOW 1, 32
- STORE_TRAN_LOW 2, 64
- STORE_TRAN_LOW 3, 96
- STORE_TRAN_LOW 4, 128
- STORE_TRAN_LOW 5, 160
- STORE_TRAN_LOW 6, 192
- STORE_TRAN_LOW 7, 224
-%else
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
-%endif
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 1, 8, 9
+ STORE_TRAN_LOW 2, outputq, 2, 8, 9
+ STORE_TRAN_LOW 3, outputq, 3, 8, 9
+ STORE_TRAN_LOW 4, outputq, 4, 8, 9
+ STORE_TRAN_LOW 5, outputq, 5, 8, 9
+ STORE_TRAN_LOW 6, outputq, 6, 8, 9
+ STORE_TRAN_LOW 7, outputq, 7, 8, 9
RET
%endif
--- /dev/null
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -1,0 +1,66 @@
+;
+; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+ add %1, 32
+%else
+ add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea %1, [%1 + %2 * 4]
+%else
+ lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m%1, [%2 + %3 * 32]
+ packssdw m%1, [%2 + %3 * 32 + 16]
+%else
+ mova m%1, [%2 + %3 * 16]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m%4, m%4
+ mova m%5, m%1
+ pcmpgtw m%4, m%1
+ punpcklwd m%5, m%4
+ punpckhwd m%1, m%4
+ mova [%2 + %3 * 32 + 0], m%5
+ mova [%2 + %3 * 32 + 16], m%1
+%else
+ mova [%2 + %3 * 16], m%1
+%endif
+%endmacro
--- /dev/null
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,0 +1,57 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+ (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+ (int16_t)a[6], (int16_t)a[7]);
+#else
+ return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+ _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(a), zero);
+ _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+ _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif // VPX_DSP_X86_FDCT_H_
--- a/vpx_dsp/x86/fdct.h
+++ /dev/null
@@ -1,57 +1,0 @@
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_X86_FDCT_H_
-#define VPX_DSP_X86_FDCT_H_
-
-#include <xmmintrin.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-// Load 8 16 bit values. If the source is 32 bits then cast down.
-// This does not saturate values. It only truncates.
-static INLINE __m128i load_tran_low(const tran_low_t *a) {
-#if CONFIG_VP9_HIGHBITDEPTH
- return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
- (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
- (int16_t)a[6], (int16_t)a[7]);
-#else
- return _mm_load_si128((const __m128i *)a);
-#endif
-}
-
-// Store 8 16 bit values. If the destination is 32 bits then sign extend the
-// values by multiplying by 1.
-static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const __m128i one = _mm_set1_epi16(1);
- const __m128i a_hi = _mm_mulhi_epi16(a, one);
- const __m128i a_lo = _mm_mullo_epi16(a, one);
- const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
- const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
- _mm_store_si128((__m128i *)(b), a_1);
- _mm_store_si128((__m128i *)(b + 4), a_2);
-#else
- _mm_store_si128((__m128i *)(b), a);
-#endif
-}
-
-// Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
- const __m128i zero = _mm_setzero_si128();
-#if CONFIG_VP9_HIGHBITDEPTH
- _mm_store_si128((__m128i *)(a), zero);
- _mm_store_si128((__m128i *)(a + 4), zero);
-#else
- _mm_store_si128((__m128i *)(a), zero);
-#endif
-}
-#endif // VPX_DSP_X86_FDCT_H_
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION_RODATA
@@ -230,21 +231,10 @@
lea r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
-%endif
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 1
+ LOAD_TRAN_LOW 2, inputq, 2
+ LOAD_TRAN_LOW 3, inputq, 3
punpcklwd m0, m1
punpcklwd m2, m3
@@ -752,33 +742,14 @@
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
@@ -1182,33 +1153,15 @@
mov r7, 2
idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
+
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
mova [r4 + 0], m0
@@ -1220,11 +1173,7 @@
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
+ INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
@@ -1231,11 +1180,7 @@
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
+ INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_135
@@ -1646,33 +1591,14 @@
mov r7, 4
idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 4
+ LOAD_TRAN_LOW 2, r3, 8
+ LOAD_TRAN_LOW 3, r3, 12
+ LOAD_TRAN_LOW 4, r3, 16
+ LOAD_TRAN_LOW 5, r3, 20
+ LOAD_TRAN_LOW 6, r3, 24
+ LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
@@ -1684,11 +1610,7 @@
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
+ INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
@@ -1696,11 +1618,7 @@
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
+ INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_1024
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
@@ -82,15 +83,8 @@
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
-%endif
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 1
psraw m0, 2
psraw m1, 2
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,7 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
--
⑨