ref: d4494e6ed7d564cbee3cfdfe41d0c0989c4345ff
parent: edffe56b309bb5e50338adbe40d29476d2f19d80
author: Sandor Zsombor Vegh <sandorzsombor.vegh@arm.com>
date: Wed Sep 11 10:00:32 EDT 2024
Arm: Speed up FLOAT2INT16 conversion with Neon Using Neon for float to int conversion, and introducing platform- specific function for converting an array of float values to int16. Also adding appropriate unit test. Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -1,5 +1,6 @@
/* Copyright (c) 2010 Xiph.Org Foundation
- * Copyright (c) 2013 Parrot */
+ * Copyright (c) 2013 Parrot
+ * Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -29,11 +30,24 @@
#include "config.h"
#endif
-#include "pitch.h"
#include "kiss_fft.h"
+#include "mathops.h"
#include "mdct.h"
+#include "pitch.h"
#if defined(OPUS_HAVE_RTCD)
+
+# if !defined(DISABLE_FLOAT_API)
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
+ celt_float2int16_c, /* ARMv4 */
+ celt_float2int16_c, /* EDSP */
+ celt_float2int16_c, /* Media */
+ celt_float2int16_neon,/* NEON */
+ celt_float2int16_neon /* DOTPROD */
+};
+# endif
+# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2014-2015 Xiph.Org Foundation
+ Copyright (c) 2024 Arm Limited
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
@@ -35,7 +36,57 @@
#endif
#include <arm_neon.h>
+#include "../float_cast.h"
+#include "../mathops.h"
#include "../pitch.h"
+#if defined(OPUS_CHECK_ASM)
+#include <stdlib.h>
+#endif
+
+#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
+{
+ int i = 0;
+
+#if defined(__ARM_NEON)
+ const int BLOCK_SIZE = 16;
+ const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
+
+ for (; i < blockedSize; i += BLOCK_SIZE)
+ {
+ float32x4_t orig_a = vld1q_f32(&in[i + 0]);
+ float32x4_t orig_b = vld1q_f32(&in[i + 4]);
+ float32x4_t orig_c = vld1q_f32(&in[i + 8]);
+ float32x4_t orig_d = vld1q_f32(&in[i + 12]);
+
+ int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
+ int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
+ int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
+ int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
+
+ vst1_s16(&out[i + 0], asShort_a);
+ vst1_s16(&out[i + 4], asShort_b);
+ vst1_s16(&out[i + 8], asShort_c);
+ vst1_s16(&out[i + 12], asShort_d);
+# if defined(OPUS_CHECK_ASM)
+ short out_c[BLOCK_SIZE];
+ int j;
+ for(j = 0; j < BLOCK_SIZE; j++)
+ {
+ out_c[j] = FLOAT2INT16(in[i + j]);
+ celt_assert(abs((out_c[j] - out[i + j])) <= 1);
+ }
+# endif
+ }
+#endif
+
+ for (; i < cnt; i++)
+ {
+ out[i] = FLOAT2INT16(in[i]);
+ }
+}
+#endif
#if defined(FIXED_POINT)
#include <string.h>
--- /dev/null
+++ b/celt/arm/mathops_arm.h
@@ -1,0 +1,65 @@
+/* Copyright (c) 2024 Arm Limited */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(MATHOPS_ARM_H)
+# define MATHOPS_ARM_H
+
+#include "armcpu.h"
+#include "cpu_support.h"
+#include "opus_defines.h"
+
+# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+#include <arm_neon.h>
+
+static inline int32x4_t vroundf(float32x4_t x)
+{
+# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+ return vcvtaq_s32_f32(x);
+# else
+ uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
+ uint32x4_t bias = vdupq_n_u32(0x3F000000);
+ return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
+# endif
+}
+
+void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+# if defined(OPUS_HAVE_RTCD) && \
+ (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void
+(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+
+# define OVERRIDE_FLOAT2INT16 (1)
+# define celt_float2int16(in, out, cnt, arch) \
+ ((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
+
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_FLOAT2INT16 (1)
+# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
+# endif
+# endif
+
+#endif /* MATHOPS_ARM_H */
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -98,6 +98,13 @@
return intgr ;
}
+#elif defined(__aarch64__)
+
+ #include <arm_neon.h>
+ static OPUS_INLINE opus_int32 float2int(float flt)
+ {
+ return vcvtns_s32_f32(flt);
+ }
#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
--- a/celt/mathops.c
+++ b/celt/mathops.c
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
+ Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin */
/**
@file mathops.h
@@ -35,6 +36,7 @@
#include "config.h"
#endif
+#include "float_cast.h"
#include "mathops.h"
/*Compute floor(sqrt(_val)) with exact arithmetic.
@@ -215,3 +217,16 @@
}
#endif
+
+#ifndef DISABLE_FLOAT_API
+
+void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
+{
+ int i;
+ for (i = 0; i < cnt; i++)
+ {
+ out[i] = FLOAT2INT16(in[i]);
+ }
+}
+
+#endif /* DISABLE_FLOAT_API */
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
+ Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, and Yunho Huh */
/**
@file mathops.h
@@ -38,6 +39,10 @@
#include "entcode.h"
#include "os_support.h"
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#include "arm/mathops_arm.h"
+#endif
+
#define PI 3.141592653f
/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
@@ -476,4 +481,15 @@
}
#endif /* FIXED_POINT */
+
+#ifndef DISABLE_FLOAT_API
+
+void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+
+#ifndef OVERRIDE_FLOAT2INT16
+#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
+#endif
+
+#endif /* DISABLE_FLOAT_API */
+
#endif /* MATHOPS_H */
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -1,5 +1,6 @@
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
Gregory Maxwell
+ Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
and Yunho Huh */
/*
@@ -37,8 +38,10 @@
#include <stdio.h>
#include <math.h>
-#include "mathops.h"
#include "bands.h"
+#include "cpu_support.h"
+#include "float_cast.h"
+#include "mathops.h"
#ifdef FIXED_POINT
#define WORD "%d"
@@ -351,8 +354,94 @@
}
#endif
+
+#ifndef DISABLE_FLOAT_API
+
+void testcelt_float2int16(int use_ref_impl, int buffer_size)
+{
+
+#define MAX_BUFFER_SIZE 2080
+ int i, cnt;
+ float floatsToConvert[MAX_BUFFER_SIZE];
+ short results[MAX_BUFFER_SIZE] = { 0 };
+ float scaleInt16RangeTo01;
+
+ celt_assert(buffer_size <= MAX_BUFFER_SIZE);
+
+ scaleInt16RangeTo01 = 1.f / 32768.f;
+ cnt = 0;
+
+ while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
+ {
+ floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = .0f;
+ floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
+ floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
+
+ celt_assert(cnt < buffer_size);
+ }
+
+ while (cnt < buffer_size)
+ {
+ float inInt16Range = cnt * 7 + .5;
+ inInt16Range += (cnt & 0x01) ? .1 : -.1;
+ inInt16Range *= (cnt & 0x02) ? 1 : -1;
+ floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
+ }
+
+ for (i = 0; i < MAX_BUFFER_SIZE; ++i)
+ {
+ results[i] = 42;
+ }
+
+ if (use_ref_impl)
+ {
+ celt_float2int16_c(floatsToConvert, results, cnt);
+ } else {
+ celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
+ }
+
+ for (i = 0; i < cnt; ++i)
+ {
+ const float expected = FLOAT2INT16(floatsToConvert[i]);
+ if (results[i] != expected)
+ {
+ fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
+ floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
+ ret = 1;
+ }
+ }
+
+ for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
+ {
+ if (results[i] != 42)
+ {
+ fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
+ ret = 1;
+ break;
+ }
+ }
+#undef MAX_BUFFER_SIZE
+}
+
+#endif
+
int main(void)
{
+ int i;
+ int use_ref_impl[2] = { 0, 1 };
+
testbitexactcos();
testbitexactlog2tan();
testdiv();
@@ -364,6 +453,15 @@
testilog2();
testlog2_db();
testexp2_db();
+#endif
+#ifndef DISABLE_FLOAT_API
+ for (i = 0; i <= 1; ++i)
+ {
+ testcelt_float2int16(use_ref_impl[i], 1);
+ testcelt_float2int16(use_ref_impl[i], 32);
+ testcelt_float2int16(use_ref_impl[i], 127);
+ testcelt_float2int16(use_ref_impl[i], 1031);
+ }
#endif
return ret;
}
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -39,6 +39,7 @@
celt/arm/fixed_arm64.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
+celt/arm/mathops_arm.h \
celt/arm/pitch_arm.h \
celt/arm/fft_arm.h \
celt/arm/mdct_arm.h \
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited
+ Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin and Koen Vos */
/*
Redistribution and use in source and binary forms, with or without
@@ -835,7 +836,7 @@
opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec)
{
VARDECL(opus_res, out);
- int ret, i;
+ int ret;
int nb_samples;
ALLOC_STACK;
@@ -858,8 +859,13 @@
ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0);
if (ret > 0)
{
+# if defined(FIXED_POINT)
+ int i;
for (i=0;i<ret*st->channels;i++)
pcm[i] = RES2INT16(out[i]);
+# else
+ celt_float2int16(out, pcm, ret*st->channels, st->arch);
+# endif
}
RESTORE_STACK;
return ret;
--
⑨