ref: a41a344a2e30455ce4d1f6662b85332a70dc4b52
parent: 62ea8ea120e3cee6e43eeec3a980b367b374e339
author: Sandor Vegh <sandorzsombor.vegh@arm.com>
date: Thu Oct 3 12:08:21 EDT 2024
Arm: Speed up -1..1 soft clipping with Neon If the signal exceeds -1..1 then, as error handling, the soft_clip function forces the signal back into -1..1. This is problematic since the search loop to find the next sample exceeding -1..1 is slow. If cheap on the current platform, while doing -2..2 hardclipping we can also detect if the signal never exceeds -1..1, avoiding the need for a second search loop. Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -103,6 +103,8 @@
#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */
#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */
#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */
+#define FMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum float value. */
+#define FMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum float value. */
#define UADD32(a,b) ((a)+(b))
#define USUB32(a,b) ((a)-(b))
#define MAXG(a,b) MAX32(a, b)
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -46,6 +46,14 @@
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};
+
+int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt) = {
+ opus_limit2_checkwithin1_c, /* ARMv4 */
+ opus_limit2_checkwithin1_c, /* EDSP */
+ opus_limit2_checkwithin1_c, /* Media */
+ opus_limit2_checkwithin1_neon,/* NEON */
+ opus_limit2_checkwithin1_neon /* DOTPROD */
+};
# endif
# endif
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -86,7 +86,84 @@
out[i] = FLOAT2INT16(in[i]);
}
}
+
+int opus_limit2_checkwithin1_neon(float *samples, int cnt)
+{
+ const float hardclipMin = -2.0f;
+ const float hardclipMax = 2.0f;
+
+ int i = 0;
+ int exceeding1 = 0;
+ int nextIndex = 0;
+
+#if defined(__ARM_NEON)
+ const int BLOCK_SIZE = 16;
+ const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
+
+ float32x4_t min_all_0 = vdupq_n_f32(0.0f);
+ float32x4_t min_all_1 = vdupq_n_f32(0.0f);
+ float32x4_t max_all_0 = vdupq_n_f32(0.0f);
+ float32x4_t max_all_1 = vdupq_n_f32(0.0f);
+
+ float max, min;
+
+ for (i = 0; i < blockedSize; i += BLOCK_SIZE)
+ {
+ const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
+ const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
+ const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
+ const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
+ max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b));
+ max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d));
+ min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b));
+ min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d));
+ }
+
+ max = vmaxvf(vmaxq_f32(max_all_0, max_all_1));
+ min = vminvf(vminq_f32(min_all_0, min_all_1));
+
+ if (min < hardclipMin || max > hardclipMax)
+ {
+ const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin);
+ const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax);
+ for (i = 0; i < blockedSize; i += BLOCK_SIZE)
+ {
+ const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
+ const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
+ const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
+ const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
+ const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg));
+ const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg));
+ const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg));
+ const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg));
+ vst1q_f32(&samples[i + 0], clipped_a);
+ vst1q_f32(&samples[i + 4], clipped_b);
+ vst1q_f32(&samples[i + 8], clipped_c);
+ vst1q_f32(&samples[i + 12], clipped_d);
+ }
+ }
+
+ nextIndex = blockedSize;
+ exceeding1 |= max > 1.0f || min < -1.0f;
+
#endif
+
+ for (i = nextIndex; i < cnt; i++)
+ {
+ const float origVal = samples[i];
+ float clippedVal = origVal;
+ clippedVal = MAX16(hardclipMin, clippedVal);
+ clippedVal = MIN16(hardclipMax, clippedVal);
+ samples[i] = clippedVal;
+
+ exceeding1 |= origVal > 1.0f || origVal < -1.0f;
+ }
+
+ return !exceeding1;
+}
+
+#endif
+
#if defined(FIXED_POINT)
#include <string.h>
--- a/celt/arm/mathops_arm.h
+++ b/celt/arm/mathops_arm.h
@@ -46,6 +46,30 @@
# endif
}
+static inline float vminvf(float32x4_t a)
+{
+#if defined(__aarch64__)
+ return vminvq_f32(a);
+#else
+ float32x2_t xy = vmin_f32(vget_low_f32(a), vget_high_f32(a));
+ float x = vget_lane_f32(xy, 0);
+ float y = vget_lane_f32(xy, 1);
+ return x < y ? x : y;
+#endif
+}
+
+static inline float vmaxvf(float32x4_t a)
+{
+#if defined(__aarch64__)
+ return vmaxvq_f32(a);
+#else
+ float32x2_t xy = vmax_f32(vget_low_f32(a), vget_high_f32(a));
+ float x = vget_lane_f32(xy, 0);
+ float y = vget_lane_f32(xy, 1);
+ return x > y ? x : y;
+#endif
+}
+
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
@@ -59,6 +83,20 @@
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
+# endif
+
+int opus_limit2_checkwithin1_neon(float * samples, int cnt);
+# if defined(OPUS_HAVE_RTCD) && \
+ (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt);
+
+# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
+# define opus_limit2_checkwithin1(samples, cnt, arch) \
+ ((*OPUS_LIMIT2_CHECKWITHIN1_IMPL[(arch)&OPUS_ARCHMASK])(samples, cnt))
+
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
+# define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_neon(samples, cnt))
# endif
# endif
--- a/celt/mathops.c
+++ b/celt/mathops.c
@@ -229,4 +229,24 @@
}
}
+int opus_limit2_checkwithin1_c(float * samples, int cnt)
+{
+ int i;
+ if (cnt <= 0)
+ {
+ return 1;
+ }
+
+ for (i = 0; i < cnt; i++)
+ {
+ float clippedVal = samples[i];
+ clippedVal = FMAX(-2.0f, clippedVal);
+ clippedVal = FMIN(2.0f, clippedVal);
+ samples[i] = clippedVal;
+ }
+
+ /* C implementation can't provide quick hint. Assume it might exceed -1/+1. */
+ return 0;
+}
+
#endif /* DISABLE_FLOAT_API */
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -490,6 +490,12 @@
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
#endif
+int opus_limit2_checkwithin1_c(float *samples, int cnt);
+
+#ifndef OVERRIDE_LIMIT2_CHECKWITHIN1
+#define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_c(samples, cnt))
+#endif
+
#endif /* DISABLE_FLOAT_API */
#endif /* MATHOPS_H */
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -435,6 +435,77 @@
#undef MAX_BUFFER_SIZE
}
+void testopus_limit2_checkwithin1(int use_ref_impl)
+{
+#define BUFFER_SIZE 37 /* strange float count to trigger residue loop of SIMD implementation */
+#define BYTE_COUNT (BUFFER_SIZE * sizeof(float))
+ int i, within1;
+ const int arch = opus_select_arch();
+
+ float pattern[BUFFER_SIZE], buffer[BUFFER_SIZE];
+
+ for (i = 0; i < BUFFER_SIZE; ++i)
+ {
+ pattern[i] = i % 2 ? -1.f : 1.f;
+ }
+
+ /* All values within -1..1:
+ Nothing changed. Return value is implementation-dependent (not expected to recognise nothing exceeds -1..1) */
+ memcpy(buffer, pattern, BYTE_COUNT);
+ within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
+ if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
+ {
+ fprintf (stderr, "opus_limit2_checkwithin1() modified values not exceeding -1..1 (ref=%d)\n", use_ref_impl);
+ ret = 1;
+ }
+
+ /* One value exceeds -1..1, within -2..2:
+ Values unchanged. Return value says not all values are within -1..1 */
+ for (i = 0; i < BUFFER_SIZE; ++i)
+ {
+ const float replace_value = pattern[i] * 1.001f;
+
+ memcpy(buffer, pattern, BYTE_COUNT);
+ buffer[i] = replace_value;
+ within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
+ if (within1 || buffer[i] != replace_value)
+ {
+ fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -1..1 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
+ ret = 1;
+ }
+ buffer[i] = pattern[i];
+ if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
+ {
+ fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
+ ret = 1;
+ }
+ }
+
+ /* One value exceeds -2..2:
+ One value is hardclipped, others are unchanged. Return value says not all values are within -1..1 */
+ for (i = 0; i < BUFFER_SIZE; ++i)
+ {
+ const float replace_value = pattern[i] * 2.1;
+
+ memcpy(buffer, pattern, BYTE_COUNT);
+ buffer[i] = replace_value;
+ within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
+ if (within1 || buffer[i] != (replace_value > 0.f ? 2.f : -2.f))
+ {
+ fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -2..2 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
+ ret = 1;
+ }
+ buffer[i] = pattern[i];
+ if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
+ {
+ fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
+ ret = 1;
+ }
+ }
+#undef BUFFER_SIZE
+#undef BYTE_COUNT
+}
+
#endif
int main(void)
@@ -461,6 +532,7 @@
testcelt_float2int16(use_ref_impl[i], 32);
testcelt_float2int16(use_ref_impl[i], 127);
testcelt_float2int16(use_ref_impl[i], 1031);
+ testopus_limit2_checkwithin1(use_ref_impl[i]);
}
#endif
return ret;
--- a/src/opus.c
+++ b/src/opus.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited
+ Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin and Koen Vos */
/*
Redistribution and use in source and binary forms, with or without
@@ -30,23 +31,40 @@
#endif
#include "opus.h"
+#include "celt/mathops.h"
#include "opus_private.h"
#ifndef DISABLE_FLOAT_API
-OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
+
+void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch)
{
int c;
int i;
float *x;
+ int all_within_neg1pos1;
if (C<1 || N<1 || !_x || !declip_mem) return;
- /* First thing: saturate everything to +/- 2 which is the highest level our
- non-linearity can handle. At the point where the signal reaches +/-2,
- the derivative will be zero anyway, so this doesn't introduce any
- discontinuity in the derivative. */
- for (i=0;i<N*C;i++)
- _x[i] = MAX16(-2.f, MIN16(2.f, _x[i]));
+ /* Clamp everything within the range [-2, +2] which is the domain of the soft
+ clipping non-linearity. Outside the defined range the derivative will be zero,
+ therefore there is no discontinuity introduced here. The implementation
+ might provide a hint if all input samples are within the [-1, +1] range.
+
+ `opus_limit2_checkwithin1()`:
+ - Clamps all samples within the valid range [-2, +2].
+ - Generic C implementation:
+ * Does not attempt early detection whether samples are within hinted range.
+ * Always returns 0.
+ - Architecture specific implementation:
+ * Uses SIMD instructions to efficiently detect if all samples are
+ within the hinted range [-1, +1].
+ * Returns 1 if no samples exceed the hinted range, 0 otherwise.
+
+ `all_within_neg1pos1`:
+ - Optimization hint to skip per-sample out-of-bound checks.
+ If true, the check can be skipped. */
+ all_within_neg1pos1 = opus_limit2_checkwithin1(_x, N*C, arch);
+
for (c=0;c<C;c++)
{
float a;
@@ -72,10 +90,16 @@
float maxval;
int special=0;
int peak_pos;
- for (i=curr;i<N;i++)
+ /* Detection for early exit can be skipped if hinted by `all_within_neg1pos1` */
+ if (all_within_neg1pos1)
{
- if (x[i*C]>1 || x[i*C]<-1)
- break;
+ i = N;
+ } else {
+ for (i=curr;i<N;i++)
+ {
+ if (x[i*C]>1 || x[i*C]<-1)
+ break;
+ }
}
if (i==N)
{
@@ -135,6 +159,12 @@
declip_mem[c] = a;
}
}
+
+OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
+{
+ opus_pcm_soft_clip_impl(_x, N, C, declip_mem, 0);
+}
+
#endif
int encode_size(int size, unsigned char *data)
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -814,7 +814,7 @@
OPUS_PRINT_INT(nb_samples);
#ifndef FIXED_POINT
if (soft_clip)
- opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem);
+ opus_pcm_soft_clip_impl(pcm, nb_samples, st->channels, st->softclip_mem, st->arch);
else
st->softclip_mem[0]=st->softclip_mem[1]=0;
#endif
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -177,6 +177,8 @@
void downmix_int24(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
int is_digital_silence(const opus_res* pcm, int frame_size, int channels, int lsb_depth);
+void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch);
+
int encode_size(int size, unsigned char *data);
opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);
--
⑨