shithub: opus

--- a/celt/arch.h

+++ b/celt/arch.h

@@ -103,6 +103,8 @@

 #define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */

 #define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */

 #define IMAX(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum int value.   */

+#define FMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum float value.   */

+#define FMAX(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum float value.   */

 #define UADD32(a,b) ((a)+(b))

 #define USUB32(a,b) ((a)-(b))

 #define MAXG(a,b) MAX32(a, b)

--- a/celt/arm/arm_celt_map.c

+++ b/celt/arm/arm_celt_map.c

@@ -46,6 +46,14 @@

   celt_float2int16_neon,/* NEON */

   celt_float2int16_neon /* DOTPROD */

};

+int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt) = {

+  opus_limit2_checkwithin1_c,   /* ARMv4 */

+  opus_limit2_checkwithin1_c,   /* EDSP */

+  opus_limit2_checkwithin1_c,   /* Media */

+  opus_limit2_checkwithin1_neon,/* NEON */

+  opus_limit2_checkwithin1_neon /* DOTPROD */

+};

 #  endif

 # endif

--- a/celt/arm/celt_neon_intr.c

+++ b/celt/arm/celt_neon_intr.c

@@ -86,7 +86,84 @@

       out[i] = FLOAT2INT16(in[i]);

+int opus_limit2_checkwithin1_neon(float *samples, int cnt)

+{

+   const float hardclipMin = -2.0f;

+   const float hardclipMax = 2.0f;

+   int i = 0;

+   int exceeding1 = 0;

+   int nextIndex = 0;

+#if defined(__ARM_NEON)

+   const int BLOCK_SIZE = 16;

+   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;

+   float32x4_t min_all_0 = vdupq_n_f32(0.0f);

+   float32x4_t min_all_1 = vdupq_n_f32(0.0f);

+   float32x4_t max_all_0 = vdupq_n_f32(0.0f);

+   float32x4_t max_all_1 = vdupq_n_f32(0.0f);

+   float max, min;

+   for (i = 0; i < blockedSize; i += BLOCK_SIZE)

+   {

+      const float32x4_t orig_a = vld1q_f32(&samples[i +  0]);

+      const float32x4_t orig_b = vld1q_f32(&samples[i +  4]);

+      const float32x4_t orig_c = vld1q_f32(&samples[i +  8]);

+      const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);

+      max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b));

+      max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d));

+      min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b));

+      min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d));

+   }

+   max = vmaxvf(vmaxq_f32(max_all_0, max_all_1));

+   min = vminvf(vminq_f32(min_all_0, min_all_1));

+   if (min < hardclipMin || max > hardclipMax)

+   {

+      const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin);

+      const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax);

+      for (i = 0; i < blockedSize; i += BLOCK_SIZE)

+      {

+         const float32x4_t orig_a = vld1q_f32(&samples[i +  0]);

+         const float32x4_t orig_b = vld1q_f32(&samples[i +  4]);

+         const float32x4_t orig_c = vld1q_f32(&samples[i +  8]);

+         const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);

+         const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg));

+         const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg));

+         const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg));

+         const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg));

+         vst1q_f32(&samples[i + 0], clipped_a);

+         vst1q_f32(&samples[i + 4], clipped_b);

+         vst1q_f32(&samples[i + 8], clipped_c);

+         vst1q_f32(&samples[i + 12], clipped_d);

+      }

+   }

+   nextIndex = blockedSize;

+   exceeding1 |= max > 1.0f || min < -1.0f;

 #endif

+   for (i = nextIndex; i < cnt; i++)

+   {

+      const float origVal = samples[i];

+      float clippedVal = origVal;

+      clippedVal = MAX16(hardclipMin, clippedVal);

+      clippedVal = MIN16(hardclipMax, clippedVal);

+      samples[i] = clippedVal;

+      exceeding1 |= origVal > 1.0f || origVal < -1.0f;

+   }

+   return !exceeding1;

+}

+#endif

 #if defined(FIXED_POINT)

 #include <string.h>

--- a/celt/arm/mathops_arm.h

+++ b/celt/arm/mathops_arm.h

@@ -46,6 +46,30 @@

 #  endif

+static inline float vminvf(float32x4_t a)

+{

+#if defined(__aarch64__)

+   return vminvq_f32(a);

+#else

+    float32x2_t xy = vmin_f32(vget_low_f32(a), vget_high_f32(a));

+    float x = vget_lane_f32(xy, 0);

+    float y = vget_lane_f32(xy, 1);

+    return x < y ? x : y;

+#endif

+}

+static inline float vmaxvf(float32x4_t a)

+{

+#if defined(__aarch64__)

+   return vmaxvq_f32(a);

+#else

+    float32x2_t xy = vmax_f32(vget_low_f32(a), vget_high_f32(a));

+    float x = vget_lane_f32(xy, 0);

+    float y = vget_lane_f32(xy, 1);

+    return x > y ? x : y;

+#endif

+}

 void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);

 #  if defined(OPUS_HAVE_RTCD) && \

     (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))

@@ -59,6 +83,20 @@

 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)

 #   define OVERRIDE_FLOAT2INT16 (1)

 #   define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))

+#  endif

+int opus_limit2_checkwithin1_neon(float * samples, int cnt);

+#  if defined(OPUS_HAVE_RTCD) && \

+      (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))

+extern int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt);

+#   define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)

+#   define opus_limit2_checkwithin1(samples, cnt, arch) \

+   ((*OPUS_LIMIT2_CHECKWITHIN1_IMPL[(arch)&OPUS_ARCHMASK])(samples, cnt))

+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)

+#   define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)

+#   define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_neon(samples, cnt))

 #  endif

 # endif

--- a/celt/mathops.c

+++ b/celt/mathops.c

@@ -229,4 +229,24 @@

+int opus_limit2_checkwithin1_c(float * samples, int cnt)

+{

+   int i;

+   if (cnt <= 0)

+   {

+      return 1;

+   }

+   for (i = 0; i < cnt; i++)

+   {

+      float clippedVal = samples[i];

+      clippedVal = FMAX(-2.0f, clippedVal);

+      clippedVal = FMIN(2.0f, clippedVal);

+      samples[i] = clippedVal;

+   }

+   /* C implementation can't provide quick hint. Assume it might exceed -1/+1. */

+   return 0;

+}

 #endif /* DISABLE_FLOAT_API */

--- a/celt/mathops.h

+++ b/celt/mathops.h

@@ -490,6 +490,12 @@

 #define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))

 #endif

+int opus_limit2_checkwithin1_c(float *samples, int cnt);

+#ifndef OVERRIDE_LIMIT2_CHECKWITHIN1

+#define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_c(samples, cnt))

+#endif

 #endif /* DISABLE_FLOAT_API */

 #endif /* MATHOPS_H */

--- a/celt/tests/test_unit_mathops.c

+++ b/celt/tests/test_unit_mathops.c

@@ -435,6 +435,77 @@

 #undef MAX_BUFFER_SIZE

+void testopus_limit2_checkwithin1(int use_ref_impl)

+{

+#define BUFFER_SIZE 37 /* strange float count to trigger residue loop of SIMD implementation */

+#define BYTE_COUNT (BUFFER_SIZE * sizeof(float))

+   int i, within1;

+   const int arch = opus_select_arch();

+   float pattern[BUFFER_SIZE], buffer[BUFFER_SIZE];

+   for (i = 0; i < BUFFER_SIZE; ++i)

+   {

+      pattern[i] = i % 2 ? -1.f : 1.f;

+   }

+   /* All values within -1..1:

+   Nothing changed. Return value is implementation-dependent (not expected to recognise nothing exceeds -1..1) */

+   memcpy(buffer, pattern, BYTE_COUNT);

+   within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);

+   if (memcmp(buffer, pattern, BYTE_COUNT) != 0)

+   {

+      fprintf (stderr, "opus_limit2_checkwithin1() modified values not exceeding -1..1 (ref=%d)\n", use_ref_impl);

+      ret = 1;

+   }

+   /* One value exceeds -1..1, within -2..2:

+   Values unchanged. Return value says not all values are within -1..1 */

+   for (i = 0; i < BUFFER_SIZE; ++i)

+   {

+      const float replace_value = pattern[i] * 1.001f;

+      memcpy(buffer, pattern, BYTE_COUNT);

+      buffer[i] = replace_value;

+      within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);

+      if (within1 || buffer[i] != replace_value)

+      {

+         fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -1..1 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);

+         ret = 1;

+      }

+      buffer[i] = pattern[i];

+      if (memcmp(buffer, pattern, BYTE_COUNT) != 0)

+      {

+         fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2  (ref=%d, i=%d)\n", use_ref_impl, i);

+         ret = 1;

+      }

+   }

+   /* One value exceeds -2..2:

+   One value is hardclipped, others are unchanged. Return value says not all values are within -1..1 */

+   for (i = 0; i < BUFFER_SIZE; ++i)

+   {

+      const float replace_value = pattern[i] * 2.1;

+      memcpy(buffer, pattern, BYTE_COUNT);

+      buffer[i] = replace_value;

+      within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);

+      if (within1 || buffer[i] != (replace_value > 0.f ? 2.f : -2.f))

+      {

+         fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -2..2 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);

+         ret = 1;

+      }

+      buffer[i] = pattern[i];

+      if (memcmp(buffer, pattern, BYTE_COUNT) != 0)

+      {

+         fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2  (ref=%d, i=%d)\n", use_ref_impl, i);

+         ret = 1;

+      }

+   }

+#undef BUFFER_SIZE

+#undef BYTE_COUNT

+}

 #endif

 int main(void)

@@ -461,6 +532,7 @@

       testcelt_float2int16(use_ref_impl[i], 32);

       testcelt_float2int16(use_ref_impl[i], 127);

       testcelt_float2int16(use_ref_impl[i], 1031);

+      testopus_limit2_checkwithin1(use_ref_impl[i]);

 #endif

    return ret;

--- a/src/opus.c

+++ b/src/opus.c

@@ -1,4 +1,5 @@

 /* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited

+   Copyright (c) 2024 Arm Limited

    Written by Jean-Marc Valin and Koen Vos */

/*

    Redistribution and use in source and binary forms, with or without

@@ -30,23 +31,40 @@

 #endif

 #include "opus.h"

+#include "celt/mathops.h"

 #include "opus_private.h"

 #ifndef DISABLE_FLOAT_API

-OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)

+void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch)

    int c;

    int i;

    float *x;

+   int all_within_neg1pos1;

    if (C<1 || N<1 || !_x || !declip_mem) return;

-   /* First thing: saturate everything to +/- 2 which is the highest level our

-      non-linearity can handle. At the point where the signal reaches +/-2,

-      the derivative will be zero anyway, so this doesn't introduce any

-      discontinuity in the derivative. */

-   for (i=0;i<N*C;i++)

-      _x[i] = MAX16(-2.f, MIN16(2.f, _x[i]));

+   /* Clamp everything within the range [-2, +2] which is the domain of the soft

+      clipping non-linearity. Outside the defined range the derivative will be zero,

+      therefore there is no discontinuity introduced here. The implementation

+      might provide a hint if all input samples are within the [-1, +1] range.

+   `opus_limit2_checkwithin1()`:

+      - Clamps all samples within the valid range [-2, +2].

+      - Generic C implementation:

+         * Does not attempt early detection whether samples are within hinted range.

+         * Always returns 0.

+      - Architecture specific implementation:

+         * Uses SIMD instructions to efficiently detect if all samples are

+           within the hinted range [-1, +1].

+         * Returns 1 if no samples exceed the hinted range, 0 otherwise.

+   `all_within_neg1pos1`:

+      - Optimization hint to skip per-sample out-of-bound checks.

+        If true, the check can be skipped. */

+   all_within_neg1pos1 = opus_limit2_checkwithin1(_x, N*C, arch);

    for (c=0;c<C;c++)

       float a;

@@ -72,10 +90,16 @@

          float maxval;

          int special=0;

          int peak_pos;

-         for (i=curr;i<N;i++)

+         /* Detection for early exit can be skipped if hinted by `all_within_neg1pos1` */

+         if (all_within_neg1pos1)

-            if (x[i*C]>1 || x[i*C]<-1)

-               break;

+            i = N;

+         } else {

+            for (i=curr;i<N;i++)

+            {

+               if (x[i*C]>1 || x[i*C]<-1)

+                  break;

+            }

          if (i==N)

@@ -135,6 +159,12 @@

       declip_mem[c] = a;

+OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)

+{

+   opus_pcm_soft_clip_impl(_x, N, C, declip_mem, 0);

+}

 #endif

 int encode_size(int size, unsigned char *data)

--- a/src/opus_decoder.c

+++ b/src/opus_decoder.c

@@ -814,7 +814,7 @@

       OPUS_PRINT_INT(nb_samples);

 #ifndef FIXED_POINT

    if (soft_clip)

-      opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem);

+      opus_pcm_soft_clip_impl(pcm, nb_samples, st->channels, st->softclip_mem, st->arch);

    else

       st->softclip_mem[0]=st->softclip_mem[1]=0;

 #endif

--- a/src/opus_private.h

+++ b/src/opus_private.h

@@ -177,6 +177,8 @@

 void downmix_int24(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);

 int is_digital_silence(const opus_res* pcm, int frame_size, int channels, int lsb_depth);

+void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch);

 int encode_size(int size, unsigned char *data);

 opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);

--

⑨