ref: 02037bc1e50947fa252ffde7318bb48565dc62e4
parent: d3951eafd209640101e9cca431860ca4ca32f993
author: Krzysztof Nikiel <knik@users.sourceforge.net>
date: Sat Oct 7 17:19:38 EDT 2017
cleaner and more compatible SSE code
--- a/libfaac/quantize.c
+++ b/libfaac/quantize.c
@@ -23,6 +23,18 @@
#include "util.h"
#include "quantize.h"
+#ifdef __SSE2__
+# ifdef __GNUC__
+# include <cpuid.h>
+# endif
+#endif
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define __SSE2__
+# define bit_SSE2 (1 << 26)
+#endif
+
#define MAGIC_NUMBER 0.4054
enum {NULL_SF = 0};
@@ -88,7 +100,21 @@
int start, end;
// 1.5dB step
static const double sfstep = 20.0 / 1.5 / M_LN10;
+#ifdef __SSE2__
+ int cpuid[4];
+ int sse2 = 0;
+ cpuid[3] = 0;
+# ifdef __GNUC__
+ __cpuid(1, cpuid[0], cpuid[1], cpuid[2], cpuid[3]);
+# endif
+# ifdef _MSC_VER
+ __cpuid(cpuid, 1);
+# endif
+ if (cpuid[3] & bit_SSE2)
+ sse2 = 1;
+#endif
+
for (sb = 0; sb < coderInfo->sfbn; sb++)
{
double sfacfix;
@@ -124,39 +150,21 @@
sfacfix = exp(sfac / sfstep);
coderInfo->scale_factor[coderInfo->sfcnt++] = sfac;
-#if defined(__GNUC__) && defined(__SSE2__)
-typedef float v4sf __attribute__ ((vector_size (16)));
-typedef int v4si __attribute__ ((vector_size (16)));
-#ifdef __APPLE__
- if (1)
-#else
- if (__builtin_cpu_supports("sse2"))
-#endif
+#ifdef __SSE2__
+ if (sse2)
{
- static const v4sf zero = {0, 0, 0, 0};
- static const v4sf magic = {MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER};
-
for (cnt = start; cnt < end; cnt += 4)
{
- float fin[4];
- fin[0] = xr[cnt];
- fin[1] = xr[cnt+1];
- fin[2] = xr[cnt+2];
- fin[3] = xr[cnt+3];
+ __m128 x = {xr[cnt], xr[cnt + 1], xr[cnt + 2], xr[cnt + 3]};
- v4sf x = _mm_loadu_ps(fin);
- x = _mm_max_ps(x, _mm_sub_ps(zero, x));
+ x = _mm_max_ps(x, -x);
+ x *= (__m128){sfacfix, sfacfix, sfacfix, sfacfix};
+ x *= _mm_sqrt_ps(x);
+ x = _mm_sqrt_ps(x);
+ x += (__m128){MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER};
- v4sf fix = {sfacfix, sfacfix, sfacfix, sfacfix};
- x = _mm_mul_ps(x, fix);
- x = _mm_mul_ps(x , __builtin_ia32_sqrtps(x));
- x = __builtin_ia32_sqrtps(x);
-
- x = _mm_add_ps(x, magic);
- v4si vi = __builtin_ia32_cvttps2dq(x);
- memcpy(xi+cnt,&vi,16);
+ *(__m128i*)(xi + cnt) = _mm_cvttps_epi32(x);
}
-
continue;
}
#endif