shithub: aacenc

Download patch

ref: 01ab3c253d2cf436b5907a3e29f85cc7a219f7df
parent: 51c7787a6371a29e3ea7ecbc54ea4457616aba35
author: Krzysztof Nikiel <knik@users.sourceforge.net>
date: Wed Oct 4 14:04:19 EDT 2017

added SSE quantizer

--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,4 @@
+	* SSE quantizer
 	* modified functioning of ABR mode (-b option)
 	* improved autotools support
 	* allowed even higher bitrates, including ADTS
--- a/libfaac/Makefile.am
+++ b/libfaac/Makefile.am
@@ -1,7 +1,7 @@
 common_SOURCES = aacquant.c bitstream.c fft.c frame.c midside.c blockswitch.c util.c channels.c filtbank.c huffman.c tns.c quantize.c
 common_INCLUDES = aacquant.h channels.h filtbank.h hufftab.h blockswitch.h coder.h frame.h midside.h tns.h bitstream.h fft.h huffman.h util.h quantize.h version.h
 common_LIBADD = -lm
-common_CFLAGS = -fvisibility=hidden
+common_CFLAGS = -fvisibility=hidden -msse2
 
 if USE_DRM
 lib_LTLIBRARIES = libfaac_drm.la
--- a/libfaac/quantize.c
+++ b/libfaac/quantize.c
@@ -121,6 +121,41 @@
 
       sfac = lrint(log(bandqual[sb] / rmsx) * sfstep);
       sfacfix = exp(sfac / sfstep);
+      coderInfo->scale_factor[coderInfo->sfcnt++] = sfac;
+
+#ifdef __GNUC__
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+      if (__builtin_cpu_supports("sse2"))
+      {
+          static const v4sf zero = {0, 0, 0, 0};
+          static const v4sf magic = {MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER};
+
+          for (cnt = start; cnt < end; cnt += 4)
+          {
+              float fin[4];
+              fin[0] = xr[cnt];
+              fin[1] = xr[cnt+1];
+              fin[2] = xr[cnt+2];
+              fin[3] = xr[cnt+3];
+
+              v4sf x = __builtin_ia32_loadups(fin);
+              x = __builtin_ia32_maxps(x, __builtin_ia32_subps(zero, x));
+
+              v4sf fix = {sfacfix, sfacfix, sfacfix, sfacfix};
+              x = __builtin_ia32_mulps(x, fix);
+              x = __builtin_ia32_mulps(x , __builtin_ia32_sqrtps(x));
+              x = __builtin_ia32_sqrtps(x);
+
+              x = __builtin_ia32_addps(x, magic);
+              v4si vi = __builtin_ia32_cvttps2dq(x);
+              memcpy(xi+cnt,&vi,16);
+          }
+
+          continue;
+      }
+#endif
+
       for (cnt = start; cnt < end; cnt++)
       {
           double tmp = fabs(xr[cnt]);
@@ -130,8 +165,6 @@
 
           xi[cnt] = (int)(tmp + MAGIC_NUMBER);
       }
-
-      coderInfo->scale_factor[coderInfo->sfcnt++] = sfac;
     }
 }