shithub: opus-tools

Download patch

ref: 4cc06a7f0fe7cd93196e3078bcf1b7c1b0e0c934
parent: 665f3e69b0477861c48257b1d2d691518ddfbbf0
author: Chris Moeller <kode54@gmail.com>
date: Sat Jul 21 15:26:35 EDT 2012

Added SSE optimized resampling functions which result in a 15 to 30 percent increase in encoding speed

--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,7 @@
                  src/opus_header.h \
                  src/opusinfo.h \
                  src/os_support.h \
+                 src/resample_sse.h \
                  src/speex_resampler.h \
                  src/stack_alloc.h \
                  src/wave_out.h \
--- a/src/opusdec.vcxproj
+++ b/src/opusdec.vcxproj
@@ -31,6 +31,7 @@
     <ClCompile Include="wav_io.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\win32\config.h" />
     <ClInclude Include="arch.h" />
     <ClInclude Include="diag_range.h" />
     <ClInclude Include="info_opus.h" />
@@ -39,6 +40,7 @@
     <ClInclude Include="opusinfo.h" />
     <ClInclude Include="opus_header.h" />
     <ClInclude Include="os_support.h" />
+    <ClInclude Include="resample_sse.h" />
     <ClInclude Include="speex_resampler.h" />
     <ClInclude Include="stack_alloc.h" />
     <ClInclude Include="wave_out.h" />
--- a/src/opusdec.vcxproj.filters
+++ b/src/opusdec.vcxproj.filters
@@ -79,5 +79,11 @@
     <ClInclude Include="info_opus.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="resample_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\win32\config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
--- a/src/opusenc.vcxproj
+++ b/src/opusenc.vcxproj
@@ -31,6 +31,7 @@
     <ClCompile Include="wav_io.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\win32\config.h" />
     <ClInclude Include="arch.h" />
     <ClInclude Include="diag_range.h" />
     <ClInclude Include="info_opus.h" />
@@ -39,6 +40,7 @@
     <ClInclude Include="opusinfo.h" />
     <ClInclude Include="opus_header.h" />
     <ClInclude Include="os_support.h" />
+    <ClInclude Include="resample_sse.h" />
     <ClInclude Include="speex_resampler.h" />
     <ClInclude Include="stack_alloc.h" />
     <ClInclude Include="wave_out.h" />
@@ -168,6 +170,7 @@
       <AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
     <Link>
       <TargetMachine>MachineX86</TargetMachine>
@@ -199,6 +202,7 @@
       <AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
--- a/src/opusenc.vcxproj.filters
+++ b/src/opusenc.vcxproj.filters
@@ -79,5 +79,11 @@
     <ClInclude Include="diag_range.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="resample_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\win32\config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
--- a/src/opusinfo.vcxproj
+++ b/src/opusinfo.vcxproj
@@ -32,6 +32,7 @@
     <ClCompile Include="wav_io.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\win32\config.h" />
     <ClInclude Include="arch.h" />
     <ClInclude Include="diag_range.h" />
     <ClInclude Include="info_opus.h" />
@@ -40,6 +41,7 @@
     <ClInclude Include="opusinfo.h" />
     <ClInclude Include="opus_header.h" />
     <ClInclude Include="os_support.h" />
+    <ClInclude Include="resample_sse.h" />
     <ClInclude Include="speex_resampler.h" />
     <ClInclude Include="stack_alloc.h" />
     <ClInclude Include="wave_out.h" />
@@ -169,6 +171,7 @@
       <AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
     <Link>
       <TargetMachine>MachineX86</TargetMachine>
@@ -200,6 +203,7 @@
       <AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <FloatingPointModel>Fast</FloatingPointModel>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
--- a/src/opusinfo.vcxproj.filters
+++ b/src/opusinfo.vcxproj.filters
@@ -82,5 +82,11 @@
     <ClInclude Include="lpc.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="resample_sse.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\win32\config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
--- a/src/resample.c
+++ b/src/resample.c
@@ -95,7 +95,7 @@
 #define NULL 0
 #endif
 
-#ifdef _USE_SSE
+#if defined(FLOATING_POINT) && (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__amd64__))
 #include "resample_sse.h"
 #endif
 
--- /dev/null
+++ b/src/resample_sse.h
@@ -1,0 +1,172 @@
+/* Copyright (C) 2007-2008 Jean-Marc Valin
+ * Copyright (C) 2008 Thorvald Natvig
+ */
+/**
+   @file resample_sse.h
+   @brief Resampler functions (SSE version)
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <xmmintrin.h>
+
+#if defined(_M_X64) || defined(__amd64__)
+#define query_cpu_support_sse() 1
+#else
+#include <intrin.h>
+
+static inline int query_cpu_support_sse()
+{
+   static int initialized = 0;
+   static int return_value;
+   if (!initialized)
+   {
+      int buffer[4];
+      __cpuid(buffer, 1);
+      return_value = (buffer[3] & (1<<25)) != 0;
+      initialized = 1;
+   }
+   return return_value;
+}
+#endif
+
+#define OVERRIDE_INNER_PRODUCT_SINGLE
+static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+{
+   int i;
+   float ret;
+   if (query_cpu_support_sse())
+   {
+      __m128 sum = _mm_setzero_ps();
+      for (i=0;i<len;i+=8)
+      {
+         sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
+         sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
+      }
+      sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+      sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+      _mm_store_ss(&ret, sum);
+   }
+   else
+   {
+      ret = 0;
+      for (i=0;i<len;i++) ret += a[i] * b[i];
+   }
+   return ret;
+}
+
+#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+  int i;
+  float ret;
+  if (query_cpu_support_sse())
+  {
+    __m128 sum = _mm_setzero_ps();
+    __m128 f = _mm_loadu_ps(frac);
+    for(i=0;i<len;i+=2)
+    {
+      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
+      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
+    }
+    sum = _mm_mul_ps(f, sum);
+    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+    _mm_store_ss(&ret, sum);
+  }
+  else
+  {
+    float accum[4] = {0,0,0,0};
+    for(i=0;i<len;i++)
+    {
+      const float curr_in=a[i];
+      accum[0] += curr_in * b[i * oversample + 0];
+      accum[1] += curr_in * b[i * oversample + 1];
+      accum[2] += curr_in * b[i * oversample + 2];
+      accum[3] += curr_in * b[i * oversample + 3];
+    }
+    ret = accum[0] * frac[0] + accum[1] * frac[1] + accum[2] * frac[2] + accum[3] * frac[3];
+  }
+  return ret;
+}
+
+#ifdef _USE_SSE2
+#include <emmintrin.h>
+#define OVERRIDE_INNER_PRODUCT_DOUBLE
+
+static inline double inner_product_double(const float *a, const float *b, unsigned int len)
+{
+   int i;
+   double ret;
+   __m128d sum = _mm_setzero_pd();
+   __m128 t;
+   for (i=0;i<len;i+=8)
+   {
+      t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
+      sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
+      sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+
+      t = _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4));
+      sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
+      sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+   }
+   sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
+   _mm_store_sd(&ret, sum);
+   return ret;
+}
+
+#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+  int i;
+  double ret;
+  __m128d sum;
+  __m128d sum1 = _mm_setzero_pd();
+  __m128d sum2 = _mm_setzero_pd();
+  __m128 f = _mm_loadu_ps(frac);
+  __m128d f1 = _mm_cvtps_pd(f);
+  __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
+  __m128 t;
+  for(i=0;i<len;i+=2)
+  {
+    t = _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample));
+    sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
+    sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+
+    t = _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample));
+    sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
+    sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+  }
+  sum1 = _mm_mul_pd(f1, sum1);
+  sum2 = _mm_mul_pd(f2, sum2);
+  sum = _mm_add_pd(sum1, sum2);
+  sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
+  _mm_store_sd(&ret, sum);
+  return ret;
+}
+
+#endif