ref: 4cc06a7f0fe7cd93196e3078bcf1b7c1b0e0c934
parent: 665f3e69b0477861c48257b1d2d691518ddfbbf0
author: Chris Moeller <kode54@gmail.com>
date: Sat Jul 21 15:26:35 EDT 2012
Added SSE optimized resampling functions which result in a 15 to 30 percent increase in encoding speed
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,7 @@
src/opus_header.h \
src/opusinfo.h \
src/os_support.h \
+ src/resample_sse.h \
src/speex_resampler.h \
src/stack_alloc.h \
src/wave_out.h \
--- a/src/opusdec.vcxproj
+++ b/src/opusdec.vcxproj
@@ -31,6 +31,7 @@
<ClCompile Include="wav_io.c" />
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="..\win32\config.h" />
<ClInclude Include="arch.h" />
<ClInclude Include="diag_range.h" />
<ClInclude Include="info_opus.h" />
@@ -39,6 +40,7 @@
<ClInclude Include="opusinfo.h" />
<ClInclude Include="opus_header.h" />
<ClInclude Include="os_support.h" />
+ <ClInclude Include="resample_sse.h" />
<ClInclude Include="speex_resampler.h" />
<ClInclude Include="stack_alloc.h" />
<ClInclude Include="wave_out.h" />
--- a/src/opusdec.vcxproj.filters
+++ b/src/opusdec.vcxproj.filters
@@ -79,5 +79,11 @@
<ClInclude Include="info_opus.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="resample_sse.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\win32\config.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
</ItemGroup>
</Project>
\ No newline at end of file
--- a/src/opusenc.vcxproj
+++ b/src/opusenc.vcxproj
@@ -31,6 +31,7 @@
<ClCompile Include="wav_io.c" />
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="..\win32\config.h" />
<ClInclude Include="arch.h" />
<ClInclude Include="diag_range.h" />
<ClInclude Include="info_opus.h" />
@@ -39,6 +40,7 @@
<ClInclude Include="opusinfo.h" />
<ClInclude Include="opus_header.h" />
<ClInclude Include="os_support.h" />
+ <ClInclude Include="resample_sse.h" />
<ClInclude Include="speex_resampler.h" />
<ClInclude Include="stack_alloc.h" />
<ClInclude Include="wave_out.h" />
@@ -168,6 +170,7 @@
<AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<TargetMachine>MachineX86</TargetMachine>
@@ -199,6 +202,7 @@
<AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
--- a/src/opusenc.vcxproj.filters
+++ b/src/opusenc.vcxproj.filters
@@ -79,5 +79,11 @@
<ClInclude Include="diag_range.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="resample_sse.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\win32\config.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
</ItemGroup>
</Project>
\ No newline at end of file
--- a/src/opusinfo.vcxproj
+++ b/src/opusinfo.vcxproj
@@ -32,6 +32,7 @@
<ClCompile Include="wav_io.c" />
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="..\win32\config.h" />
<ClInclude Include="arch.h" />
<ClInclude Include="diag_range.h" />
<ClInclude Include="info_opus.h" />
@@ -40,6 +41,7 @@
<ClInclude Include="opusinfo.h" />
<ClInclude Include="opus_header.h" />
<ClInclude Include="os_support.h" />
+ <ClInclude Include="resample_sse.h" />
<ClInclude Include="speex_resampler.h" />
<ClInclude Include="stack_alloc.h" />
<ClInclude Include="wave_out.h" />
@@ -169,6 +171,7 @@
<AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<TargetMachine>MachineX86</TargetMachine>
@@ -200,6 +203,7 @@
<AdditionalIncludeDirectories>..\..\libogg\include;..\..\opus\include;..\win32;..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
--- a/src/opusinfo.vcxproj.filters
+++ b/src/opusinfo.vcxproj.filters
@@ -82,5 +82,11 @@
<ClInclude Include="lpc.h">
<Filter>Header Files</Filter>
</ClInclude>
+ <ClInclude Include="resample_sse.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\win32\config.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
</ItemGroup>
</Project>
\ No newline at end of file
--- a/src/resample.c
+++ b/src/resample.c
@@ -95,7 +95,7 @@
#define NULL 0
#endif
-#ifdef _USE_SSE
+#if defined(FLOATING_POINT) && (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__amd64__))
#include "resample_sse.h"
#endif
--- /dev/null
+++ b/src/resample_sse.h
@@ -1,0 +1,172 @@
+/* Copyright (C) 2007-2008 Jean-Marc Valin
+ * Copyright (C) 2008 Thorvald Natvig
+ */
+/**
+ @file resample_sse.h
+ @brief Resampler functions (SSE version)
+*/
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ - Neither the name of the Xiph.org Foundation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <xmmintrin.h>
+
+#if defined(_M_X64) || defined(__amd64__)
+#define query_cpu_support_sse() 1
+#else
+#include <intrin.h>
+
+static inline int query_cpu_support_sse()
+{
+ static int initialized = 0;
+ static int return_value;
+ if (!initialized)
+ {
+ int buffer[4];
+ __cpuid(buffer, 1);
+ return_value = (buffer[3] & (1<<25)) != 0;
+ initialized = 1;
+ }
+ return return_value;
+}
+#endif
+
+#define OVERRIDE_INNER_PRODUCT_SINGLE
+static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+{
+ int i;
+ float ret;
+ if (query_cpu_support_sse())
+ {
+ __m128 sum = _mm_setzero_ps();
+ for (i=0;i<len;i+=8)
+ {
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
+ }
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+ _mm_store_ss(&ret, sum);
+ }
+ else
+ {
+ ret = 0;
+ for (i=0;i<len;i++) ret += a[i] * b[i];
+ }
+ return ret;
+}
+
+#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+ int i;
+ float ret;
+ if (query_cpu_support_sse())
+ {
+ __m128 sum = _mm_setzero_ps();
+ __m128 f = _mm_loadu_ps(frac);
+ for(i=0;i<len;i+=2)
+ {
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
+ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
+ }
+ sum = _mm_mul_ps(f, sum);
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+ _mm_store_ss(&ret, sum);
+ }
+ else
+ {
+ float accum[4] = {0,0,0,0};
+ for(i=0;i<len;i++)
+ {
+ const float curr_in=a[i];
+ accum[0] += curr_in * b[i * oversample + 0];
+ accum[1] += curr_in * b[i * oversample + 1];
+ accum[2] += curr_in * b[i * oversample + 2];
+ accum[3] += curr_in * b[i * oversample + 3];
+ }
+ ret = accum[0] * frac[0] + accum[1] * frac[1] + accum[2] * frac[2] + accum[3] * frac[3];
+ }
+ return ret;
+}
+
+#ifdef _USE_SSE2
+#include <emmintrin.h>
+#define OVERRIDE_INNER_PRODUCT_DOUBLE
+
+static inline double inner_product_double(const float *a, const float *b, unsigned int len)
+{
+ int i;
+ double ret;
+ __m128d sum = _mm_setzero_pd();
+ __m128 t;
+ for (i=0;i<len;i+=8)
+ {
+ t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
+ sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
+ sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+
+ t = _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4));
+ sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
+ sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+ }
+ sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
+ _mm_store_sd(&ret, sum);
+ return ret;
+}
+
+#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+ int i;
+ double ret;
+ __m128d sum;
+ __m128d sum1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
+ __m128 f = _mm_loadu_ps(frac);
+ __m128d f1 = _mm_cvtps_pd(f);
+ __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
+ __m128 t;
+ for(i=0;i<len;i+=2)
+ {
+ t = _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample));
+ sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
+ sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+
+ t = _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample));
+ sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
+ sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
+ }
+ sum1 = _mm_mul_pd(f1, sum1);
+ sum2 = _mm_mul_pd(f2, sum2);
+ sum = _mm_add_pd(sum1, sum2);
+ sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
+ _mm_store_sd(&ret, sum);
+ return ret;
+}
+
+#endif