ref: a2f7fccb2b2cd6b504990b09f784d74b3bb9e8bd
parent: f7e4d06555e314a0b831d8a44a802043d16df69e
author: Chris Moeller <kode54@gmail.com>
date: Fri Mar 21 11:55:46 EDT 2014
Optimized SSE with optional SSE convolving loop, and doubled the kernel size
--- a/dumb/include/internal/lanczos_resampler.h
+++ b/dumb/include/internal/lanczos_resampler.h
@@ -1,11 +1,31 @@
#ifndef _LANCZOS_RESAMPLER_H_
#define _LANCZOS_RESAMPLER_H_
-void lanczos_init();
+// Ugglay
+#ifdef LANCZOS_DECORATE
+#define PASTE(a,b) a ## b
+#define EVALUATE(a,b) PASTE(a,b)
+#define lanczos_init EVALUATE(LANCZOS_DECORATE,_lanczos_init)
+#define lanczos_resampler_create EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_create)
+#define lanczos_resampler_delete EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_delete)
+#define lanczos_resampler_dup EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup)
+#define lanczos_resampler_dup_inplace EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_dup_inplace)
+#define lanczos_resampler_get_free_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_free_count)
+#define lanczos_resampler_write_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_write_sample)
+#define lanczos_resampler_set_rate EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_set_rate)
+#define lanczos_resampler_ready EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_ready)
+#define lanczos_resampler_clear EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_clear)
+#define lanczos_resampler_get_sample_count EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample_count)
+#define lanczos_resampler_get_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_get_sample)
+#define lanczos_resampler_remove_sample EVALUATE(LANCZOS_DECORATE,_lanczos_resampler_remove_sample)
+#endif
-void * lanczos_resampler_create();
+void lanczos_init(void);
+
+void * lanczos_resampler_create(void);
void lanczos_resampler_delete(void *);
-void * lanczos_resampler_dup(void *);
+void * lanczos_resampler_dup(const void *);
+void lanczos_resampler_dup_inplace(void *, const void *);
int lanczos_resampler_get_free_count(void *);
void lanczos_resampler_write_sample(void *, short sample);
--- a/dumb/src/helpers/lanczos_resampler.c
+++ b/dumb/src/helpers/lanczos_resampler.c
@@ -2,6 +2,10 @@
#include <string.h>
#define _USE_MATH_DEFINES
#include <math.h>
+#if (defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__amd64__))
+#include <xmmintrin.h>
+#define LANCZOS_SSE
+#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846
@@ -10,29 +14,60 @@
#include "internal/lanczos_resampler.h"
enum { LANCZOS_RESOLUTION = 8192 };
-enum { LANCZOS_WIDTH = 8 };
+enum { LANCZOS_WIDTH = 16 };
enum { LANCZOS_SAMPLES = LANCZOS_RESOLUTION * LANCZOS_WIDTH };
-static double lanczos_lut[LANCZOS_SAMPLES + 1];
+static float lanczos_lut[LANCZOS_SAMPLES + 1];
enum { lanczos_buffer_size = LANCZOS_WIDTH * 4 };
-int fEqual(const double b, const double a)
+static int fEqual(const float b, const float a)
{
return fabs(a - b) < 1.0e-6;
}
-static double sinc(double x)
+static float sinc(float x)
{
return fEqual(x, 0.0) ? 1.0 : sin(x * M_PI) / (x * M_PI);
}
-void lanczos_init()
+#ifdef LANCZOS_SSE
+#ifdef _MSC_VER
+#include <intrin.h>
+#elif defined(__clang__) || defined(__GNUC__)
+static inline void
+__cpuid(int *data, int selector)
{
+ asm("cpuid"
+ : "=a" (data[0]),
+ "=b" (data[1]),
+ "=c" (data[2]),
+ "=d" (data[3])
+ : "a"(selector));
+}
+#else
+#define __cpuid(a,b) memset((a), 0, sizeof(int) * 4)
+#endif
+
+static int query_cpu_feature_sse() {
+ int buffer[4];
+ __cpuid(buffer,1);
+ if ((buffer[3]&(1<<25)) == 0) return 0;
+ return 1;
+}
+
+static int lanczos_has_sse = 0;
+#endif
+
+void lanczos_init(void)
+{
unsigned i;
- double dx = (double)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
+ float dx = (float)(LANCZOS_WIDTH) / LANCZOS_SAMPLES, x = 0.0;
for (i = 0; i < LANCZOS_SAMPLES + 1; ++i, x += dx)
- lanczos_lut[i] = abs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0;
+ lanczos_lut[i] = fabs(x) < LANCZOS_WIDTH ? sinc(x) * sinc(x / LANCZOS_WIDTH) : 0.0;
+#ifdef LANCZOS_SSE
+ lanczos_has_sse = query_cpu_feature_sse();
+#endif
}
typedef struct lanczos_resampler
@@ -45,7 +80,7 @@
int buffer_out[lanczos_buffer_size];
} lanczos_resampler;
-void * lanczos_resampler_create()
+void * lanczos_resampler_create(void)
{
lanczos_resampler * r = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) );
if ( !r ) return 0;
@@ -67,9 +102,9 @@
free( _r );
}
-void * lanczos_resampler_dup(void * _r)
+void * lanczos_resampler_dup(const void * _r)
{
- lanczos_resampler * r_in = ( lanczos_resampler * ) _r;
+ const lanczos_resampler * r_in = ( const lanczos_resampler * ) _r;
lanczos_resampler * r_out = ( lanczos_resampler * ) malloc( sizeof(lanczos_resampler) );
if ( !r_out ) return 0;
@@ -85,6 +120,21 @@
return r_out;
}
+void lanczos_resampler_dup_inplace(void *_d, const void *_s)
+{
+ const lanczos_resampler * r_in = ( const lanczos_resampler * ) _s;
+ lanczos_resampler * r_out = ( lanczos_resampler * ) _d;
+
+ r_out->write_pos = r_in->write_pos;
+ r_out->write_filled = r_in->write_filled;
+ r_out->read_pos = r_in->read_pos;
+ r_out->read_filled = r_in->read_filled;
+ r_out->phase = r_in->phase;
+ r_out->phase_inc = r_in->phase_inc;
+ memcpy( r_out->buffer_in, r_in->buffer_in, sizeof(r_in->buffer_in) );
+ memcpy( r_out->buffer_out, r_in->buffer_out, sizeof(r_in->buffer_out) );
+}
+
int lanczos_resampler_get_free_count(void *_r)
{
lanczos_resampler * r = ( lanczos_resampler * ) _r;
@@ -149,10 +199,10 @@
do
{
// accumulate in extended precision
- double kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
+ float kernel[LANCZOS_WIDTH * 2], kernel_sum = 0.0;
int i = LANCZOS_WIDTH;
int phase_adj = phase * step / LANCZOS_RESOLUTION;
- double sample;
+ float sample;
if ( out >= out_end )
break;
@@ -164,7 +214,7 @@
}
for (sample = 0, i = 0; i < LANCZOS_WIDTH * 2; ++i)
sample += in[i] * kernel[i];
- *out++ = (int) (sample / kernel_sum * 256.0);
+ *out++ = (int)(sample / kernel_sum * 256.0);
phase += phase_inc;
@@ -174,10 +224,10 @@
}
while ( in < in_end );
- r->phase = phase;
+ r->phase = (unsigned short) phase;
*out_ = out;
- used = in - in_;
+ used = (int)(in - in_);
r->write_filled -= used;
}
@@ -185,6 +235,79 @@
return used;
}
+#ifdef LANCZOS_SSE
+static int lanczos_resampler_run_sse(lanczos_resampler * r, int ** out_, int * out_end)
+{
+ int in_size = r->write_filled;
+ float const* in_ = r->buffer_in + lanczos_buffer_size + r->write_pos - r->write_filled;
+ int used = 0;
+ in_size -= LANCZOS_WIDTH * 2;
+ if ( in_size > 0 )
+ {
+ int* out = *out_;
+ float const* in = in_;
+ float const* const in_end = in + in_size;
+ int phase = r->phase;
+ int phase_inc = r->phase_inc;
+
+ int step = phase_inc > LANCZOS_RESOLUTION ? LANCZOS_RESOLUTION * LANCZOS_RESOLUTION / phase_inc : LANCZOS_RESOLUTION;
+
+ do
+ {
+ // accumulate in extended precision
+ float kernel_sum = 0.0;
+ __m128 kernel[LANCZOS_WIDTH / 2];
+ __m128 temp1, temp2;
+ __m128 samplex = _mm_setzero_ps();
+ float *kernelf = (float*)(&kernel);
+ int i = LANCZOS_WIDTH;
+ int phase_adj = phase * step / LANCZOS_RESOLUTION;
+
+ if ( out >= out_end )
+ break;
+
+ for (; i >= -LANCZOS_WIDTH + 1; --i)
+ {
+ int pos = i * step;
+ kernel_sum += kernelf[i + LANCZOS_WIDTH - 1] = lanczos_lut[abs(phase_adj - pos)];
+ }
+ for (i = 0; i < LANCZOS_WIDTH / 2; ++i)
+ {
+ temp1 = _mm_loadu_ps( (const float *)( in + i * 4 ) );
+ temp2 = _mm_load_ps( (const float *)( kernel + i ) );
+ temp1 = _mm_mul_ps( temp1, temp2 );
+ samplex = _mm_add_ps( samplex, temp1 );
+ }
+ kernel_sum = 1.0 / kernel_sum * 256.0;
+ temp1 = _mm_movehl_ps( temp1, samplex );
+ samplex = _mm_add_ps( samplex, temp1 );
+ temp1 = samplex;
+ temp1 = _mm_shuffle_ps( temp1, samplex, _MM_SHUFFLE(0, 0, 0, 1) );
+ samplex = _mm_add_ps( samplex, temp1 );
+ temp1 = _mm_set_ss( kernel_sum );
+ samplex = _mm_mul_ps( samplex, temp1 );
+ *out++ = _mm_cvtss_si32( samplex );
+
+ phase += phase_inc;
+
+ in += phase >> 13;
+
+ phase &= 8191;
+ }
+ while ( in < in_end );
+
+ r->phase = (unsigned short) phase;
+ *out_ = out;
+
+ used = (int)(in - in_);
+
+ r->write_filled -= used;
+ }
+
+ return used;
+}
+#endif
+
static void lanczos_resampler_fill(lanczos_resampler * r)
{
while ( r->write_filled > (LANCZOS_WIDTH * 2) &&
@@ -195,7 +318,12 @@
int * out = r->buffer_out + write_pos;
if ( write_size > ( lanczos_buffer_size - r->read_filled ) )
write_size = lanczos_buffer_size - r->read_filled;
- lanczos_resampler_run( r, &out, out + write_size );
+#ifdef LANCZOS_SSE
+ if ( lanczos_has_sse )
+ lanczos_resampler_run_sse( r, &out, out + write_size );
+ else
+#endif
+ lanczos_resampler_run( r, &out, out + write_size );
r->read_filled += out - r->buffer_out - write_pos;
}
}