ref: 783ad76766e1f6b6aaca5d6eb415ac8a8269e1f2
parent: c9ba55208c842a1681d82e7d7ff44fafedd2a853
author: Linfeng Zhang <linfengz@google.com>
date: Thu Jun 16 12:21:02 EDT 2016
Revise celt_fir_c() to not pass in argument "mem" The "mem" in celt_fir_c() either is contained in the head of input "x" in reverse order already, or can be easily attached to the head of "x" before calling the function. Removing argument "mem" can eliminate the redundant buffer copies inside. Update celt_fir_sse4_1() accordingly.
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -556,10 +556,11 @@
} else {
/* Pitch-based PLC */
const opus_val16 *window;
+ opus_val16 *exc;
opus_val16 fade = Q15ONE;
int pitch_index;
VARDECL(opus_val32, etmp);
- VARDECL(opus_val16, exc);
+ VARDECL(opus_val16, _exc);
if (loss_count == 0)
{
@@ -570,7 +571,8 @@
}
ALLOC(etmp, overlap, opus_val32);
- ALLOC(exc, MAX_PERIOD, opus_val16);
+ ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+ exc = _exc+LPC_ORDER;
window = mode->window;
c=0; do {
opus_val16 decay;
@@ -635,15 +637,14 @@
/* Initialize the LPC history with the samples just before the start
of the region for which we're computing the excitation. */
{
- opus_val16 lpc_mem[LPC_ORDER];
for (i=0;i<LPC_ORDER;i++)
{
- lpc_mem[i] =
- ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
+ exc[MAX_PERIOD-exc_length-LPC_ORDER+i] =
+ ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-LPC_ORDER+i], SIG_SHIFT);
}
/* Compute the excitation for exc_length samples before the loss. */
celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
- exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
+ exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, st->arch);
}
/* Check if the waveform is decaying, and if so how fast.
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -89,56 +89,47 @@
void celt_fir_c(
- const opus_val16 *_x,
+ const opus_val16 *x,
const opus_val16 *num,
- opus_val16 *_y,
+ opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
- VARDECL(opus_val16, x);
SAVE_STACK;
ALLOC(rnum, ord, opus_val16);
- ALLOC(x, N+ord, opus_val16);
for(i=0;i<ord;i++)
rnum[i] = num[ord-i-1];
- for(i=0;i<ord;i++)
- x[i] = mem[ord-i-1];
- for (i=0;i<N;i++)
- x[i+ord]=_x[i];
- for(i=0;i<ord;i++)
- mem[i] = _x[N-i-1];
#ifdef SMALL_FOOTPRINT
(void)arch;
for (i=0;i<N;i++)
{
- opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+ opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
{
- sum = MAC16_16(sum,rnum[j],x[i+j]);
+ sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
}
- _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+ y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
}
#else
for (i=0;i<N-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
- xcorr_kernel(rnum, x+i, sum, ord, arch);
- _y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT)));
- _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
- _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
- _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
+ xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+ y[i ] = SATURATE16(ADD32(EXTEND32(x[i ]), PSHR32(sum[0], SIG_SHIFT)));
+ y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+ y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+ y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
}
for (;i<N;i++)
{
opus_val32 sum = 0;
for (j=0;j<ord;j++)
- sum = MAC16_16(sum,rnum[j],x[i+j]);
- _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+ sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
+ y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
}
#endif
RESTORE_STACK;
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -45,12 +45,11 @@
opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch);
#if !defined(OVERRIDE_CELT_FIR)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
- (celt_fir_c(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+ (celt_fir_c(x, num, y, N, ord, arch))
#endif
void celt_iir(const opus_val32 *x,
--- a/celt/x86/celt_lpc_sse.c
+++ b/celt/x86/celt_lpc_sse.c
@@ -40,17 +40,15 @@
#if defined(FIXED_POINT)
-void celt_fir_sse4_1(const opus_val16 *_x,
+void celt_fir_sse4_1(const opus_val16 *x,
const opus_val16 *num,
- opus_val16 *_y,
+ opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
- VARDECL(opus_val16, x);
__m128i vecNoA;
opus_int32 noA ;
@@ -57,46 +55,17 @@
SAVE_STACK;
ALLOC(rnum, ord, opus_val16);
- ALLOC(x, N+ord, opus_val16);
for(i=0;i<ord;i++)
rnum[i] = num[ord-i-1];
- for(i=0;i<ord;i++)
- x[i] = mem[ord-i-1];
-
- for (i=0;i<N-7;i+=8)
- {
- x[i+ord ]=_x[i ];
- x[i+ord+1]=_x[i+1];
- x[i+ord+2]=_x[i+2];
- x[i+ord+3]=_x[i+3];
- x[i+ord+4]=_x[i+4];
- x[i+ord+5]=_x[i+5];
- x[i+ord+6]=_x[i+6];
- x[i+ord+7]=_x[i+7];
- }
-
- for (;i<N-3;i+=4)
- {
- x[i+ord ]=_x[i ];
- x[i+ord+1]=_x[i+1];
- x[i+ord+2]=_x[i+2];
- x[i+ord+3]=_x[i+3];
- }
-
- for (;i<N;i++)
- x[i+ord]=_x[i];
-
- for(i=0;i<ord;i++)
- mem[i] = _x[N-i-1];
#ifdef SMALL_FOOTPRINT
for (i=0;i<N;i++)
{
- opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+ opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
{
- sum = MAC16_16(sum,rnum[j],x[i+j]);
+ sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
}
- _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+ y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
}
#else
noA = EXTEND32(1) << SIG_SHIFT >> 1;
@@ -107,22 +76,22 @@
opus_val32 sums[4] = {0};
__m128i vecSum, vecX;
- xcorr_kernel(rnum, x+i, sums, ord, arch);
+ xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
vecSum = _mm_loadu_si128((__m128i *)sums);
vecSum = _mm_add_epi32(vecSum, vecNoA);
vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
- vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+ vecX = OP_CVTEPI16_EPI32_M64(x + i);
vecSum = _mm_add_epi32(vecSum, vecX);
vecSum = _mm_packs_epi32(vecSum, vecSum);
- _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+ _mm_storel_epi64((__m128i *)(y + i), vecSum);
}
for (;i<N;i++)
{
opus_val32 sum = 0;
for (j=0;j<ord;j++)
- sum = MAC16_16(sum, rnum[j], x[i + j]);
- _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+ sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+ y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
}
#endif
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -41,12 +41,11 @@
opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch);
#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
- ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
#else
@@ -56,11 +55,10 @@
opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch);
-# define celt_fir(x, num, y, N, ord, mem, arch) \
- ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+# define celt_fir(x, num, y, N, ord, arch) \
+ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
#endif
#endif
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -47,7 +47,6 @@
opus_val16 *y,
int N,
int ord,
- opus_val16 *mem,
int arch
) = {
celt_fir_c, /* non-sse */
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -57,7 +57,6 @@
{
opus_int j;
#if USE_CELT_FIR
- opus_int16 mem[SILK_MAX_ORDER_LPC];
opus_int16 num[SILK_MAX_ORDER_LPC];
#else
int ix;
@@ -74,10 +73,7 @@
for ( j = 0; j < d; j++ ) {
num[ j ] = -B[ j ];
}
- for (j=0;j<d;j++) {
- mem[ j ] = in[ d - j - 1 ];
- }
- celt_fir( in + d, num, out + d, len - d, d, mem, arch );
+ celt_fir( in + d, num, out + d, len - d, d, arch );
for ( j = 0; j < d; j++ ) {
out[ j ] = 0;
}