shithub: sox

Download patch

ref: a2ead81a9627893296fd099618b853a881535914
parent: ca0ab44167b35c0d825a61a842c5ba2b56dda0ef
author: Rob Sykes <robs@users.sourceforge.net>
date: Mon Apr 2 15:43:22 EDT 2012

more rate speed-ups

--- a/src/effects_i_dsp.c
+++ b/src/effects_i_dsp.c
@@ -280,7 +280,7 @@
     double n160 = (.0425* att - 1.4) / tr_bw;   /* Half order for att = 160 */
     n = n160 * (16.556 / (att - 39.6) + .8625) + .5;  /* For att [80,160) */
   }
-  return k? 2 * n : 2 * (n + (n & 1)) + 1; /* =1 %4 (0 phase 1/2 band) */
+  return k? 2 * n * k - 1 : 2 * (n + (n & 1)) + 1; /* =1 %4 (0 phase 1/2 band) */
 }
 
 double * lsx_design_lpf(
@@ -300,14 +300,12 @@
   Fp /= Fn, Fc /= Fn;        /* Normalise to Fn = 1 */
   tr_bw = LSX_TO_6dB * (Fc-Fp); /* Transition band-width: 6dB to stop points */
 
-  if (!*num_taps)
-    *num_taps = lsx_lpf_num_taps(att, tr_bw, k);
   if (beta < 0)
     beta = lsx_kaiser_beta(att);
-  if (k)
-    *num_taps = *num_taps * k - 1;
-  else k = 1;
-  lsx_debug("%g %g %g", Fp, tr_bw, Fc);
+  if (!*num_taps)
+    *num_taps = lsx_lpf_num_taps(att, tr_bw, k);
+  k = max(k, 1);
+  lsx_debug("design_lpf Fp=%g tr_bw=%g Fc=%g", Fp, tr_bw, Fc);
   return lsx_make_lpf(*num_taps, (Fc - tr_bw) / k, beta, (double)k, sox_true);
 }
 
--- a/src/rate.c
+++ b/src/rate.c
@@ -69,7 +69,7 @@
 
 typedef struct {    /* Data that are shared between channels and stages */
   sample_t   * poly_fir_coefs;
-  dft_filter_t dft_filter[2];   /* Div or mul by n */
+  dft_filter_t dft_filter[2];
 } rate_shared_t;
 
 struct stage;
@@ -92,15 +92,15 @@
     int64_t all;
     #define MULT32 (65536. * 65536.)
   } at, step;
-  int        divisor;          /* For step: > 1 for rational; 1 otherwise */
+  int        L, remL, remM;
+
   double     out_in_ratio;
-  int        rem, tuple;
 } stage_t;
 
 #define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
 #define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
 
-static void cubic_spline(stage_t * p, fifo_t * output_fifo)
+static void cubic_spline_fn(stage_t * p, fifo_t * output_fifo)
 {
   int i, num_in = stage_occupancy(p), max_num_out = 1 + num_in*p->out_in_ratio;
   sample_t const * input = stage_read_p(p);
@@ -119,25 +119,48 @@
   p->at.parts.integer = 0;
 }
 
-static void decimate(stage_t * p, fifo_t * output_fifo)
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
 {
-  sample_t * output, tmp = 0;
+  sample_t * output, tmp;
   int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
   rate_shared_t const * s = p->shared;
   dft_filter_t const * f = &s->dft_filter[p->which];
   int const overlap = f->num_taps - 1;
 
-  while (num_in >= f->dft_length) {
+  while (p->remL + p->L * num_in >= f->dft_length) {
+    div_t divd = div(f->dft_length - overlap - p->remL + p->L - 1, p->L);
     sample_t const * input = fifo_read_ptr(&p->fifo);
-    fifo_read(&p->fifo, f->dft_length - overlap, NULL);
-    num_in -= f->dft_length - overlap;
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
 
     output = fifo_reserve(output_fifo, f->dft_length);
-    memcpy(output, input, f->dft_length * sizeof(*output));
-
-    lsx_safe_rdft(f->dft_length, 1, output);
+    if (p->L == 2 || p->L == 4) { /* F-domain */
+      int portion = f->dft_length / p->L;
+      memcpy(output, input, (unsigned)portion * sizeof(*output));
+      lsx_safe_rdft(portion, 1, output);
+      for (i = portion + 2; i < (portion << 1); i += 2)
+        output[i] = output[(portion << 1) - i],
+        output[i+1] = -output[(portion << 1) - i + 1];
+      output[portion] = output[1];
+      output[portion + 1] = 0;
+      output[1] = output[0];
+      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+        memcpy(output + i, output, portion * sizeof(*output));
+        output[i + 1] = 0;
+      }
+    } else {
+      if (p->L == 1)
+        memcpy(output, input, f->dft_length * sizeof(*output));
+      else {
+        memset(output, 0, f->dft_length * sizeof(*output));
+        for (j = 0, i = p->remL; i < f->dft_length; ++j, i += p->L)
+          output[i] = input[j];
+        p->remL = p->L - 1 - divd.rem;
+      }
+      lsx_safe_rdft(f->dft_length, 1, output);
+    }
     output[0] *= f->coefs[0];
-    if (p->tuple) {
+    if (p->step.parts.integer) {
       output[1] *= f->coefs[1];
       for (i = 2; i < f->dft_length; i += 2) {
         tmp = output[i];
@@ -145,10 +168,13 @@
         output[i+1] = f->coefs[i+1] * tmp + f->coefs[i  ] * output[i+1];
       }
       lsx_safe_rdft(f->dft_length, -1, output);
-      for (j = 0, i = p->rem; i < f->dft_length - overlap; ++j, i += p->tuple)
-        output[j] = output[i];
-      p->rem = i - (f->dft_length - overlap);
-      fifo_trim_by(output_fifo, f->dft_length - j);
+      if (p->step.parts.integer != 1) {
+        for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j, i += p->step.parts.integer)
+          output[j] = output[i];
+        p->remM = i - (f->dft_length - overlap);
+        fifo_trim_by(output_fifo, f->dft_length - j);
+      }
+      else fifo_trim_by(output_fifo, overlap);
     }
     else { /* F-domain */
       for (i = 2; i < (f->dft_length >> 1); i += 2) {
@@ -156,7 +182,7 @@
         output[i  ] = f->coefs[i  ] * tmp - f->coefs[i+1] * output[i+1];
         output[i+1] = f->coefs[i+1] * tmp + f->coefs[i  ] * output[i+1];
       }
-      output[1] = f->coefs[i  ] * tmp - f->coefs[i+1] * output[i+1];
+      output[1] = f->coefs[i] * output[i] - f->coefs[i+1] * output[i+1];
       lsx_safe_rdft(f->dft_length >> 1, -1, output);
       fifo_trim_by(output_fifo, (f->dft_length + overlap) >> 1);
     }
@@ -163,52 +189,14 @@
   }
 }
 
-static void interpolate(stage_t * p, fifo_t * output_fifo)
+static void setup_dft_stage(rate_shared_t * shared, int which, stage_t * stage, int L, int M)
 {
-  sample_t * output;
-  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
-  rate_shared_t const * s = p->shared;
-  dft_filter_t const * f = &s->dft_filter[p->which];
-  int const overlap = f->num_taps - 1;
-
-  while (p->rem + p->tuple * num_in >= f->dft_length) {
-    div_t divd = div(f->dft_length - overlap - p->rem + p->tuple - 1, p->tuple);
-    sample_t const * input = fifo_read_ptr(&p->fifo);
-    fifo_read(&p->fifo, divd.quot, NULL);
-    num_in -= divd.quot;
-
-    output = fifo_reserve(output_fifo, f->dft_length);
-    fifo_trim_by(output_fifo, overlap);
-    if (p->tuple == 2 || p->tuple == 4) { /* F-domain */
-      int portion = f->dft_length / p->tuple;
-      memcpy(output, input, (unsigned)portion * sizeof(*output));
-      lsx_safe_rdft(portion, 1, output);
-      for (i = portion + 2; i < (portion << 1); i += 2)
-        output[i] = output[(portion << 1) - i],
-        output[i+1] = -output[(portion << 1) - i + 1];
-      output[portion] = output[1];
-      output[portion + 1] = 0;
-      output[1] = output[0];
-      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
-        memcpy(output + i, output, portion * sizeof(*output));
-        output[i + 1] = 0;
-      }
-    } else {
-      memset(output, 0, f->dft_length * sizeof(*output));
-      for (j = 0, i = p->rem; i < f->dft_length; ++j, i += p->tuple)
-        output[i] = input[j];
-      p->rem = p->tuple - 1 - divd.rem;
-      lsx_safe_rdft(f->dft_length, 1, output);
-    }
-    output[0] *= f->coefs[0];
-    output[1] *= f->coefs[1];
-    for (i = 2; i < f->dft_length; i += 2) {
-      sample_t tmp = output[i];
-      output[i  ] = f->coefs[i  ] * tmp - f->coefs[i+1] * output[i+1];
-      output[i+1] = f->coefs[i+1] * tmp + f->coefs[i  ] * output[i+1];
-    }
-    lsx_safe_rdft(f->dft_length, -1, output);
-  }
+  stage->fn = dft_stage_fn;
+  stage->preload = shared->dft_filter[which].post_peak / L;
+  stage->remL    = shared->dft_filter[which].post_peak % L;
+  stage->L = L;
+  stage->step.parts.integer = M;
+  stage->which = which;
 }
 
 static void init_dft_filter(rate_shared_t * p, unsigned which, int num_taps,
@@ -264,193 +252,175 @@
 typedef struct {
   double     factor;
   uint64_t   samples_in, samples_out;
-  int        level, input_stage_num, output_stage_num;
-  sox_bool   upsample;
+  int        input_stage_num, output_stage_num;
   stage_t    * stages;
 } rate_t;
 
 #define pre_stage p->stages[-1]
-#define last_stage p->stages[p->level]
-#define post_stage p->stages[p->level + 1]
+#define frac_stage p->stages[level]
+#define post_stage p->stages[level + have_frac_stage]
+#define have_frac_stage (realM * fracL != 1)
 
 typedef enum {Default = -1, Quick, Low, Medium, High, Very} quality_t;
 
 static void rate_init(rate_t * p, rate_shared_t * shared, double factor,
     quality_t quality, int interp_order, double phase, double bandwidth,
-    sox_bool allow_aliasing, sox_bool old_behaviour)
+    sox_bool allow_aliasing)
 {
-  int i, tuple, mult, divisor = 1, two_or_three = 2;
-  sox_bool two_factors = sox_false;
+  int i, preL = 1, preM = 1, level = 0, fracL = 1, postL = 1, postM = 1;
+  sox_bool upsample = sox_false;
+  double realM = factor;
 
   assert(factor > 0);
   p->factor = factor;
+
   if (quality < Quick || quality > Very)
     quality = High;
-  if (quality != Quick) {
+
+  if (quality != Quick) while (sox_true) {
     const int max_divisor = 2048;      /* Keep coef table size ~< 500kb */
     double epsilon;
-    p->upsample = factor < 1;
-    for (i = factor, p->level = 0; i >>= 1; ++p->level); /* log base 2 */
-    factor /= 1 << (p->level + !p->upsample);
-    epsilon = fabs((uint32_t)(factor * MULT32 + .5) / (factor * MULT32) - 1);
-    for (i = 2; i <= max_divisor && divisor == 1; ++i) {
-      double try_d = factor * i;
+    upsample = realM < 1;
+    for (i = realM, level = 0; i >>= 1; ++level); /* log base 2 */
+    realM /= 1 << (level + !upsample);
+    epsilon = fabs((uint32_t)(realM * MULT32 + .5) / (realM * MULT32) - 1);
+    for (i = 2; i <= max_divisor && fracL == 1; ++i) {
+      double try_d = realM * i;
       int try = try_d + .5;
       if (fabs(try / try_d - 1) <= epsilon) { /* N.B. beware of long doubles */
-        if (try == i) {
-          factor = 1, divisor = 2;
-          if (p->upsample)
-            p->upsample = sox_false;
-          else ++p->level;
+        if (try == i)
+          realM = 1, fracL = 2, level += !upsample, upsample = sox_false;
+        else realM = try, fracL = i;
+      }
+    }
+    if (upsample) {
+      if (postL == 1 && (realM != 1 || fracL > 5) && fracL / realM > 4) {
+        realM = realM * (postL = min((fracL / realM), 4)) / fracL, fracL = 1;
+        continue;
+      }
+      else if ((realM == 2 && fracL == 3) || (realM == 3 && fracL == 4))
+        preL = fracL, preM = realM, fracL = realM = 1;
+      else if (fracL < 6 && realM == 1)
+        preL = fracL, fracL = 1;
+      else if (quality > Low) {
+        preL = 2;
+        if (fracL % preL)
+          realM *= preL;
+        else fracL /= preL;
+      }
+    }
+    else {
+      if (fracL > 2) {
+        int L = fracL, M = realM;
+        for (i = level + 1; i && !(L & 1); L >>= 1, --i);
+        if (L < 3 && (M <<= i) < 7) {
+          preL = L, preM = M, realM = fracL = 1, level = 0, upsample = sox_true;
+          break;
         }
-        else factor = try, divisor = i;
       }
+      postM = 2;
+      if (fracL == 2)
+        --fracL, postM -= !level, level -= !!level;
     }
-    if (!old_behaviour && quality != Low && factor == 3 && divisor == 4 && p->level == 1)
-      two_or_three = 3;
+    break;
   }
-  p->stages = (stage_t *)calloc((size_t)p->level + 4, sizeof(*p->stages)) + 1;
-  for (i = -1; i <= p->level + 1; ++i) p->stages[i].shared = shared;
-  last_stage.step.all = factor * MULT32 + .5;
-  last_stage.out_in_ratio = MULT32 * divisor / last_stage.step.all;
 
-  if (divisor != 1)
-    assert(!last_stage.step.parts.fraction);
-  else if (quality != Quick)
-    assert(!last_stage.step.parts.integer);
+  p->stages = (stage_t *)calloc((size_t)level + 4, sizeof(*p->stages)) + 1;
+  for (i = -1; i <= level + 1; ++i)
+    p->stages[i].shared = shared;
 
-  tuple = 1 + p->upsample;
-  if (!old_behaviour && p->upsample) {
-    if (factor == 2 && divisor == 3)
-      two_factors = sox_true, tuple = divisor;
-    else if (factor == 1) {
-      if (divisor < 6)
-        two_factors = sox_true, tuple = divisor;
-      else for (i = 2; !two_factors && i < 20; ++i)
-        if (!(divisor % i))
-          two_factors = sox_true, tuple = i;
-    }
-  }
-  mult = tuple;
-  lsx_debug("i/o=%.16g; %.12g:%i @ level %i tuple=%i last-stage:%x.%x", p->factor, last_stage.step.all / MULT32,
-      divisor, p->level, tuple, last_stage.step.parts.integer, last_stage.step.parts.fraction);
+  p->output_stage_num = level;
 
-  p->input_stage_num = -p->upsample;
-  p->output_stage_num = p->level;
+  frac_stage.step.all = realM * MULT32 + .5;
+  frac_stage.out_in_ratio = MULT32 * fracL / frac_stage.step.all;
+
   if (quality == Quick) {
+    frac_stage.fn = cubic_spline_fn;
+    frac_stage.pre_post = max(3, frac_stage.step.parts.integer);
+    frac_stage.preload = frac_stage.pre = 1;
     ++p->output_stage_num;
-    last_stage.fn = cubic_spline;
-    last_stage.pre_post = max(3, last_stage.step.parts.integer);
-    last_stage.preload = last_stage.pre = 1;
   }
-  else if ((two_or_three != 3 && last_stage.out_in_ratio != 2 && !two_factors) || (p->upsample && quality == Low)) {
-    poly_fir_t const * f;
+  else if (have_frac_stage) {
+    int n = (4 - (quality == Low)) * upsample + range_limit(quality, Medium, Very) - Medium;
+    poly_fir_t const * f = &poly_firs[n];
     poly_fir1_t const * f1;
-    int n = 4 * p->upsample + range_limit(quality, Medium, Very) - Medium;
+
+    if (f->num_coefs & 1) {
+      if (fracL != 1 && (fracL & 1))
+        fracL <<= 1, realM *= 2, frac_stage.step.all <<= 1;
+      frac_stage.at.all = fracL * .5 * MULT32 + .5;
+    }
+    frac_stage.L = fracL;
+
     if (interp_order < 0)
       interp_order = quality > High;
-    interp_order = divisor == 1? 1 + interp_order : 0;
-    last_stage.divisor = divisor;
-    p->output_stage_num += 2;
-    if (p->upsample && quality == Low)
-      mult = 1, ++p->input_stage_num, --p->output_stage_num, --n;
-    f = &poly_firs[n];
+    interp_order = fracL == 1? 1 + interp_order : 0;
     f1 = &f->interp[interp_order];
-    if (!last_stage.shared->poly_fir_coefs) {
-      int num_taps = 0, phases = divisor == 1? (1 << f1->phase_bits) : divisor;
+
+    if (!frac_stage.shared->poly_fir_coefs) {
+      int phases = fracL == 1? (1 << f1->phase_bits) : fracL;
+      int num_taps = f->num_coefs * phases - 1;
       raw_coef_t * coefs = lsx_design_lpf(
           f->pass, f->stop, 1., sox_false, f->att, &num_taps, phases, -1.);
       assert(num_taps == f->num_coefs * phases - 1);
-      last_stage.shared->poly_fir_coefs =
-          prepare_coefs(coefs, f->num_coefs, phases, interp_order, mult);
-      lsx_debug("fir_len=%i phases=%i coef_interp=%i mult=%i size=%s",
-          f->num_coefs, phases, interp_order, mult,
+      frac_stage.shared->poly_fir_coefs =
+          prepare_coefs(coefs, f->num_coefs, phases, interp_order, 1);
+      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%s",
+          f->num_coefs, phases, interp_order,
           lsx_sigfigs3((num_taps +1.) * (interp_order + 1) * sizeof(sample_t)));
       free(coefs);
     }
-    last_stage.fn = f1->fn;
-    last_stage.pre_post = f->num_coefs - 1;
-    last_stage.pre = 0;
-    last_stage.preload = last_stage.pre_post >> 1;
-    mult = 1;
+    frac_stage.fn = f1->fn;
+    frac_stage.pre_post = f->num_coefs - 1;
+    frac_stage.pre = 0;
+    frac_stage.preload = frac_stage.pre_post >> 1;
+    ++p->output_stage_num;
   }
-  if (quality > Low) {
-    typedef struct {int len; sample_t const * h; double bw, a;} filter_t;
+  if (quality == Low && !upsample) {  /* dft is slower here, so */
+    post_stage.fn = half_sample_low;       /* use normal convolution */
+    post_stage.pre_post = 2 * (array_length(half_fir_coefs_low) - 1);
+    post_stage.preload = post_stage.pre = post_stage.pre_post >> 1;
+    ++p->output_stage_num;
+  }
+  else if (quality != Quick) {
+    typedef struct {double bw, a;} filter_t;
     static filter_t const filters[] = {
-      {2 * array_length(half_fir_coefs_low) - 1, half_fir_coefs_low, 0,0},
-      {0, NULL, .931, 110}, {0, NULL, .931, 125}, {0, NULL, .931, 170}};
+      {.724, 100}, {.931, 110}, {.931, 125}, {.931, 170}};
     filter_t const * f = &filters[quality - Low];
     double att = allow_aliasing? (34./33)* f->a : f->a; /* negate att degrade */
     double bw = bandwidth? 1 - (1 - bandwidth / 100) / LSX_TO_3dB : f->bw;
     double min = 1 - (allow_aliasing? LSX_MAX_TBW0A : LSX_MAX_TBW0) / 100;
+    double pass = bw * fracL / realM / 2;
     assert((size_t)(quality - Low) < array_length(filters));
-    init_dft_filter(shared, p->upsample, f->len, f->h, bw, 1., (double)max(tuple, two_or_three), att, mult, phase, allow_aliasing);
-    if (p->upsample) {
-      pre_stage.fn = interpolate; /* Finish off setting up pre-stage */
-      pre_stage.preload = shared->dft_filter[1].post_peak / tuple;
-      pre_stage.rem     = shared->dft_filter[1].post_peak % tuple;
-      pre_stage.tuple = tuple;
-      pre_stage.which = 1;
-      if (two_factors && divisor != tuple) {
-        int other = divisor / tuple;
-        ++p->output_stage_num;
-        init_dft_filter(shared, 0, 0, NULL, 1., (double)tuple, (double)divisor, att, other, phase, allow_aliasing);
-        last_stage.fn = interpolate;
-        last_stage.preload = shared->dft_filter[0].post_peak / other;
-        last_stage.rem     = shared->dft_filter[0].post_peak % other;
-        last_stage.tuple = other;
-        last_stage.which = 0;
-      }
-      else {
-        /* Start setting up post-stage; TODO don't use dft for short filters */
-        if ((1 - p->factor) / (1 - bw) > 2 || tuple != 2)
-          init_dft_filter(shared, 0, 0, NULL, max(p->factor, min), 1., 2., att, 1, phase, allow_aliasing);
-        else shared->dft_filter[0] = shared->dft_filter[1];
-        if (two_factors && factor == 2) {
-          ++p->output_stage_num;
-          last_stage.fn = decimate;
-          last_stage.preload = shared->dft_filter[0].post_peak;
-          last_stage.tuple = (old_behaviour | allow_aliasing) << 1;
-        } else {
-          post_stage.fn = decimate;
-          post_stage.preload = shared->dft_filter[0].post_peak;
-          post_stage.tuple = (old_behaviour | allow_aliasing) << 1;
-        }
-      }
+
+    if (preL * preM != 1) {
+      init_dft_filter(shared, 0, 0, 0, bw, 1., (double)max(preL, preM), att, preL, phase, allow_aliasing);
+      setup_dft_stage(shared, 0, &pre_stage, preL, preM == 2 && !allow_aliasing? 0 : preM);
+      --p->input_stage_num;
     }
-    else {
-      if (p->level > 0 && p->output_stage_num > p->level) {
-        double pass = bw * divisor / factor / 2;
-        if ((1 - pass) / (1 - bw) > 2)
-          init_dft_filter(shared, 1, 0, NULL, max(pass, min), 1., 2., att, 1, phase, allow_aliasing);
-      }
-      post_stage.fn = decimate;
-      post_stage.preload = shared->dft_filter[0].post_peak;
-      post_stage.tuple = two_or_three == 2 && !(old_behaviour | allow_aliasing)? 0 : two_or_three;
+    else if (level && have_frac_stage && (1 - pass) / (1 - bw) > 2)
+      init_dft_filter(shared, 0, 0, NULL, max(pass, min), 1., 2., att, 1, phase, allow_aliasing);
+
+    if (postL * postM != 1) {
+      init_dft_filter(shared, 1, 0, 0,
+          bw * (upsample? factor * postL / postM : 1),
+          1., (double)(upsample? postL : postM), att, postL, phase, allow_aliasing);
+      setup_dft_stage(shared, 1, &post_stage, postL, postM == 2 && !allow_aliasing? 0 : postM);
+      ++p->output_stage_num;
     }
   }
-  else if (quality == Low && !p->upsample) {    /* dft is slower here, so */
-    post_stage.fn = half_sample_low;            /* use normal convolution */
-    post_stage.pre_post = 2 * (array_length(half_fir_coefs_low) - 1);
-    post_stage.preload = post_stage.pre = post_stage.pre_post >> 1;
-  }
-  if (p->level > 0) {
-    stage_t * s = & p->stages[p->level - 1];
-    if (shared->dft_filter[1].num_taps) {
-      s->fn = decimate;
-      s->preload = shared->dft_filter[1].post_peak;
-      s->tuple = (old_behaviour | allow_aliasing) << 1;
-      s->which = 1;
-    }
-    else *s = post_stage;
-  }
   for (i = p->input_stage_num; i <= p->output_stage_num; ++i) {
     stage_t * s = &p->stages[i];
-    if (i >= 0 && i < p->level - 1) {
+    if (i >= 0 && i < level - have_frac_stage) {
       s->fn = half_sample_25;
       s->pre_post = 2 * (array_length(half_fir_coefs_25) - 1);
       s->preload = s->pre = s->pre_post >> 1;
     }
+    else if (level && i == level - 1) {
+      if (shared->dft_filter[0].num_taps)
+        setup_dft_stage(shared, 0, s, 1, (int)allow_aliasing << 1);
+      else *s = post_stage;
+    }
     fifo_create(&s->fifo, (int)sizeof(sample_t));
     memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t)*s->preload);
     if (i < p->output_stage_num)
@@ -520,7 +490,7 @@
   sox_rate_t      out_rate;
   int             quality;
   double          coef_interp, phase, bandwidth;
-  sox_bool        allow_aliasing, old_behaviour;
+  sox_bool        allow_aliasing;
   rate_t          rate;
   rate_shared_t   shared, * shared_ptr;
 } priv_t;
@@ -545,7 +515,6 @@
     case 'I': p->phase = 25; break;
     case 'L': p->phase = 50; break;
     case 'a': p->allow_aliasing = sox_true; break;
-    case 'o': p->old_behaviour = sox_true; break;
     case 's': p->bandwidth = 99; break;
     default: if ((found_at = strchr(qopts, c))) p->quality = found_at - qopts;
       else {lsx_fail("unknown option `-%c'", optstate.opt); return lsx_usage(effp);}
@@ -586,7 +555,7 @@
   effp->out_signal.rate = out_rate;
   rate_init(&p->rate, p->shared_ptr, effp->in_signal.rate / out_rate,
       p->quality, (int)p->coef_interp - 1, p->phase, p->bandwidth,
-      p->allow_aliasing, p->old_behaviour);
+      p->allow_aliasing);
   return SOX_SUCCESS;
 }
 
--- a/src/rate_filters.h
+++ b/src/rate_filters.h
@@ -162,8 +162,8 @@
 #define CONVOLVE poly_fir_convolve_U100
 #include "rate_poly_fir.h"
 #define U100_3_b 6
-#define u100_l 10
-#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _
 #define FUNCTION u100_0
 #define FIR_LENGTH u100_l
 #define CONVOLVE poly_fir_convolve_u100
@@ -189,8 +189,8 @@
 #define CONVOLVE poly_fir_convolve_u100
 #include "rate_poly_fir.h"
 #define u100_3_b 6
-#define u120_l 14
-#define poly_fir_convolve_u120 _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define u120_l 15
+#define poly_fir_convolve_u120 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
 #define FUNCTION u120_0
 #define FIR_LENGTH u120_l
 #define CONVOLVE poly_fir_convolve_u120
@@ -216,8 +216,8 @@
 #define CONVOLVE poly_fir_convolve_u120
 #include "rate_poly_fir.h"
 #define u120_3_b 6
-#define u150_l 20
-#define poly_fir_convolve_u150 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define u150_l 21
+#define poly_fir_convolve_u150 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
 #define FUNCTION u150_0
 #define FIR_LENGTH u150_l
 #define CONVOLVE poly_fir_convolve_u150
--- a/src/rate_poly_fir0.h
+++ b/src/rate_poly_fir0.h
@@ -28,8 +28,8 @@
   sample_t * output = fifo_reserve(output_fifo, max_num_out);
   div_t divided2;
 
-  for (i = 0; p->at.parts.integer < num_in * p->divisor; ++i, p->at.parts.integer += p->step.parts.integer) {
-    div_t divided = div(p->at.parts.integer, p->divisor);
+  for (i = 0; p->at.parts.integer < num_in * p->L; ++i, p->at.parts.integer += p->step.parts.integer) {
+    div_t divided = div(p->at.parts.integer, p->L);
     sample_t const * at = input + divided.quot;
     sample_t sum = 0;
     int j = 0;
@@ -39,9 +39,9 @@
   }
   assert(max_num_out - i >= 0);
   fifo_trim_by(output_fifo, max_num_out - i);
-  divided2 = div(p->at.parts.integer, p->divisor);
+  divided2 = div(p->at.parts.integer, p->L);
   fifo_read(&p->fifo, divided2.quot, NULL);
-  p->at.parts.integer -= divided2.quot * p->divisor;
+  p->at.parts.integer -= divided2.quot * p->L;
 }
 
 #undef _