shithub: sox

Download patch

ref: eb205813c2efd425674e14462706aa1cd9a62190
parent: bea27204158fb70c8643450de60c5177ebcce1a0
author: Rob Sykes <robs@users.sourceforge.net>
date: Mon Mar 5 13:55:38 EST 2012

speed-ups for particular rational factors

--- a/src/rate.c
+++ b/src/rate.c
@@ -67,9 +67,9 @@
   return result;
 }
 
-typedef struct {    /* Data that are shared between channels and filters */
+typedef struct {    /* Data that are shared between channels and stages */
   sample_t   * poly_fir_coefs;
-  dft_filter_t half_band[2];    /* [0]: halve; [1]: down/up: halve/double */
+  dft_filter_t half_band[2];   /* Div or mul by n */
 } rate_shared_t;
 
 struct stage;
@@ -80,7 +80,7 @@
   int        pre;              /* Number of past samples to store */
   int        pre_post;         /* pre + number of future samples to store */
   int        preload;          /* Number of zero samples to pre-load the fifo */
-  int        which;            /* Which of the 2 half-band filters to use */
+  int        which;            /* Which, if any, of the 2 dft filters to use */
   stage_fn_t fn;
                                /* For poly_fir & spline: */
   union {                      /* 32bit.32bit fixed point arithmetic */
@@ -94,6 +94,7 @@
   } at, step;
   int        divisor;          /* For step: > 1 for rational; 1 otherwise */
   double     out_in_ratio;
+  int        rem, tuple;
 } stage_t;
 
 #define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
@@ -132,7 +133,6 @@
     num_in -= f->dft_length - overlap;
 
     output = fifo_reserve(output_fifo, f->dft_length);
-    fifo_trim_by(output_fifo, (f->dft_length + overlap) >> 1);
     memcpy(output, input, f->dft_length * sizeof(*output));
 
     lsx_safe_rdft(f->dft_length, 1, output);
@@ -145,8 +145,10 @@
     }
     lsx_safe_rdft(f->dft_length, -1, output);
 
-    for (j = 1, i = 2; i < f->dft_length - overlap; ++j, i += 2)
+    for (j = 0, i = p->rem; i < f->dft_length - overlap; ++j, i += p->tuple)
       output[j] = output[i];
+    p->rem = i - (f->dft_length - overlap);
+    fifo_trim_by(output_fifo, f->dft_length - j);
   }
 }
 
@@ -155,18 +157,21 @@
   sample_t * output;
   int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
   rate_shared_t const * s = p->shared;
-  dft_filter_t const * f = &s->half_band[1];
+  dft_filter_t const * f = &s->half_band[p->which];
   int const overlap = f->num_taps - 1;
 
-  while (num_in > f->dft_length >> 1) {
+  while (p->rem + p->tuple * num_in >= f->dft_length) {
+    div_t divd = div(f->dft_length - overlap - p->rem + p->tuple - 1, p->tuple);
     sample_t const * input = fifo_read_ptr(&p->fifo);
-    fifo_read(&p->fifo, (f->dft_length - overlap) >> 1, NULL);
-    num_in -= (f->dft_length - overlap) >> 1;
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
 
     output = fifo_reserve(output_fifo, f->dft_length);
     fifo_trim_by(output_fifo, overlap);
-    for (j = i = 0; i < f->dft_length; ++j, i += 2)
-      output[i] = input[j], output[i+1] = 0;
+    memset(output, 0, f->dft_length * sizeof(*output));
+    for (j = 0, i = p->rem; i < f->dft_length; ++j, i += p->tuple)
+      output[i] = input[j];
+    p->rem = p->tuple - 1 - divd.rem;
 
     lsx_safe_rdft(f->dft_length, 1, output);
     output[0] *= f->coefs[0];
@@ -180,9 +185,9 @@
   }
 }
 
-static void half_band_filter_init(rate_shared_t * p, unsigned which,
-    int num_taps, sample_t const h[], double Fp, double att, int multiplier,
-    double phase, sox_bool allow_aliasing)
+static void init_dft_filter(rate_shared_t * p, unsigned which, int num_taps,
+    sample_t const h[], double Fp, double Fc, double Fn, double att,
+    int multiplier, double phase, sox_bool allow_aliasing)
 {
   dft_filter_t * f = &p->half_band[which];
   int dft_length, i;
@@ -198,7 +203,7 @@
     f->post_peak = num_taps / 2;
   }
   else {
-    double * h2 = lsx_design_lpf(Fp, 1., 2., allow_aliasing, att, &num_taps, 0);
+    double * h2 = lsx_design_lpf(Fp, Fc, Fn, allow_aliasing, att, &num_taps, 0, -1.);
 
     if (phase != 50)
       lsx_fir_to_phase(&h2, &num_taps, &f->post_peak, phase);
@@ -214,8 +219,8 @@
   assert(num_taps & 1);
   f->num_taps = num_taps;
   f->dft_length = dft_length;
-  lsx_debug("fir_len=%i dft_length=%i Fp=%g att=%g mult=%i",
-      num_taps, dft_length, Fp, att, multiplier);
+  lsx_debug("fir_len=%i dft_length=%i Fp=%g Fc=%g Fn=%g att=%g mult=%i",
+      num_taps, dft_length, Fp, Fc, Fn, att, multiplier);
   lsx_safe_rdft(dft_length, 1, f->coefs);
 }
 
@@ -223,7 +228,7 @@
 
 typedef struct {
   double     factor;
-  size_t     samples_in, samples_out;
+  uint64_t   samples_in, samples_out;
   int        level, input_stage_num, output_stage_num;
   sox_bool   upsample;
   stage_t    * stages;
@@ -237,9 +242,10 @@
 
 static void rate_init(rate_t * p, rate_shared_t * shared, double factor,
     quality_t quality, int interp_order, double phase, double bandwidth,
-    sox_bool allow_aliasing)
+    sox_bool allow_aliasing, sox_bool old_behaviour)
 {
-  int i, mult, divisor = 1;
+  int i, tuple, mult, divisor = 1, two_or_three = 2;
+  sox_bool two_factors = sox_false;
 
   assert(factor > 0);
   p->factor = factor;
@@ -247,19 +253,26 @@
     quality = High;
   if (quality != Quick) {
     const int max_divisor = 2048;      /* Keep coef table size ~< 500kb */
-    const double epsilon = 4 / MULT32; /* Scaled to half this at max_divisor */
-    p->upsample = p->factor < 1;
+    double epsilon;
+    p->upsample = factor < 1;
     for (i = factor, p->level = 0; i >>= 1; ++p->level); /* log base 2 */
     factor /= 1 << (p->level + !p->upsample);
+    epsilon = fabs((uint32_t)(factor * MULT32 + .5) / (factor * MULT32) - 1);
     for (i = 2; i <= max_divisor && divisor == 1; ++i) {
       double try_d = factor * i;
       int try = try_d + .5;
-      if (fabs(try - try_d) < try * epsilon * (1 - (.5 / max_divisor) * i)) {
-        if (try == i)  /* Rounded to 1:1? */
-          factor = 1, divisor = 2, p->upsample = sox_false;
+      if (fabs(try / try_d - 1) <= epsilon) { /* N.B. beware of long doubles */
+        if (try == i) {
+          factor = 1, divisor = 2;
+          if (p->upsample)
+            p->upsample = sox_false;
+          else ++p->level;
+        }
         else factor = try, divisor = i;
       }
     }
+    if (!old_behaviour && quality != Low && factor == 3 && divisor == 4 && p->level == 1)
+      two_or_three = 3;
   }
   p->stages = (stage_t *)calloc((size_t)p->level + 4, sizeof(*p->stages)) + 1;
   for (i = -1; i <= p->level + 1; ++i) p->stages[i].shared = shared;
@@ -270,9 +283,23 @@
     assert(!last_stage.step.parts.fraction);
   else if (quality != Quick)
     assert(!last_stage.step.parts.integer);
-  lsx_debug("i/o=%g; %.9g:%i @ level %i", p->factor, factor, divisor, p->level);
 
-  mult = 1 + p->upsample; /* Compensate for zero-stuffing in double_sample */
+  tuple = 1 + p->upsample;
+  if (!old_behaviour && p->upsample) {
+    if (factor == 2 && divisor == 3)
+      two_factors = sox_true, tuple = divisor;
+    else if (factor == 1) {
+      if (divisor < 6)
+        two_factors = sox_true, tuple = divisor;
+      else for (i = 2; !two_factors && i < 20; ++i)
+        if (!(divisor % i))
+          two_factors = sox_true, tuple = i;
+    }
+  }
+  mult = tuple;
+  lsx_debug("i/o=%.16g; %.12g:%i @ level %i tuple=%i", p->factor, last_stage.step.all / MULT32, divisor, p->level, tuple);
+  lsx_debug("%x.%x", last_stage.step.parts.integer, last_stage.step.parts.fraction);
+
   p->input_stage_num = -p->upsample;
   p->output_stage_num = p->level;
   if (quality == Quick) {
@@ -281,7 +308,7 @@
     last_stage.pre_post = max(3, last_stage.step.parts.integer);
     last_stage.preload = last_stage.pre = 1;
   }
-  else if (last_stage.out_in_ratio != 2 || (p->upsample && quality == Low)) {
+  else if ((two_or_three != 3 && last_stage.out_in_ratio != 2 && !two_factors) || (p->upsample && quality == Low)) {
     poly_fir_t const * f;
     poly_fir1_t const * f1;
     int n = 4 * p->upsample + range_limit(quality, Medium, Very) - Medium;
@@ -297,7 +324,7 @@
     if (!last_stage.shared->poly_fir_coefs) {
       int num_taps = 0, phases = divisor == 1? (1 << f1->phase_bits) : divisor;
       raw_coef_t * coefs = lsx_design_lpf(
-          f->pass, f->stop, 1., sox_false, f->att, &num_taps, phases);
+          f->pass, f->stop, 1., sox_false, f->att, &num_taps, phases, -1.);
       assert(num_taps == f->num_coefs * phases - 1);
       last_stage.shared->poly_fir_coefs =
           prepare_coefs(coefs, f->num_coefs, phases, interp_order, mult);
@@ -322,22 +349,50 @@
     double bw = bandwidth? 1 - (1 - bandwidth / 100) / LSX_TO_3dB : f->bw;
     double min = 1 - (allow_aliasing? LSX_MAX_TBW0A : LSX_MAX_TBW0) / 100;
     assert((size_t)(quality - Low) < array_length(filters));
-    half_band_filter_init(shared, p->upsample, f->len, f->h, bw, att, mult, phase, allow_aliasing);
+    init_dft_filter(shared, p->upsample, f->len, f->h, bw, 1., (double)max(tuple, two_or_three), att, mult, phase, allow_aliasing);
     if (p->upsample) {
       pre_stage.fn = double_sample; /* Finish off setting up pre-stage */
-      pre_stage.preload = shared->half_band[1].post_peak >> 1;
-       /* Start setting up post-stage; TODO don't use dft for short filters */
-      if ((1 - p->factor) / (1 - bw) > 2)
-        half_band_filter_init(shared, 0, 0, NULL, max(p->factor, min), att, 1, phase, allow_aliasing);
-      else shared->half_band[0] = shared->half_band[1];
+      pre_stage.preload = shared->half_band[1].post_peak / tuple;
+      pre_stage.rem     = shared->half_band[1].post_peak % tuple;
+      pre_stage.tuple = tuple;
+      pre_stage.which = 1;
+      if (two_factors && divisor != tuple) {
+        int other = divisor / tuple;
+        ++p->output_stage_num;
+        init_dft_filter(shared, 0, 0, NULL, 1., (double)tuple, (double)divisor, att, other, phase, allow_aliasing);
+        last_stage.fn = double_sample;
+        last_stage.preload = shared->half_band[0].post_peak / other;
+        last_stage.rem     = shared->half_band[0].post_peak % other;
+        last_stage.tuple = other;
+        last_stage.which = 0;
+      }
+      else {
+        /* Start setting up post-stage; TODO don't use dft for short filters */
+        if ((1 - p->factor) / (1 - bw) > 2)
+          init_dft_filter(shared, 0, 0, NULL, max(p->factor, min), 1., 2., att, 1, phase, allow_aliasing);
+        else shared->half_band[0] = shared->half_band[1];
+        if (two_factors && factor == 2) {
+          ++p->output_stage_num;
+          last_stage.fn = half_sample;
+          last_stage.preload = shared->half_band[0].post_peak;
+          last_stage.tuple = 2;
+        } else {
+          post_stage.fn = half_sample;
+          post_stage.preload = shared->half_band[0].post_peak;
+          post_stage.tuple = 2;
+        }
+      }
     }
-    else if (p->level > 0 && p->output_stage_num > p->level) {
-      double pass = bw * divisor / factor / 2;
-      if ((1 - pass) / (1 - bw) > 2)
-        half_band_filter_init(shared, 1, 0, NULL, max(pass, min), att, 1, phase, allow_aliasing);
+    else {
+      if (p->level > 0 && p->output_stage_num > p->level) {
+        double pass = bw * divisor / factor / 2;
+        if ((1 - pass) / (1 - bw) > 2)
+          init_dft_filter(shared, 1, 0, NULL, max(pass, min), 1., 2., att, 1, phase, allow_aliasing);
+      }
+      post_stage.fn = half_sample;
+      post_stage.preload = shared->half_band[0].post_peak;
+      post_stage.tuple = two_or_three;
     }
-    post_stage.fn = half_sample;
-    post_stage.preload = shared->half_band[0].post_peak;
   }
   else if (quality == Low && !p->upsample) {    /* dft is slower here, so */
     post_stage.fn = half_sample_low;            /* use normal convolution */
@@ -349,6 +404,7 @@
     if (shared->half_band[1].num_taps) {
       s->fn = half_sample;
       s->preload = shared->half_band[1].post_peak;
+      s->tuple = 2;
       s->which = 1;
     }
     else *s = post_stage;
@@ -393,7 +449,7 @@
 static void rate_flush(rate_t * p)
 {
   fifo_t * fifo = &p->stages[p->output_stage_num].fifo;
-  size_t samples_out = p->samples_in / p->factor + .5;
+  uint64_t samples_out = p->samples_in / p->factor + .5;
   size_t remaining = samples_out - p->samples_out;
   sample_t * buff = calloc(1024, sizeof(*buff));
 
@@ -429,7 +485,7 @@
   sox_rate_t      out_rate;
   int             quality;
   double          coef_interp, phase, bandwidth;
-  sox_bool        allow_aliasing;
+  sox_bool        allow_aliasing, old_behaviour;
   rate_t          rate;
   rate_shared_t   shared, * shared_ptr;
 } priv_t;
@@ -438,7 +494,7 @@
 {
   priv_t * p = (priv_t *) effp->priv;
   int c;
-  char * dummy_p, * found_at, * opts = "+i:b:p:MILasqlmhv", * qopts = opts +12;
+  char * dummy_p, * found_at, * opts = "+i:b:p:MILaosqlmhv", * qopts = opts +13;
   lsx_getopt_t optstate;
   lsx_getopt_init(argc, argv, opts, NULL, lsx_getopt_flag_none, 1, &optstate);
 
@@ -453,8 +509,9 @@
     case 'M': p->phase =  0; break;
     case 'I': p->phase = 25; break;
     case 'L': p->phase = 50; break;
-    case 's': p->bandwidth = 99; break;
     case 'a': p->allow_aliasing = sox_true; break;
+    case 'o': p->old_behaviour = sox_true; break;
+    case 's': p->bandwidth = 99; break;
     default: if ((found_at = strchr(qopts, c))) p->quality = found_at - qopts;
       else {lsx_fail("unknown option `-%c'", optstate.opt); return lsx_usage(effp);}
   }
@@ -493,7 +550,8 @@
   effp->out_signal.channels = effp->in_signal.channels;
   effp->out_signal.rate = out_rate;
   rate_init(&p->rate, p->shared_ptr, effp->in_signal.rate / out_rate,
-      p->quality, (int)p->coef_interp - 1, p->phase, p->bandwidth, p->allow_aliasing);
+      p->quality, (int)p->coef_interp - 1, p->phase, p->bandwidth,
+      p->allow_aliasing, p->old_behaviour);
   return SOX_SUCCESS;
 }