Performance optimisations

Use local arrays/variables to avoid multiple deferences.
Merge outputs with delays to allow compiler to optimise loops.
lv2bench shows nearly 5x cpu performance improvement.
master
Peter Nelson 2013-02-06 00:46:45 +00:00
parent a0e5770e28
commit 14441af2c4
1 changed files with 37 additions and 57 deletions

View File

@ -10,9 +10,7 @@
#define BUFFER_SECONDS 10
struct Tap {
float *t_gain[TAPS];
float *l_gain;
float *r_gain;
float *t_gain[TAPS + CHANNELS];
float *gain;
float *delay;
};
@ -24,12 +22,9 @@ struct PTap
double sample_rate;
size_t buffer_max;
Tap tap[TAPS];
Tap l_out;
Tap r_out;
Tap tap[TAPS + CHANNELS];
float *buffers[TAPS]; ///< Tap audio buffers
float *rp[TAPS]; ///< Read pointers
float *wp[TAPS]; ///< Write pointers
float *in_l;
@ -62,7 +57,6 @@ static LV2_Handle ptap_instantiate(
return NULL;
}
ptap->rp[i] = ptap->buffers[i];
ptap->wp[i] = ptap->buffers[i];
}
@ -90,24 +84,16 @@ static void ptap_connect_port(LV2_Handle lv2instance, uint32_t port, void *data)
int tap_index = port / CONTROLS_PER_TAP;
if (tap_index < TAPS) {
if (tap_index < TAPS + CHANNELS) {
tap = &ptap->tap[tap_index];
} else if (tap_index - TAPS == 0) {
tap = &ptap->l_out;
} else if (tap_index - TAPS == 1) {
tap = &ptap->r_out;
} else {
return;
}
int tap_port = port % CONTROLS_PER_TAP;
if (tap_port < TAPS) {
if (tap_port < TAPS + CHANNELS) {
tap->t_gain[tap_port] = fdata;
} else if (tap_port - TAPS == 0) {
tap->l_gain = fdata;
} else if (tap_port - TAPS == 1) {
tap->r_gain = fdata;
} else if (tap_port - TAPS == 2) {
tap->gain = fdata;
} else if (tap_port - TAPS == 3) {
@ -120,71 +106,65 @@ static void ptap_connect_port(LV2_Handle lv2instance, uint32_t port, void *data)
static void ptap_run(LV2_Handle lv2instance, uint32_t sample_count)
{
PTap *ptap = (PTap *)lv2instance;
Tap *tap;
float *wp;
const float *readp[TAPS + CHANNELS];
float gain[TAPS + CHANNELS];
/* Position read pointers behind write pointers */
for (int i = 0; i < TAPS; i++) {
int delay = *ptap->tap[i].delay * ptap->sample_rate;
ptap->rp[i] = ptap->wp[i] - delay;
if (ptap->rp[i] < ptap->buffers[i]) {
ptap->rp[i] += ptap->buffer_max;
/* Zero delay results in processing order dependencies. Tapiir doesn't permit it either... */
if (delay < 1) delay = 1;
readp[i] = ptap->wp[i] - delay;
if (readp[i] < ptap->buffers[i]) {
readp[i] += ptap->buffer_max;
}
gain[i] = *ptap->tap[i].gain;
}
float *in_l = ptap->in_l;
float *in_r = ptap->in_r;
float *out_l = ptap->out_l;
float *out_r = ptap->out_r;
readp[TAPS ] = ptap->in_l;
readp[TAPS + 1] = ptap->in_r;
gain[TAPS ] = *ptap->tap[TAPS ].gain;
gain[TAPS + 1] = *ptap->tap[TAPS + 1].gain;
while (sample_count--) {
for (int i = 0; i < TAPS; i++) {
wp = ptap->wp[i];
tap = &ptap->tap[i];
float rp[TAPS + CHANNELS];
float wp[TAPS + CHANNELS];
*wp = *in_l * *tap->l_gain;
*wp += *in_r * *tap->r_gain;
for (int j = 0; j < TAPS; j++) {
*wp += *ptap->rp[j] * *tap->t_gain[j];
for (int i = 0; i < TAPS + CHANNELS; i++) {
rp[i] = *readp[i]++;
}
for (int i = 0; i < TAPS + CHANNELS; i++) {
const Tap *tap = &ptap->tap[i];
float sample = 0;
for (int j = 0; j < TAPS + CHANNELS; j++) {
sample += rp[j] * *tap->t_gain[j];
}
*wp *= *tap->gain;
wp[i] = sample * gain[i];
}
/* Write to left output */
tap = &ptap->l_out;
*out_l = *in_l * *tap->l_gain;
*out_l += *in_r * *tap->r_gain;
for (int j = 0; j < TAPS; j++) {
*out_l += *ptap->rp[j] * *tap->t_gain[j];
for (int i = 0; i < TAPS; i++) {
*ptap->wp[i]++ = wp[i];
}
*out_l *= *tap->gain;
/* Write to right output */
tap = &ptap->r_out;
*out_r = *in_l * *tap->l_gain;
*out_r += *in_r * *tap->r_gain;
for (int j = 0; j < TAPS; j++) {
*out_r += *ptap->rp[j] * *tap->t_gain[j];
}
*out_r *= *tap->gain;
*out_l++ = wp[TAPS];
*out_r++ = wp[TAPS + 1];
/* Progress read pointers */
for (int i = 0; i < TAPS; i++) {
ptap->wp[i]++;
if (ptap->wp[i] >= ptap->buffers[i] + ptap->buffer_max) {
ptap->wp[i] = ptap->buffers[i];
}
ptap->rp[i]++;
if (ptap->rp[i] >= ptap->buffers[i] + ptap->buffer_max) {
ptap->rp[i] = ptap->buffers[i];
if (readp[i] >= ptap->buffers[i] + ptap->buffer_max) {
readp[i] = ptap->buffers[i];
}
}
in_l++;
in_r++;
out_l++;
out_r++;
}
}