audiobook-creation-exchange 0.1.0

ACX-compliant audio post-processing: normalisation, limiting, gating, LUFS measurement, and spectral analysis for AI-generated speech audio.
Documentation
//! Frequency-selective de-esser for TTS sibilance reduction.
//!
//! Applies gain reduction in the 5–8 kHz band when the sibilance energy ratio
//! exceeds a threshold. Uses 50 ms Hann-windowed frames with 50 % overlap-add
//! (OLA) for smooth, artifact-free processing.
//!
//! With periodic Hann at 50 % overlap the OLA sum = 1.0 everywhere (steady state),
//! so an unmodified frame passes through unchanged.

use rustfft::{FftPlanner, num_complex::Complex};

/// Default sibilance ratio above which reduction begins.
pub const DEFAULT_THRESHOLD_RATIO: f32 = 0.45;
/// Default maximum dB reduction in the sibilance band.
pub const DEFAULT_MAX_REDUCTION_DB: f32 = 6.0;

const DEESS_LO_HZ: f32 = 5_000.0;
const DEESS_HI_HZ: f32 = 8_000.0;
const KNEE_WIDTH: f32 = 0.15; // ratio units for soft-knee transition
const WINDOW_MS: usize = 50;

/// De-ess `samples` using default parameters (−6 dBFS max reduction, 0.45 ratio threshold).
pub fn deess(samples: &mut [i16], sample_rate: u32) {
    deess_with_params(
        samples,
        sample_rate,
        DEFAULT_THRESHOLD_RATIO,
        DEFAULT_MAX_REDUCTION_DB,
    );
}

/// De-ess `samples` with explicit `threshold_ratio` and `max_reduction_db`.
pub fn deess_with_params(
    samples: &mut [i16],
    sample_rate: u32,
    threshold_ratio: f32,
    max_reduction_db: f32,
) {
    let window_size = (sample_rate as usize * WINDOW_MS) / 1000;
    if window_size < 4 || samples.is_empty() {
        return;
    }
    let hop = window_size / 2;
    let half = window_size / 2;

    let freq_res = sample_rate as f32 / window_size as f32;
    let lo_bin = (DEESS_LO_HZ / freq_res) as usize;
    let hi_bin = ((DEESS_HI_HZ / freq_res) as usize).min(half);

    if lo_bin >= hi_bin {
        return;
    }

    // Periodic Hann: OLA sum = 1.0 at 50 % overlap.
    let hann = hann_periodic(window_size);

    let mut planner = FftPlanner::<f32>::new();
    let fft = planner.plan_fft_forward(window_size);
    let ifft = planner.plan_fft_inverse(window_size);

    let n_frames = samples.len().div_ceil(hop);
    let mut output = vec![0f32; samples.len()];
    let mut norm = vec![0f32; samples.len()];

    for frame_idx in 0..n_frames {
        let start = frame_idx * hop;
        if start >= samples.len() {
            break;
        }

        let mut buffer: Vec<Complex<f32>> = (0..window_size)
            .map(|i| {
                let s = if start + i < samples.len() {
                    samples[start + i] as f32
                } else {
                    0.0
                };
                Complex {
                    re: s * hann[i],
                    im: 0.0,
                }
            })
            .collect();

        fft.process(&mut buffer);

        // Compute power spectrum and sibilance energy ratio.
        let power: Vec<f32> = buffer[..half].iter().map(|c| c.norm_sqr()).collect();
        let total: f32 = power.iter().sum();

        let gain = if total > f32::EPSILON {
            let sib_energy: f32 = power[lo_bin..hi_bin].iter().sum();
            let ratio = sib_energy / total;
            if ratio > threshold_ratio {
                let excess = ((ratio - threshold_ratio) / KNEE_WIDTH).min(1.0);
                10f32.powf(-excess * max_reduction_db / 20.0)
            } else {
                1.0
            }
        } else {
            1.0
        };

        if gain < 1.0 {
            // Apply to sibilance bins in positive frequency half.
            for bin in &mut buffer[lo_bin..hi_bin] {
                bin.re *= gain;
                bin.im *= gain;
            }
            // Apply to conjugate-mirror in negative frequency half.
            for i in lo_bin..hi_bin {
                let mirror = window_size - i;
                if mirror > half && mirror < window_size {
                    buffer[mirror].re *= gain;
                    buffer[mirror].im *= gain;
                }
            }
        }

        ifft.process(&mut buffer);

        // Accumulate OLA output (divide by N for unnormalized IFFT).
        let scale = 1.0 / window_size as f32;
        for i in 0..window_size {
            let out_idx = start + i;
            if out_idx < output.len() {
                output[out_idx] += buffer[i].re * scale;
                norm[out_idx] += hann[i];
            }
        }
    }

    // Normalize by accumulated Hann weights; keep original where norm ≈ 0 (boundaries).
    for (i, s) in samples.iter_mut().enumerate() {
        let n = norm[i];
        if n > f32::EPSILON {
            *s = (output[i] / n)
                .round()
                .clamp(i16::MIN as f32, i16::MAX as f32) as i16;
        }
    }
}

fn hann_periodic(n: usize) -> Vec<f32> {
    (0..n)
        .map(|i| 0.5 * (1.0 - (2.0 * std::f32::consts::PI * i as f32 / n as f32).cos()))
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    const SR: u32 = 24_000;

    fn pure_tone(freq_hz: f32, amplitude: f32, secs: f32) -> Vec<i16> {
        let n = (SR as f32 * secs) as usize;
        (0..n)
            .map(|i| {
                let v =
                    amplitude * (2.0 * std::f32::consts::PI * freq_hz * i as f32 / SR as f32).sin();
                v.clamp(i16::MIN as f32, i16::MAX as f32) as i16
            })
            .collect()
    }

    #[test]
    fn passthrough_preserves_non_sibilant_audio() {
        let original = pure_tone(1000.0, 8_000.0, 0.2);
        let mut processed = original.clone();
        deess(&mut processed, SR);
        // RMS should be within 0.5 dBFS of original for non-sibilant tone.
        let original_rms: f32 = {
            let sq: f32 = original.iter().map(|&s| (s as f32).powi(2)).sum();
            (sq / original.len() as f32).sqrt()
        };
        let processed_rms: f32 = {
            let sq: f32 = processed.iter().map(|&s| (s as f32).powi(2)).sum();
            (sq / processed.len() as f32).sqrt()
        };
        let diff_db = 20.0 * (processed_rms / original_rms.max(1.0)).log10();
        assert!(
            diff_db.abs() < 1.0,
            "Non-sibilant tone altered by {:.2} dB",
            diff_db
        );
    }

    #[test]
    fn sibilant_tone_is_attenuated() {
        let mut sibilant = pure_tone(7_000.0, 8_000.0, 0.5);
        let rms_before: f32 = {
            let sq: f32 = sibilant.iter().map(|&s| (s as f32).powi(2)).sum();
            (sq / sibilant.len() as f32).sqrt()
        };
        deess(&mut sibilant, SR);
        let rms_after: f32 = {
            let sq: f32 = sibilant.iter().map(|&s| (s as f32).powi(2)).sum();
            (sq / sibilant.len() as f32).sqrt()
        };
        assert!(
            rms_after < rms_before,
            "Sibilant tone was not attenuated (before={:.0}, after={:.0})",
            rms_before,
            rms_after
        );
    }

    #[test]
    fn empty_input_is_a_no_op() {
        let mut samples: Vec<i16> = Vec::new();
        deess(&mut samples, SR); // must not panic
    }
}