audiobook-creation-exchange 0.1.0

ACX-compliant audio post-processing: normalisation, limiting, gating, LUFS measurement, and spectral analysis for AI-generated speech audio.
Documentation
//! Breath removal: detect and suppress audible breath sounds in speech audio.
//!
//! Breath sounds occupy a narrow RMS range (audible but much quieter than
//! speech) and concentrate their energy in the 200–1 000 Hz nasal/tracheal
//! resonance band.  Detected windows are replaced with room tone so the
//! silence floor remains consistent with the rest of the track.
//!
//! The detector is intentionally conservative: it only fires when the energy
//! ratio in the breath band is high AND the RMS is in the audible-but-quiet
//! region, minimising false positives on soft speech passages.

use crate::analyse::rms_db;
use crate::room_tone;
use rustfft::{FftPlanner, num_complex::Complex};

const WINDOW_MS: usize = 25; // short enough to catch 50 ms breaths
const BREATH_LO_HZ: f32 = 200.0;
const BREATH_HI_HZ: f32 = 1_000.0;
const BREATH_RATIO_THRESHOLD: f32 = 0.62; // 62 % energy in breath band
const BREATH_RMS_MIN_DB: f32 = -50.0; // above room-tone floor
const BREATH_RMS_MAX_DB: f32 = -30.0; // below typical speech (-16 dBFS)

/// Remove breath sounds from `samples`, replacing them with room tone at `room_tone_db`.
///
/// A breath window is any 25 ms segment where the RMS is between −50 and −30 dBFS
/// AND more than 62 % of its spectral energy falls in the 200–1 000 Hz band.
pub fn remove_breaths(samples: &mut [i16], sample_rate: u32, room_tone_db: f32) {
    let window_size = (sample_rate as usize * WINDOW_MS) / 1000;
    if window_size < 4 || samples.is_empty() {
        return;
    }
    let half = window_size / 2;

    let freq_res = sample_rate as f32 / window_size as f32;
    let lo_bin = (BREATH_LO_HZ / freq_res).max(1.0) as usize;
    let hi_bin = ((BREATH_HI_HZ / freq_res) as usize).min(half);

    if lo_bin >= hi_bin {
        return;
    }

    let hann: Vec<f32> = (0..window_size)
        .map(|i| {
            0.5 * (1.0 - (2.0 * std::f32::consts::PI * i as f32 / (window_size as f32 - 1.0)).cos())
        })
        .collect();

    let mut planner = FftPlanner::<f32>::new();
    let fft = planner.plan_fft_forward(window_size);

    let tone = room_tone::generate_room_tone(window_size * 8, room_tone_db);
    let mut tone_offset = 0usize;

    for chunk in samples.chunks_mut(window_size) {
        if chunk.len() < window_size {
            break;
        }

        let rms = rms_db(chunk);
        if !(BREATH_RMS_MIN_DB..=BREATH_RMS_MAX_DB).contains(&rms) {
            continue;
        }

        let mut buffer: Vec<Complex<f32>> = chunk
            .iter()
            .zip(hann.iter())
            .map(|(&s, &w)| Complex {
                re: s as f32 * w / i16::MAX as f32,
                im: 0.0,
            })
            .collect();

        fft.process(&mut buffer);

        let power: Vec<f32> = buffer[..half].iter().map(|c| c.norm_sqr()).collect();
        let total: f32 = power.iter().sum();

        if total < f32::EPSILON {
            continue;
        }

        let breath_energy: f32 = power[lo_bin..hi_bin].iter().sum();
        if breath_energy / total > BREATH_RATIO_THRESHOLD {
            for s in chunk.iter_mut() {
                *s = tone[tone_offset % tone.len()];
                tone_offset += 1;
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::analyse::rms_db;

    const SR: u32 = 24_000;

    fn pure_tone(freq_hz: f32, amplitude: f32, secs: f32) -> Vec<i16> {
        let n = (SR as f32 * secs) as usize;
        (0..n)
            .map(|i| {
                let v =
                    amplitude * (2.0 * std::f32::consts::PI * freq_hz * i as f32 / SR as f32).sin();
                v.clamp(i16::MIN as f32, i16::MAX as f32) as i16
            })
            .collect()
    }

    #[test]
    fn loud_speech_is_not_removed() {
        // Tone at -16 dBFS (amplitude ≈ 5800) is above BREATH_RMS_MAX_DB.
        let mut samples = pure_tone(500.0, 5_800.0, 0.5);
        let rms_before = rms_db(&samples);
        remove_breaths(&mut samples, SR, -52.0);
        let rms_after = rms_db(&samples);
        // Allow up to 0.5 dB change (boundary windows only).
        assert!(
            (rms_before - rms_after).abs() < 0.5,
            "Loud speech altered: before={:.1}, after={:.1}",
            rms_before,
            rms_after
        );
    }

    #[test]
    fn digital_silence_is_not_altered() {
        let mut samples = vec![0i16; SR as usize];
        remove_breaths(&mut samples, SR, -52.0);
        // All zeros should remain (RMS < BREATH_RMS_MIN_DB).
        assert!(samples.iter().all(|&s| s == 0));
    }

    #[test]
    fn empty_input_is_a_no_op() {
        let mut samples: Vec<i16> = Vec::new();
        remove_breaths(&mut samples, SR, -52.0); // must not panic
    }
}