audiobook-creation-exchange 0.1.0

ACX-compliant audio post-processing: normalisation, limiting, gating, LUFS measurement, and spectral analysis for AI-generated speech audio.
Documentation
use crate::analyse::rms_db;
use crate::temporal::{HEAD_DURATION, TAIL_DURATION};

const WINDOW_MS: u32 = 50;

/// Replace sub-threshold 50 ms windows with room tone.
///
/// This prevents ACX-style "digital silence" rejection — any gap where the
/// track drops to zero causes a listener's brain to register it as a dropout.
/// Room tone keeps the acoustic floor consistent across the episode.
///
/// `room_tone` is tiled cyclically to fill each silent window, so its length
/// need not match the window size exactly.
pub fn gate_to_room_tone(
    samples: &mut [i16],
    sample_rate: u32,
    threshold_db: f32,
    room_tone: &[i16],
) {
    if room_tone.is_empty() {
        return;
    }

    let window_size = ((sample_rate as usize) * WINDOW_MS as usize) / 1000;
    if window_size == 0 {
        return;
    }

    let mut tone_offset = 0usize;

    for chunk in samples.chunks_mut(window_size) {
        if rms_db(chunk) < threshold_db {
            for s in chunk.iter_mut() {
                *s = room_tone[tone_offset % room_tone.len()];
                tone_offset += 1;
            }
        }
    }
}

/// Force the head (first 1 s) and tail (last 3 s) to room tone regardless of content.
///
/// ACX requires these bookends to be at or below room-tone energy.  TTS engines
/// typically start speaking immediately, so the head will fail `check_bookends`
/// without explicit padding.  A short linear crossfade (10 ms) prevents clicks
/// at the speech boundary.
pub fn pad_bookends(samples: &mut [i16], sample_rate: u32, room_tone: &[i16]) {
    if room_tone.is_empty() || samples.is_empty() {
        return;
    }

    let head_samples = (sample_rate as usize * HEAD_DURATION.whole_milliseconds() as usize) / 1000;
    let tail_samples = (sample_rate as usize * TAIL_DURATION.whole_milliseconds() as usize) / 1000;

    // 10 ms crossfade ramp at each boundary
    let fade_samples = (sample_rate as usize * 10) / 1000;

    // --- head ---
    let head_end = head_samples.min(samples.len());
    for (i, s) in samples[..head_end].iter_mut().enumerate() {
        let tone = room_tone[i % room_tone.len()];
        // Crossfade from room tone to original in the last `fade_samples` of the head region
        if head_end > fade_samples && i >= head_end - fade_samples {
            let t = (i - (head_end - fade_samples)) as f32 / fade_samples as f32;
            *s = (tone as f32 * (1.0 - t) + *s as f32 * t).round() as i16;
        } else {
            *s = tone;
        }
    }

    // --- tail ---
    let len = samples.len();
    let tail_start = len.saturating_sub(tail_samples);
    let mut tone_offset = tail_start;
    for (i, s) in samples[tail_start..].iter_mut().enumerate() {
        let abs_i = tail_start + i;
        let tone = room_tone[tone_offset % room_tone.len()];
        tone_offset += 1;
        // Crossfade from original to room tone in the first `fade_samples` of the tail region
        if i < fade_samples && tail_start > 0 {
            let t = i as f32 / fade_samples as f32;
            *s = (*s as f32 * (1.0 - t) + tone as f32 * t).round() as i16;
        } else {
            *s = tone;
        }
        let _ = abs_i; // used only for clarity
    }
}