audiobook-creation-exchange 0.1.0

ACX-compliant audio post-processing: normalisation, limiting, gating, LUFS measurement, and spectral analysis for AI-generated speech audio.
Documentation
//! Inter-sentence pause normaliser.
//!
//! TTS engines produce unnaturally uniform pauses — every sentence boundary the
//! same length. This module classifies each sub-threshold run by duration and
//! caps it to a natural target, returning a new buffer (length may shrink).
//!
//! A 5 ms raised-cosine crossfade is applied at every edit boundary to prevent
//! audible clicks.
//!
//! Classification:
//!
//! | Class       | Raw duration    | Target     |
//! |-------------|-----------------|------------|
//! | Sentence    | < 200 ms        | 120 ms     |
//! | Paragraph   | 200 ms – 800 ms | 400 ms     |
//! | Scene break | > 800 ms        | 700 ms     |

use crate::{analyse::rms_db, room_tone};

/// Upper bound of a sentence-level pause before it is capped.
pub const SENTENCE_THRESHOLD_MS: u32 = 200;
/// Upper bound of a paragraph-level pause before it is capped.
pub const PARAGRAPH_THRESHOLD_MS: u32 = 800;

/// Default target lengths after capping (ms).
pub const DEFAULT_SENTENCE_TARGET_MS: u32 = 120;
pub const DEFAULT_PARAGRAPH_TARGET_MS: u32 = 400;
pub const DEFAULT_SCENE_TARGET_MS: u32 = 700;

/// Silence detection threshold — windows below this are classified as pauses.
const PAUSE_THRESHOLD_DB: f32 = -55.0;
const CROSSFADE_MS: usize = 5;

/// Normalise pauses with default targets. Returns a new sample buffer.
pub fn normalize_pauses(samples: &[i16], sample_rate: u32) -> Vec<i16> {
    normalize_pauses_with_targets(
        samples,
        sample_rate,
        DEFAULT_SENTENCE_TARGET_MS,
        DEFAULT_PARAGRAPH_TARGET_MS,
        DEFAULT_SCENE_TARGET_MS,
    )
}

/// Normalise pauses with explicit per-class targets. Returns a new sample buffer.
pub fn normalize_pauses_with_targets(
    samples: &[i16],
    sample_rate: u32,
    sentence_target_ms: u32,
    paragraph_target_ms: u32,
    scene_target_ms: u32,
) -> Vec<i16> {
    if samples.is_empty() || sample_rate == 0 {
        return samples.to_vec();
    }

    let ms_to_samples = |ms: u32| (sample_rate as usize * ms as usize) / 1000;

    let sentence_thresh = ms_to_samples(SENTENCE_THRESHOLD_MS);
    let paragraph_thresh = ms_to_samples(PARAGRAPH_THRESHOLD_MS);

    let sentence_target = ms_to_samples(sentence_target_ms);
    let paragraph_target = ms_to_samples(paragraph_target_ms);
    let scene_target = ms_to_samples(scene_target_ms);

    let fade_len = ms_to_samples(CROSSFADE_MS as u32).max(2);

    // Room tone for filling gaps that expand (shorter than target — rare but safe).
    let tone = room_tone::generate_room_tone(scene_target + fade_len * 2, -62.0);

    // Detect silence runs: scan sample-by-sample using a 10 ms window for RMS.
    let window = ms_to_samples(10).max(1);
    let n = samples.len();
    let mut is_silent = vec![false; n];
    let mut pos = 0;
    while pos < n {
        let end = (pos + window).min(n);
        let silent = rms_db(&samples[pos..end]) < PAUSE_THRESHOLD_DB;
        for v in is_silent[pos..end].iter_mut() {
            *v = silent;
        }
        pos = end;
    }

    // Collect runs of silence as (start, end) pairs.
    struct Run {
        start: usize,
        end: usize,
    }
    let mut runs: Vec<Run> = Vec::new();
    let mut i = 0;
    while i < n {
        if is_silent[i] {
            let start = i;
            while i < n && is_silent[i] {
                i += 1;
            }
            runs.push(Run { start, end: i });
        } else {
            i += 1;
        }
    }

    if runs.is_empty() {
        return samples.to_vec();
    }

    // Build output: copy speech segments verbatim, replace silence runs with
    // capped versions and crossfade at boundaries.
    let mut out: Vec<i16> = Vec::with_capacity(n);
    let mut cursor = 0usize;

    for run in &runs {
        // Speech before this run.
        if cursor < run.start {
            out.extend_from_slice(&samples[cursor..run.start]);
        }

        let raw_len = run.end - run.start;

        // Leave leading/trailing silence intact — bookend regions are managed by
        // pad_bookends and the normalise pre-compensation assumes fixed bookend sizes.
        if run.start == 0 || run.end == n {
            out.extend_from_slice(&samples[run.start..run.end]);
            cursor = run.end;
            continue;
        }

        let target_len = if raw_len < sentence_thresh {
            sentence_target
        } else if raw_len < paragraph_thresh {
            paragraph_target
        } else {
            scene_target
        };

        // Cap at target_len if longer, or keep as-is if already shorter.
        let fill_len = raw_len.min(target_len).max(1);

        // Use room tone for the silence fill.
        let tone_chunk: Vec<i16> = tone.iter().cycle().take(fill_len).copied().collect();

        // Crossfade the leading edge by APPENDING a blended region (not overwriting
        // existing speech samples, which would shrink the output by `fade` per run).
        let fade = fade_len.min(fill_len / 2).min(out.len());
        if fade > 0 {
            let out_len = out.len();
            for k in 0..fade {
                let t = k as f32 / fade as f32;
                let gain_existing = (std::f32::consts::FRAC_PI_2 * (1.0 - t)).cos();
                let gain_tone = (std::f32::consts::FRAC_PI_2 * t).cos();
                let existing = out[out_len.saturating_sub(fade) + k] as f32;
                let tone_s = tone_chunk[k] as f32;
                out.push(
                    (existing * gain_existing + tone_s * gain_tone)
                        .round()
                        .clamp(i16::MIN as f32, i16::MAX as f32) as i16,
                );
            }
            out.extend_from_slice(&tone_chunk[fade..]);
        } else {
            out.extend_from_slice(&tone_chunk);
        }

        cursor = run.end;
    }

    // Append any trailing speech after the last run.
    if cursor < n {
        out.extend_from_slice(&samples[cursor..]);
    }

    out
}

#[cfg(test)]
mod tests {
    use super::*;

    const SR: u32 = 24_000;

    fn speech_block(amplitude: f32, ms: u32) -> Vec<i16> {
        let n = (SR as usize * ms as usize) / 1000;
        (0..n)
            .map(|i| {
                let v =
                    amplitude * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / SR as f32).sin();
                v.clamp(i16::MIN as f32, i16::MAX as f32) as i16
            })
            .collect()
    }

    fn silence_block(ms: u32) -> Vec<i16> {
        vec![0i16; (SR as usize * ms as usize) / 1000]
    }

    #[test]
    fn empty_input_returns_empty() {
        let out = normalize_pauses(&[], SR);
        assert!(out.is_empty());
    }

    #[test]
    fn no_silence_returns_same_length() {
        let samples = speech_block(5_000.0, 500);
        let len = samples.len();
        let out = normalize_pauses(&samples, SR);
        assert_eq!(out.len(), len, "Continuous speech should not change length");
    }

    #[test]
    fn long_pause_is_capped() {
        // 500 ms silence between speech blocks.
        let mut samples = speech_block(5_000.0, 300);
        samples.extend(silence_block(500));
        samples.extend(speech_block(5_000.0, 300));

        let target_samples = (SR as usize * DEFAULT_PARAGRAPH_TARGET_MS as usize) / 1000;
        let out = normalize_pauses(&samples, SR);

        // Output should be shorter than input (pause capped from 500 ms to 400 ms).
        assert!(
            out.len() < samples.len(),
            "Long pause should be capped: in={} out={}",
            samples.len(),
            out.len()
        );
        // And no longer than input_speech + target_pause + small tolerance.
        let speech_len = (SR as usize * 600) / 1000;
        assert!(
            out.len() <= speech_len + target_samples + 100,
            "Output longer than expected: {}",
            out.len()
        );
    }

    #[test]
    fn very_long_pause_capped_to_scene_target() {
        let mut samples = speech_block(5_000.0, 300);
        samples.extend(silence_block(2_000)); // 2 s → scene break
        samples.extend(speech_block(5_000.0, 300));

        let out = normalize_pauses(&samples, SR);

        // 2 s capped to 700 ms — output must be shorter than input.
        assert!(
            out.len() < samples.len(),
            "Scene pause not capped: in={} out={}",
            samples.len(),
            out.len()
        );
    }
}