use crate::{analyse::rms_db, room_tone};
pub const SENTENCE_THRESHOLD_MS: u32 = 200;
pub const PARAGRAPH_THRESHOLD_MS: u32 = 800;
pub const DEFAULT_SENTENCE_TARGET_MS: u32 = 120;
pub const DEFAULT_PARAGRAPH_TARGET_MS: u32 = 400;
pub const DEFAULT_SCENE_TARGET_MS: u32 = 700;
const PAUSE_THRESHOLD_DB: f32 = -55.0;
const CROSSFADE_MS: usize = 5;
pub fn normalize_pauses(samples: &[i16], sample_rate: u32) -> Vec<i16> {
normalize_pauses_with_targets(
samples,
sample_rate,
DEFAULT_SENTENCE_TARGET_MS,
DEFAULT_PARAGRAPH_TARGET_MS,
DEFAULT_SCENE_TARGET_MS,
)
}
pub fn normalize_pauses_with_targets(
samples: &[i16],
sample_rate: u32,
sentence_target_ms: u32,
paragraph_target_ms: u32,
scene_target_ms: u32,
) -> Vec<i16> {
if samples.is_empty() || sample_rate == 0 {
return samples.to_vec();
}
let ms_to_samples = |ms: u32| (sample_rate as usize * ms as usize) / 1000;
let sentence_thresh = ms_to_samples(SENTENCE_THRESHOLD_MS);
let paragraph_thresh = ms_to_samples(PARAGRAPH_THRESHOLD_MS);
let sentence_target = ms_to_samples(sentence_target_ms);
let paragraph_target = ms_to_samples(paragraph_target_ms);
let scene_target = ms_to_samples(scene_target_ms);
let fade_len = ms_to_samples(CROSSFADE_MS as u32).max(2);
let tone = room_tone::generate_room_tone(scene_target + fade_len * 2, -62.0);
let window = ms_to_samples(10).max(1);
let n = samples.len();
let mut is_silent = vec![false; n];
let mut pos = 0;
while pos < n {
let end = (pos + window).min(n);
let silent = rms_db(&samples[pos..end]) < PAUSE_THRESHOLD_DB;
for v in is_silent[pos..end].iter_mut() {
*v = silent;
}
pos = end;
}
struct Run {
start: usize,
end: usize,
}
let mut runs: Vec<Run> = Vec::new();
let mut i = 0;
while i < n {
if is_silent[i] {
let start = i;
while i < n && is_silent[i] {
i += 1;
}
runs.push(Run { start, end: i });
} else {
i += 1;
}
}
if runs.is_empty() {
return samples.to_vec();
}
let mut out: Vec<i16> = Vec::with_capacity(n);
let mut cursor = 0usize;
for run in &runs {
if cursor < run.start {
out.extend_from_slice(&samples[cursor..run.start]);
}
let raw_len = run.end - run.start;
if run.start == 0 || run.end == n {
out.extend_from_slice(&samples[run.start..run.end]);
cursor = run.end;
continue;
}
let target_len = if raw_len < sentence_thresh {
sentence_target
} else if raw_len < paragraph_thresh {
paragraph_target
} else {
scene_target
};
let fill_len = raw_len.min(target_len).max(1);
let tone_chunk: Vec<i16> = tone.iter().cycle().take(fill_len).copied().collect();
let fade = fade_len.min(fill_len / 2).min(out.len());
if fade > 0 {
let out_len = out.len();
for k in 0..fade {
let t = k as f32 / fade as f32;
let gain_existing = (std::f32::consts::FRAC_PI_2 * (1.0 - t)).cos();
let gain_tone = (std::f32::consts::FRAC_PI_2 * t).cos();
let existing = out[out_len.saturating_sub(fade) + k] as f32;
let tone_s = tone_chunk[k] as f32;
out.push(
(existing * gain_existing + tone_s * gain_tone)
.round()
.clamp(i16::MIN as f32, i16::MAX as f32) as i16,
);
}
out.extend_from_slice(&tone_chunk[fade..]);
} else {
out.extend_from_slice(&tone_chunk);
}
cursor = run.end;
}
if cursor < n {
out.extend_from_slice(&samples[cursor..]);
}
out
}
#[cfg(test)]
mod tests {
use super::*;
const SR: u32 = 24_000;
fn speech_block(amplitude: f32, ms: u32) -> Vec<i16> {
let n = (SR as usize * ms as usize) / 1000;
(0..n)
.map(|i| {
let v =
amplitude * (2.0 * std::f32::consts::PI * 440.0 * i as f32 / SR as f32).sin();
v.clamp(i16::MIN as f32, i16::MAX as f32) as i16
})
.collect()
}
fn silence_block(ms: u32) -> Vec<i16> {
vec![0i16; (SR as usize * ms as usize) / 1000]
}
#[test]
fn empty_input_returns_empty() {
let out = normalize_pauses(&[], SR);
assert!(out.is_empty());
}
#[test]
fn no_silence_returns_same_length() {
let samples = speech_block(5_000.0, 500);
let len = samples.len();
let out = normalize_pauses(&samples, SR);
assert_eq!(out.len(), len, "Continuous speech should not change length");
}
#[test]
fn long_pause_is_capped() {
let mut samples = speech_block(5_000.0, 300);
samples.extend(silence_block(500));
samples.extend(speech_block(5_000.0, 300));
let target_samples = (SR as usize * DEFAULT_PARAGRAPH_TARGET_MS as usize) / 1000;
let out = normalize_pauses(&samples, SR);
assert!(
out.len() < samples.len(),
"Long pause should be capped: in={} out={}",
samples.len(),
out.len()
);
let speech_len = (SR as usize * 600) / 1000;
assert!(
out.len() <= speech_len + target_samples + 100,
"Output longer than expected: {}",
out.len()
);
}
#[test]
fn very_long_pause_capped_to_scene_target() {
let mut samples = speech_block(5_000.0, 300);
samples.extend(silence_block(2_000)); samples.extend(speech_block(5_000.0, 300));
let out = normalize_pauses(&samples, SR);
assert!(
out.len() < samples.len(),
"Scene pause not capped: in={} out={}",
samples.len(),
out.len()
);
}
}