#[cfg(feature = "long-form-vad")]
use crate::constants::VAD_WINDOW_SAMPLES;
use crate::constants::{MAX_MODEL_SAMPLES, SAMPLE_RATE, SAMPLES_PER_ENCODER_FRAME};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct SampleRange {
pub start: usize,
pub end: usize,
}
#[cfg(feature = "long-form-vad")]
#[derive(Debug, Clone)]
pub struct VadConfig {
pub default_threshold: f32,
}
#[cfg(feature = "long-form-vad")]
impl Default for VadConfig {
fn default() -> Self {
Self {
default_threshold: 0.85,
}
}
}
#[cfg(feature = "long-form-vad")]
#[derive(Debug, Clone)]
pub struct VadSegmentationConfig {
pub min_speech_duration: f64,
pub min_silence_duration: f64,
pub max_speech_duration: f64,
pub speech_padding: f64,
pub silence_threshold_for_split: f32,
pub negative_threshold: Option<f32>,
pub negative_threshold_offset: f32,
pub min_silence_at_max_speech: f64,
pub use_max_possible_silence_at_max_speech: bool,
}
#[cfg(feature = "long-form-vad")]
impl Default for VadSegmentationConfig {
fn default() -> Self {
Self {
min_speech_duration: 0.15,
min_silence_duration: 0.75,
max_speech_duration: 14.0,
speech_padding: 0.1,
silence_threshold_for_split: 0.3,
negative_threshold: None,
negative_threshold_offset: 0.15,
min_silence_at_max_speech: 0.098,
use_max_possible_silence_at_max_speech: true,
}
}
}
#[cfg(feature = "long-form-vad")]
impl VadSegmentationConfig {
pub(crate) fn threshold(&self, default_threshold: f32) -> f32 {
default_threshold
}
fn effective_negative_threshold(&self, base_threshold: f32) -> f32 {
if let Some(override_threshold) = self.negative_threshold {
return override_threshold;
}
(base_threshold - self.negative_threshold_offset).max(0.01)
}
}
#[derive(Debug, Clone)]
pub struct OverlapChunkConfig {
pub overlap_seconds: f64,
pub context_samples: usize,
pub max_model_samples: usize,
}
impl Default for OverlapChunkConfig {
fn default() -> Self {
Self {
overlap_seconds: 2.0,
context_samples: SAMPLES_PER_ENCODER_FRAME,
max_model_samples: MAX_MODEL_SAMPLES,
}
}
}
impl OverlapChunkConfig {
pub(crate) fn chunk_samples(&self) -> usize {
let max_actual_chunk = self.max_model_samples.saturating_sub(self.context_samples);
let raw = max_actual_chunk
.saturating_sub(crate::constants::MEL_HOP_SAMPLES)
.max(SAMPLES_PER_ENCODER_FRAME);
raw / SAMPLES_PER_ENCODER_FRAME * SAMPLES_PER_ENCODER_FRAME
}
pub(crate) fn overlap_samples(&self) -> usize {
let requested = (self.overlap_seconds * SAMPLE_RATE as f64) as usize;
let capped = requested.min(self.chunk_samples() / 2);
capped / SAMPLES_PER_ENCODER_FRAME * SAMPLES_PER_ENCODER_FRAME
}
pub(crate) fn stride_samples(&self) -> usize {
let raw = self.chunk_samples().saturating_sub(self.overlap_samples());
raw.max(SAMPLES_PER_ENCODER_FRAME) / SAMPLES_PER_ENCODER_FRAME * SAMPLES_PER_ENCODER_FRAME
}
pub(crate) fn plan(&self, range: SampleRange) -> Vec<SampleRange> {
let mut chunks = Vec::new();
let mut start = range.start;
while start < range.end {
let end = (start + self.chunk_samples()).min(range.end);
chunks.push(SampleRange { start, end });
if end == range.end {
break;
}
start += self.stride_samples();
}
chunks
}
}
#[cfg(feature = "long-form-vad")]
pub(crate) fn detect_speech_regions(
probabilities: &[f32],
audio_length_samples: usize,
threshold: f32,
config: &VadSegmentationConfig,
) -> Vec<SampleRange> {
if probabilities.is_empty() || audio_length_samples == 0 {
return Vec::new();
}
let hop_size = VAD_WINDOW_SAMPLES;
let min_speech_samples = (config.min_speech_duration * SAMPLE_RATE as f64) as usize;
let min_silence_samples = (config.min_silence_duration * SAMPLE_RATE as f64) as usize;
let speech_pad_samples = (config.speech_padding * SAMPLE_RATE as f64) as usize;
let negative_threshold = config.effective_negative_threshold(threshold);
let mut triggered = false;
let mut current_speech_start = 0usize;
let mut temp_end = None;
let mut speeches = Vec::new();
for (index, probability) in probabilities.iter().copied().enumerate() {
let frame_start = index * hop_size;
if probability >= threshold {
temp_end = None;
if !triggered {
triggered = true;
current_speech_start = frame_start;
}
continue;
}
if probability < negative_threshold && triggered {
if temp_end.is_none() {
temp_end = Some(frame_start);
}
if let Some(start_silence) = temp_end
&& frame_start.saturating_sub(start_silence) >= min_silence_samples
{
if start_silence.saturating_sub(current_speech_start) >= min_speech_samples {
speeches.push(SampleRange {
start: current_speech_start,
end: start_silence,
});
}
triggered = false;
temp_end = None;
}
}
}
if triggered && audio_length_samples.saturating_sub(current_speech_start) >= min_speech_samples
{
speeches.push(SampleRange {
start: current_speech_start,
end: audio_length_samples,
});
}
if speeches.is_empty() {
return Vec::new();
}
let mut adjusted = speeches;
for index in 0..adjusted.len() {
if index == 0 {
adjusted[index].start = adjusted[index].start.saturating_sub(speech_pad_samples);
}
if index < adjusted.len() - 1 {
let silence = adjusted[index + 1]
.start
.saturating_sub(adjusted[index].end);
if silence < 2 * speech_pad_samples {
let half = silence / 2;
adjusted[index].end = (adjusted[index].end + half).min(audio_length_samples);
adjusted[index + 1].start = adjusted[index + 1].start.saturating_sub(half);
} else {
adjusted[index].end =
(adjusted[index].end + speech_pad_samples).min(audio_length_samples);
adjusted[index + 1].start =
adjusted[index + 1].start.saturating_sub(speech_pad_samples);
}
} else {
adjusted[index].end =
(adjusted[index].end + speech_pad_samples).min(audio_length_samples);
}
}
adjusted.retain(|range| range.end > range.start);
adjusted
}
#[cfg(feature = "long-form-vad")]
pub(crate) fn region_probability_slice(probabilities: &[f32], region: SampleRange) -> &[f32] {
let frame_start = region.start / VAD_WINDOW_SAMPLES;
let frame_end = region.end.div_ceil(VAD_WINDOW_SAMPLES);
&probabilities[frame_start.min(probabilities.len())..frame_end.min(probabilities.len())]
}
#[cfg(feature = "long-form-vad")]
pub(crate) fn plan_region_subsegments(
region: SampleRange,
region_probabilities: &[f32],
config: &VadSegmentationConfig,
max_chunk_samples: usize,
) -> Option<Vec<SampleRange>> {
if region.end.saturating_sub(region.start) <= max_chunk_samples {
return Some(vec![region]);
}
let min_silence_samples = (config.min_silence_at_max_speech * SAMPLE_RATE as f64) as usize;
let silence_threshold = config.silence_threshold_for_split;
let mut spans = silence_spans(region, region_probabilities, silence_threshold);
spans.retain(|span| span.end.saturating_sub(span.start) >= min_silence_samples);
let mut segments = Vec::new();
let mut cursor = region.start;
while region.end.saturating_sub(cursor) > max_chunk_samples {
let search_end = cursor + max_chunk_samples;
let best_span = spans
.iter()
.copied()
.filter(|span| span.start > cursor && span.start < search_end)
.max_by_key(|span| span.end.saturating_sub(span.start));
let span = best_span?;
segments.push(SampleRange {
start: cursor,
end: span.start,
});
cursor = span.end;
}
if cursor < region.end {
segments.push(SampleRange {
start: cursor,
end: region.end,
});
}
Some(segments)
}
#[cfg(feature = "long-form-vad")]
fn silence_spans(
region: SampleRange,
region_probabilities: &[f32],
threshold: f32,
) -> Vec<SampleRange> {
let region_frame_start = region.start / VAD_WINDOW_SAMPLES;
let mut spans = Vec::new();
let mut start = None;
for (index, probability) in region_probabilities.iter().copied().enumerate() {
let absolute_frame = region_frame_start + index;
let sample_start = absolute_frame * VAD_WINDOW_SAMPLES;
if probability <= threshold {
start.get_or_insert(sample_start);
continue;
}
if let Some(span_start) = start.take() {
spans.push(SampleRange {
start: span_start.max(region.start),
end: sample_start.min(region.end),
});
}
}
if let Some(span_start) = start {
spans.push(SampleRange {
start: span_start.max(region.start),
end: region.end,
});
}
spans
}
#[cfg(test)]
mod tests {
use super::{OverlapChunkConfig, SampleRange};
#[cfg(feature = "long-form-vad")]
use super::{VadSegmentationConfig, detect_speech_regions, plan_region_subsegments};
#[cfg(feature = "long-form-vad")]
use crate::constants::VAD_WINDOW_SAMPLES;
#[cfg(feature = "long-form-vad")]
#[test]
fn speech_regions_trim_short_silence() {
let probabilities = vec![0.9, 0.9, 0.1, 0.9, 0.9];
let regions = detect_speech_regions(
&probabilities,
probabilities.len() * VAD_WINDOW_SAMPLES,
0.85,
&VadSegmentationConfig::default(),
);
assert_eq!(regions.len(), 1);
}
#[cfg(feature = "long-form-vad")]
#[test]
fn silence_split_prefers_internal_gap() {
let probabilities = vec![0.9, 0.9, 0.9, 0.2, 0.2, 0.2, 0.2, 0.9, 0.9, 0.9];
let region = SampleRange {
start: 0,
end: probabilities.len() * VAD_WINDOW_SAMPLES,
};
let segments = plan_region_subsegments(
region,
&probabilities,
&VadSegmentationConfig::default(),
5 * VAD_WINDOW_SAMPLES,
)
.unwrap();
assert_eq!(segments.len(), 2);
assert!(segments[0].end <= 5 * VAD_WINDOW_SAMPLES);
}
#[cfg(feature = "long-form-vad")]
#[test]
fn no_silence_returns_none_for_overlap_fallback() {
let probabilities = vec![0.9; 10];
let region = SampleRange {
start: 0,
end: probabilities.len() * VAD_WINDOW_SAMPLES,
};
assert!(
plan_region_subsegments(
region,
&probabilities,
&VadSegmentationConfig::default(),
4 * VAD_WINDOW_SAMPLES
)
.is_none()
);
}
#[test]
fn overlap_plan_advances_with_stride() {
let config = OverlapChunkConfig::default();
let chunks = config.plan(SampleRange {
start: 0,
end: 600_000,
});
assert!(chunks.len() > 1);
assert!(chunks[1].start < chunks[0].end);
}
}