use super::audio_processor::VadAudioProcessor;
use crate::config::VadConfig;
use crate::{Result, error::SubXError};
use log::{debug, trace, warn};
use std::time::{Duration, Instant};
use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
pub struct LocalVadDetector {
config: VadConfig,
audio_processor: VadAudioProcessor,
}
impl LocalVadDetector {
pub fn new(config: VadConfig) -> Result<Self> {
debug!("Initializing LocalVadDetector with config: {:?}", config);
Ok(Self {
config,
audio_processor: VadAudioProcessor::new()?,
})
}
pub async fn detect_speech_from_data(
&self,
mut audio_data: crate::services::vad::audio_processor::ProcessedAudioData,
) -> Result<VadResult> {
debug!(
"Starting speech detection for ProcessedAudioData: sample_rate={}, duration={}",
audio_data.info.sample_rate, audio_data.info.duration_seconds
);
if audio_data.samples.is_empty() {
return Err(SubXError::audio_processing(
"Audio data is empty".to_string(),
));
}
let start_time = Instant::now();
if audio_data.info.sample_rate != 8000 && audio_data.info.sample_rate != 16000 {
debug!(
"Resampling audio from {}Hz to 16000Hz...",
audio_data.info.sample_rate
);
use crate::services::vad::resample::resample_to_target_rate;
let resampled =
resample_to_target_rate(&audio_data.samples, audio_data.info.sample_rate, 16000)?;
let new_len = resampled.len();
audio_data.samples = resampled;
audio_data.info.sample_rate = 16000;
audio_data.info.duration_seconds = new_len as f64 / 16000.0;
audio_data.info.total_samples = new_len;
debug!(
"Resampling complete: new sample_rate=16000, total_samples={}, duration={:.3}s",
new_len, audio_data.info.duration_seconds
);
}
let chunk_size = self.calculate_chunk_size(audio_data.info.sample_rate);
debug!(
"Calculated VAD chunk_size={} for sample_rate={}",
chunk_size, audio_data.info.sample_rate
);
let vad = VoiceActivityDetector::builder()
.sample_rate(audio_data.info.sample_rate)
.chunk_size(chunk_size)
.build()
.map_err(|e| {
warn!("Failed to create VAD instance: {}", e);
SubXError::audio_processing(format!("Failed to create VAD: {}", e))
})?;
trace!("Running speech segment detection");
let speech_segments =
self.detect_speech_segments(vad, &audio_data.samples, audio_data.info.sample_rate)?;
let processing_duration = start_time.elapsed();
debug!(
"Speech detection completed in {:?} seconds, segments found: {}",
processing_duration,
speech_segments.len()
);
Ok(VadResult {
speech_segments,
processing_duration,
audio_info: audio_data.info,
})
}
fn detect_speech_segments(
&self,
mut vad: VoiceActivityDetector,
samples: &[i16],
sample_rate: u32,
) -> Result<Vec<SpeechSegment>> {
trace!(
"Detecting speech segments: samples={}, sample_rate={}",
samples.len(),
sample_rate
);
let mut segments = Vec::new();
let chunk_size = self.calculate_chunk_size(sample_rate);
let chunk_duration_seconds = chunk_size as f64 / sample_rate as f64;
let vad_threshold = 1.0 - self.config.sensitivity;
debug!(
"VAD threshold set to {} (sensitivity={})",
vad_threshold, self.config.sensitivity
);
let labels: Vec<LabeledAudio<i16>> = samples
.iter()
.copied()
.label(&mut vad, vad_threshold, self.config.padding_chunks as usize)
.collect();
trace!("Labeling complete, total chunks: {}", labels.len());
let mut current_speech_start: Option<f64> = None;
let mut chunk_index = 0;
for label in labels {
let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
match label {
LabeledAudio::Speech(_chunk) => {
if current_speech_start.is_none() {
trace!(
"Speech started at {:.3}s (chunk #{})",
chunk_start_time, chunk_index
);
current_speech_start = Some(chunk_start_time);
}
}
LabeledAudio::NonSpeech(_chunk) => {
if let Some(start_time) = current_speech_start.take() {
let end_time = chunk_start_time;
let duration = end_time - start_time;
trace!(
"Speech ended at {:.3}s (duration {:.3}s)",
end_time, duration
);
if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
trace!(
"Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
start_time, end_time, duration
);
segments.push(SpeechSegment {
start_time,
end_time,
duration,
});
} else {
trace!(
"Discarded short segment: start={:.3}s, end={:.3}s, duration={:.3}s",
start_time, end_time, duration
);
}
}
}
}
chunk_index += 1;
}
if let Some(start_time) = current_speech_start {
let end_time = chunk_index as f64 * chunk_duration_seconds;
let duration = end_time - start_time;
trace!(
"Final speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
start_time, end_time, duration
);
if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
trace!(
"Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
start_time, end_time, duration
);
segments.push(SpeechSegment {
start_time,
end_time,
duration,
});
} else {
trace!(
"Discarded short final segment: start={:.3}s, end={:.3}s, duration={:.3}s",
start_time, end_time, duration
);
}
}
debug!("Speech segments detected: {}", segments.len());
Ok(segments)
}
pub fn calculate_chunk_size(&self, sample_rate: u32) -> usize {
trace!("Calculating chunk size for sample_rate={}", sample_rate);
let chunk_size = match sample_rate {
8000 => 256,
16000 => 512,
_ => panic!(
"Unsupported VAD sample_rate={}. Only 8kHz/256, 16kHz/512 are allowed.",
sample_rate
),
};
debug!(
"Final chunk_size for sample_rate {}: {}",
sample_rate, chunk_size
);
chunk_size
}
pub fn audio_processor(&self) -> &VadAudioProcessor {
&self.audio_processor
}
}
#[derive(Debug, Clone)]
pub struct VadResult {
pub speech_segments: Vec<SpeechSegment>,
pub processing_duration: Duration,
pub audio_info: AudioInfo,
}
#[derive(Debug, Clone)]
pub struct SpeechSegment {
pub start_time: f64,
pub end_time: f64,
pub duration: f64,
}
#[derive(Debug, Clone)]
pub struct AudioInfo {
pub sample_rate: u32,
pub channels: u16,
pub duration_seconds: f64,
pub total_samples: usize,
}