use super::*;
use crate::{
audio::sts::pipeline::{
barge_in::EnergyBargeInDetector, chunker::FixedSizeAudioChunker,
turn_taking::SilenceTurnTakingPolicy,
},
error::Error,
};
use std::cell::RefCell;
struct MockVad {
threshold: f32,
}
impl VadFrameAdapter for MockVad {
fn is_speech(&mut self, frame: &[f32]) -> Result<bool> {
let rms = EnergyBargeInDetector::rms(frame);
Ok(rms >= self.threshold)
}
}
struct MockStt {
last_audio_len: RefCell<usize>,
text: String,
}
impl SttTurnAdapter for MockStt {
fn transcribe_turn(&mut self, turn_audio: &[f32]) -> Result<String> {
*self.last_audio_len.borrow_mut() = turn_audio.len();
Ok(self.text.clone())
}
}
struct MockLlm {
seen: RefCell<Vec<String>>,
}
impl LlmResponderAdapter for MockLlm {
fn respond(&mut self, user_text: &str) -> Result<String> {
self.seen.borrow_mut().push(user_text.to_string());
Ok(format!("re:{user_text}"))
}
}
struct MockTts {
seen: RefCell<Vec<String>>,
samples_per_word: usize,
}
impl TtsStreamAdapter for MockTts {
fn synthesize_stream<'a>(
&'a mut self,
text: &str,
) -> Result<Box<dyn Iterator<Item = Result<Vec<f32>>> + 'a>> {
self.seen.borrow_mut().push(text.to_string());
let n_words = text.split_whitespace().count().max(1);
let n = self.samples_per_word;
Ok(Box::new((0..n_words).map(move |_| Ok(vec![0.1_f32; n]))))
}
fn sample_rate(&self) -> u32 {
24_000
}
}
struct MockSink {
recorded: Vec<f32>,
write_count: usize,
flush_count: usize,
running: bool,
}
impl MockSink {
fn new() -> Self {
Self {
recorded: Vec::new(),
write_count: 0,
flush_count: 0,
running: false,
}
}
}
impl AudioOutputStream for MockSink {
fn write_samples(&mut self, samples: &[f32]) -> Result<usize> {
self.recorded.extend_from_slice(samples);
self.write_count += 1;
self.running = true;
Ok(samples.len())
}
fn flush(&mut self) -> Result<()> {
self.flush_count += 1;
Ok(())
}
fn stop(&mut self) -> Result<()> {
self.running = false;
Ok(())
}
fn is_running(&self) -> bool {
self.running
}
}
fn test_session() -> VoiceSession<
MockVad,
MockStt,
MockLlm,
MockTts,
FixedSizeAudioChunker,
EnergyBargeInDetector,
SilenceTurnTakingPolicy,
> {
let config = VoicePipelineConfig::new()
.with_input_sample_rate(16_000)
.with_frame_duration_ms(20)
.with_preroll_ms(40)
.with_vad_end_silence_ms(200)
.with_turn_max_incomplete_silence_ms(200);
let chunk_size = (16_000 * 20) / 1_000;
VoiceSession::new(
config,
MockVad { threshold: 0.05 },
MockStt {
last_audio_len: RefCell::new(0),
text: "hello world".to_string(),
},
MockLlm {
seen: RefCell::new(Vec::new()),
},
MockTts {
seen: RefCell::new(Vec::new()),
samples_per_word: 100,
},
FixedSizeAudioChunker::new(chunk_size),
EnergyBargeInDetector::default(),
SilenceTurnTakingPolicy::new(200),
)
.expect("test session input_sample_rate is non-zero")
}
#[test]
fn end_to_end_drives_vad_stt_llm_tts_in_order() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
sess.step(&silence_chunk, &mut sink, false).unwrap();
let mut speech_frame = Vec::new();
for _ in 0..10 {
speech_frame.extend_from_slice(&speech_chunk);
}
sess.step(&speech_frame, &mut sink, false).unwrap();
let mut silence_frame = Vec::new();
for _ in 0..11 {
silence_frame.extend_from_slice(&silence_chunk);
}
let turns = sess.step(&silence_frame, &mut sink, false).unwrap();
assert_eq!(turns, 1, "exactly one turn finalized");
let events = sess.turn_events();
assert_eq!(events.len(), 1);
assert_eq!(events[0].user_text(), "hello world");
assert_eq!(events[0].assistant_text(), "re:hello world");
assert!(!events[0].barge_in_observed());
let stt = &sess.stt;
assert!(*stt.last_audio_len.borrow() >= 10 * chunk_size);
assert_eq!(sess.llm.seen.borrow().as_slice(), &["hello world"]);
assert_eq!(sess.tts.seen.borrow().as_slice(), &["re:hello world"]);
assert_eq!(sink.recorded.len(), 200);
}
#[test]
fn play_audio_false_skips_tts() {
let mut sess = test_session();
sess.config = sess.config.with_play_audio(false);
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
let mut speech_frame = Vec::new();
for _ in 0..5 {
speech_frame.extend_from_slice(&speech_chunk);
}
sess.step(&speech_frame, &mut sink, false).unwrap();
let mut silence_frame = Vec::new();
for _ in 0..11 {
silence_frame.extend_from_slice(&silence_chunk);
}
sess.step(&silence_frame, &mut sink, false).unwrap();
assert_eq!(sess.turn_events().len(), 1);
assert_eq!(sink.recorded.len(), 0);
assert_eq!(sink.write_count, 0);
}
#[test]
fn run_drives_mic_iterator_to_end() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
let mic: Vec<Vec<f32>> = {
let mut v = Vec::new();
for _ in 0..5 {
v.push(speech_chunk.clone());
}
for _ in 0..15 {
v.push(silence_chunk.clone());
}
v
};
sess.run(mic.into_iter(), &mut sink).unwrap();
assert_eq!(sess.turn_events().len(), 1);
assert_eq!(sink.flush_count, 1, "sink flushed exactly once at run-end");
}
#[test]
fn run_flushes_in_progress_turn_at_mic_eof() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
sess
.run((0..5).map(|_| speech_chunk.clone()), &mut sink)
.unwrap();
assert_eq!(sess.turn_events().len(), 1);
}
#[test]
fn barge_in_observed_when_user_overlaps_tts() {
let mut sess = test_session();
let mut sink = MockSink::new();
sink.running = true;
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
sess
.step(&speech_chunk, &mut sink, true)
.unwrap();
let mut silence_frame = Vec::new();
for _ in 0..12 {
silence_frame.extend_from_slice(&silence_chunk);
}
sess
.step(&silence_frame, &mut sink, false)
.unwrap();
assert_eq!(sess.turn_events().len(), 1);
assert!(sess.turn_events()[0].barge_in_observed());
}
#[test]
fn write_samples_zero_is_backend_error() {
struct BadSink;
impl AudioOutputStream for BadSink {
fn write_samples(&mut self, _samples: &[f32]) -> Result<usize> {
Ok(0)
}
fn flush(&mut self) -> Result<()> {
Ok(())
}
fn stop(&mut self) -> Result<()> {
Ok(())
}
fn is_running(&self) -> bool {
false
}
}
let mut sess = test_session();
let mut sink = BadSink;
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
sess.step(&speech_chunk, &mut sink, false).unwrap();
let mut silence_frame = Vec::new();
for _ in 0..12 {
silence_frame.extend_from_slice(&silence_chunk);
}
let err = sess.step(&silence_frame, &mut sink, false).unwrap_err();
match err {
Error::InvariantViolation(p) => {
let msg = p.to_string();
assert!(msg.contains("audio sink"), "got: {msg}")
}
other => panic!("expected InvariantViolation error, got: {other:?}"),
}
}
#[test]
fn config_accessor_returns_bundled_config() {
let sess = test_session();
let cfg = sess.config();
assert_eq!(cfg.input_sample_rate(), 16_000);
assert_eq!(cfg.frame_duration_ms(), 20);
}
#[test]
fn voice_session_first_speech_chunk_is_not_duplicated_in_turn_audio() {
struct CapturingStt {
audio: RefCell<Vec<f32>>,
}
impl SttTurnAdapter for CapturingStt {
fn transcribe_turn(&mut self, turn_audio: &[f32]) -> Result<String> {
*self.audio.borrow_mut() = turn_audio.to_vec();
Ok("captured".to_string())
}
}
let config = VoicePipelineConfig::new()
.with_input_sample_rate(16_000)
.with_frame_duration_ms(20)
.with_preroll_ms(40)
.with_vad_end_silence_ms(200)
.with_turn_max_incomplete_silence_ms(200);
let chunk_size = 320;
let mut sess = VoiceSession::new(
config,
MockVad { threshold: 0.05 },
CapturingStt {
audio: RefCell::new(Vec::new()),
},
MockLlm {
seen: RefCell::new(Vec::new()),
},
MockTts {
seen: RefCell::new(Vec::new()),
samples_per_word: 1,
},
FixedSizeAudioChunker::new(chunk_size),
EnergyBargeInDetector::default(),
SilenceTurnTakingPolicy::new(200),
)
.unwrap();
let mut sink = MockSink::new();
let silence_chunk: Vec<f32> = vec![0.0; chunk_size];
let speech_chunk: Vec<f32> = vec![0.5_f32; chunk_size];
let silence_for_finalize: Vec<f32> = vec![0.0; chunk_size * 12];
sess.step(&silence_chunk, &mut sink, false).unwrap();
sess.step(&silence_chunk, &mut sink, false).unwrap();
sess.step(&speech_chunk, &mut sink, false).unwrap();
sess.step(&silence_for_finalize, &mut sink, false).unwrap();
assert_eq!(sess.turn_events().len(), 1);
let audio = sess.stt.audio.borrow();
let n_marker = audio.iter().filter(|&&s| s == 0.5).count();
assert_eq!(
n_marker, chunk_size,
"the speech chunk (320 samples of 0.5) must appear EXACTLY \
ONCE in the turn audio — an idle branch that pushed it into \
the pre-roll BEFORE the VAD branch ran would let \
the start-of-turn snapshot copy it into \
the turn audio AGAIN (640 marker samples total)."
);
}
#[test]
fn voice_session_run_at_mic_eof_with_partial_chunk_still_finalizes_full_audio() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let partial: Vec<f32> = (0..100)
.map(|i| 0.3 * ((i as f32 + 1000.0).sin()))
.collect();
let mic: Vec<Vec<f32>> = {
let mut v = Vec::new();
for _ in 0..5 {
v.push(speech_chunk.clone());
}
v.push(partial);
v
};
sess.run(mic.into_iter(), &mut sink).unwrap();
assert_eq!(sess.turn_events().len(), 1);
let stt_len = *sess.stt.last_audio_len.borrow();
assert_eq!(
stt_len,
5 * 320 + 100,
"STT must receive the 5 full speech chunks PLUS the 100 \
residual samples buffered in the chunker at mic-EOF"
);
}
#[test]
fn voice_session_flush_in_progress_turn_drains_chunker_residual() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let speech_chunk: Vec<f32> = (0..chunk_size).map(|i| 0.3 * ((i as f32).sin())).collect();
let partial: Vec<f32> = (0..50).map(|i| 0.3 * ((i as f32 + 7.0).sin())).collect();
sess.step(&speech_chunk, &mut sink, false).unwrap();
sess.step(&speech_chunk, &mut sink, false).unwrap();
sess.step(&speech_chunk, &mut sink, false).unwrap();
sess.step(&partial, &mut sink, false).unwrap();
assert!(sess.flush_in_progress_turn(&mut sink).unwrap());
let stt_len = *sess.stt.last_audio_len.borrow();
assert_eq!(stt_len, 3 * 320 + 50);
}
#[test]
fn voice_session_barge_in_observed_does_not_leak_from_idle_into_later_turn() {
let mut sess = test_session();
let mut sink = MockSink::new();
let chunk_size = 320;
let noisy_idle: Vec<f32> = vec![0.04_f32; chunk_size];
let silence: Vec<f32> = vec![0.0; chunk_size];
sess
.step(&noisy_idle, &mut sink, true)
.unwrap();
sess.step(&silence, &mut sink, false).unwrap();
let speech_frame: Vec<f32> = (0..5 * chunk_size)
.map(|i| 0.3 * (((i % chunk_size) as f32).sin()))
.collect();
sess
.step(&speech_frame, &mut sink, false)
.unwrap();
let silence_frame: Vec<f32> = vec![0.0; 12 * chunk_size];
sess.step(&silence_frame, &mut sink, false).unwrap();
let events = sess.turn_events();
assert_eq!(events.len(), 1, "exactly one turn finalized");
assert!(
!events[0].barge_in_observed(),
"idle-noise barge-in detection must NOT leak into a later \
turn — only in-turn speech-while-TTS-playing counts"
);
}
#[test]
fn voice_session_new_rejects_zero_sample_rate() {
let config = VoicePipelineConfig::new()
.with_input_sample_rate(0)
.with_frame_duration_ms(20)
.with_preroll_ms(40)
.with_vad_end_silence_ms(200)
.with_turn_max_incomplete_silence_ms(200);
let chunk_size = 320;
let result = VoiceSession::new(
config,
MockVad { threshold: 0.05 },
MockStt {
last_audio_len: RefCell::new(0),
text: "x".to_string(),
},
MockLlm {
seen: RefCell::new(Vec::new()),
},
MockTts {
seen: RefCell::new(Vec::new()),
samples_per_word: 1,
},
FixedSizeAudioChunker::new(chunk_size),
EnergyBargeInDetector::default(),
SilenceTurnTakingPolicy::new(200),
);
match result {
Ok(_) => panic!("expected Err, got Ok"),
Err(Error::InvariantViolation(p)) => {
let msg = p.to_string();
assert!(
msg.contains("sample_rate"),
"expected InvariantViolation mentioning sample_rate, got: {msg}"
)
}
Err(other) => panic!("expected InvariantViolation error, got: {other:?}"),
}
}
#[test]
fn voice_session_silence_accounting_uses_per_chunk_duration_not_first_chunk() {
let speech_chunk: Vec<f32> = (0..320).map(|i| 0.3 * ((i as f32).sin())).collect();
let big_silence: Vec<f32> = vec![0.0; 320];
let small_silence: Vec<f32> = vec![0.0; 160];
struct OneShotChunker {
chunks: Option<Vec<Vec<f32>>>,
}
impl AudioChunker for OneShotChunker {
fn push_samples(&mut self, _samples: &[f32]) -> Result<Vec<Vec<f32>>> {
Ok(self.chunks.take().unwrap_or_default())
}
fn drain_residual(&mut self) -> Vec<f32> {
Vec::new()
}
fn reset(&mut self) {
self.chunks = None;
}
}
struct RecordingTurnTaking {
observed_silence_ms: RefCell<Vec<u32>>,
threshold_ms: u32,
}
impl TurnTakingPolicy for RecordingTurnTaking {
fn user_finished(&self, _recent_audio: &[f32], silence_ms: u32) -> bool {
self.observed_silence_ms.borrow_mut().push(silence_ms);
silence_ms >= self.threshold_ms
}
}
let one_shot = OneShotChunker {
chunks: Some(vec![
speech_chunk.clone(),
speech_chunk,
big_silence,
small_silence,
]),
};
let config = VoicePipelineConfig::new()
.with_input_sample_rate(16_000)
.with_frame_duration_ms(20)
.with_preroll_ms(0)
.with_vad_end_silence_ms(200)
.with_turn_max_incomplete_silence_ms(200);
let mut sess = VoiceSession::new(
config,
MockVad { threshold: 0.05 },
MockStt {
last_audio_len: RefCell::new(0),
text: "x".to_string(),
},
MockLlm {
seen: RefCell::new(Vec::new()),
},
MockTts {
seen: RefCell::new(Vec::new()),
samples_per_word: 1,
},
one_shot,
EnergyBargeInDetector::default(),
RecordingTurnTaking {
observed_silence_ms: RefCell::new(Vec::new()),
threshold_ms: 100,
},
)
.unwrap();
let mut sink = MockSink::new();
sess.step(&[], &mut sink, false).unwrap();
assert_eq!(
sess.turn_events().len(),
0,
"high threshold (100 ms) must NOT finalize on a 30 ms silence run",
);
let observed = sess.turn_policy.observed_silence_ms.borrow().clone();
assert_eq!(
observed,
vec![20, 30],
"per-chunk silence_ms must accumulate as variable-frame durations \
[20, 30]; a chunks[0].len() reuse would produce [20, 40]; got {observed:?}",
);
}
#[test]
fn voice_session_silence_accounting_records_uniform_chunk_size_correctly() {
let speech_chunk: Vec<f32> = (0..320).map(|i| 0.3 * ((i as f32).sin())).collect();
let silence_chunk: Vec<f32> = vec![0.0; 320];
struct OneShotChunker {
chunks: Option<Vec<Vec<f32>>>,
}
impl AudioChunker for OneShotChunker {
fn push_samples(&mut self, _samples: &[f32]) -> Result<Vec<Vec<f32>>> {
Ok(self.chunks.take().unwrap_or_default())
}
fn drain_residual(&mut self) -> Vec<f32> {
Vec::new()
}
fn reset(&mut self) {
self.chunks = None;
}
}
struct RecordingTurnTaking {
observed_silence_ms: RefCell<Vec<u32>>,
threshold_ms: u32,
}
impl TurnTakingPolicy for RecordingTurnTaking {
fn user_finished(&self, _recent_audio: &[f32], silence_ms: u32) -> bool {
self.observed_silence_ms.borrow_mut().push(silence_ms);
silence_ms >= self.threshold_ms
}
}
let one_shot = OneShotChunker {
chunks: Some(vec![
speech_chunk.clone(),
speech_chunk,
silence_chunk.clone(),
silence_chunk,
]),
};
let config = VoicePipelineConfig::new()
.with_input_sample_rate(16_000)
.with_frame_duration_ms(20)
.with_preroll_ms(0)
.with_vad_end_silence_ms(200)
.with_turn_max_incomplete_silence_ms(200);
let mut sess = VoiceSession::new(
config,
MockVad { threshold: 0.05 },
MockStt {
last_audio_len: RefCell::new(0),
text: "x".to_string(),
},
MockLlm {
seen: RefCell::new(Vec::new()),
},
MockTts {
seen: RefCell::new(Vec::new()),
samples_per_word: 1,
},
one_shot,
EnergyBargeInDetector::default(),
RecordingTurnTaking {
observed_silence_ms: RefCell::new(Vec::new()),
threshold_ms: 100,
},
)
.unwrap();
let mut sink = MockSink::new();
sess.step(&[], &mut sink, false).unwrap();
let observed = sess.turn_policy.observed_silence_ms.borrow().clone();
assert_eq!(observed, vec![20, 40], "uniform 320-sample silence chunks");
}