1use std::collections::VecDeque;
2
3use crate::stream_decode::TokenEmit;
4use crate::vad_silero::SileroVad;
5
6const SAMPLE_RATE: u32 = 16_000;
7const VAD_FRAME_SIZE: usize = 512;
8
9pub struct WindowConfig {
10 pub max_window_secs: f32,
18 pub stride_secs: f32,
20 pub vad_threshold: f32,
21 pub min_speech_secs: f32,
24}
25
26impl Default for WindowConfig {
27 fn default() -> Self {
28 Self {
29 max_window_secs: 5.0,
30 stride_secs: 1.0,
31 vad_threshold: 0.5,
32 min_speech_secs: 0.25,
33 }
34 }
35}
36
37pub struct StreamWindow {
38 pub samples: Vec<f32>,
41 pub real_samples: usize,
43 pub window_start_secs: f32,
45 pub had_speech: bool,
47 pub trailing_silence_secs: f32,
49 pub cap_hit: bool,
53 pub forced_eou: bool,
57}
58
59pub struct Chunker {
60 cfg: WindowConfig,
61 vad: SileroVad,
62 buf: Vec<f32>,
64 pub samples_since_last_stride: usize,
65 pub total_samples_seen: u64,
66 pub last_speech_at_sample: Option<u64>,
67
68 speech_samples_accumulated: u64,
72 vad_frame_buf: Vec<f32>,
74 window_ever_emitted: bool,
77 vad_decisions: VecDeque<bool>,
79 utterance_start_sample: u64,
81 cap_pending_handler: bool,
85 deferred_samples: Vec<f32>,
89}
90
91impl Chunker {
92 pub fn new(cfg: WindowConfig, vad: SileroVad) -> Self {
93 let max_samples = (cfg.max_window_secs * SAMPLE_RATE as f32) as usize;
94 let max_vad_frames = max_samples.div_ceil(VAD_FRAME_SIZE);
95 Self {
96 cfg,
97 vad,
98 buf: Vec::with_capacity(max_samples),
99 samples_since_last_stride: 0,
100 total_samples_seen: 0,
101 last_speech_at_sample: None,
102 speech_samples_accumulated: 0,
103 vad_frame_buf: Vec::with_capacity(VAD_FRAME_SIZE),
104 window_ever_emitted: false,
105 vad_decisions: VecDeque::with_capacity(max_vad_frames),
106 utterance_start_sample: 0,
107 cap_pending_handler: false,
108 deferred_samples: Vec::new(),
109 }
110 }
111
112 pub fn push(&mut self, samples: &[f32]) -> Option<StreamWindow> {
123 let stride_samples = (self.cfg.stride_secs * SAMPLE_RATE as f32) as usize;
124 let max_samples = (self.cfg.max_window_secs * SAMPLE_RATE as f32) as usize;
125 let min_speech_samples = (self.cfg.min_speech_secs * SAMPLE_RATE as f32) as u64;
126 let max_vad_frames = max_samples.div_ceil(VAD_FRAME_SIZE);
127
128 let mut all_samples: Vec<f32> = std::mem::take(&mut self.deferred_samples);
129 all_samples.extend_from_slice(samples);
130
131 let mut result: Option<StreamWindow> = None;
132
133 for (idx, &s) in all_samples.iter().enumerate() {
134 if self.buf.len() < max_samples {
135 self.buf.push(s);
136 }
137 self.total_samples_seen += 1;
138 self.samples_since_last_stride += 1;
139
140 self.vad_frame_buf.push(s);
142 if self.vad_frame_buf.len() == VAD_FRAME_SIZE {
143 let frame: &[f32; VAD_FRAME_SIZE] = self
144 .vad_frame_buf
145 .as_slice()
146 .try_into()
147 .expect("vad_frame_buf is exactly VAD_FRAME_SIZE");
148 let is_speech = self
149 .vad
150 .probability(frame)
151 .map(|p| p >= self.cfg.vad_threshold)
152 .unwrap_or(false);
153 self.vad_frame_buf.clear();
154
155 if is_speech {
156 self.last_speech_at_sample = Some(self.total_samples_seen);
157 self.speech_samples_accumulated += VAD_FRAME_SIZE as u64;
158 }
159
160 if self.vad_decisions.len() >= max_vad_frames {
161 self.vad_decisions.pop_front();
162 }
163 self.vad_decisions.push_back(is_speech);
164 }
165
166 if self.buf.len() >= max_samples {
174 let mut w = self.build_window(max_samples);
175 w.cap_hit = true;
176 if self.cap_pending_handler {
177 w.forced_eou = true;
178 }
179 self.cap_pending_handler = true;
180 self.samples_since_last_stride = 0;
181 self.deferred_samples = all_samples[idx + 1..].to_vec();
182 return Some(w);
183 }
184
185 if self.samples_since_last_stride >= stride_samples {
187 let enough_speech = self.speech_samples_accumulated >= min_speech_samples;
188 if enough_speech || self.window_ever_emitted {
189 result = Some(self.build_window(max_samples));
190 self.window_ever_emitted = true;
191 }
192 self.samples_since_last_stride = 0;
193 }
194 }
195
196 result
197 }
198
199 fn build_window(&self, max_samples: usize) -> StreamWindow {
200 let real_samples = self.buf.len();
201 let mut samples = Vec::with_capacity(max_samples);
202 samples.extend_from_slice(&self.buf);
203 samples.resize(max_samples, 0.0);
204
205 let trailing_silence_secs = match self.last_speech_at_sample {
206 Some(last) => self.total_samples_seen.saturating_sub(last) as f32 / SAMPLE_RATE as f32,
207 None => {
208 self.total_samples_seen
209 .saturating_sub(self.utterance_start_sample) as f32
210 / SAMPLE_RATE as f32
211 }
212 };
213
214 let window_start_secs = self.utterance_start_sample as f32 / SAMPLE_RATE as f32;
215 let had_speech = self.vad_decisions.iter().any(|&b| b);
216
217 StreamWindow {
218 samples,
219 real_samples,
220 window_start_secs,
221 had_speech,
222 trailing_silence_secs,
223 cap_hit: false,
224 forced_eou: false,
225 }
226 }
227
228 pub fn reset_utterance(&mut self) {
233 self.buf.clear();
234 self.samples_since_last_stride = 0;
235 self.vad_decisions.clear();
236 self.speech_samples_accumulated = 0;
237 self.window_ever_emitted = false;
238 self.utterance_start_sample = self.total_samples_seen;
239 self.cap_pending_handler = false;
240 self.deferred_samples.clear();
241 }
242
243 pub fn trim_oldest(&mut self, samples: usize) -> usize {
253 let trim_frames = samples / VAD_FRAME_SIZE;
254 let trim_samples = trim_frames * VAD_FRAME_SIZE;
255 if trim_samples == 0 || trim_samples >= self.buf.len() {
256 return 0;
257 }
258 self.buf.drain(..trim_samples);
259 for _ in 0..trim_frames {
260 if self.vad_decisions.pop_front().unwrap_or(false) {
261 self.speech_samples_accumulated = self
262 .speech_samples_accumulated
263 .saturating_sub(VAD_FRAME_SIZE as u64);
264 }
265 }
266 self.utterance_start_sample += trim_samples as u64;
267 self.cap_pending_handler = false;
268 trim_samples
269 }
270}
271
272pub enum CommitDelta {
274 Committed {
276 new_tokens: Vec<TokenEmit>,
277 new_text: String,
278 },
279 Tentative {
281 tokens: Vec<TokenEmit>,
282 text: String,
283 },
284}
285
286pub struct Committer {
291 last_candidate: Vec<TokenEmit>,
292 committed: Vec<TokenEmit>,
293 committed_text: String,
294 awaiting_reset: bool,
299 last_lcp_end_in_candidate: usize,
303}
304
305impl Committer {
306 pub fn new() -> Self {
307 Self {
308 last_candidate: Vec::new(),
309 committed: Vec::new(),
310 committed_text: String::new(),
311 awaiting_reset: false,
312 last_lcp_end_in_candidate: 0,
313 }
314 }
315
316 pub fn ingest(&mut self, candidate: Vec<TokenEmit>) -> (CommitDelta, CommitDelta) {
320 if self.awaiting_reset {
321 self.last_candidate.clear();
322 self.committed.clear();
323 self.committed_text.clear();
324 self.awaiting_reset = false;
325 }
326 let lcp = lcp_len(&self.last_candidate, &candidate);
327 self.last_lcp_end_in_candidate = lcp;
328 let prev_committed_len = self.committed.len();
329
330 if lcp > prev_committed_len {
334 for t in candidate[prev_committed_len..lcp].iter() {
335 self.committed.push(t.clone());
336 }
337 }
338
339 let new_text = build_text(&self.committed[prev_committed_len..]);
340 self.committed_text.push_str(&new_text);
341
342 let new_tokens: Vec<TokenEmit> = self.committed[prev_committed_len..].to_vec();
343 let tentative_tokens: Vec<TokenEmit> = candidate[lcp..].to_vec();
344 let tentative_text = build_text(&tentative_tokens);
345
346 self.last_candidate = candidate;
347
348 (
349 CommitDelta::Committed {
350 new_tokens,
351 new_text,
352 },
353 CommitDelta::Tentative {
354 tokens: tentative_tokens,
355 text: tentative_text,
356 },
357 )
358 }
359
360 pub fn finalize_utterance(&mut self) -> CommitDelta {
367 let start = self.committed.len();
368 let new_tokens: Vec<TokenEmit> = if start < self.last_candidate.len() {
369 self.last_candidate[start..].to_vec()
370 } else {
371 Vec::new()
372 };
373 let new_text = build_text(&new_tokens);
374 self.committed_text.push_str(&new_text);
375 self.committed.extend(new_tokens.iter().cloned());
376 self.last_candidate.clear();
377 self.awaiting_reset = true;
378 CommitDelta::Committed {
379 new_tokens,
380 new_text,
381 }
382 }
383
384 pub fn committed_tokens(&self) -> &[TokenEmit] {
385 &self.committed
386 }
387
388 pub fn committed_text(&self) -> &str {
389 &self.committed_text
390 }
391
392 pub fn last_candidate(&self) -> &[TokenEmit] {
395 &self.last_candidate
396 }
397
398 pub fn last_lcp_end_in_candidate(&self) -> usize {
402 self.last_lcp_end_in_candidate
403 }
404
405 pub fn on_trim(&mut self) -> CommitDelta {
417 let start = self.committed.len();
418 let new_tokens: Vec<TokenEmit> = if start < self.last_candidate.len() {
419 self.last_candidate[start..].to_vec()
420 } else {
421 Vec::new()
422 };
423 let new_text = build_text(&new_tokens);
424 self.committed_text.push_str(&new_text);
425 self.committed.extend(new_tokens.iter().cloned());
426 self.last_candidate.clear();
427 self.last_lcp_end_in_candidate = 0;
428 CommitDelta::Committed {
429 new_tokens,
430 new_text,
431 }
432 }
433}
434
435impl Default for Committer {
436 fn default() -> Self {
437 Self::new()
438 }
439}
440
441fn build_text(tokens: &[TokenEmit]) -> String {
443 tokens
444 .iter()
445 .filter(|t| !t.is_special)
446 .map(|t| t.text.as_str())
447 .collect()
448}
449
450fn lcp_len(a: &[TokenEmit], b: &[TokenEmit]) -> usize {
456 let a_content: Vec<(usize, u32)> = a
457 .iter()
458 .enumerate()
459 .filter(|(_, t)| t.window_ts_secs.is_none())
460 .map(|(i, t)| (i, t.id))
461 .collect();
462 let b_content: Vec<(usize, u32)> = b
463 .iter()
464 .enumerate()
465 .filter(|(_, t)| t.window_ts_secs.is_none())
466 .map(|(i, t)| (i, t.id))
467 .collect();
468
469 let lcp_n = a_content
470 .iter()
471 .zip(b_content.iter())
472 .take_while(|((_, aid), (_, bid))| aid == bid)
473 .count();
474
475 if lcp_n == 0 {
476 return 0;
477 }
478 b_content[lcp_n - 1].0 + 1
479}
480
481pub struct EndpointConfig {
482 pub silence_secs: f32,
484 pub punct_silence_secs: f32,
486 pub min_utterance_secs: f32,
488}
489
490impl Default for EndpointConfig {
491 fn default() -> Self {
492 Self {
493 silence_secs: 2.0,
494 punct_silence_secs: 0.8,
495 min_utterance_secs: 0.5,
496 }
497 }
498}
499
500pub struct Endpointer {
505 cfg: EndpointConfig,
506 text_len_at_reset: usize,
508 fired: bool,
510 needs_baseline_update: bool,
513 utterance_start_secs: Option<f32>,
515}
516
517impl Endpointer {
518 pub fn new(cfg: EndpointConfig) -> Self {
519 Self {
520 cfg,
521 text_len_at_reset: 0,
522 fired: false,
523 needs_baseline_update: false,
524 utterance_start_secs: None,
525 }
526 }
527
528 pub fn step(&mut self, window: &StreamWindow, latest_committed_text: &str) -> bool {
530 if self.fired {
531 return false;
532 }
533
534 if self.needs_baseline_update {
535 self.text_len_at_reset = latest_committed_text.len();
536 self.needs_baseline_update = false;
537 }
538
539 let has_new_committed = latest_committed_text.len() > self.text_len_at_reset;
540
541 if has_new_committed && self.utterance_start_secs.is_none() {
542 self.utterance_start_secs = Some(window.window_start_secs);
543 }
544
545 let current_secs =
546 window.window_start_secs + window.real_samples as f32 / SAMPLE_RATE as f32;
547 let utterance_secs = self
548 .utterance_start_secs
549 .map(|start| current_secs - start)
550 .unwrap_or(0.0);
551 if utterance_secs < self.cfg.min_utterance_secs {
552 return false;
553 }
554
555 if has_new_committed && window.trailing_silence_secs >= self.cfg.silence_secs {
556 self.fired = true;
557 return true;
558 }
559
560 if has_new_committed {
561 let new_text = &latest_committed_text[self.text_len_at_reset..];
562 if matches!(new_text.chars().last(), Some('.') | Some('!') | Some('?'))
563 && window.trailing_silence_secs >= self.cfg.punct_silence_secs
564 {
565 self.fired = true;
566 return true;
567 }
568 }
569
570 false
571 }
572
573 pub fn reset(&mut self) {
576 self.fired = false;
577 self.utterance_start_secs = None;
578 self.needs_baseline_update = true;
579 }
580}
581
582pub struct PromptContext {
588 max_prompt_tokens: usize,
590 prevtext_token_id: Option<u32>,
592 current_prompt: Vec<u32>,
594}
595
596impl PromptContext {
597 pub fn new(max_prompt_tokens: usize, prevtext_token_id: Option<u32>) -> Self {
600 Self {
601 max_prompt_tokens,
602 prevtext_token_id,
603 current_prompt: Vec::new(),
604 }
605 }
606
607 pub fn update_after_eou(&mut self, committed_tokens: &[TokenEmit], max_new_tokens: usize) {
613 let max_allowed = 448usize.saturating_sub(3 + max_new_tokens);
616 let text_slots = if self.prevtext_token_id.is_some() {
618 max_allowed.saturating_sub(1)
619 } else {
620 max_allowed
621 };
622 let cap = self.max_prompt_tokens.min(text_slots);
623
624 let regular_ids: Vec<u32> = committed_tokens
626 .iter()
627 .filter(|t| !t.is_special)
628 .map(|t| t.id)
629 .collect();
630
631 let start = regular_ids.len().saturating_sub(cap);
632 let tail = ®ular_ids[start..];
633
634 self.current_prompt.clear();
635 if let Some(prevtext) = self.prevtext_token_id
637 && !tail.is_empty()
638 {
639 self.current_prompt.push(prevtext);
640 }
641 self.current_prompt.extend_from_slice(tail);
642 }
643
644 pub fn prompt_tokens(&self) -> &[u32] {
647 &self.current_prompt
648 }
649
650 pub fn reset(&mut self) {
652 self.current_prompt.clear();
653 }
654}
655
656#[cfg(test)]
657mod tests {
658 use super::*;
659 use crate::vad_silero::ensure_silero_model;
660 use anyhow::Result;
661 use std::path::PathBuf;
662
663 fn models_dir() -> PathBuf {
664 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
665 .parent()
666 .expect("workspace root")
667 .join("models")
668 }
669
670 fn open_vad() -> Result<SileroVad> {
671 let path = ensure_silero_model(&models_dir())?;
672 SileroVad::open(&path)
673 }
674
675 fn read_wav_samples(path: &std::path::Path) -> Result<Vec<f32>> {
677 use anyhow::Context;
678 let bytes = std::fs::read(path).with_context(|| format!("read {}", path.display()))?;
679 let mut pos = 12usize;
680 let mut data_start = None;
681 let mut data_len = 0usize;
682 let mut audio_format = 1u16; while pos + 8 <= bytes.len() {
684 let id = &bytes[pos..pos + 4];
685 let size = u32::from_le_bytes(bytes[pos + 4..pos + 8].try_into().unwrap()) as usize;
686 if id == b"fmt " {
687 audio_format = u16::from_le_bytes(bytes[pos + 8..pos + 10].try_into().unwrap());
688 } else if id == b"data" {
689 data_start = Some(pos + 8);
690 data_len = size;
691 break;
692 }
693 pos += 8 + size + (size & 1);
694 }
695 let start = data_start.context("no 'data' chunk in WAV")?;
696 let end = (start + data_len).min(bytes.len());
697 if audio_format == 3 {
698 let n = (end - start) / 4;
700 let mut samples = Vec::with_capacity(n);
701 for i in 0..n {
702 let b: [u8; 4] = bytes[start + i * 4..start + i * 4 + 4].try_into().unwrap();
703 samples.push(f32::from_le_bytes(b));
704 }
705 Ok(samples)
706 } else {
707 let n = (end - start) / 2;
709 let mut samples = Vec::with_capacity(n);
710 for i in 0..n {
711 let b: [u8; 2] = bytes[start + i * 2..start + i * 2 + 2].try_into().unwrap();
712 samples.push(i16::from_le_bytes(b) as f32 / 32768.0);
713 }
714 Ok(samples)
715 }
716 }
717
718 #[test]
719 #[ignore = "requires silero_vad.onnx in ./models AND test_data/LJ001-0001_16k.wav at repo root"]
720 fn test_chunker_speech_windows_cluster() -> Result<()> {
721 let vad = open_vad()?;
722 let cfg = WindowConfig {
723 max_window_secs: 28.0,
724 stride_secs: 1.0,
725 vad_threshold: 0.5,
726 min_speech_secs: 0.25,
727 };
728 let mut chunker = Chunker::new(cfg, vad);
729
730 let repo_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
731 .parent()
732 .expect("workspace root")
733 .to_path_buf();
734 let speech_wav = repo_root.join("test_data").join("LJ001-0001_16k.wav");
735 assert!(
736 speech_wav.exists(),
737 "speech WAV not found: {}",
738 speech_wav.display()
739 );
740 let speech_samples = read_wav_samples(&speech_wav)?;
741 let speech_len = (2 * SAMPLE_RATE as usize).min(speech_samples.len());
743 let speech_clip = &speech_samples[..speech_len];
744
745 let silence_4s = vec![0.0f32; 4 * SAMPLE_RATE as usize];
747 let silence_4s_end = vec![0.0f32; 4 * SAMPLE_RATE as usize];
748
749 let chunk_size = 512usize;
750 let mut windows: Vec<StreamWindow> = Vec::new();
751
752 let all_samples: Vec<f32> = silence_4s
753 .iter()
754 .chain(speech_clip.iter())
755 .chain(silence_4s_end.iter())
756 .copied()
757 .collect();
758
759 for chunk in all_samples.chunks(chunk_size) {
760 if let Some(w) = chunker.push(chunk) {
761 windows.push(w);
762 }
763 }
764
765 assert!(!windows.is_empty(), "no windows emitted");
768 let speech_windows: Vec<&StreamWindow> = windows.iter().filter(|w| w.had_speech).collect();
769 assert!(!speech_windows.is_empty(), "no speech windows detected");
770
771 for w in &speech_windows {
774 assert!(
775 w.window_start_secs >= 0.0 && w.window_start_secs <= 11.0,
776 "speech window start {:.2} s outside expected 0–11 s range",
777 w.window_start_secs
778 );
779 }
780
781 Ok(())
782 }
783
784 fn make_token(id: u32) -> TokenEmit {
785 TokenEmit {
786 id,
787 text: format!("tok{id}"),
788 logprob: 0.0,
789 window_ts_secs: None,
790 is_special: false,
791 }
792 }
793
794 fn make_ts_token(id: u32, ts: f32) -> TokenEmit {
795 TokenEmit {
796 id,
797 text: String::new(),
798 logprob: 0.0,
799 window_ts_secs: Some(ts),
800 is_special: true,
801 }
802 }
803
804 fn committed_ids(delta: &CommitDelta) -> Vec<u32> {
805 match delta {
806 CommitDelta::Committed { new_tokens, .. } => new_tokens.iter().map(|t| t.id).collect(),
807 CommitDelta::Tentative { .. } => vec![],
808 }
809 }
810
811 fn tentative_ids(delta: &CommitDelta) -> Vec<u32> {
812 match delta {
813 CommitDelta::Tentative { tokens, .. } => tokens.iter().map(|t| t.id).collect(),
814 CommitDelta::Committed { .. } => vec![],
815 }
816 }
817
818 #[test]
820 fn test_committer_local_agreement_2() -> Result<()> {
821 let mut c = Committer::new();
822
823 let (comm, tent) = c.ingest(vec![make_token(1), make_token(2), make_token(3)]);
825 assert!(
826 committed_ids(&comm).is_empty(),
827 "round 1: nothing committed"
828 );
829 assert_eq!(
830 tentative_ids(&tent),
831 vec![1, 2, 3],
832 "round 1: all tokens tentative"
833 );
834
835 let (comm, tent) = c.ingest(vec![
837 make_token(1),
838 make_token(2),
839 make_token(4),
840 make_token(5),
841 ]);
842 assert_eq!(
843 committed_ids(&comm),
844 vec![1, 2],
845 "round 2: [tok1, tok2] committed"
846 );
847 assert_eq!(
848 tentative_ids(&tent),
849 vec![4, 5],
850 "round 2: tentative = [tok4, tok5]"
851 );
852 assert_eq!(c.committed_text(), "tok1tok2");
853
854 let (comm, tent) = c.ingest(vec![
856 make_token(1),
857 make_token(2),
858 make_token(4),
859 make_token(6),
860 ]);
861 assert_eq!(
862 committed_ids(&comm),
863 vec![4],
864 "round 3: [tok4] newly committed"
865 );
866 assert_eq!(tentative_ids(&tent), vec![6], "round 3: tentative = [tok6]");
867 assert_eq!(c.committed_text(), "tok1tok2tok4");
868
869 let final_delta = c.finalize_utterance();
871 assert_eq!(
872 committed_ids(&final_delta),
873 vec![6],
874 "finalize: [tok6] committed"
875 );
876 assert_eq!(c.committed_text(), "tok1tok2tok4tok6");
877 assert_eq!(
878 c.committed_tokens()
879 .iter()
880 .map(|t| t.id)
881 .collect::<Vec<_>>(),
882 vec![1, 2, 4, 6],
883 "committed_tokens must remain readable after finalize for PromptContext"
884 );
885
886 let noop = c.finalize_utterance();
888 assert!(
889 committed_ids(&noop).is_empty(),
890 "second finalize must be a no-op"
891 );
892
893 c.ingest(vec![make_token(10), make_token(20), make_token(30)]);
896 let (comm, _) = c.ingest(vec![make_token(10), make_token(20), make_token(40)]);
897 assert_eq!(
898 committed_ids(&comm),
899 vec![10, 20],
900 "new utterance must commit [tok10, tok20] after auto-reset"
901 );
902
903 Ok(())
904 }
905
906 #[test]
908 fn test_committer_lcp_ignores_timestamps() -> Result<()> {
909 let mut c = Committer::new();
910 c.ingest(vec![
912 make_ts_token(50364, 0.0),
913 make_token(1),
914 make_ts_token(50366, 0.2),
915 make_token(2),
916 ]);
917 let (comm, _tent) = c.ingest(vec![
919 make_ts_token(50365, 0.02),
920 make_token(1),
921 make_ts_token(50367, 0.22),
922 make_token(2),
923 ]);
924 assert_eq!(
925 committed_ids(&comm),
926 vec![50365, 1, 50367, 2],
927 "content tokens tok1/tok2 match despite different timestamp IDs; both windows committed"
928 );
929 Ok(())
930 }
931
932 fn make_window(trailing_silence_secs: f32, window_start_secs: f32) -> StreamWindow {
933 let real_samples = 5 * SAMPLE_RATE as usize;
934 StreamWindow {
935 samples: vec![0.0f32; real_samples],
936 real_samples,
937 window_start_secs,
938 had_speech: trailing_silence_secs < 5.0,
939 trailing_silence_secs,
940 cap_hit: false,
941 forced_eou: false,
942 }
943 }
944
945 #[test]
946 fn test_endpointer_no_eou_silence_no_text() -> Result<()> {
947 let cfg = EndpointConfig {
948 silence_secs: 2.0,
949 punct_silence_secs: 0.8,
950 min_utterance_secs: 0.0,
951 };
952 let mut ep = Endpointer::new(cfg);
953 let window = make_window(2.5, 0.0);
954 assert!(!ep.step(&window, ""), "no EOU when committed text is empty");
955 Ok(())
956 }
957
958 #[test]
959 fn test_endpointer_hard_eou_fires_once() -> Result<()> {
960 let cfg = EndpointConfig {
961 silence_secs: 2.0,
962 punct_silence_secs: 0.8,
963 min_utterance_secs: 0.0,
964 };
965 let mut ep = Endpointer::new(cfg);
966
967 assert!(!ep.step(&make_window(0.3, 0.0), "hello world"));
969 assert!(
971 ep.step(&make_window(2.5, 1.0), "hello world"),
972 "hard EOU should fire"
973 );
974 assert!(
976 !ep.step(&make_window(3.0, 2.0), "hello world"),
977 "must not re-fire before reset"
978 );
979
980 Ok(())
981 }
982
983 #[test]
984 fn test_endpointer_soft_eou_punctuation() -> Result<()> {
985 let cfg = EndpointConfig {
986 silence_secs: 2.0,
987 punct_silence_secs: 0.8,
988 min_utterance_secs: 0.0,
989 };
990 let mut ep = Endpointer::new(cfg);
991 assert!(
993 ep.step(&make_window(0.9, 0.0), "Hello."),
994 "soft EOU should fire after '.' + 0.9 s silence"
995 );
996 Ok(())
997 }
998
999 fn make_committed_tokens(ids: &[u32]) -> Vec<TokenEmit> {
1002 ids.iter().map(|&id| make_token(id)).collect()
1003 }
1004
1005 #[test]
1006 fn test_prompt_context_basic() -> Result<()> {
1007 let tokens = make_committed_tokens(&[1, 2, 3]);
1008 let mut ctx = PromptContext::new(60, Some(99_999));
1009 ctx.update_after_eou(&tokens, 128);
1010 let p = ctx.prompt_tokens();
1011 assert_eq!(p[0], 99_999, "must start with <|prevtext|>");
1012 assert_eq!(&p[1..], &[1, 2, 3]);
1013 Ok(())
1014 }
1015
1016 #[test]
1017 fn test_prompt_context_no_prevtext() -> Result<()> {
1018 let tokens = make_committed_tokens(&[1, 2, 3]);
1019 let mut ctx = PromptContext::new(60, None);
1020 ctx.update_after_eou(&tokens, 128);
1021 assert_eq!(ctx.prompt_tokens(), &[1, 2, 3]);
1022 Ok(())
1023 }
1024
1025 #[test]
1026 fn test_prompt_context_filters_specials() -> Result<()> {
1027 let tokens = vec![
1028 make_ts_token(50364, 0.0),
1029 make_token(1),
1030 make_ts_token(50366, 0.2),
1031 make_token(2),
1032 ];
1033 let mut ctx = PromptContext::new(60, None);
1034 ctx.update_after_eou(&tokens, 128);
1035 assert_eq!(
1036 ctx.prompt_tokens(),
1037 &[1, 2],
1038 "special/timestamp tokens must be excluded from prompt"
1039 );
1040 Ok(())
1041 }
1042
1043 #[test]
1044 fn test_prompt_context_caps_max_prompt_tokens() -> Result<()> {
1045 let ids: Vec<u32> = (0u32..101).collect();
1047 let tokens = make_committed_tokens(&ids);
1048 let mut ctx = PromptContext::new(60, None);
1049 ctx.update_after_eou(&tokens, 128);
1050 assert_eq!(ctx.prompt_tokens().len(), 60);
1051 assert_eq!(
1052 ctx.prompt_tokens()[0],
1053 41,
1054 "should be the 42nd token (ID 41)"
1055 );
1056 assert_eq!(ctx.prompt_tokens()[59], 100);
1057 Ok(())
1058 }
1059
1060 #[test]
1061 fn test_prompt_context_caps_context_limit() -> Result<()> {
1062 let ids: Vec<u32> = (0u32..100).collect();
1066 let tokens = make_committed_tokens(&ids);
1067 let mut ctx = PromptContext::new(60, Some(99_999));
1068 ctx.update_after_eou(&tokens, 400);
1069 assert_eq!(
1070 ctx.prompt_tokens().len(),
1071 45,
1072 "prompt must fit: prevtext(1) + 44 text = 45 ≤ 448-3-400"
1073 );
1074 assert_eq!(
1075 ctx.prompt_tokens()[0],
1076 99_999,
1077 "first token must be <|prevtext|>"
1078 );
1079 Ok(())
1080 }
1081
1082 #[test]
1083 fn test_prompt_context_no_prevtext_on_empty_commit() -> Result<()> {
1084 let tokens = vec![make_ts_token(50364, 0.0)];
1086 let mut ctx = PromptContext::new(60, Some(99_999));
1087 ctx.update_after_eou(&tokens, 128);
1088 assert!(
1089 ctx.prompt_tokens().is_empty(),
1090 "<|prevtext|> must not be emitted when no regular tokens were committed"
1091 );
1092 Ok(())
1093 }
1094
1095 #[test]
1096 fn test_prompt_context_reset() -> Result<()> {
1097 let tokens = make_committed_tokens(&[1, 2]);
1098 let mut ctx = PromptContext::new(60, None);
1099 ctx.update_after_eou(&tokens, 128);
1100 assert!(!ctx.prompt_tokens().is_empty());
1101 ctx.reset();
1102 assert!(ctx.prompt_tokens().is_empty());
1103 Ok(())
1104 }
1105
1106 #[test]
1107 #[ignore = "requires silero_vad.onnx in ./models"]
1108 fn test_chunker_first_window_at_stride_boundary() -> Result<()> {
1109 let vad = open_vad()?;
1112 let cfg = WindowConfig {
1113 max_window_secs: 3.0,
1114 stride_secs: 1.0,
1115 vad_threshold: 0.5,
1116 min_speech_secs: 0.0,
1117 };
1118 let mut chunker = Chunker::new(cfg, vad);
1119
1120 let one_second = vec![0.0f32; SAMPLE_RATE as usize];
1121
1122 let window = chunker.push(&one_second);
1124
1125 let w = window.expect("first window should have fired after 1 s (stride_secs = 1.0)");
1126
1127 assert_eq!(
1128 w.real_samples, SAMPLE_RATE as usize,
1129 "real_samples should equal exactly 1 s of input"
1130 );
1131 assert_eq!(
1132 w.samples.len(),
1133 3 * SAMPLE_RATE as usize,
1134 "samples vec must be max_window_secs × 16000 long"
1135 );
1136 assert!(
1137 (w.window_start_secs - 0.0).abs() < 1e-4,
1138 "window_start_secs should be 0.0"
1139 );
1140 assert!(!w.forced_eou, "first 1-s window should not be a forced EOU");
1141
1142 Ok(())
1143 }
1144
1145 #[test]
1149 #[ignore = "requires silero_vad.onnx in ./models"]
1150 fn test_chunker_cap_hit_no_auto_forced_eou() -> Result<()> {
1151 let vad = open_vad()?;
1152 let cfg = WindowConfig {
1153 max_window_secs: 2.0,
1154 stride_secs: 1.0,
1155 vad_threshold: 0.5,
1156 min_speech_secs: 0.0,
1157 };
1158 let mut chunker = Chunker::new(cfg, vad);
1159
1160 let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1162 let window = chunker
1163 .push(&two_seconds)
1164 .expect("cap-hit window should have fired");
1165 assert!(window.cap_hit, "cap_hit must be true at the cap");
1166 assert!(
1167 !window.forced_eou,
1168 "first cap-hit must NOT set forced_eou — trim_oldest is the primary path"
1169 );
1170 assert_eq!(window.real_samples, 2 * SAMPLE_RATE as usize);
1171
1172 Ok(())
1173 }
1174
1175 #[test]
1178 #[ignore = "requires silero_vad.onnx in ./models"]
1179 fn test_chunker_cap_pending_then_forced_eou() -> Result<()> {
1180 let vad = open_vad()?;
1181 let cfg = WindowConfig {
1182 max_window_secs: 2.0,
1183 stride_secs: 1.0,
1184 vad_threshold: 0.5,
1185 min_speech_secs: 0.0,
1186 };
1187 let mut chunker = Chunker::new(cfg, vad);
1188
1189 let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1191 let w1 = chunker.push(&two_seconds).expect("first cap-hit");
1192 assert!(w1.cap_hit && !w1.forced_eou);
1193
1194 let w2 = chunker.push(&[0.0f32]).expect("second cap-hit");
1197 assert!(w2.cap_hit, "second window still flagged as cap_hit");
1198 assert!(
1199 w2.forced_eou,
1200 "second cap without trim must set forced_eou as safety net"
1201 );
1202
1203 Ok(())
1204 }
1205
1206 #[test]
1210 #[ignore = "requires silero_vad.onnx in ./models"]
1211 fn test_chunker_trim_oldest_vad_frame_alignment() -> Result<()> {
1212 let vad = open_vad()?;
1213 let cfg = WindowConfig {
1214 max_window_secs: 2.0,
1215 stride_secs: 1.0,
1216 vad_threshold: 0.5,
1217 min_speech_secs: 0.0,
1218 };
1219 let mut chunker = Chunker::new(cfg, vad);
1220
1221 let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1223 chunker.push(&two_seconds).expect("cap-hit");
1224 let buf_before = chunker.buf.len();
1225 let vad_before = chunker.vad_decisions.len();
1226 assert_eq!(buf_before, 2 * SAMPLE_RATE as usize);
1228 assert!(vad_before == 62 || vad_before == 63);
1229
1230 let trimmed = chunker.trim_oldest(700);
1233 assert_eq!(
1234 trimmed, VAD_FRAME_SIZE,
1235 "trim rounds down to frame multiple"
1236 );
1237 assert_eq!(
1238 chunker.buf.len(),
1239 buf_before - VAD_FRAME_SIZE,
1240 "buf shrunk by exactly one frame"
1241 );
1242 assert_eq!(
1243 chunker.vad_decisions.len(),
1244 vad_before - 1,
1245 "vad_decisions shrunk by exactly one entry"
1246 );
1247
1248 let trimmed_small = chunker.trim_oldest(100);
1250 assert_eq!(trimmed_small, 0);
1251 assert_eq!(chunker.buf.len(), buf_before - VAD_FRAME_SIZE);
1252
1253 Ok(())
1254 }
1255
1256 #[test]
1259 #[ignore = "requires silero_vad.onnx in ./models"]
1260 fn test_chunker_trim_oldest_advances_utterance_start_sample() -> Result<()> {
1261 let vad = open_vad()?;
1262 let cfg = WindowConfig {
1263 max_window_secs: 2.0,
1264 stride_secs: 1.0,
1265 vad_threshold: 0.5,
1266 min_speech_secs: 0.0,
1267 };
1268 let mut chunker = Chunker::new(cfg, vad);
1269
1270 let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1271 chunker.push(&two_seconds).expect("cap-hit");
1272 let start_before = chunker.utterance_start_sample;
1273
1274 let target = SAMPLE_RATE as usize;
1276 let trimmed = chunker.trim_oldest(target);
1277 assert_eq!(
1278 trimmed,
1279 (target / VAD_FRAME_SIZE) * VAD_FRAME_SIZE,
1280 "trimmed exactly the aligned amount"
1281 );
1282 assert_eq!(
1283 chunker.utterance_start_sample,
1284 start_before + trimmed as u64,
1285 "utterance_start_sample advanced by trimmed samples"
1286 );
1287
1288 let w = chunker.push(&[0.0f32]);
1290 if let Some(w) = w {
1291 assert!(
1292 !w.forced_eou,
1293 "after successful trim, the next push should not be flagged forced_eou"
1294 );
1295 }
1296
1297 Ok(())
1298 }
1299
1300 #[test]
1304 fn test_committer_on_trim_force_commits_tentative_tail() -> Result<()> {
1305 let mut c = Committer::new();
1306 c.ingest(vec![make_token(1), make_token(2), make_token(3)]);
1308 c.ingest(vec![make_token(1), make_token(2), make_token(4)]);
1309 assert_eq!(c.committed_text(), "tok1tok2");
1310
1311 let delta = c.on_trim();
1312 let trim_committed = committed_ids(&delta);
1313 assert_eq!(
1314 trim_committed,
1315 vec![4],
1316 "on_trim force-commits the tentative tail (tok4)"
1317 );
1318 assert!(
1319 c.last_candidate().is_empty(),
1320 "last_candidate cleared after on_trim"
1321 );
1322 assert_eq!(c.last_lcp_end_in_candidate(), 0, "LCP cursor reset");
1323 assert_eq!(
1324 c.committed_text(),
1325 "tok1tok2tok4",
1326 "committed_text now includes the force-committed tail"
1327 );
1328
1329 let (comm, _) = c.ingest(vec![make_token(5), make_token(6)]);
1332 assert!(
1333 committed_ids(&comm).is_empty(),
1334 "first post-trim ingest commits nothing (no prior candidate)"
1335 );
1336 assert_eq!(
1337 c.committed_text(),
1338 "tok1tok2tok4",
1339 "committed_text not blown away by post-trim ingest"
1340 );
1341
1342 Ok(())
1343 }
1344
1345 #[test]
1349 fn test_committer_lcp_mixed_timestamps() -> Result<()> {
1350 let mut c = Committer::new();
1351 c.ingest(vec![make_token(1), make_token(2)]);
1353 let (comm, _tent) = c.ingest(vec![
1355 make_ts_token(50364, 0.0),
1356 make_token(1),
1357 make_ts_token(50370, 0.2),
1358 make_token(2),
1359 ]);
1360 let ids = committed_ids(&comm);
1361 assert!(ids.contains(&1), "content token 1 must commit");
1364 assert!(ids.contains(&2), "content token 2 must commit");
1365
1366 Ok(())
1367 }
1368}