whisperforge_core/
streaming.rs

1use std::collections::VecDeque;
2
3use crate::stream_decode::TokenEmit;
4use crate::vad_silero::SileroVad;
5
6const SAMPLE_RATE: u32 = 16_000;
7const VAD_FRAME_SIZE: usize = 512;
8
9pub struct WindowConfig {
10    /// Hard cap on the growing decode buffer; once reached, the chunker emits a window with
11    /// `cap_hit=true` and the consumer must call either [`Chunker::trim_oldest`] (continue
12    /// the utterance by dropping the oldest ~1.5 s of audio) or [`Chunker::reset_utterance`]
13    /// (forced EOU). Default 5.0 s — chosen so the autoregressive decoder on tiny.en CPU
14    /// stays under `stride_secs` per window. If a second cap is hit without an intervening
15    /// trim/reset, `forced_eou=true` fires as a safety net (see CLAUDE.md § Live-mic perf
16    /// ceiling).
17    pub max_window_secs: f32,
18    /// How often to re-decode the growing buffer (default 1.0 s).
19    pub stride_secs: f32,
20    pub vad_threshold: f32,
21    /// Minimum accumulated speech before the first window of an utterance fires
22    /// (debounces VAD flicker).
23    pub min_speech_secs: f32,
24}
25
26impl Default for WindowConfig {
27    fn default() -> Self {
28        Self {
29            max_window_secs: 5.0,
30            stride_secs: 1.0,
31            vad_threshold: 0.5,
32            min_speech_secs: 0.25,
33        }
34    }
35}
36
37pub struct StreamWindow {
38    /// Exactly `max_window_secs × 16 000` samples; zero-padded at the BACK when the
39    /// growing buffer hasn't reached `max_window_secs` yet.
40    pub samples: Vec<f32>,
41    /// Number of real (non-padding) samples in `samples`.
42    pub real_samples: usize,
43    /// Wall-clock offset since stream start of the first sample in the current utterance.
44    pub window_start_secs: f32,
45    /// True if any VAD-positive frame falls within the current utterance buffer.
46    pub had_speech: bool,
47    /// Seconds since the last VAD-positive frame; for use by the endpointer.
48    pub trailing_silence_secs: f32,
49    /// True when the buffer hit `max_window_secs`. The consumer must call either
50    /// [`Chunker::trim_oldest`] or [`Chunker::reset_utterance`] before the next push;
51    /// otherwise the next cap will additionally set `forced_eou=true`.
52    pub cap_hit: bool,
53    /// True only on the *second* consecutive cap without an intervening trim or reset.
54    /// Treat as a forced EOU (finalize + `reset_utterance`) — the safety-net path when the
55    /// trim wasn't applied (e.g. no prior commits to anchor against).
56    pub forced_eou: bool,
57}
58
59pub struct Chunker {
60    cfg: WindowConfig,
61    vad: SileroVad,
62    /// Growing audio buffer for the current utterance; never exceeds `max_window_secs * 16 kHz`.
63    buf: Vec<f32>,
64    pub samples_since_last_stride: usize,
65    pub total_samples_seen: u64,
66    pub last_speech_at_sample: Option<u64>,
67
68    // --- private implementation details ---
69
70    // Speech samples accumulated within the current utterance; gates the first stride.
71    speech_samples_accumulated: u64,
72    // Partial accumulation buffer for in-progress VAD frames.
73    vad_frame_buf: Vec<f32>,
74    // Once the first stride of the current utterance has fired, subsequent strides
75    // fire even during silence (so the endpointer can see growing trailing silence).
76    window_ever_emitted: bool,
77    // VAD decisions for the current utterance's audio buffer.
78    vad_decisions: VecDeque<bool>,
79    // Sample index (in stream-relative terms) where the current utterance's audio[0] sits.
80    utterance_start_sample: u64,
81    // True after a cap_hit window is emitted, until `trim_oldest` or `reset_utterance`
82    // is called. If a second cap fires while still pending, the safety-net `forced_eou`
83    // path activates so a misbehaving caller can't lose audio indefinitely.
84    cap_pending_handler: bool,
85    // Samples held back when `push` returned early due to a cap-hit. Prepended to the
86    // next `push` call so the audio is not lost. Eliminates the case where multiple
87    // cap-hits fire within a single `push` batch and the consumer only sees the last.
88    deferred_samples: Vec<f32>,
89}
90
91impl Chunker {
92    pub fn new(cfg: WindowConfig, vad: SileroVad) -> Self {
93        let max_samples = (cfg.max_window_secs * SAMPLE_RATE as f32) as usize;
94        let max_vad_frames = max_samples.div_ceil(VAD_FRAME_SIZE);
95        Self {
96            cfg,
97            vad,
98            buf: Vec::with_capacity(max_samples),
99            samples_since_last_stride: 0,
100            total_samples_seen: 0,
101            last_speech_at_sample: None,
102            speech_samples_accumulated: 0,
103            vad_frame_buf: Vec::with_capacity(VAD_FRAME_SIZE),
104            window_ever_emitted: false,
105            vad_decisions: VecDeque::with_capacity(max_vad_frames),
106            utterance_start_sample: 0,
107            cap_pending_handler: false,
108            deferred_samples: Vec::new(),
109        }
110    }
111
112    /// Push new microphone samples; returns `Some(window)` when a stride boundary fires
113    /// or the buffer hits the `max_window_secs` cap.
114    ///
115    /// The buffer GROWS — it doesn't slide. Consumers must call [`trim_oldest`] or
116    /// [`reset_utterance`] after a cap-hit window so the next push has room.
117    ///
118    /// On cap-hit, the call returns early with the cap-hit window; any remaining input
119    /// samples are deferred internally and prepended to the next `push` call. This avoids
120    /// the failure mode where multiple cap-hits fire within one batch and only the last
121    /// (forced_eou) window reaches the consumer.
122    pub fn push(&mut self, samples: &[f32]) -> Option<StreamWindow> {
123        let stride_samples = (self.cfg.stride_secs * SAMPLE_RATE as f32) as usize;
124        let max_samples = (self.cfg.max_window_secs * SAMPLE_RATE as f32) as usize;
125        let min_speech_samples = (self.cfg.min_speech_secs * SAMPLE_RATE as f32) as u64;
126        let max_vad_frames = max_samples.div_ceil(VAD_FRAME_SIZE);
127
128        let mut all_samples: Vec<f32> = std::mem::take(&mut self.deferred_samples);
129        all_samples.extend_from_slice(samples);
130
131        let mut result: Option<StreamWindow> = None;
132
133        for (idx, &s) in all_samples.iter().enumerate() {
134            if self.buf.len() < max_samples {
135                self.buf.push(s);
136            }
137            self.total_samples_seen += 1;
138            self.samples_since_last_stride += 1;
139
140            // Accumulate into the current VAD frame.
141            self.vad_frame_buf.push(s);
142            if self.vad_frame_buf.len() == VAD_FRAME_SIZE {
143                let frame: &[f32; VAD_FRAME_SIZE] = self
144                    .vad_frame_buf
145                    .as_slice()
146                    .try_into()
147                    .expect("vad_frame_buf is exactly VAD_FRAME_SIZE");
148                let is_speech = self
149                    .vad
150                    .probability(frame)
151                    .map(|p| p >= self.cfg.vad_threshold)
152                    .unwrap_or(false);
153                self.vad_frame_buf.clear();
154
155                if is_speech {
156                    self.last_speech_at_sample = Some(self.total_samples_seen);
157                    self.speech_samples_accumulated += VAD_FRAME_SIZE as u64;
158                }
159
160                if self.vad_decisions.len() >= max_vad_frames {
161                    self.vad_decisions.pop_front();
162                }
163                self.vad_decisions.push_back(is_speech);
164            }
165
166            // Buffer hit the cap. Mark cap_hit=true; the consumer chooses between a
167            // stride-based trim (continue the utterance) or a reset (forced EOU). If
168            // a previous cap-hit is still pending unhandled, additionally set
169            // forced_eou=true as a safety net.
170            //
171            // We return early here so the consumer always sees the FIRST cap-hit of
172            // each batch (not the last). Remaining samples are deferred to next push.
173            if self.buf.len() >= max_samples {
174                let mut w = self.build_window(max_samples);
175                w.cap_hit = true;
176                if self.cap_pending_handler {
177                    w.forced_eou = true;
178                }
179                self.cap_pending_handler = true;
180                self.samples_since_last_stride = 0;
181                self.deferred_samples = all_samples[idx + 1..].to_vec();
182                return Some(w);
183            }
184
185            // Regular stride tick.
186            if self.samples_since_last_stride >= stride_samples {
187                let enough_speech = self.speech_samples_accumulated >= min_speech_samples;
188                if enough_speech || self.window_ever_emitted {
189                    result = Some(self.build_window(max_samples));
190                    self.window_ever_emitted = true;
191                }
192                self.samples_since_last_stride = 0;
193            }
194        }
195
196        result
197    }
198
199    fn build_window(&self, max_samples: usize) -> StreamWindow {
200        let real_samples = self.buf.len();
201        let mut samples = Vec::with_capacity(max_samples);
202        samples.extend_from_slice(&self.buf);
203        samples.resize(max_samples, 0.0);
204
205        let trailing_silence_secs = match self.last_speech_at_sample {
206            Some(last) => self.total_samples_seen.saturating_sub(last) as f32 / SAMPLE_RATE as f32,
207            None => {
208                self.total_samples_seen
209                    .saturating_sub(self.utterance_start_sample) as f32
210                    / SAMPLE_RATE as f32
211            }
212        };
213
214        let window_start_secs = self.utterance_start_sample as f32 / SAMPLE_RATE as f32;
215        let had_speech = self.vad_decisions.iter().any(|&b| b);
216
217        StreamWindow {
218            samples,
219            real_samples,
220            window_start_secs,
221            had_speech,
222            trailing_silence_secs,
223            cap_hit: false,
224            forced_eou: false,
225        }
226    }
227
228    /// Drop the current utterance buffer so the next stride starts fresh. Call after
229    /// `committer.finalize_utterance()` (either natural EOU or `forced_eou`). Wall-clock
230    /// counters and the last-speech timestamp persist so trailing-silence math stays valid
231    /// across utterance boundaries.
232    pub fn reset_utterance(&mut self) {
233        self.buf.clear();
234        self.samples_since_last_stride = 0;
235        self.vad_decisions.clear();
236        self.speech_samples_accumulated = 0;
237        self.window_ever_emitted = false;
238        self.utterance_start_sample = self.total_samples_seen;
239        self.cap_pending_handler = false;
240        self.deferred_samples.clear();
241    }
242
243    /// Drop the oldest `samples` from the front of the buffer at a committed-token
244    /// boundary, keeping the utterance going. Rounded down to a `VAD_FRAME_SIZE` multiple
245    /// so `vad_decisions` stays in lock-step with `buf`. Advances `utterance_start_sample`
246    /// to preserve wall-clock anchoring; `current_secs = window_start_secs + real_samples/SR`
247    /// is invariant (the increase in window_start exactly cancels the decrease in real_samples).
248    ///
249    /// Returns the number of samples actually trimmed (0 if the requested amount is less
250    /// than one VAD frame or would empty the buffer). Returning 0 leaves `cap_pending_handler`
251    /// set so the next cap escalates to forced-EOU.
252    pub fn trim_oldest(&mut self, samples: usize) -> usize {
253        let trim_frames = samples / VAD_FRAME_SIZE;
254        let trim_samples = trim_frames * VAD_FRAME_SIZE;
255        if trim_samples == 0 || trim_samples >= self.buf.len() {
256            return 0;
257        }
258        self.buf.drain(..trim_samples);
259        for _ in 0..trim_frames {
260            if self.vad_decisions.pop_front().unwrap_or(false) {
261                self.speech_samples_accumulated = self
262                    .speech_samples_accumulated
263                    .saturating_sub(VAD_FRAME_SIZE as u64);
264            }
265        }
266        self.utterance_start_sample += trim_samples as u64;
267        self.cap_pending_handler = false;
268        trim_samples
269    }
270}
271
272/// Outcome of one `Committer::ingest` or `finalize_utterance` call.
273pub enum CommitDelta {
274    /// Newly stable tokens since the previous call. May be empty.
275    Committed {
276        new_tokens: Vec<TokenEmit>,
277        new_text: String,
278    },
279    /// Current tentative tail (everything after the committed prefix). Updated every round.
280    Tentative {
281        tokens: Vec<TokenEmit>,
282        text: String,
283    },
284}
285
286/// LocalAgreement-2 token committer.
287///
288/// Compares successive window decode outputs and commits the longest stable
289/// prefix; everything past that prefix is tentative until the next round confirms it.
290pub struct Committer {
291    last_candidate: Vec<TokenEmit>,
292    committed: Vec<TokenEmit>,
293    committed_text: String,
294    /// Set by `finalize_utterance`; the next `ingest` auto-resets the committer's per-
295    /// utterance state. This lets the caller read `committed_tokens` / `committed_text`
296    /// between finalize and the next ingest (e.g. for the `<|prevtext|>` prompt prefix)
297    /// without smearing the previous utterance into the next one.
298    awaiting_reset: bool,
299    /// Exclusive end index of the most recent LCP within the most recent candidate's
300    /// original token-list (timestamp-bearing) space. Used by the streaming caller to
301    /// locate the latest timestamp inside the committed prefix when computing trim points.
302    last_lcp_end_in_candidate: usize,
303}
304
305impl Committer {
306    pub fn new() -> Self {
307        Self {
308            last_candidate: Vec::new(),
309            committed: Vec::new(),
310            committed_text: String::new(),
311            awaiting_reset: false,
312            last_lcp_end_in_candidate: 0,
313        }
314    }
315
316    /// Ingest a new full-window decode result.
317    ///
318    /// Returns `(committed_delta, tentative_delta)`.
319    pub fn ingest(&mut self, candidate: Vec<TokenEmit>) -> (CommitDelta, CommitDelta) {
320        if self.awaiting_reset {
321            self.last_candidate.clear();
322            self.committed.clear();
323            self.committed_text.clear();
324            self.awaiting_reset = false;
325        }
326        let lcp = lcp_len(&self.last_candidate, &candidate);
327        self.last_lcp_end_in_candidate = lcp;
328        let prev_committed_len = self.committed.len();
329
330        // Commit candidate[prev_committed_len..lcp] when lcp has advanced past what we've
331        // already committed. If lcp < prev_committed_len the new decode regressed on
332        // committed tokens — committed tokens are final, so we simply don't extend.
333        if lcp > prev_committed_len {
334            for t in candidate[prev_committed_len..lcp].iter() {
335                self.committed.push(t.clone());
336            }
337        }
338
339        let new_text = build_text(&self.committed[prev_committed_len..]);
340        self.committed_text.push_str(&new_text);
341
342        let new_tokens: Vec<TokenEmit> = self.committed[prev_committed_len..].to_vec();
343        let tentative_tokens: Vec<TokenEmit> = candidate[lcp..].to_vec();
344        let tentative_text = build_text(&tentative_tokens);
345
346        self.last_candidate = candidate;
347
348        (
349            CommitDelta::Committed {
350                new_tokens,
351                new_text,
352            },
353            CommitDelta::Tentative {
354                tokens: tentative_tokens,
355                text: tentative_text,
356            },
357        )
358    }
359
360    /// Force-commit everything tentative. Called by the endpointer on EOU.
361    ///
362    /// After this call, `committed_tokens` / `committed_text` still reflect the just-
363    /// finalised utterance (so the caller can build a `<|prevtext|>` prompt from it).
364    /// The next `ingest` will auto-reset the committer for the new utterance. Repeated
365    /// `finalize_utterance` calls without an intervening `ingest` are a no-op.
366    pub fn finalize_utterance(&mut self) -> CommitDelta {
367        let start = self.committed.len();
368        let new_tokens: Vec<TokenEmit> = if start < self.last_candidate.len() {
369            self.last_candidate[start..].to_vec()
370        } else {
371            Vec::new()
372        };
373        let new_text = build_text(&new_tokens);
374        self.committed_text.push_str(&new_text);
375        self.committed.extend(new_tokens.iter().cloned());
376        self.last_candidate.clear();
377        self.awaiting_reset = true;
378        CommitDelta::Committed {
379            new_tokens,
380            new_text,
381        }
382    }
383
384    pub fn committed_tokens(&self) -> &[TokenEmit] {
385        &self.committed
386    }
387
388    pub fn committed_text(&self) -> &str {
389        &self.committed_text
390    }
391
392    /// The most recent candidate (the argument to the most recent `ingest`). Empty
393    /// after `finalize_utterance` (until the next `ingest`) and after `on_trim`.
394    pub fn last_candidate(&self) -> &[TokenEmit] {
395        &self.last_candidate
396    }
397
398    /// Exclusive end index of the most recent LCP within `last_candidate()`. The streaming
399    /// caller uses this with `last_candidate()` to find the latest timestamp inside the
400    /// committed prefix when picking a trim boundary on a cap-hit window.
401    pub fn last_lcp_end_in_candidate(&self) -> usize {
402        self.last_lcp_end_in_candidate
403    }
404
405    /// Called after `Chunker::trim_oldest` succeeds. Force-commits the current tentative
406    /// tail (the content past the LCP boundary in the most recent decode) and clears
407    /// `last_candidate` so the next ingest establishes a fresh baseline. Returns the
408    /// force-committed text so the caller can mirror it to its output sink.
409    ///
410    /// Rationale: without force-commit, the tentative tail is lost after the trim — the
411    /// next stride decodes overlapping audio in a different buffer context, the non-causal
412    /// encoder produces different tokens, and LCP doesn't reconfirm. Empirically this loses
413    /// roughly 1.5 s of content per cap-hit. Force-committing trusts the cap-hit window's
414    /// tail; duplication risk is mitigated by clearing `last_candidate` (post-trim strides
415    /// can't LCP against the pre-trim tail).
416    pub fn on_trim(&mut self) -> CommitDelta {
417        let start = self.committed.len();
418        let new_tokens: Vec<TokenEmit> = if start < self.last_candidate.len() {
419            self.last_candidate[start..].to_vec()
420        } else {
421            Vec::new()
422        };
423        let new_text = build_text(&new_tokens);
424        self.committed_text.push_str(&new_text);
425        self.committed.extend(new_tokens.iter().cloned());
426        self.last_candidate.clear();
427        self.last_lcp_end_in_candidate = 0;
428        CommitDelta::Committed {
429            new_tokens,
430            new_text,
431        }
432    }
433}
434
435impl Default for Committer {
436    fn default() -> Self {
437        Self::new()
438    }
439}
440
441/// Join text pieces for non-special tokens.
442fn build_text(tokens: &[TokenEmit]) -> String {
443    tokens
444        .iter()
445        .filter(|t| !t.is_special)
446        .map(|t| t.text.as_str())
447        .collect()
448}
449
450/// Length of the longest common non-timestamp-token prefix between `a` and `b`,
451/// expressed as an exclusive end index in `b`.
452///
453/// Timestamp tokens (`window_ts_secs.is_some()`) are skipped in both sequences
454/// so that differing timestamp values between overlapping windows don't break the match.
455fn lcp_len(a: &[TokenEmit], b: &[TokenEmit]) -> usize {
456    let a_content: Vec<(usize, u32)> = a
457        .iter()
458        .enumerate()
459        .filter(|(_, t)| t.window_ts_secs.is_none())
460        .map(|(i, t)| (i, t.id))
461        .collect();
462    let b_content: Vec<(usize, u32)> = b
463        .iter()
464        .enumerate()
465        .filter(|(_, t)| t.window_ts_secs.is_none())
466        .map(|(i, t)| (i, t.id))
467        .collect();
468
469    let lcp_n = a_content
470        .iter()
471        .zip(b_content.iter())
472        .take_while(|((_, aid), (_, bid))| aid == bid)
473        .count();
474
475    if lcp_n == 0 {
476        return 0;
477    }
478    b_content[lcp_n - 1].0 + 1
479}
480
481pub struct EndpointConfig {
482    /// Hard EOU: minimum trailing silence (seconds) after which an utterance is ended (default 2.0).
483    pub silence_secs: f32,
484    /// Soft EOU: minimum trailing silence after a terminal punctuation mark (default 0.8).
485    pub punct_silence_secs: f32,
486    /// Suppress EOU if the utterance has been running for less than this many seconds (default 0.5).
487    pub min_utterance_secs: f32,
488}
489
490impl Default for EndpointConfig {
491    fn default() -> Self {
492        Self {
493            silence_secs: 2.0,
494            punct_silence_secs: 0.8,
495            min_utterance_secs: 0.5,
496        }
497    }
498}
499
500/// Hybrid silence + punctuation end-of-utterance detector.
501///
502/// Call `step` after each chunker + committer round. When it returns `true`, fire the EOU
503/// event, then call `reset` before the next utterance.
504pub struct Endpointer {
505    cfg: EndpointConfig,
506    /// Byte-length of committed text when this utterance baseline was captured.
507    text_len_at_reset: usize,
508    /// Prevents re-firing before the caller calls `reset`.
509    fired: bool,
510    /// Set by `reset`; the next `step` call captures the current committed-text length
511    /// as the new baseline (avoids requiring committed text as a parameter to `reset`).
512    needs_baseline_update: bool,
513    /// Wall-clock second when first new committed text appeared after the last reset.
514    utterance_start_secs: Option<f32>,
515}
516
517impl Endpointer {
518    pub fn new(cfg: EndpointConfig) -> Self {
519        Self {
520            cfg,
521            text_len_at_reset: 0,
522            fired: false,
523            needs_baseline_update: false,
524            utterance_start_secs: None,
525        }
526    }
527
528    /// Called after each chunker tick + committer round. Returns `true` if EOU should fire.
529    pub fn step(&mut self, window: &StreamWindow, latest_committed_text: &str) -> bool {
530        if self.fired {
531            return false;
532        }
533
534        if self.needs_baseline_update {
535            self.text_len_at_reset = latest_committed_text.len();
536            self.needs_baseline_update = false;
537        }
538
539        let has_new_committed = latest_committed_text.len() > self.text_len_at_reset;
540
541        if has_new_committed && self.utterance_start_secs.is_none() {
542            self.utterance_start_secs = Some(window.window_start_secs);
543        }
544
545        let current_secs =
546            window.window_start_secs + window.real_samples as f32 / SAMPLE_RATE as f32;
547        let utterance_secs = self
548            .utterance_start_secs
549            .map(|start| current_secs - start)
550            .unwrap_or(0.0);
551        if utterance_secs < self.cfg.min_utterance_secs {
552            return false;
553        }
554
555        if has_new_committed && window.trailing_silence_secs >= self.cfg.silence_secs {
556            self.fired = true;
557            return true;
558        }
559
560        if has_new_committed {
561            let new_text = &latest_committed_text[self.text_len_at_reset..];
562            if matches!(new_text.chars().last(), Some('.') | Some('!') | Some('?'))
563                && window.trailing_silence_secs >= self.cfg.punct_silence_secs
564            {
565                self.fired = true;
566                return true;
567            }
568        }
569
570        false
571    }
572
573    /// Reset state after an EOU has been handled. The next `step` call will capture the
574    /// current committed-text length as the baseline for the new utterance.
575    pub fn reset(&mut self) {
576        self.fired = false;
577        self.utterance_start_secs = None;
578        self.needs_baseline_update = true;
579    }
580}
581
582/// Manages the cross-utterance prompt prefix fed to the streaming decoder.
583///
584/// Call `update_after_eou` immediately after an EOU fires, passing the full committed
585/// token list from `Committer::committed_tokens`. Then pass `prompt_tokens()` into
586/// `DecodeContext::prompt_tokens` for every window until the next EOU.
587pub struct PromptContext {
588    /// Maximum number of committed regular-token IDs to carry forward (default 60).
589    max_prompt_tokens: usize,
590    /// The `<|prevtext|>` token ID, or `None` for models that lack it (e.g. tiny.en).
591    prevtext_token_id: Option<u32>,
592    /// Prompt built after the last EOU; empty before the first EOU.
593    current_prompt: Vec<u32>,
594}
595
596impl PromptContext {
597    /// `prevtext_token_id`: result of `tokenizer.token_to_id("<|prevtext|>")`.
598    /// Pass `None` for English-only models that lack `<|prevtext|>` in their vocabulary.
599    pub fn new(max_prompt_tokens: usize, prevtext_token_id: Option<u32>) -> Self {
600        Self {
601            max_prompt_tokens,
602            prevtext_token_id,
603            current_prompt: Vec::new(),
604        }
605    }
606
607    /// Update the stored prompt after an EOU.
608    ///
609    /// Extracts the last `max_prompt_tokens` regular-token IDs (filter: `!is_special`),
610    /// prepends `<|prevtext|>` when available and the tail is non-empty, and caps the total
611    /// so that `prompt.len() + sot(1) + lang(1) + transcribe(1) + max_new_tokens ≤ 448`.
612    pub fn update_after_eou(&mut self, committed_tokens: &[TokenEmit], max_new_tokens: usize) {
613        // Whisper's decoder context window is 448 tokens.
614        // 3 slots are reserved for the sot + language + transcribe init tokens.
615        let max_allowed = 448usize.saturating_sub(3 + max_new_tokens);
616        // Reserve one additional slot for the <|prevtext|> prefix if it will be emitted.
617        let text_slots = if self.prevtext_token_id.is_some() {
618            max_allowed.saturating_sub(1)
619        } else {
620            max_allowed
621        };
622        let cap = self.max_prompt_tokens.min(text_slots);
623
624        // Collect regular (non-special) token IDs — same predicate as build_text.
625        let regular_ids: Vec<u32> = committed_tokens
626            .iter()
627            .filter(|t| !t.is_special)
628            .map(|t| t.id)
629            .collect();
630
631        let start = regular_ids.len().saturating_sub(cap);
632        let tail = &regular_ids[start..];
633
634        self.current_prompt.clear();
635        // Only emit <|prevtext|> when there are actual text tokens to accompany it.
636        if let Some(prevtext) = self.prevtext_token_id
637            && !tail.is_empty()
638        {
639            self.current_prompt.push(prevtext);
640        }
641        self.current_prompt.extend_from_slice(tail);
642    }
643
644    /// Returns the prompt tokens to pass into `DecodeContext::prompt_tokens`.
645    /// Empty before the first EOU or after `reset`.
646    pub fn prompt_tokens(&self) -> &[u32] {
647        &self.current_prompt
648    }
649
650    /// Clear the stored prompt (call on stream restart or explicit reset).
651    pub fn reset(&mut self) {
652        self.current_prompt.clear();
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::*;
659    use crate::vad_silero::ensure_silero_model;
660    use anyhow::Result;
661    use std::path::PathBuf;
662
663    fn models_dir() -> PathBuf {
664        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
665            .parent()
666            .expect("workspace root")
667            .join("models")
668    }
669
670    fn open_vad() -> Result<SileroVad> {
671        let path = ensure_silero_model(&models_dir())?;
672        SileroVad::open(&path)
673    }
674
675    /// Read all f32 samples from a 16-bit PCM or IEEE float32 WAV, returning raw f32.
676    fn read_wav_samples(path: &std::path::Path) -> Result<Vec<f32>> {
677        use anyhow::Context;
678        let bytes = std::fs::read(path).with_context(|| format!("read {}", path.display()))?;
679        let mut pos = 12usize;
680        let mut data_start = None;
681        let mut data_len = 0usize;
682        let mut audio_format = 1u16; // PCM default
683        while pos + 8 <= bytes.len() {
684            let id = &bytes[pos..pos + 4];
685            let size = u32::from_le_bytes(bytes[pos + 4..pos + 8].try_into().unwrap()) as usize;
686            if id == b"fmt " {
687                audio_format = u16::from_le_bytes(bytes[pos + 8..pos + 10].try_into().unwrap());
688            } else if id == b"data" {
689                data_start = Some(pos + 8);
690                data_len = size;
691                break;
692            }
693            pos += 8 + size + (size & 1);
694        }
695        let start = data_start.context("no 'data' chunk in WAV")?;
696        let end = (start + data_len).min(bytes.len());
697        if audio_format == 3 {
698            // IEEE float32
699            let n = (end - start) / 4;
700            let mut samples = Vec::with_capacity(n);
701            for i in 0..n {
702                let b: [u8; 4] = bytes[start + i * 4..start + i * 4 + 4].try_into().unwrap();
703                samples.push(f32::from_le_bytes(b));
704            }
705            Ok(samples)
706        } else {
707            // 16-bit PCM
708            let n = (end - start) / 2;
709            let mut samples = Vec::with_capacity(n);
710            for i in 0..n {
711                let b: [u8; 2] = bytes[start + i * 2..start + i * 2 + 2].try_into().unwrap();
712                samples.push(i16::from_le_bytes(b) as f32 / 32768.0);
713            }
714            Ok(samples)
715        }
716    }
717
718    #[test]
719    #[ignore = "requires silero_vad.onnx in ./models AND test_data/LJ001-0001_16k.wav at repo root"]
720    fn test_chunker_speech_windows_cluster() -> Result<()> {
721        let vad = open_vad()?;
722        let cfg = WindowConfig {
723            max_window_secs: 28.0,
724            stride_secs: 1.0,
725            vad_threshold: 0.5,
726            min_speech_secs: 0.25,
727        };
728        let mut chunker = Chunker::new(cfg, vad);
729
730        let repo_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
731            .parent()
732            .expect("workspace root")
733            .to_path_buf();
734        let speech_wav = repo_root.join("test_data").join("LJ001-0001_16k.wav");
735        assert!(
736            speech_wav.exists(),
737            "speech WAV not found: {}",
738            speech_wav.display()
739        );
740        let speech_samples = read_wav_samples(&speech_wav)?;
741        // Use up to 2 s of speech.
742        let speech_len = (2 * SAMPLE_RATE as usize).min(speech_samples.len());
743        let speech_clip = &speech_samples[..speech_len];
744
745        // 4 s silence + 2 s speech + 4 s silence ≈ 10 s total.
746        let silence_4s = vec![0.0f32; 4 * SAMPLE_RATE as usize];
747        let silence_4s_end = vec![0.0f32; 4 * SAMPLE_RATE as usize];
748
749        let chunk_size = 512usize;
750        let mut windows: Vec<StreamWindow> = Vec::new();
751
752        let all_samples: Vec<f32> = silence_4s
753            .iter()
754            .chain(speech_clip.iter())
755            .chain(silence_4s_end.iter())
756            .copied()
757            .collect();
758
759        for chunk in all_samples.chunks(chunk_size) {
760            if let Some(w) = chunker.push(chunk) {
761                windows.push(w);
762            }
763        }
764
765        // At least one window with speech should have fired (the growing buffer keeps the
766        // speech samples from the middle 2 s of audio for all subsequent windows).
767        assert!(!windows.is_empty(), "no windows emitted");
768        let speech_windows: Vec<&StreamWindow> = windows.iter().filter(|w| w.had_speech).collect();
769        assert!(!speech_windows.is_empty(), "no speech windows detected");
770
771        // window_start_secs anchors to where the *current utterance buffer* started — which
772        // here is 0 (we never reset). Just sanity-check it's a reasonable wall-clock value.
773        for w in &speech_windows {
774            assert!(
775                w.window_start_secs >= 0.0 && w.window_start_secs <= 11.0,
776                "speech window start {:.2} s outside expected 0–11 s range",
777                w.window_start_secs
778            );
779        }
780
781        Ok(())
782    }
783
784    fn make_token(id: u32) -> TokenEmit {
785        TokenEmit {
786            id,
787            text: format!("tok{id}"),
788            logprob: 0.0,
789            window_ts_secs: None,
790            is_special: false,
791        }
792    }
793
794    fn make_ts_token(id: u32, ts: f32) -> TokenEmit {
795        TokenEmit {
796            id,
797            text: String::new(),
798            logprob: 0.0,
799            window_ts_secs: Some(ts),
800            is_special: true,
801        }
802    }
803
804    fn committed_ids(delta: &CommitDelta) -> Vec<u32> {
805        match delta {
806            CommitDelta::Committed { new_tokens, .. } => new_tokens.iter().map(|t| t.id).collect(),
807            CommitDelta::Tentative { .. } => vec![],
808        }
809    }
810
811    fn tentative_ids(delta: &CommitDelta) -> Vec<u32> {
812        match delta {
813            CommitDelta::Tentative { tokens, .. } => tokens.iter().map(|t| t.id).collect(),
814            CommitDelta::Committed { .. } => vec![],
815        }
816    }
817
818    /// LocalAgreement-2: three-round scenario from the C7 acceptance criteria.
819    #[test]
820    fn test_committer_local_agreement_2() -> Result<()> {
821        let mut c = Committer::new();
822
823        // Round 1: [tok1, tok2, tok3] — nothing can be committed yet (no prior candidate).
824        let (comm, tent) = c.ingest(vec![make_token(1), make_token(2), make_token(3)]);
825        assert!(
826            committed_ids(&comm).is_empty(),
827            "round 1: nothing committed"
828        );
829        assert_eq!(
830            tentative_ids(&tent),
831            vec![1, 2, 3],
832            "round 1: all tokens tentative"
833        );
834
835        // Round 2: [tok1, tok2, tok4, tok5] — LCP with round-1 is [tok1, tok2].
836        let (comm, tent) = c.ingest(vec![
837            make_token(1),
838            make_token(2),
839            make_token(4),
840            make_token(5),
841        ]);
842        assert_eq!(
843            committed_ids(&comm),
844            vec![1, 2],
845            "round 2: [tok1, tok2] committed"
846        );
847        assert_eq!(
848            tentative_ids(&tent),
849            vec![4, 5],
850            "round 2: tentative = [tok4, tok5]"
851        );
852        assert_eq!(c.committed_text(), "tok1tok2");
853
854        // Round 3: [tok1, tok2, tok4, tok6] — LCP with round-2 is [tok1, tok2, tok4].
855        let (comm, tent) = c.ingest(vec![
856            make_token(1),
857            make_token(2),
858            make_token(4),
859            make_token(6),
860        ]);
861        assert_eq!(
862            committed_ids(&comm),
863            vec![4],
864            "round 3: [tok4] newly committed"
865        );
866        assert_eq!(tentative_ids(&tent), vec![6], "round 3: tentative = [tok6]");
867        assert_eq!(c.committed_text(), "tok1tok2tok4");
868
869        // finalize_utterance: force-commit the remaining [tok6].
870        let final_delta = c.finalize_utterance();
871        assert_eq!(
872            committed_ids(&final_delta),
873            vec![6],
874            "finalize: [tok6] committed"
875        );
876        assert_eq!(c.committed_text(), "tok1tok2tok4tok6");
877        assert_eq!(
878            c.committed_tokens()
879                .iter()
880                .map(|t| t.id)
881                .collect::<Vec<_>>(),
882            vec![1, 2, 4, 6],
883            "committed_tokens must remain readable after finalize for PromptContext"
884        );
885
886        // Repeated finalize without an intervening ingest must not panic and must yield empty.
887        let noop = c.finalize_utterance();
888        assert!(
889            committed_ids(&noop).is_empty(),
890            "second finalize must be a no-op"
891        );
892
893        // Next ingest auto-resets the committer; the new utterance must start fresh and
894        // commit normally rather than refusing because cumulative `committed.len()` is high.
895        c.ingest(vec![make_token(10), make_token(20), make_token(30)]);
896        let (comm, _) = c.ingest(vec![make_token(10), make_token(20), make_token(40)]);
897        assert_eq!(
898            committed_ids(&comm),
899            vec![10, 20],
900            "new utterance must commit [tok10, tok20] after auto-reset"
901        );
902
903        Ok(())
904    }
905
906    /// Timestamp tokens between words must not break LCP matching.
907    #[test]
908    fn test_committer_lcp_ignores_timestamps() -> Result<()> {
909        let mut c = Committer::new();
910        // Round 1: [ts:0.0, tok1, ts:0.2, tok2]
911        c.ingest(vec![
912            make_ts_token(50364, 0.0),
913            make_token(1),
914            make_ts_token(50366, 0.2),
915            make_token(2),
916        ]);
917        // Round 2: same content tokens but different timestamp values
918        let (comm, _tent) = c.ingest(vec![
919            make_ts_token(50365, 0.02),
920            make_token(1),
921            make_ts_token(50367, 0.22),
922            make_token(2),
923        ]);
924        assert_eq!(
925            committed_ids(&comm),
926            vec![50365, 1, 50367, 2],
927            "content tokens tok1/tok2 match despite different timestamp IDs; both windows committed"
928        );
929        Ok(())
930    }
931
932    fn make_window(trailing_silence_secs: f32, window_start_secs: f32) -> StreamWindow {
933        let real_samples = 5 * SAMPLE_RATE as usize;
934        StreamWindow {
935            samples: vec![0.0f32; real_samples],
936            real_samples,
937            window_start_secs,
938            had_speech: trailing_silence_secs < 5.0,
939            trailing_silence_secs,
940            cap_hit: false,
941            forced_eou: false,
942        }
943    }
944
945    #[test]
946    fn test_endpointer_no_eou_silence_no_text() -> Result<()> {
947        let cfg = EndpointConfig {
948            silence_secs: 2.0,
949            punct_silence_secs: 0.8,
950            min_utterance_secs: 0.0,
951        };
952        let mut ep = Endpointer::new(cfg);
953        let window = make_window(2.5, 0.0);
954        assert!(!ep.step(&window, ""), "no EOU when committed text is empty");
955        Ok(())
956    }
957
958    #[test]
959    fn test_endpointer_hard_eou_fires_once() -> Result<()> {
960        let cfg = EndpointConfig {
961            silence_secs: 2.0,
962            punct_silence_secs: 0.8,
963            min_utterance_secs: 0.0,
964        };
965        let mut ep = Endpointer::new(cfg);
966
967        // Speech in progress — not enough silence yet.
968        assert!(!ep.step(&make_window(0.3, 0.0), "hello world"));
969        // Long trailing silence — hard EOU fires.
970        assert!(
971            ep.step(&make_window(2.5, 1.0), "hello world"),
972            "hard EOU should fire"
973        );
974        // Without reset, must not re-fire.
975        assert!(
976            !ep.step(&make_window(3.0, 2.0), "hello world"),
977            "must not re-fire before reset"
978        );
979
980        Ok(())
981    }
982
983    #[test]
984    fn test_endpointer_soft_eou_punctuation() -> Result<()> {
985        let cfg = EndpointConfig {
986            silence_secs: 2.0,
987            punct_silence_secs: 0.8,
988            min_utterance_secs: 0.0,
989        };
990        let mut ep = Endpointer::new(cfg);
991        // Terminal period + 0.9 s silence → soft EOU.
992        assert!(
993            ep.step(&make_window(0.9, 0.0), "Hello."),
994            "soft EOU should fire after '.' + 0.9 s silence"
995        );
996        Ok(())
997    }
998
999    // --- PromptContext tests (no model files required) ---
1000
1001    fn make_committed_tokens(ids: &[u32]) -> Vec<TokenEmit> {
1002        ids.iter().map(|&id| make_token(id)).collect()
1003    }
1004
1005    #[test]
1006    fn test_prompt_context_basic() -> Result<()> {
1007        let tokens = make_committed_tokens(&[1, 2, 3]);
1008        let mut ctx = PromptContext::new(60, Some(99_999));
1009        ctx.update_after_eou(&tokens, 128);
1010        let p = ctx.prompt_tokens();
1011        assert_eq!(p[0], 99_999, "must start with <|prevtext|>");
1012        assert_eq!(&p[1..], &[1, 2, 3]);
1013        Ok(())
1014    }
1015
1016    #[test]
1017    fn test_prompt_context_no_prevtext() -> Result<()> {
1018        let tokens = make_committed_tokens(&[1, 2, 3]);
1019        let mut ctx = PromptContext::new(60, None);
1020        ctx.update_after_eou(&tokens, 128);
1021        assert_eq!(ctx.prompt_tokens(), &[1, 2, 3]);
1022        Ok(())
1023    }
1024
1025    #[test]
1026    fn test_prompt_context_filters_specials() -> Result<()> {
1027        let tokens = vec![
1028            make_ts_token(50364, 0.0),
1029            make_token(1),
1030            make_ts_token(50366, 0.2),
1031            make_token(2),
1032        ];
1033        let mut ctx = PromptContext::new(60, None);
1034        ctx.update_after_eou(&tokens, 128);
1035        assert_eq!(
1036            ctx.prompt_tokens(),
1037            &[1, 2],
1038            "special/timestamp tokens must be excluded from prompt"
1039        );
1040        Ok(())
1041    }
1042
1043    #[test]
1044    fn test_prompt_context_caps_max_prompt_tokens() -> Result<()> {
1045        // 101 regular tokens; cap = 60 → last 60 IDs (41 through 100).
1046        let ids: Vec<u32> = (0u32..101).collect();
1047        let tokens = make_committed_tokens(&ids);
1048        let mut ctx = PromptContext::new(60, None);
1049        ctx.update_after_eou(&tokens, 128);
1050        assert_eq!(ctx.prompt_tokens().len(), 60);
1051        assert_eq!(
1052            ctx.prompt_tokens()[0],
1053            41,
1054            "should be the 42nd token (ID 41)"
1055        );
1056        assert_eq!(ctx.prompt_tokens()[59], 100);
1057        Ok(())
1058    }
1059
1060    #[test]
1061    fn test_prompt_context_caps_context_limit() -> Result<()> {
1062        // max_new_tokens=400 → max_allowed = 448 - 3 - 400 = 45.
1063        // prevtext takes 1 slot → text_slots = 44; cap = min(60, 44) = 44.
1064        // Total = 1 (prevtext) + 44 = 45.
1065        let ids: Vec<u32> = (0u32..100).collect();
1066        let tokens = make_committed_tokens(&ids);
1067        let mut ctx = PromptContext::new(60, Some(99_999));
1068        ctx.update_after_eou(&tokens, 400);
1069        assert_eq!(
1070            ctx.prompt_tokens().len(),
1071            45,
1072            "prompt must fit: prevtext(1) + 44 text = 45 ≤ 448-3-400"
1073        );
1074        assert_eq!(
1075            ctx.prompt_tokens()[0],
1076            99_999,
1077            "first token must be <|prevtext|>"
1078        );
1079        Ok(())
1080    }
1081
1082    #[test]
1083    fn test_prompt_context_no_prevtext_on_empty_commit() -> Result<()> {
1084        // Only a timestamp token committed → regular tail is empty → no prevtext emitted.
1085        let tokens = vec![make_ts_token(50364, 0.0)];
1086        let mut ctx = PromptContext::new(60, Some(99_999));
1087        ctx.update_after_eou(&tokens, 128);
1088        assert!(
1089            ctx.prompt_tokens().is_empty(),
1090            "<|prevtext|> must not be emitted when no regular tokens were committed"
1091        );
1092        Ok(())
1093    }
1094
1095    #[test]
1096    fn test_prompt_context_reset() -> Result<()> {
1097        let tokens = make_committed_tokens(&[1, 2]);
1098        let mut ctx = PromptContext::new(60, None);
1099        ctx.update_after_eou(&tokens, 128);
1100        assert!(!ctx.prompt_tokens().is_empty());
1101        ctx.reset();
1102        assert!(ctx.prompt_tokens().is_empty());
1103        Ok(())
1104    }
1105
1106    #[test]
1107    #[ignore = "requires silero_vad.onnx in ./models"]
1108    fn test_chunker_first_window_at_stride_boundary() -> Result<()> {
1109        // min_speech_secs = 0 means the speech gate is always satisfied; the first
1110        // window must fire after exactly stride_secs regardless of VAD output.
1111        let vad = open_vad()?;
1112        let cfg = WindowConfig {
1113            max_window_secs: 3.0,
1114            stride_secs: 1.0,
1115            vad_threshold: 0.5,
1116            min_speech_secs: 0.0,
1117        };
1118        let mut chunker = Chunker::new(cfg, vad);
1119
1120        let one_second = vec![0.0f32; SAMPLE_RATE as usize];
1121
1122        // Push all at once.
1123        let window = chunker.push(&one_second);
1124
1125        let w = window.expect("first window should have fired after 1 s (stride_secs = 1.0)");
1126
1127        assert_eq!(
1128            w.real_samples, SAMPLE_RATE as usize,
1129            "real_samples should equal exactly 1 s of input"
1130        );
1131        assert_eq!(
1132            w.samples.len(),
1133            3 * SAMPLE_RATE as usize,
1134            "samples vec must be max_window_secs × 16000 long"
1135        );
1136        assert!(
1137            (w.window_start_secs - 0.0).abs() < 1e-4,
1138            "window_start_secs should be 0.0"
1139        );
1140        assert!(!w.forced_eou, "first 1-s window should not be a forced EOU");
1141
1142        Ok(())
1143    }
1144
1145    /// First cap-hit emits cap_hit=true but NOT forced_eou — the consumer gets a chance
1146    /// to call `trim_oldest` for timestamp-anchored continuation. forced_eou is reserved
1147    /// for the safety-net path when trimming fails.
1148    #[test]
1149    #[ignore = "requires silero_vad.onnx in ./models"]
1150    fn test_chunker_cap_hit_no_auto_forced_eou() -> Result<()> {
1151        let vad = open_vad()?;
1152        let cfg = WindowConfig {
1153            max_window_secs: 2.0,
1154            stride_secs: 1.0,
1155            vad_threshold: 0.5,
1156            min_speech_secs: 0.0,
1157        };
1158        let mut chunker = Chunker::new(cfg, vad);
1159
1160        // Push exactly max_window_secs of audio → first cap-hit fires once.
1161        let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1162        let window = chunker
1163            .push(&two_seconds)
1164            .expect("cap-hit window should have fired");
1165        assert!(window.cap_hit, "cap_hit must be true at the cap");
1166        assert!(
1167            !window.forced_eou,
1168            "first cap-hit must NOT set forced_eou — trim_oldest is the primary path"
1169        );
1170        assert_eq!(window.real_samples, 2 * SAMPLE_RATE as usize);
1171
1172        Ok(())
1173    }
1174
1175    /// Second consecutive cap without an intervening trim_oldest/reset_utterance must
1176    /// set forced_eou=true so a misbehaving consumer can't lose audio indefinitely.
1177    #[test]
1178    #[ignore = "requires silero_vad.onnx in ./models"]
1179    fn test_chunker_cap_pending_then_forced_eou() -> Result<()> {
1180        let vad = open_vad()?;
1181        let cfg = WindowConfig {
1182            max_window_secs: 2.0,
1183            stride_secs: 1.0,
1184            vad_threshold: 0.5,
1185            min_speech_secs: 0.0,
1186        };
1187        let mut chunker = Chunker::new(cfg, vad);
1188
1189        // Push 2 s → first cap-hit.
1190        let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1191        let w1 = chunker.push(&two_seconds).expect("first cap-hit");
1192        assert!(w1.cap_hit && !w1.forced_eou);
1193
1194        // Push one more sample without trimming or resetting — second cap fires with
1195        // forced_eou=true.
1196        let w2 = chunker.push(&[0.0f32]).expect("second cap-hit");
1197        assert!(w2.cap_hit, "second window still flagged as cap_hit");
1198        assert!(
1199            w2.forced_eou,
1200            "second cap without trim must set forced_eou as safety net"
1201        );
1202
1203        Ok(())
1204    }
1205
1206    /// trim_oldest rounds down to a VAD_FRAME_SIZE multiple so buf.len() and
1207    /// vad_decisions stay in lock-step. Verify alignment using a non-frame-multiple
1208    /// request and a known buffer.
1209    #[test]
1210    #[ignore = "requires silero_vad.onnx in ./models"]
1211    fn test_chunker_trim_oldest_vad_frame_alignment() -> Result<()> {
1212        let vad = open_vad()?;
1213        let cfg = WindowConfig {
1214            max_window_secs: 2.0,
1215            stride_secs: 1.0,
1216            vad_threshold: 0.5,
1217            min_speech_secs: 0.0,
1218        };
1219        let mut chunker = Chunker::new(cfg, vad);
1220
1221        // Fill the buffer to the cap.
1222        let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1223        chunker.push(&two_seconds).expect("cap-hit");
1224        let buf_before = chunker.buf.len();
1225        let vad_before = chunker.vad_decisions.len();
1226        // Note: a 2s buffer is exactly 62.5 VAD frames; we expect 62 (truncating).
1227        assert_eq!(buf_before, 2 * SAMPLE_RATE as usize);
1228        assert!(vad_before == 62 || vad_before == 63);
1229
1230        // Request a non-frame-multiple trim: 700 samples (1.37 VAD frames).
1231        // Expected: trim_oldest rounds down to 1 frame = 512 samples.
1232        let trimmed = chunker.trim_oldest(700);
1233        assert_eq!(
1234            trimmed, VAD_FRAME_SIZE,
1235            "trim rounds down to frame multiple"
1236        );
1237        assert_eq!(
1238            chunker.buf.len(),
1239            buf_before - VAD_FRAME_SIZE,
1240            "buf shrunk by exactly one frame"
1241        );
1242        assert_eq!(
1243            chunker.vad_decisions.len(),
1244            vad_before - 1,
1245            "vad_decisions shrunk by exactly one entry"
1246        );
1247
1248        // Request less than one frame: trim returns 0, nothing changes.
1249        let trimmed_small = chunker.trim_oldest(100);
1250        assert_eq!(trimmed_small, 0);
1251        assert_eq!(chunker.buf.len(), buf_before - VAD_FRAME_SIZE);
1252
1253        Ok(())
1254    }
1255
1256    /// trim_oldest advances utterance_start_sample by exactly the trimmed amount and
1257    /// clears cap_pending_handler so subsequent caps are treated as first caps again.
1258    #[test]
1259    #[ignore = "requires silero_vad.onnx in ./models"]
1260    fn test_chunker_trim_oldest_advances_utterance_start_sample() -> Result<()> {
1261        let vad = open_vad()?;
1262        let cfg = WindowConfig {
1263            max_window_secs: 2.0,
1264            stride_secs: 1.0,
1265            vad_threshold: 0.5,
1266            min_speech_secs: 0.0,
1267        };
1268        let mut chunker = Chunker::new(cfg, vad);
1269
1270        let two_seconds = vec![0.0f32; 2 * SAMPLE_RATE as usize];
1271        chunker.push(&two_seconds).expect("cap-hit");
1272        let start_before = chunker.utterance_start_sample;
1273
1274        // Trim ~1 s — rounded down to 31 × 512 = 15872 (closest multiple below 16000).
1275        let target = SAMPLE_RATE as usize;
1276        let trimmed = chunker.trim_oldest(target);
1277        assert_eq!(
1278            trimmed,
1279            (target / VAD_FRAME_SIZE) * VAD_FRAME_SIZE,
1280            "trimmed exactly the aligned amount"
1281        );
1282        assert_eq!(
1283            chunker.utterance_start_sample,
1284            start_before + trimmed as u64,
1285            "utterance_start_sample advanced by trimmed samples"
1286        );
1287
1288        // Push 1 sample — should NOT trigger forced_eou because trim cleared cap_pending.
1289        let w = chunker.push(&[0.0f32]);
1290        if let Some(w) = w {
1291            assert!(
1292                !w.forced_eou,
1293                "after successful trim, the next push should not be flagged forced_eou"
1294            );
1295        }
1296
1297        Ok(())
1298    }
1299
1300    /// `Committer::on_trim` force-commits the tentative tail of the most recent decode
1301    /// (everything past the LCP boundary), clears last_candidate, and preserves
1302    /// already-committed state without setting awaiting_reset.
1303    #[test]
1304    fn test_committer_on_trim_force_commits_tentative_tail() -> Result<()> {
1305        let mut c = Committer::new();
1306        // Two rounds so LCP commits [tok1, tok2] and tok4 is tentative.
1307        c.ingest(vec![make_token(1), make_token(2), make_token(3)]);
1308        c.ingest(vec![make_token(1), make_token(2), make_token(4)]);
1309        assert_eq!(c.committed_text(), "tok1tok2");
1310
1311        let delta = c.on_trim();
1312        let trim_committed = committed_ids(&delta);
1313        assert_eq!(
1314            trim_committed,
1315            vec![4],
1316            "on_trim force-commits the tentative tail (tok4)"
1317        );
1318        assert!(
1319            c.last_candidate().is_empty(),
1320            "last_candidate cleared after on_trim"
1321        );
1322        assert_eq!(c.last_lcp_end_in_candidate(), 0, "LCP cursor reset");
1323        assert_eq!(
1324            c.committed_text(),
1325            "tok1tok2tok4",
1326            "committed_text now includes the force-committed tail"
1327        );
1328
1329        // Next ingest must NOT auto-clear committed (awaiting_reset must be false). It
1330        // simply starts a fresh LCP baseline.
1331        let (comm, _) = c.ingest(vec![make_token(5), make_token(6)]);
1332        assert!(
1333            committed_ids(&comm).is_empty(),
1334            "first post-trim ingest commits nothing (no prior candidate)"
1335        );
1336        assert_eq!(
1337            c.committed_text(),
1338            "tok1tok2tok4",
1339            "committed_text not blown away by post-trim ingest"
1340        );
1341
1342        Ok(())
1343    }
1344
1345    /// LCP must work even when one side has timestamps and the other doesn't (mixed-
1346    /// mode case from the adversarial review #9 — exercised when streaming flips
1347    /// timestamp mode mid-utterance, even though current design keeps it always-on).
1348    #[test]
1349    fn test_committer_lcp_mixed_timestamps() -> Result<()> {
1350        let mut c = Committer::new();
1351        // Round 1: no timestamps in candidate.
1352        c.ingest(vec![make_token(1), make_token(2)]);
1353        // Round 2: interspersed timestamps; content tokens still match.
1354        let (comm, _tent) = c.ingest(vec![
1355            make_ts_token(50364, 0.0),
1356            make_token(1),
1357            make_ts_token(50370, 0.2),
1358            make_token(2),
1359        ]);
1360        let ids = committed_ids(&comm);
1361        // The committed prefix should cover the content tokens 1, 2 (plus the
1362        // surrounding timestamps that fall within the LCP boundary in candidate space).
1363        assert!(ids.contains(&1), "content token 1 must commit");
1364        assert!(ids.contains(&2), "content token 2 must commit");
1365
1366        Ok(())
1367    }
1368}
whisperforge_core/streaming.rs

whisperforge_core/
streaming.rs