phostt 0.4.3

Local STT server powered by Zipformer-vi RNN-T — on-device Vietnamese speech recognition via ONNX Runtime
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
//! Streaming inference state and overlap-buffer/VAD pipeline.
//!
//! Holds per-connection [`StreamingState`], [`DecoderState`], and the
//! streaming half of [`Engine`](super::engine::Engine) (`process_chunk`,
//! `flush_state`, etc.).

use serde::Serialize;
use std::sync::Arc;

use crate::error::PhosttError;

use super::engine::Engine;
use super::features;
use super::{CONTEXT_SIZE, N_MELS, SessionTriplet, TARGET_SAMPLE_RATE};
use kaldi_native_fbank::fbank::FbankComputer;
use kaldi_native_fbank::online::{FeatureComputer, OnlineFeature};

pub fn now_timestamp() -> f64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs_f64()
}

#[derive(Debug, Clone)]
pub struct StreamingConfig {
    /// Window size in mel frames (default 400 ≈ 4 s).
    pub window_frames: usize,
    /// Overlap between consecutive windows in mel frames (default 100 ≈ 1 s).
    pub overlap_frames: usize,
    /// Fuzzy-match threshold for the overlap merge (0.0 = exact, 1.0 = anything).
    /// Words with normalized Levenshtein similarity ≥ this value are treated
    /// as equal during boundary deduplication.
    pub fuzzy_match_threshold: f32,
}

impl Default for StreamingConfig {
    fn default() -> Self {
        Self {
            window_frames: 400,
            overlap_frames: 100,
            fuzzy_match_threshold: 1.0, // exact match by default
        }
    }
}

impl StreamingConfig {
    /// Shift between window starts in mel frames.
    pub fn shift_frames(&self) -> usize {
        self.window_frames.saturating_sub(self.overlap_frames)
    }

    /// Shift between window starts in encoder frames (subsampling-by-4).
    pub fn shift_encoder_frames(&self) -> usize {
        self.shift_frames() / 4
    }

    /// Validate invariants. Returns an error message if config is invalid.
    pub fn validate(&self) -> Result<(), String> {
        if self.window_frames == 0 {
            return Err("streaming window must be > 0 frames".into());
        }
        if self.overlap_frames >= self.window_frames {
            return Err("streaming overlap must be smaller than window".into());
        }
        if !self.window_frames.is_multiple_of(4) {
            return Err("streaming window must be a multiple of 4 (encoder subsampling)".into());
        }
        if !self.overlap_frames.is_multiple_of(4) {
            return Err("streaming overlap must be a multiple of 4 (encoder subsampling)".into());
        }
        if !(0.0..=1.0).contains(&self.fuzzy_match_threshold) {
            return Err("fuzzy_match_threshold must be in [0.0, 1.0]".into());
        }
        Ok(())
    }
}

#[non_exhaustive]
pub struct DecoderState {
    /// Last [`CONTEXT_SIZE`] non-blank token ids (left-padded with `blank_id`).
    pub tokens: Vec<i64>,
    /// Blank token id (cached so [`Self::push_token`] can reset state without
    /// re-reading it from the engine).
    pub blank_id: usize,
    /// Count of consecutive blank frames (used for endpointing).
    pub consecutive_blanks: usize,
}

impl StreamingState {
    /// Reset overlap-buffer state for the start of a new utterance.
    /// Called by the VAD path after a completed speech segment is emitted.
    pub fn reset_utterance_state(&mut self) {
        self.decoder = DecoderState::new(self.blank_id);
        self.accumulated_text = Arc::new(String::new());
        self.accumulated_words = Arc::new(Vec::new());
        self.feature_window.clear();
        self.prev_window_words.clear();
        self.total_frames = 0;
    }
}

impl DecoderState {
    /// Create a fresh decoder state with the context window left-padded with
    /// `blank_id` and zero blank streak.
    pub fn new(blank_id: usize) -> Self {
        Self {
            tokens: vec![blank_id as i64; CONTEXT_SIZE],
            blank_id,
            consecutive_blanks: 0,
        }
    }

    /// Slide a newly emitted non-blank token into the context window,
    /// dropping the oldest entry to keep the length at [`CONTEXT_SIZE`].
    pub fn push_token(&mut self, token: i64) {
        // VecDeque would be cleaner but the window is fixed at CONTEXT_SIZE,
        // so a `rotate_left + assign` keeps the tensor view contiguous.
        self.tokens.rotate_left(1);
        let last = self.tokens.last_mut().expect("CONTEXT_SIZE > 0");
        *last = token;
    }
}

/// A recognized word with timing and confidence metadata.
///
/// Produced by the RNN-T decoder during [`Engine::process_chunk`] or [`Engine::transcribe_file`].
/// Timestamps are in seconds relative to the start of the audio stream.
#[derive(Debug, Clone, Serialize)]
#[non_exhaustive]
#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
pub struct WordInfo {
    /// The recognized word text (BPE tokens joined, `▁` stripped).
    pub word: String,
    /// Start time in seconds from the beginning of the audio stream.
    pub start: f64,
    /// End time in seconds from the beginning of the audio stream.
    pub end: f64,
    /// Softmax confidence score (0.0–1.0), averaged over constituent BPE tokens.
    pub confidence: f32,
    /// Speaker label from diarization (zero-based index). Omitted if diarization is disabled.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub speaker: Option<u32>,
}

/// Per-connection diarization state accumulating audio and speaker assignments.
#[cfg(feature = "diarization")]
pub struct DiarizationStreamState {
    /// Online diarizer that buffers audio, extracts embeddings, and clusters speakers.
    pub diarizer: polyvoice::OnlineDiarizer,
}

/// Per-connection streaming state that persists across audio chunks.
///
/// Created via [`Engine::create_state`]. Holds the decoder LSTM state, an audio
/// sample buffer for incomplete frames, and accumulated transcript text/words.
/// Pass this to [`Engine::process_chunk`] for each incoming audio chunk and
/// [`Engine::flush_state`] when the stream ends.
#[non_exhaustive]
pub struct StreamingState {
    /// Decoder state (persisted across chunks).
    pub decoder: DecoderState,
    /// Online FBANK feature extractor.
    pub online: OnlineFeature,
    /// Number of mel frames already consumed from `online`.
    pub frames_seen: usize,
    /// Accumulated transcript text across chunks (reset on endpointing).
    /// Arc allows O(1) clone when emitting partial segments.
    pub accumulated_text: Arc<String>,
    /// Accumulated words with timestamps (reset on endpointing).
    /// Arc allows O(1) clone when emitting partial segments.
    pub accumulated_words: Arc<Vec<WordInfo>>,
    /// Absolute encoder frame offset for the next window (after subsampling-by-4).
    pub total_frames: usize,
    /// Mel feature frames waiting to fill the next streaming window.
    pub feature_window: Vec<f32>,
    /// Words from the previous window, used for overlap merging.
    pub prev_window_words: Vec<WordInfo>,
    /// Streaming configuration (window / overlap sizes).
    pub config: StreamingConfig,
    /// Blank token id cached so the state can be reset between utterances
    /// without re-reading the tokenizer.
    pub blank_id: usize,
    /// Per-connection Silero VAD session (only when VAD is enabled).
    pub vad_session: Option<silero::Session>,
    /// Per-connection VAD stream state (recurrent memory + pending samples).
    pub vad_stream_state: Option<silero::StreamState>,
    /// Per-connection VAD speech segmenter.
    pub vad_segmenter: Option<silero::SpeechSegmenter>,
    /// Accumulated raw audio samples for VAD-based segmentation.
    pub vad_audio_buffer: Vec<f32>,
    /// Sample offset of `vad_audio_buffer[0]` relative to the start of the stream.
    pub vad_sample_offset: u64,
    /// Completed VAD utterances waiting for async offline ASR.
    /// Populated by `process_chunk_vad`, drained by the WebSocket handler.
    pub vad_pending_asr: Vec<Vec<f32>>,
    /// Diarization state (present only when diarization is enabled).
    #[cfg(feature = "diarization")]
    pub diarization_state: Option<DiarizationStreamState>,
}

impl Engine {
    pub fn create_state(&self, diarization_enabled: bool) -> Result<StreamingState, PhosttError> {
        #[cfg(feature = "diarization")]
        let diarization_state = if diarization_enabled && self.speaker_encoder.is_some() {
            Some(DiarizationStreamState {
                diarizer: polyvoice::OnlineDiarizer::new(polyvoice::DiarizationConfig {
                    window_secs: 1.5,
                    hop_secs: 1.5, // same as window = no overlap, matching old behaviour
                    threshold: 0.5,
                    max_speakers: 64,
                    min_speech_secs: 0.25,
                    max_gap_secs: 0.5,
                    sample_rate: polyvoice::SampleRate::new(16000).expect("valid sample rate"),
                }),
            })
        } else {
            None
        };

        #[cfg(not(feature = "diarization"))]
        if diarization_enabled {
            tracing::warn!(
                "diarization_enabled=true ignored: build lacks the `diarization` feature"
            );
        }

        let computer = FbankComputer::new(features::phostt_fbank_options())
            .map_err(|e| PhosttError::Inference(format!("FBANK init failed: {e}")))?;
        let online = OnlineFeature::new(FeatureComputer::Fbank(computer));

        let (vad_session, vad_stream_state, vad_segmenter) = if self.vad_enabled {
            let session = silero::Session::bundled()
                .map_err(|e| PhosttError::ModelLoad(format!("silero VAD load failed: {e}")))?;
            let stream = silero::StreamState::new(silero::SampleRate::Rate16k);
            let segmenter = silero::SpeechSegmenter::new(silero::SpeechOptions::default());
            (Some(session), Some(stream), Some(segmenter))
        } else {
            (None, None, None)
        };

        let blank_id = self.tokenizer.blank_id();

        Ok(StreamingState {
            decoder: DecoderState::new(blank_id),
            online,
            frames_seen: 0,
            accumulated_text: Arc::new(String::new()),
            accumulated_words: Arc::new(Vec::new()),
            total_frames: 0,
            feature_window: Vec::new(),
            prev_window_words: Vec::new(),
            config: self.streaming_config.clone(),
            blank_id,
            vad_session,
            vad_stream_state,
            vad_segmenter,
            vad_audio_buffer: Vec::new(),
            vad_sample_offset: 0,
            vad_pending_asr: Vec::new(),
            #[cfg(feature = "diarization")]
            diarization_state,
        })
    }

    /// Process a chunk of 16kHz f32 audio samples and return any new transcript segments.
    ///
    /// Returns [`TranscriptSegment`] with `is_final == false` during speech (Partial),
    /// and `is_final == true` on endpointing (~600ms silence detected).
    /// Streaming state (LSTM hidden/cell, leftover audio, accumulated text) is maintained in `state`.
    ///
    /// # Errors
    ///
    /// Returns [`PhosttError::Inference`] if the ONNX runtime fails.
    pub fn process_chunk(
        &self,
        samples: &[f32],
        state: &mut StreamingState,
        triplet: &mut SessionTriplet,
    ) -> Result<Vec<TranscriptSegment>, PhosttError> {
        if samples.is_empty() {
            return Ok(vec![]);
        }

        // VAD path: segment speech with Silero VAD, transcribe each utterance offline.
        if state.vad_session.is_some() {
            return self.process_chunk_vad(samples, state, triplet);
        }

        self.process_chunk_overlap(samples, state, triplet)
    }

    /// Overlap-buffer streaming path (the original non-VAD logic).
    fn process_chunk_overlap(
        &self,
        samples: &[f32],
        state: &mut StreamingState,
        triplet: &mut SessionTriplet,
    ) -> Result<Vec<TranscriptSegment>, PhosttError> {
        state
            .online
            .accept_waveform(TARGET_SAMPLE_RATE as f32, samples);

        let ready = state.online.num_frames_ready();
        let new_frames = ready.saturating_sub(state.frames_seen);
        if new_frames == 0 {
            return Ok(vec![]);
        }

        let new_features =
            features::extract_online_frames(&state.online, state.frames_seen, new_frames);
        state.frames_seen = ready;
        state.feature_window.extend_from_slice(&new_features);

        let mut emitted_words: Vec<WordInfo> = Vec::new();
        let mut endpoint = false;

        while state.feature_window.len() / N_MELS >= state.config.window_frames {
            let num_frames = state.config.window_frames;
            let features = &state.feature_window[..num_frames * N_MELS];
            let frame_offset = state.total_frames;

            let (window_words, window_endpoint, _enc_len) = self
                .run_inference(
                    triplet,
                    features,
                    num_frames,
                    &mut state.decoder,
                    frame_offset,
                )
                .map_err(|e| PhosttError::Inference(format!("{e:#}")))?;

            let delta = super::delta_words(
                &window_words,
                &state.prev_window_words,
                state.config.fuzzy_match_threshold,
            );
            emitted_words.extend(delta);
            state.prev_window_words = window_words;

            // Shift window, keeping overlap
            let shift = state.config.shift_frames() * N_MELS;
            state.feature_window.drain(..shift);
            state.total_frames += state.config.shift_encoder_frames();

            if window_endpoint {
                endpoint = true;
                break;
            }
        }

        // --- Diarization: feed audio to polyvoice and annotate words ---
        #[cfg(feature = "diarization")]
        if let (Some(dia), Some(enc)) = (
            state.diarization_state.as_mut(),
            self.speaker_encoder.as_ref(),
        ) {
            if let Err(e) = dia.diarizer.feed(samples, enc) {
                tracing::warn!("Diarizer feed failed: {e:#}");
            }
            let speaker_id = dia.diarizer.current_speaker().map(|s| s.0);
            for w in &mut emitted_words {
                w.speaker = speaker_id;
            }
        }

        if emitted_words.is_empty() && !endpoint {
            return Ok(vec![]);
        }

        // Accumulate new words — make_mut clones only if refcount > 1
        let acc_text = Arc::make_mut(&mut state.accumulated_text);
        let acc_words = Arc::make_mut(&mut state.accumulated_words);
        for w in &emitted_words {
            if !acc_text.is_empty() {
                acc_text.push(' ');
            }
            acc_text.push_str(&w.word);
        }
        acc_words.extend(emitted_words);

        let text = Arc::clone(&state.accumulated_text);
        let words = Arc::clone(&state.accumulated_words);
        let ts = now_timestamp();

        if endpoint {
            // Endpoint detected: emit Final and reset accumulation
            state.accumulated_text = Arc::new(String::new());
            state.accumulated_words = Arc::new(Vec::new());
            state.decoder.consecutive_blanks = 0;
            state.prev_window_words.clear();
            Ok(vec![TranscriptSegment {
                text,
                words,
                is_final: true,
                timestamp: ts,
            }])
        } else {
            // Ongoing speech: emit Partial
            Ok(vec![TranscriptSegment {
                text,
                words,
                is_final: false,
                timestamp: ts,
            }])
        }
    }

    /// VAD-based streaming: feed samples to Silero VAD, emit a Final segment
    /// for each completed speech utterance.
    fn process_chunk_vad(
        &self,
        samples: &[f32],
        state: &mut StreamingState,
        triplet: &mut SessionTriplet,
    ) -> Result<Vec<TranscriptSegment>, PhosttError> {
        state.vad_audio_buffer.extend_from_slice(samples);

        // --- Run VAD inference in a scoped block so segmenter borrow ends ---
        let (speech_segments, is_active) = {
            let session = state.vad_session.as_mut().unwrap();
            let stream = state.vad_stream_state.as_mut().unwrap();
            let segmenter = state.vad_segmenter.as_mut().unwrap();

            let mut segments: Vec<silero::SpeechSegment> = Vec::new();
            session
                .process_stream(stream, samples, |probability| {
                    if let Some(segment) = segmenter.push_probability(probability) {
                        segments.push(segment);
                    }
                })
                .map_err(|e| PhosttError::Inference(format!("VAD inference failed: {e}")))?;

            let active = segmenter.is_active();
            (segments, active)
        };

        let mut emitted_segments: Vec<TranscriptSegment> = Vec::new();
        let buffer_start = state.vad_sample_offset;

        // Queue completed utterances for async offline ASR so VAD can keep
        // listening without blocking on the encoder.
        for segment in &speech_segments {
            let buf_start = segment.start_sample().saturating_sub(buffer_start) as usize;
            let buf_end = segment.end_sample().saturating_sub(buffer_start) as usize;
            if buf_end > state.vad_audio_buffer.len() {
                tracing::warn!("VAD segment extends beyond audio buffer, skipping");
                continue;
            }
            let speech_samples = &state.vad_audio_buffer[buf_start..buf_end];
            if speech_samples.is_empty() {
                continue;
            }
            state.vad_pending_asr.push(speech_samples.to_vec());
            state.reset_utterance_state();
        }

        // Drain processed audio from the buffer.
        if let Some(last_seg) = speech_segments.last() {
            let remove_up_to = (last_seg.end_sample().saturating_sub(buffer_start)) as usize;
            if remove_up_to <= state.vad_audio_buffer.len() {
                state.vad_audio_buffer.drain(..remove_up_to);
                state.vad_sample_offset += remove_up_to as u64;
            }
        }

        // Emit Partial segments while speech is still active.
        if is_active {
            let partials = self.process_chunk_overlap(samples, state, triplet)?;
            emitted_segments.extend(partials);
        }

        Ok(emitted_segments)
    }

    /// VAD flush: process any trailing pending samples and close the final
    /// open speech segment.
    fn flush_state_vad(
        &self,
        state: &mut StreamingState,
        triplet: &mut SessionTriplet,
    ) -> Option<TranscriptSegment> {
        let session = state.vad_session.as_mut()?;
        let stream = state.vad_stream_state.as_mut()?;
        let segmenter = state.vad_segmenter.as_mut()?;

        // Flush pending VAD samples.
        if let Ok(Some(probability)) = session.flush_stream(stream)
            && let Some(segment) = segmenter.push_probability(probability)
        {
            let buffer_start = state.vad_sample_offset;
            let buf_start = segment.start_sample().saturating_sub(buffer_start) as usize;
            let buf_end = (segment.end_sample().saturating_sub(buffer_start) as usize)
                .min(state.vad_audio_buffer.len());
            if buf_start < buf_end {
                let speech_samples = &state.vad_audio_buffer[buf_start..buf_end];
                if let Ok(result) = self.transcribe_samples(speech_samples, triplet)
                    && !result.text.is_empty()
                {
                    state.reset_utterance_state();
                    return Some(TranscriptSegment {
                        text: Arc::new(result.text),
                        words: Arc::new(result.words),
                        is_final: true,
                        timestamp: now_timestamp(),
                    });
                }
            }
        }

        // Close any trailing open segment.
        if let Some(segment) = segmenter.finish() {
            let buffer_start = state.vad_sample_offset;
            let buf_start = segment.start_sample().saturating_sub(buffer_start) as usize;
            let buf_end = (segment.end_sample().saturating_sub(buffer_start) as usize)
                .min(state.vad_audio_buffer.len());
            if buf_start < buf_end {
                let speech_samples = &state.vad_audio_buffer[buf_start..buf_end];
                if let Ok(result) = self.transcribe_samples(speech_samples, triplet)
                    && !result.text.is_empty()
                {
                    state.reset_utterance_state();
                    return Some(TranscriptSegment {
                        text: Arc::new(result.text),
                        words: Arc::new(result.words),
                        is_final: true,
                        timestamp: now_timestamp(),
                    });
                }
            }
        }

        None
    }

    /// Flush accumulated text as a Final segment (called on Stop/Close).
    pub fn flush_state(
        &self,
        state: &mut StreamingState,
        triplet: &mut SessionTriplet,
    ) -> Option<TranscriptSegment> {
        // VAD path: flush pending samples and emit trailing segment.
        if state.vad_session.is_some() {
            return self.flush_state_vad(state, triplet);
        }

        state.online.input_finished();
        let ready = state.online.num_frames_ready();
        let new_frames = ready.saturating_sub(state.frames_seen);
        if new_frames > 0 {
            let new_features =
                features::extract_online_frames(&state.online, state.frames_seen, new_frames);
            state.feature_window.extend_from_slice(&new_features);
            state.frames_seen = ready;
        }

        if !state.feature_window.is_empty() {
            let num_frames = state.feature_window.len() / N_MELS;
            let features = &state.feature_window[..];
            let frame_offset = state.total_frames;
            let (window_words, _endpoint, _enc_len) = match self.run_inference(
                triplet,
                features,
                num_frames,
                &mut state.decoder,
                frame_offset,
            ) {
                Ok(r) => r,
                Err(e) => {
                    tracing::error!("flush_state inference failed: {e:#}");
                    return None;
                }
            };
            let delta = super::delta_words(
                &window_words,
                &state.prev_window_words,
                state.config.fuzzy_match_threshold,
            );
            let acc_text = Arc::make_mut(&mut state.accumulated_text);
            let acc_words = Arc::make_mut(&mut state.accumulated_words);
            for w in &delta {
                if !acc_text.is_empty() {
                    acc_text.push(' ');
                }
                acc_text.push_str(&w.word);
            }
            acc_words.extend(delta);
            state.prev_window_words = window_words;
            state.feature_window.clear();
            state.total_frames += num_frames / 4;
        }

        if state.accumulated_text.is_empty() {
            return None;
        }
        let seg = TranscriptSegment {
            text: Arc::clone(&state.accumulated_text),
            words: Arc::clone(&state.accumulated_words),
            is_final: true,
            timestamp: now_timestamp(),
        };
        Some(seg)
    }
}

/// Result of file transcription, including word-level details.
#[derive(Debug, Clone, Serialize)]
pub struct TranscribeResult {
    pub text: String,
    pub words: Vec<WordInfo>,
    pub duration_s: f64,
}

/// A transcript segment emitted by the inference engine.
///
/// Partial segments (`is_final == false`) represent interim results that may change.
/// Final segments (`is_final == true`) represent completed utterances after endpointing.
#[derive(Debug, Clone, Serialize)]
#[non_exhaustive]
#[cfg_attr(feature = "openapi", derive(utoipa::ToSchema))]
pub struct TranscriptSegment {
    /// Recognized text for this segment.
    #[cfg_attr(feature = "openapi", schema(value_type = String))]
    pub text: Arc<String>,
    /// Individual words with timing and confidence metadata.
    #[cfg_attr(feature = "openapi", schema(value_type = Vec<WordInfo>))]
    pub words: Arc<Vec<WordInfo>>,
    /// Whether this segment is final (utterance complete) or partial (interim).
    pub is_final: bool,
    /// Unix timestamp (seconds since epoch) when this segment was produced.
    pub timestamp: f64,
}