atomr_agents_stt_core/
stream.rs

1//! Streaming-push session abstraction.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use bytes::Bytes;
7use futures::Stream;
8use serde::{Deserialize, Serialize};
9
10use crate::audio::AudioFormat;
11use crate::capabilities::Capabilities;
12use crate::error::{Result, SttError};
13use crate::transcript::{Segment, SpeakerTag, Word};
14
15/// Per-call options for opening a streaming session. Mirrors the
16/// "common knobs" across the four MVP backends.
17#[derive(Debug, Clone, Default)]
18pub struct StreamOptions {
19    /// Audio format the caller intends to push. Backends use this to
20    /// negotiate the WS handshake (`encoding=…&sample_rate=…`).
21    pub format: Option<AudioFormat>,
22    /// BCP-47 hint (`"en-US"`). `None` triggers detection on backends
23    /// that support it.
24    pub language: Option<String>,
25    /// Request diarization on backends whose CAPS support it.
26    pub diarize: bool,
27    /// Model override (e.g. Deepgram `"nova-3"`).
28    pub model: Option<String>,
29    /// Backend-specific extra knobs round-tripped as JSON. Avoids
30    /// adding a knob to this struct for every backend quirk.
31    pub extra: Option<serde_json::Value>,
32}
33
34/// Active streaming session. Caller alternates `push_audio`/`finish`
35/// with consuming the stream returned from `events`.
36#[async_trait]
37pub trait StreamingSession: Send {
38    fn capabilities(&self) -> &'static Capabilities;
39
40    async fn push_audio(&mut self, chunk: Bytes) -> Result<()>;
41
42    /// Mark end-of-stream. Backend flushes; `events` then drains.
43    async fn finish(&mut self) -> Result<()>;
44
45    /// Forcibly tear down the session (drop the WS, etc.).
46    async fn close(&mut self) -> Result<()>;
47
48    /// Stream of partial / final transcripts and metadata events.
49    /// Returned as a boxed pinned stream so the trait is dyn-safe.
50    fn events(
51        &mut self,
52    ) -> Pin<Box<dyn Stream<Item = std::result::Result<StreamEvent, SttError>> + Send + '_>>;
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
56#[serde(tag = "kind", rename_all = "snake_case")]
57pub enum StreamEvent {
58    /// In-progress transcript. May be revised by a later `Final`.
59    Partial {
60        text: String,
61        start_ms: u32,
62        end_ms: u32,
63        words: Vec<Word>,
64    },
65    /// Committed segment.
66    Final { segment: Segment },
67    /// Speaker-change detected at the given offset.
68    SpeakerTurn { speaker: SpeakerTag, at_ms: u32 },
69    /// VAD-detected end of utterance.
70    UtteranceEnd { at_ms: u32 },
71    /// Backend-specific metadata blob (round-tripped to Python as a
72    /// dict).
73    Metadata(serde_json::Value),
74}
atomr_agents_stt_core/stream.rs

atomr_agents_stt_core/
stream.rs