atomr_agents_stt_core/stream.rs
1//! Streaming-push session abstraction.
2
3use std::pin::Pin;
4
5use async_trait::async_trait;
6use bytes::Bytes;
7use futures::Stream;
8use serde::{Deserialize, Serialize};
9
10use crate::audio::AudioFormat;
11use crate::capabilities::Capabilities;
12use crate::error::{Result, SttError};
13use crate::transcript::{Segment, SpeakerTag, Word};
14
15/// Per-call options for opening a streaming session. Mirrors the
16/// "common knobs" across the four MVP backends.
17#[derive(Debug, Clone, Default)]
18pub struct StreamOptions {
19 /// Audio format the caller intends to push. Backends use this to
20 /// negotiate the WS handshake (`encoding=…&sample_rate=…`).
21 pub format: Option<AudioFormat>,
22 /// BCP-47 hint (`"en-US"`). `None` triggers detection on backends
23 /// that support it.
24 pub language: Option<String>,
25 /// Request diarization on backends whose CAPS support it.
26 pub diarize: bool,
27 /// Model override (e.g. Deepgram `"nova-3"`).
28 pub model: Option<String>,
29 /// Backend-specific extra knobs round-tripped as JSON. Avoids
30 /// adding a knob to this struct for every backend quirk.
31 pub extra: Option<serde_json::Value>,
32}
33
34/// Active streaming session. Caller alternates `push_audio`/`finish`
35/// with consuming the stream returned from `events`.
36#[async_trait]
37pub trait StreamingSession: Send {
38 fn capabilities(&self) -> &'static Capabilities;
39
40 async fn push_audio(&mut self, chunk: Bytes) -> Result<()>;
41
42 /// Mark end-of-stream. Backend flushes; `events` then drains.
43 async fn finish(&mut self) -> Result<()>;
44
45 /// Forcibly tear down the session (drop the WS, etc.).
46 async fn close(&mut self) -> Result<()>;
47
48 /// Stream of partial / final transcripts and metadata events.
49 /// Returned as a boxed pinned stream so the trait is dyn-safe.
50 fn events(
51 &mut self,
52 ) -> Pin<Box<dyn Stream<Item = std::result::Result<StreamEvent, SttError>> + Send + '_>>;
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
56#[serde(tag = "kind", rename_all = "snake_case")]
57pub enum StreamEvent {
58 /// In-progress transcript. May be revised by a later `Final`.
59 Partial {
60 text: String,
61 start_ms: u32,
62 end_ms: u32,
63 words: Vec<Word>,
64 },
65 /// Committed segment.
66 Final { segment: Segment },
67 /// Speaker-change detected at the given offset.
68 SpeakerTurn { speaker: SpeakerTag, at_ms: u32 },
69 /// VAD-detected end of utterance.
70 UtteranceEnd { at_ms: u32 },
71 /// Backend-specific metadata blob (round-tripped to Python as a
72 /// dict).
73 Metadata(serde_json::Value),
74}