wavekat_asr/lib.rs
1//! # wavekat-asr
2//!
3//! Streaming ASR trait surface, intended to wrap one or more speech-to-text
4//! backends behind a common Rust API. Modeled on the same pattern as
5//! [`wavekat-vad`] and [`wavekat-turn`].
6//!
7//! [`wavekat-vad`]: https://crates.io/crates/wavekat-vad
8//! [`wavekat-turn`]: https://crates.io/crates/wavekat-turn
9//!
10//! # Status
11//!
12//! This crate is pre-1.0. The trait surface may iterate as more
13//! backends land. Pin to an exact patch version.
14//!
15//! The bundled backend is [`backends::sherpa_onnx`] (behind the
16//! `sherpa-onnx` Cargo feature): a local streaming Zipformer that
17//! auto-downloads its model from HuggingFace on first use.
18
19pub mod backends;
20pub mod error;
21
22pub use error::AsrError;
23pub use wavekat_core::AudioFrame;
24
25/// Which side of a two-channel call the audio (or transcript) belongs to.
26///
27/// The daemon tees both RTP directions through one ASR instance, so every
28/// event needs to carry the channel it came from.
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum Channel {
31 /// Audio captured from the local mic — what the user said.
32 Local,
33 /// Audio received over RTP — what the remote party said.
34 Remote,
35}
36
37/// One transcript event emitted by a [`StreamingAsr`] backend.
38#[derive(Debug, Clone)]
39pub enum TranscriptEvent {
40 /// Backend has begun receiving speech on this channel.
41 ///
42 /// Optional — not every backend emits this; consumers must not gate
43 /// finals on having seen a `SpeechStarted` first.
44 SpeechStarted { channel: Channel, ts_ms: u64 },
45
46 /// Backend has detected end of speech on this channel.
47 ///
48 /// Optional, same caveat as [`SpeechStarted`](TranscriptEvent::SpeechStarted).
49 SpeechEnded { channel: Channel, ts_ms: u64 },
50
51 /// In-flight transcript that may be revised before becoming
52 /// [`Final`](TranscriptEvent::Final). Render but do not persist.
53 Partial {
54 channel: Channel,
55 /// Stream start time in ms.
56 ts_ms: u64,
57 text: String,
58 },
59
60 /// Stable transcript for a segment. Persist this; drop any partials
61 /// that share the same channel and overlapping `ts_ms..end_ms`.
62 Final {
63 channel: Channel,
64 ts_ms: u64,
65 end_ms: u64,
66 text: String,
67 /// Backend-reported confidence in `[0.0, 1.0]`. Backends that
68 /// don't report confidence should emit `1.0`.
69 confidence: f32,
70 },
71
72 /// Backend hit a non-fatal error and continues. Fatal errors come
73 /// back from [`StreamingAsr::push_audio`] / [`StreamingAsr::finish`]
74 /// as `Err`.
75 Warning(String),
76}
77
78/// A streaming ASR session.
79///
80/// Implementations are expected to:
81///
82/// - Accept any [`AudioFrame`] sample rate; resample internally.
83/// - Be `Send` so the daemon can move them between tasks.
84/// - Emit [`TranscriptEvent`]s via the receiver returned at construction
85/// time (see backend docs for the constructor shape).
86///
87/// The trait is intentionally tiny in `0.0.1`. Expect additions
88/// (per-utterance reset, hot-swappable config, metric hooks) as real
89/// backends land in later releases.
90pub trait StreamingAsr: Send {
91 /// Push audio into the stream.
92 ///
93 /// Returns synchronously; transcript events are delivered on the
94 /// backend's receiver, not as a return value here.
95 fn push_audio(&mut self, frame: &AudioFrame, channel: Channel) -> Result<(), AsrError>;
96
97 /// Signal end-of-stream. The backend should flush any remaining
98 /// audio and emit a terminal [`Final`](TranscriptEvent::Final) per
99 /// channel where applicable.
100 fn finish(&mut self) -> Result<(), AsrError>;
101
102 /// Reset per-channel utterance state.
103 ///
104 /// Cheap on local backends; network-backed backends may drop and
105 /// recreate their socket. The contract is only that the next
106 /// `push_audio(frame, channel)` starts a fresh utterance on `channel`.
107 fn reset(&mut self, channel: Channel) -> Result<(), AsrError>;
108}