wavekat_asr/lib.rs
1//! # wavekat-asr
2//!
3//! Streaming ASR trait surface, intended to wrap one or more speech-to-text
4//! backends behind a common Rust API. Modeled on the same pattern as
5//! [`wavekat-vad`] and [`wavekat-turn`].
6//!
7//! [`wavekat-vad`]: https://crates.io/crates/wavekat-vad
8//! [`wavekat-turn`]: https://crates.io/crates/wavekat-turn
9//!
10//! # Status
11//!
12//! This crate is pre-1.0. The trait surface may iterate as more
13//! backends land. Pin to an exact patch version.
14//!
15//! The bundled backend is [`backends::sherpa_onnx`] (behind the
16//! `sherpa-onnx` Cargo feature): a local streaming Zipformer that
17//! auto-downloads its model from HuggingFace on first use.
18
19pub mod backends;
20#[cfg(feature = "download")]
21pub mod download;
22pub mod error;
23
24#[cfg(feature = "download")]
25pub use download::DownloadProgress;
26pub use error::AsrError;
27pub use wavekat_core::AudioFrame;
28
29/// Which side of a two-channel call the audio (or transcript) belongs to.
30///
31/// The daemon tees both RTP directions through one ASR instance, so every
32/// event needs to carry the channel it came from.
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub enum Channel {
35 /// Audio captured from the local mic — what the user said.
36 Local,
37 /// Audio received over RTP — what the remote party said.
38 Remote,
39}
40
41/// One transcript event emitted by a [`StreamingAsr`] backend.
42#[derive(Debug, Clone)]
43pub enum TranscriptEvent {
44 /// Backend has begun receiving speech on this channel.
45 ///
46 /// Optional — not every backend emits this; consumers must not gate
47 /// finals on having seen a `SpeechStarted` first.
48 SpeechStarted { channel: Channel, ts_ms: u64 },
49
50 /// Backend has detected end of speech on this channel.
51 ///
52 /// Optional, same caveat as [`SpeechStarted`](TranscriptEvent::SpeechStarted).
53 SpeechEnded { channel: Channel, ts_ms: u64 },
54
55 /// In-flight transcript that may be revised before becoming
56 /// [`Final`](TranscriptEvent::Final). Render but do not persist.
57 Partial {
58 channel: Channel,
59 /// Stream start time in ms.
60 ts_ms: u64,
61 text: String,
62 },
63
64 /// Stable transcript for a segment. Persist this; drop any partials
65 /// that share the same channel and overlapping `ts_ms..end_ms`.
66 Final {
67 channel: Channel,
68 ts_ms: u64,
69 end_ms: u64,
70 text: String,
71 /// Backend-reported confidence in `[0.0, 1.0]`. Backends that
72 /// don't report confidence should emit `1.0`.
73 confidence: f32,
74 },
75
76 /// Backend hit a non-fatal error and continues. Fatal errors come
77 /// back from [`StreamingAsr::push_audio`] / [`StreamingAsr::finish`]
78 /// as `Err`.
79 Warning(String),
80}
81
82/// A streaming ASR session.
83///
84/// Implementations are expected to:
85///
86/// - Accept any [`AudioFrame`] sample rate; resample internally.
87/// - Be `Send` so the daemon can move them between tasks.
88/// - Emit [`TranscriptEvent`]s via the receiver returned at construction
89/// time (see backend docs for the constructor shape).
90///
91/// The trait is intentionally tiny in `0.0.1`. Expect additions
92/// (per-utterance reset, hot-swappable config, metric hooks) as real
93/// backends land in later releases.
94pub trait StreamingAsr: Send {
95 /// Push audio into the stream.
96 ///
97 /// Returns synchronously; transcript events are delivered on the
98 /// backend's receiver, not as a return value here.
99 fn push_audio(&mut self, frame: &AudioFrame, channel: Channel) -> Result<(), AsrError>;
100
101 /// Signal end-of-stream. The backend should flush any remaining
102 /// audio and emit a terminal [`Final`](TranscriptEvent::Final) per
103 /// channel where applicable.
104 fn finish(&mut self) -> Result<(), AsrError>;
105
106 /// Reset per-channel utterance state.
107 ///
108 /// Cheap on local backends; network-backed backends may drop and
109 /// recreate their socket. The contract is only that the next
110 /// `push_audio(frame, channel)` starts a fresh utterance on `channel`.
111 fn reset(&mut self, channel: Channel) -> Result<(), AsrError>;
112}