Skip to main content

wavekat_asr/
lib.rs

1//! # wavekat-asr
2//!
3//! Streaming ASR trait surface, intended to wrap one or more speech-to-text
4//! backends behind a common Rust API. Modeled on the same pattern as
5//! [`wavekat-vad`] and [`wavekat-turn`].
6//!
7//! [`wavekat-vad`]: https://crates.io/crates/wavekat-vad
8//! [`wavekat-turn`]: https://crates.io/crates/wavekat-turn
9//!
10//! # Status
11//!
12//! This crate is pre-1.0. The trait surface may iterate as more
13//! backends land. Pin to an exact patch version.
14//!
15//! The bundled backend is [`backends::sherpa_onnx`] (behind the
16//! `sherpa-onnx` Cargo feature): a local streaming Zipformer that
17//! auto-downloads its model from HuggingFace on first use.
18
19pub mod backends;
20pub mod error;
21
22pub use error::AsrError;
23pub use wavekat_core::AudioFrame;
24
25/// Which side of a two-channel call the audio (or transcript) belongs to.
26///
27/// The daemon tees both RTP directions through one ASR instance, so every
28/// event needs to carry the channel it came from.
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum Channel {
31    /// Audio captured from the local mic — what the user said.
32    Local,
33    /// Audio received over RTP — what the remote party said.
34    Remote,
35}
36
37/// One transcript event emitted by a [`StreamingAsr`] backend.
38#[derive(Debug, Clone)]
39pub enum TranscriptEvent {
40    /// Backend has begun receiving speech on this channel.
41    ///
42    /// Optional — not every backend emits this; consumers must not gate
43    /// finals on having seen a `SpeechStarted` first.
44    SpeechStarted { channel: Channel, ts_ms: u64 },
45
46    /// Backend has detected end of speech on this channel.
47    ///
48    /// Optional, same caveat as [`SpeechStarted`](TranscriptEvent::SpeechStarted).
49    SpeechEnded { channel: Channel, ts_ms: u64 },
50
51    /// In-flight transcript that may be revised before becoming
52    /// [`Final`](TranscriptEvent::Final). Render but do not persist.
53    Partial {
54        channel: Channel,
55        /// Stream start time in ms.
56        ts_ms: u64,
57        text: String,
58    },
59
60    /// Stable transcript for a segment. Persist this; drop any partials
61    /// that share the same channel and overlapping `ts_ms..end_ms`.
62    Final {
63        channel: Channel,
64        ts_ms: u64,
65        end_ms: u64,
66        text: String,
67        /// Backend-reported confidence in `[0.0, 1.0]`. Backends that
68        /// don't report confidence should emit `1.0`.
69        confidence: f32,
70    },
71
72    /// Backend hit a non-fatal error and continues. Fatal errors come
73    /// back from [`StreamingAsr::push_audio`] / [`StreamingAsr::finish`]
74    /// as `Err`.
75    Warning(String),
76}
77
78/// A streaming ASR session.
79///
80/// Implementations are expected to:
81///
82/// - Accept any [`AudioFrame`] sample rate; resample internally.
83/// - Be `Send` so the daemon can move them between tasks.
84/// - Emit [`TranscriptEvent`]s via the receiver returned at construction
85///   time (see backend docs for the constructor shape).
86///
87/// The trait is intentionally tiny in `0.0.1`. Expect additions
88/// (per-utterance reset, hot-swappable config, metric hooks) as real
89/// backends land in later releases.
90pub trait StreamingAsr: Send {
91    /// Push audio into the stream.
92    ///
93    /// Returns synchronously; transcript events are delivered on the
94    /// backend's receiver, not as a return value here.
95    fn push_audio(&mut self, frame: &AudioFrame, channel: Channel) -> Result<(), AsrError>;
96
97    /// Signal end-of-stream. The backend should flush any remaining
98    /// audio and emit a terminal [`Final`](TranscriptEvent::Final) per
99    /// channel where applicable.
100    fn finish(&mut self) -> Result<(), AsrError>;
101
102    /// Reset per-channel utterance state.
103    ///
104    /// Cheap on local backends; network-backed backends may drop and
105    /// recreate their socket. The contract is only that the next
106    /// `push_audio(frame, channel)` starts a fresh utterance on `channel`.
107    fn reset(&mut self, channel: Channel) -> Result<(), AsrError>;
108}