wavekat_turn/lib.rs
1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! For most use cases, wrap a detector in [`TurnController`] to get
13//! automatic state tracking and soft-reset logic for VAD integration.
14//! See [`controller`] for details.
15//!
16//! # Feature flags
17//!
18//! | Feature | Backend | Input |
19//! |---------|---------|-------|
20//! | `pipecat` | Pipecat Smart Turn v3 (ONNX, embedded) | Audio (16 kHz) |
21//! | `wavekat-smart-turn` | WaveKat language-specialized fine-tunes (ONNX, runtime download) | Audio (16 kHz) |
22//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
23//!
24//! `wavekat-smart-turn` implies `pipecat` and adds an `hf-hub` runtime
25//! dependency. Weights live in
26//! [`wavekat/smart-turn-ONNX`](https://huggingface.co/wavekat/smart-turn-ONNX)
27//! and are cached under `$HF_HOME/hub/`. Set `WAVEKAT_TURN_MODEL_DIR` to a
28//! directory containing `<lang>/smart-turn-cpu.onnx` to skip the download.
29
30pub mod controller;
31pub mod error;
32
33#[cfg(any(feature = "pipecat", feature = "livekit"))]
34pub(crate) mod onnx;
35
36#[cfg(feature = "pipecat")]
37pub mod audio;
38
39#[cfg(feature = "livekit")]
40pub mod text;
41
42pub use controller::TurnController;
43pub use error::TurnError;
44pub use wavekat_core::AudioFrame;
45
46/// The predicted turn state.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum TurnState {
49 /// User is done speaking — AI should respond.
50 Finished,
51 /// User is still speaking or thinking.
52 Unfinished,
53 /// User explicitly asked the AI to wait.
54 Wait,
55}
56
57/// Per-stage timing entry.
58#[derive(Debug, Clone)]
59pub struct StageTiming {
60 /// Stage name (e.g. "audio_prep", "mel", "onnx").
61 pub name: &'static str,
62 /// Time in microseconds for this stage.
63 pub us: f64,
64}
65
66/// A turn detection prediction with confidence and timing metadata.
67#[derive(Debug, Clone)]
68pub struct TurnPrediction {
69 pub state: TurnState,
70 pub confidence: f32,
71 pub latency_ms: u64,
72 /// Per-stage timing breakdown in pipeline order.
73 pub stage_times: Vec<StageTiming>,
74 /// Duration of audio in the detector's buffer at prediction time (ms).
75 ///
76 /// For PipecatSmartTurn this reflects how much of the 8 s ring buffer
77 /// was filled. With soft reset the buffer may span multiple speech
78 /// segments, so this can exceed the current segment duration.
79 pub audio_duration_ms: u64,
80}
81
82/// A single turn in the conversation, for context-aware text detectors.
83#[derive(Debug, Clone)]
84pub struct ConversationTurn {
85 pub role: Role,
86 pub text: String,
87}
88
89/// Speaker role in a conversation turn.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum Role {
92 User,
93 Assistant,
94}
95
96/// Turn detector that operates on raw audio.
97///
98/// Implementations buffer audio internally and run prediction on demand.
99///
100/// **Most users should wrap this in [`TurnController`]** rather than calling
101/// these methods directly. The controller tracks prediction state and provides
102/// [`reset_if_finished`](TurnController::reset_if_finished) for correct
103/// multi-utterance handling.
104///
105/// # Direct usage (advanced)
106///
107/// If you need full control over reset logic:
108///
109/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
110/// 2. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
111/// 3. **New turn begins** → [`reset`](AudioTurnDetector::reset)
112///
113/// Note: calling `reset` unconditionally on every VAD speech-start will discard
114/// audio context when the user pauses mid-sentence. See [`TurnController`] for
115/// the recommended approach.
116pub trait AudioTurnDetector: Send + Sync {
117 /// Feed audio into the internal buffer.
118 ///
119 /// Call continuously with incoming audio frames (16 kHz mono).
120 fn push_audio(&mut self, frame: &AudioFrame);
121
122 /// Run prediction on buffered audio.
123 ///
124 /// Call when VAD detects end of speech. The buffer is **not** cleared
125 /// after prediction — call [`reset`](AudioTurnDetector::reset) explicitly
126 /// when starting a new turn.
127 fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
128
129 /// Unconditionally clear the internal buffer.
130 ///
131 /// Use when you are certain a new turn is starting (e.g. after the
132 /// assistant finishes responding). For VAD speech-start events where
133 /// the user may be continuing, prefer
134 /// [`TurnController::reset_if_finished`].
135 fn reset(&mut self);
136}
137
138/// Turn detector that operates on ASR transcript text.
139///
140/// Implementations receive the current (possibly partial) transcript
141/// and optionally prior conversation turns for context.
142pub trait TextTurnDetector: Send + Sync {
143 fn predict_text(
144 &mut self,
145 transcript: &str,
146 context: &[ConversationTurn],
147 ) -> Result<TurnPrediction, TurnError>;
148 fn reset(&mut self);
149}