Skip to main content

wavekat_turn/
lib.rs

1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! For most use cases, wrap a detector in [`TurnController`] to get
13//! automatic state tracking and soft-reset logic for VAD integration.
14//! See [`controller`] for details.
15//!
16//! # Feature flags
17//!
18//! | Feature | Backend | Input |
19//! |---------|---------|-------|
20//! | `pipecat` | Pipecat Smart Turn v3 (ONNX, embedded) | Audio (16 kHz) |
21//! | `wavekat-smart-turn` | WaveKat language-specialized fine-tunes (ONNX, runtime download) | Audio (16 kHz) |
22//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
23//!
24//! `wavekat-smart-turn` implies `pipecat` and adds an `hf-hub` runtime
25//! dependency. Weights live in
26//! [`wavekat/smart-turn-ONNX`](https://huggingface.co/wavekat/smart-turn-ONNX)
27//! and are cached under `$HF_HOME/hub/`. Set `WAVEKAT_TURN_MODEL_DIR` to a
28//! directory containing `<lang>/smart-turn-cpu.onnx` to skip the download.
29
30pub mod controller;
31pub mod error;
32
33#[cfg(any(feature = "pipecat", feature = "livekit"))]
34pub(crate) mod onnx;
35
36#[cfg(feature = "pipecat")]
37pub mod audio;
38
39#[cfg(feature = "livekit")]
40pub mod text;
41
42pub use controller::TurnController;
43pub use error::TurnError;
44pub use wavekat_core::AudioFrame;
45
46/// The predicted turn state.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum TurnState {
49    /// User is done speaking — AI should respond.
50    Finished,
51    /// User is still speaking or thinking.
52    Unfinished,
53    /// User explicitly asked the AI to wait.
54    Wait,
55}
56
57/// Per-stage timing entry.
58#[derive(Debug, Clone)]
59pub struct StageTiming {
60    /// Stage name (e.g. "audio_prep", "mel", "onnx").
61    pub name: &'static str,
62    /// Time in microseconds for this stage.
63    pub us: f64,
64}
65
66/// A turn detection prediction with confidence and timing metadata.
67#[derive(Debug, Clone)]
68pub struct TurnPrediction {
69    pub state: TurnState,
70    pub confidence: f32,
71    pub latency_ms: u64,
72    /// Per-stage timing breakdown in pipeline order.
73    pub stage_times: Vec<StageTiming>,
74    /// Duration of audio in the detector's buffer at prediction time (ms).
75    ///
76    /// For PipecatSmartTurn this reflects how much of the 8 s ring buffer
77    /// was filled. With soft reset the buffer may span multiple speech
78    /// segments, so this can exceed the current segment duration.
79    pub audio_duration_ms: u64,
80}
81
82/// A single turn in the conversation, for context-aware text detectors.
83#[derive(Debug, Clone)]
84pub struct ConversationTurn {
85    pub role: Role,
86    pub text: String,
87}
88
89/// Speaker role in a conversation turn.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum Role {
92    User,
93    Assistant,
94}
95
96/// Turn detector that operates on raw audio.
97///
98/// Implementations buffer audio internally and run prediction on demand.
99///
100/// **Most users should wrap this in [`TurnController`]** rather than calling
101/// these methods directly. The controller tracks prediction state and provides
102/// [`reset_if_finished`](TurnController::reset_if_finished) for correct
103/// multi-utterance handling.
104///
105/// # Direct usage (advanced)
106///
107/// If you need full control over reset logic:
108///
109/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
110/// 2. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
111/// 3. **New turn begins** → [`reset`](AudioTurnDetector::reset)
112///
113/// Note: calling `reset` unconditionally on every VAD speech-start will discard
114/// audio context when the user pauses mid-sentence. See [`TurnController`] for
115/// the recommended approach.
116pub trait AudioTurnDetector: Send + Sync {
117    /// Feed audio into the internal buffer.
118    ///
119    /// Call continuously with incoming audio frames (16 kHz mono).
120    fn push_audio(&mut self, frame: &AudioFrame);
121
122    /// Run prediction on buffered audio.
123    ///
124    /// Call when VAD detects end of speech. The buffer is **not** cleared
125    /// after prediction — call [`reset`](AudioTurnDetector::reset) explicitly
126    /// when starting a new turn.
127    fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
128
129    /// Unconditionally clear the internal buffer.
130    ///
131    /// Use when you are certain a new turn is starting (e.g. after the
132    /// assistant finishes responding). For VAD speech-start events where
133    /// the user may be continuing, prefer
134    /// [`TurnController::reset_if_finished`].
135    fn reset(&mut self);
136}
137
138/// Turn detector that operates on ASR transcript text.
139///
140/// Implementations receive the current (possibly partial) transcript
141/// and optionally prior conversation turns for context.
142pub trait TextTurnDetector: Send + Sync {
143    fn predict_text(
144        &mut self,
145        transcript: &str,
146        context: &[ConversationTurn],
147    ) -> Result<TurnPrediction, TurnError>;
148    fn reset(&mut self);
149}