wavekat_turn/lib.rs
1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! For most use cases, wrap a detector in [`TurnController`] to get
13//! automatic state tracking and soft-reset logic for VAD integration.
14//! See [`controller`] for details.
15//!
16//! # Feature flags
17//!
18//! | Feature | Backend | Input |
19//! |---------|---------|-------|
20//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
21//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
22
23pub mod controller;
24pub mod error;
25
26#[cfg(any(feature = "pipecat", feature = "livekit"))]
27pub(crate) mod onnx;
28
29#[cfg(feature = "pipecat")]
30pub mod audio;
31
32#[cfg(feature = "livekit")]
33pub mod text;
34
35pub use controller::TurnController;
36pub use error::TurnError;
37pub use wavekat_core::AudioFrame;
38
39/// The predicted turn state.
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum TurnState {
42 /// User is done speaking — AI should respond.
43 Finished,
44 /// User is still speaking or thinking.
45 Unfinished,
46 /// User explicitly asked the AI to wait.
47 Wait,
48}
49
50/// Per-stage timing entry.
51#[derive(Debug, Clone)]
52pub struct StageTiming {
53 /// Stage name (e.g. "audio_prep", "mel", "onnx").
54 pub name: &'static str,
55 /// Time in microseconds for this stage.
56 pub us: f64,
57}
58
59/// A turn detection prediction with confidence and timing metadata.
60#[derive(Debug, Clone)]
61pub struct TurnPrediction {
62 pub state: TurnState,
63 pub confidence: f32,
64 pub latency_ms: u64,
65 /// Per-stage timing breakdown in pipeline order.
66 pub stage_times: Vec<StageTiming>,
67 /// Duration of audio in the detector's buffer at prediction time (ms).
68 ///
69 /// For PipecatSmartTurn this reflects how much of the 8 s ring buffer
70 /// was filled. With soft reset the buffer may span multiple speech
71 /// segments, so this can exceed the current segment duration.
72 pub audio_duration_ms: u64,
73}
74
75/// A single turn in the conversation, for context-aware text detectors.
76#[derive(Debug, Clone)]
77pub struct ConversationTurn {
78 pub role: Role,
79 pub text: String,
80}
81
82/// Speaker role in a conversation turn.
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum Role {
85 User,
86 Assistant,
87}
88
89/// Turn detector that operates on raw audio.
90///
91/// Implementations buffer audio internally and run prediction on demand.
92///
93/// **Most users should wrap this in [`TurnController`]** rather than calling
94/// these methods directly. The controller tracks prediction state and provides
95/// [`reset_if_finished`](TurnController::reset_if_finished) for correct
96/// multi-utterance handling.
97///
98/// # Direct usage (advanced)
99///
100/// If you need full control over reset logic:
101///
102/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
103/// 2. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
104/// 3. **New turn begins** → [`reset`](AudioTurnDetector::reset)
105///
106/// Note: calling `reset` unconditionally on every VAD speech-start will discard
107/// audio context when the user pauses mid-sentence. See [`TurnController`] for
108/// the recommended approach.
109pub trait AudioTurnDetector: Send + Sync {
110 /// Feed audio into the internal buffer.
111 ///
112 /// Call continuously with incoming audio frames (16 kHz mono).
113 fn push_audio(&mut self, frame: &AudioFrame);
114
115 /// Run prediction on buffered audio.
116 ///
117 /// Call when VAD detects end of speech. The buffer is **not** cleared
118 /// after prediction — call [`reset`](AudioTurnDetector::reset) explicitly
119 /// when starting a new turn.
120 fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
121
122 /// Unconditionally clear the internal buffer.
123 ///
124 /// Use when you are certain a new turn is starting (e.g. after the
125 /// assistant finishes responding). For VAD speech-start events where
126 /// the user may be continuing, prefer
127 /// [`TurnController::reset_if_finished`].
128 fn reset(&mut self);
129}
130
131/// Turn detector that operates on ASR transcript text.
132///
133/// Implementations receive the current (possibly partial) transcript
134/// and optionally prior conversation turns for context.
135pub trait TextTurnDetector: Send + Sync {
136 fn predict_text(
137 &mut self,
138 transcript: &str,
139 context: &[ConversationTurn],
140 ) -> Result<TurnPrediction, TurnError>;
141 fn reset(&mut self);
142}