wavekat_turn/lib.rs
1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! # Feature flags
13//!
14//! | Feature | Backend | Input |
15//! |---------|---------|-------|
16//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
17//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
18
19pub mod error;
20
21#[cfg(any(feature = "pipecat", feature = "livekit"))]
22pub(crate) mod onnx;
23
24#[cfg(feature = "pipecat")]
25pub mod audio;
26
27#[cfg(feature = "livekit")]
28pub mod text;
29
30pub use error::TurnError;
31pub use wavekat_core::AudioFrame;
32
33/// The predicted turn state.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum TurnState {
36 /// User is done speaking — AI should respond.
37 Finished,
38 /// User is still speaking or thinking.
39 Unfinished,
40 /// User explicitly asked the AI to wait.
41 Wait,
42}
43
44/// Per-stage timing entry.
45#[derive(Debug, Clone)]
46pub struct StageTiming {
47 /// Stage name (e.g. "audio_prep", "mel", "onnx").
48 pub name: &'static str,
49 /// Time in microseconds for this stage.
50 pub us: f64,
51}
52
53/// A turn detection prediction with confidence and timing metadata.
54#[derive(Debug, Clone)]
55pub struct TurnPrediction {
56 pub state: TurnState,
57 pub confidence: f32,
58 pub latency_ms: u64,
59 /// Per-stage timing breakdown in pipeline order.
60 pub stage_times: Vec<StageTiming>,
61}
62
63/// A single turn in the conversation, for context-aware text detectors.
64#[derive(Debug, Clone)]
65pub struct ConversationTurn {
66 pub role: Role,
67 pub text: String,
68}
69
70/// Speaker role in a conversation turn.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum Role {
73 User,
74 Assistant,
75}
76
77/// Turn detector that operates on raw audio.
78///
79/// Implementations buffer audio internally and run prediction on demand.
80/// The typical flow with VAD:
81///
82/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
83/// 2. **VAD fires "speech started"** → [`reset`](AudioTurnDetector::reset)
84/// 3. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
85pub trait AudioTurnDetector: Send + Sync {
86 /// Feed audio into the internal buffer.
87 ///
88 /// Call continuously with incoming audio frames (16 kHz mono).
89 fn push_audio(&mut self, frame: &AudioFrame);
90
91 /// Run prediction on buffered audio.
92 ///
93 /// Call when VAD detects end of speech.
94 fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
95
96 /// Clear the internal buffer. Call when a new speech turn begins.
97 fn reset(&mut self);
98}
99
100/// Turn detector that operates on ASR transcript text.
101///
102/// Implementations receive the current (possibly partial) transcript
103/// and optionally prior conversation turns for context.
104pub trait TextTurnDetector: Send + Sync {
105 fn predict_text(
106 &mut self,
107 transcript: &str,
108 context: &[ConversationTurn],
109 ) -> Result<TurnPrediction, TurnError>;
110 fn reset(&mut self);
111}