wavekat_turn/lib.rs
1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! # Feature flags
13//!
14//! | Feature | Backend | Input |
15//! |---------|---------|-------|
16//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
17//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
18
19pub mod error;
20
21#[cfg(feature = "pipecat")]
22pub mod audio;
23
24#[cfg(feature = "livekit")]
25pub mod text;
26
27pub use error::TurnError;
28pub use wavekat_core::AudioFrame;
29
30/// The predicted turn state.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum TurnState {
33 /// User is done speaking — AI should respond.
34 Finished,
35 /// User is still speaking or thinking.
36 Unfinished,
37 /// User explicitly asked the AI to wait.
38 Wait,
39}
40
41/// A turn detection prediction with confidence and timing metadata.
42#[derive(Debug, Clone)]
43pub struct TurnPrediction {
44 pub state: TurnState,
45 pub confidence: f32,
46 pub latency_ms: u64,
47}
48
49/// A single turn in the conversation, for context-aware text detectors.
50#[derive(Debug, Clone)]
51pub struct ConversationTurn {
52 pub role: Role,
53 pub text: String,
54}
55
56/// Speaker role in a conversation turn.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum Role {
59 User,
60 Assistant,
61}
62
63/// Turn detector that operates on raw audio.
64///
65/// Implementations buffer audio internally and run prediction on demand.
66/// The typical flow with VAD:
67///
68/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
69/// 2. **VAD fires "speech started"** → [`reset`](AudioTurnDetector::reset)
70/// 3. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
71pub trait AudioTurnDetector: Send + Sync {
72 /// Feed audio into the internal buffer.
73 ///
74 /// Call continuously with incoming audio frames (16 kHz mono).
75 fn push_audio(&mut self, frame: &AudioFrame);
76
77 /// Run prediction on buffered audio.
78 ///
79 /// Call when VAD detects end of speech.
80 fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
81
82 /// Clear the internal buffer. Call when a new speech turn begins.
83 fn reset(&mut self);
84}
85
86/// Turn detector that operates on ASR transcript text.
87///
88/// Implementations receive the current (possibly partial) transcript
89/// and optionally prior conversation turns for context.
90pub trait TextTurnDetector: Send + Sync {
91 fn predict_text(
92 &mut self,
93 transcript: &str,
94 context: &[ConversationTurn],
95 ) -> Result<TurnPrediction, TurnError>;
96 fn reset(&mut self);
97}