Skip to main content

wavekat_turn/
lib.rs

1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! # Feature flags
13//!
14//! | Feature | Backend | Input |
15//! |---------|---------|-------|
16//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
17//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
18
19pub mod error;
20
21#[cfg(feature = "pipecat")]
22pub mod audio;
23
24#[cfg(feature = "livekit")]
25pub mod text;
26
27pub use error::TurnError;
28pub use wavekat_core::AudioFrame;
29
30/// The predicted turn state.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum TurnState {
33    /// User is done speaking — AI should respond.
34    Finished,
35    /// User is still speaking or thinking.
36    Unfinished,
37    /// User explicitly asked the AI to wait.
38    Wait,
39}
40
41/// A turn detection prediction with confidence and timing metadata.
42#[derive(Debug, Clone)]
43pub struct TurnPrediction {
44    pub state: TurnState,
45    pub confidence: f32,
46    pub latency_ms: u64,
47}
48
49/// A single turn in the conversation, for context-aware text detectors.
50#[derive(Debug, Clone)]
51pub struct ConversationTurn {
52    pub role: Role,
53    pub text: String,
54}
55
56/// Speaker role in a conversation turn.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum Role {
59    User,
60    Assistant,
61}
62
63/// Turn detector that operates on raw audio.
64///
65/// Implementations buffer audio internally and run prediction on demand.
66/// The typical flow with VAD:
67///
68/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
69/// 2. **VAD fires "speech started"** → [`reset`](AudioTurnDetector::reset)
70/// 3. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
71pub trait AudioTurnDetector: Send + Sync {
72    /// Feed audio into the internal buffer.
73    ///
74    /// Call continuously with incoming audio frames (16 kHz mono).
75    fn push_audio(&mut self, frame: &AudioFrame);
76
77    /// Run prediction on buffered audio.
78    ///
79    /// Call when VAD detects end of speech.
80    fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
81
82    /// Clear the internal buffer. Call when a new speech turn begins.
83    fn reset(&mut self);
84}
85
86/// Turn detector that operates on ASR transcript text.
87///
88/// Implementations receive the current (possibly partial) transcript
89/// and optionally prior conversation turns for context.
90pub trait TextTurnDetector: Send + Sync {
91    fn predict_text(
92        &mut self,
93        transcript: &str,
94        context: &[ConversationTurn],
95    ) -> Result<TurnPrediction, TurnError>;
96    fn reset(&mut self);
97}