Skip to main content

wavekat_turn/
lib.rs

1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! # Feature flags
13//!
14//! | Feature | Backend | Input |
15//! |---------|---------|-------|
16//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
17//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
18
19pub mod error;
20
21#[cfg(any(feature = "pipecat", feature = "livekit"))]
22pub(crate) mod onnx;
23
24#[cfg(feature = "pipecat")]
25pub mod audio;
26
27#[cfg(feature = "livekit")]
28pub mod text;
29
30pub use error::TurnError;
31pub use wavekat_core::AudioFrame;
32
33/// The predicted turn state.
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum TurnState {
36    /// User is done speaking — AI should respond.
37    Finished,
38    /// User is still speaking or thinking.
39    Unfinished,
40    /// User explicitly asked the AI to wait.
41    Wait,
42}
43
44/// Per-stage timing entry.
45#[derive(Debug, Clone)]
46pub struct StageTiming {
47    /// Stage name (e.g. "audio_prep", "mel", "onnx").
48    pub name: &'static str,
49    /// Time in microseconds for this stage.
50    pub us: f64,
51}
52
53/// A turn detection prediction with confidence and timing metadata.
54#[derive(Debug, Clone)]
55pub struct TurnPrediction {
56    pub state: TurnState,
57    pub confidence: f32,
58    pub latency_ms: u64,
59    /// Per-stage timing breakdown in pipeline order.
60    pub stage_times: Vec<StageTiming>,
61}
62
63/// A single turn in the conversation, for context-aware text detectors.
64#[derive(Debug, Clone)]
65pub struct ConversationTurn {
66    pub role: Role,
67    pub text: String,
68}
69
70/// Speaker role in a conversation turn.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum Role {
73    User,
74    Assistant,
75}
76
77/// Turn detector that operates on raw audio.
78///
79/// Implementations buffer audio internally and run prediction on demand.
80/// The typical flow with VAD:
81///
82/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
83/// 2. **VAD fires "speech started"** → [`reset`](AudioTurnDetector::reset)
84/// 3. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
85pub trait AudioTurnDetector: Send + Sync {
86    /// Feed audio into the internal buffer.
87    ///
88    /// Call continuously with incoming audio frames (16 kHz mono).
89    fn push_audio(&mut self, frame: &AudioFrame);
90
91    /// Run prediction on buffered audio.
92    ///
93    /// Call when VAD detects end of speech.
94    fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
95
96    /// Clear the internal buffer. Call when a new speech turn begins.
97    fn reset(&mut self);
98}
99
100/// Turn detector that operates on ASR transcript text.
101///
102/// Implementations receive the current (possibly partial) transcript
103/// and optionally prior conversation turns for context.
104pub trait TextTurnDetector: Send + Sync {
105    fn predict_text(
106        &mut self,
107        transcript: &str,
108        context: &[ConversationTurn],
109    ) -> Result<TurnPrediction, TurnError>;
110    fn reset(&mut self);
111}