wavekat_turn/
lib.rs

1//! # wavekat-turn
2//!
3//! Unified turn detection with multiple backends.
4//!
5//! Provides a clean abstraction over turn-detection models that predict
6//! whether a user has finished speaking. Two trait families cover the
7//! two fundamental input modalities:
8//!
9//! - [`AudioTurnDetector`] — operates on raw audio frames (e.g. Pipecat Smart Turn)
10//! - [`TextTurnDetector`] — operates on ASR transcript text (e.g. LiveKit EOU)
11//!
12//! For most use cases, wrap a detector in [`TurnController`] to get
13//! automatic state tracking and soft-reset logic for VAD integration.
14//! See [`controller`] for details.
15//!
16//! # Feature flags
17//!
18//! | Feature | Backend | Input |
19//! |---------|---------|-------|
20//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) |
21//! | `livekit` | LiveKit Turn Detector (ONNX) | Text |
22
23pub mod controller;
24pub mod error;
25
26#[cfg(any(feature = "pipecat", feature = "livekit"))]
27pub(crate) mod onnx;
28
29#[cfg(feature = "pipecat")]
30pub mod audio;
31
32#[cfg(feature = "livekit")]
33pub mod text;
34
35pub use controller::TurnController;
36pub use error::TurnError;
37pub use wavekat_core::AudioFrame;
38
39/// The predicted turn state.
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum TurnState {
42    /// User is done speaking — AI should respond.
43    Finished,
44    /// User is still speaking or thinking.
45    Unfinished,
46    /// User explicitly asked the AI to wait.
47    Wait,
48}
49
50/// Per-stage timing entry.
51#[derive(Debug, Clone)]
52pub struct StageTiming {
53    /// Stage name (e.g. "audio_prep", "mel", "onnx").
54    pub name: &'static str,
55    /// Time in microseconds for this stage.
56    pub us: f64,
57}
58
59/// A turn detection prediction with confidence and timing metadata.
60#[derive(Debug, Clone)]
61pub struct TurnPrediction {
62    pub state: TurnState,
63    pub confidence: f32,
64    pub latency_ms: u64,
65    /// Per-stage timing breakdown in pipeline order.
66    pub stage_times: Vec<StageTiming>,
67    /// Duration of audio in the detector's buffer at prediction time (ms).
68    ///
69    /// For PipecatSmartTurn this reflects how much of the 8 s ring buffer
70    /// was filled. With soft reset the buffer may span multiple speech
71    /// segments, so this can exceed the current segment duration.
72    pub audio_duration_ms: u64,
73}
74
75/// A single turn in the conversation, for context-aware text detectors.
76#[derive(Debug, Clone)]
77pub struct ConversationTurn {
78    pub role: Role,
79    pub text: String,
80}
81
82/// Speaker role in a conversation turn.
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum Role {
85    User,
86    Assistant,
87}
88
89/// Turn detector that operates on raw audio.
90///
91/// Implementations buffer audio internally and run prediction on demand.
92///
93/// **Most users should wrap this in [`TurnController`]** rather than calling
94/// these methods directly. The controller tracks prediction state and provides
95/// [`reset_if_finished`](TurnController::reset_if_finished) for correct
96/// multi-utterance handling.
97///
98/// # Direct usage (advanced)
99///
100/// If you need full control over reset logic:
101///
102/// 1. **Every audio chunk** → [`push_audio`](AudioTurnDetector::push_audio)
103/// 2. **VAD fires "speech stopped"** → [`predict`](AudioTurnDetector::predict)
104/// 3. **New turn begins** → [`reset`](AudioTurnDetector::reset)
105///
106/// Note: calling `reset` unconditionally on every VAD speech-start will discard
107/// audio context when the user pauses mid-sentence. See [`TurnController`] for
108/// the recommended approach.
109pub trait AudioTurnDetector: Send + Sync {
110    /// Feed audio into the internal buffer.
111    ///
112    /// Call continuously with incoming audio frames (16 kHz mono).
113    fn push_audio(&mut self, frame: &AudioFrame);
114
115    /// Run prediction on buffered audio.
116    ///
117    /// Call when VAD detects end of speech. The buffer is **not** cleared
118    /// after prediction — call [`reset`](AudioTurnDetector::reset) explicitly
119    /// when starting a new turn.
120    fn predict(&mut self) -> Result<TurnPrediction, TurnError>;
121
122    /// Unconditionally clear the internal buffer.
123    ///
124    /// Use when you are certain a new turn is starting (e.g. after the
125    /// assistant finishes responding). For VAD speech-start events where
126    /// the user may be continuing, prefer
127    /// [`TurnController::reset_if_finished`].
128    fn reset(&mut self);
129}
130
131/// Turn detector that operates on ASR transcript text.
132///
133/// Implementations receive the current (possibly partial) transcript
134/// and optionally prior conversation turns for context.
135pub trait TextTurnDetector: Send + Sync {
136    fn predict_text(
137        &mut self,
138        transcript: &str,
139        context: &[ConversationTurn],
140    ) -> Result<TurnPrediction, TurnError>;
141    fn reset(&mut self);
142}
wavekat_turn/lib.rs

wavekat_turn/
lib.rs