Skip to main content

wavekat_turn/audio/
pipecat.rs

1//! Pipecat Smart Turn v3 backend.
2//!
3//! Audio-based turn detection using the Smart Turn ONNX model.
4//! Expects 16 kHz f32 PCM input. Telephony audio at 8 kHz must be
5//! upsampled before feeding to this detector.
6//!
7//! - Model size: ~8 MB (int8 quantized ONNX)
8//! - Inference: ~12 ms on CPU
9//! - License: BSD 2-Clause
10
11use crate::{AudioFrame, AudioTurnDetector, TurnError, TurnPrediction};
12
13/// Pipecat Smart Turn v3 detector.
14///
15/// Buffers up to 8 seconds of audio internally. When [`predict`](AudioTurnDetector::predict)
16/// is called, it takes the last 8s (zero-padded at front if shorter),
17/// extracts Whisper log-mel features, and runs ONNX inference.
18pub struct PipecatSmartTurn {
19    // TODO: ONNX session + audio ring buffer + state
20}
21
22impl PipecatSmartTurn {
23    /// Create a new Smart Turn detector, loading the ONNX model.
24    pub fn new() -> Result<Self, TurnError> {
25        todo!("load Smart Turn v3 ONNX model")
26    }
27}
28
29impl AudioTurnDetector for PipecatSmartTurn {
30    fn push_audio(&mut self, _frame: &AudioFrame) {
31        todo!("append to ring buffer")
32    }
33
34    fn predict(&mut self) -> Result<TurnPrediction, TurnError> {
35        todo!("truncate/pad to 8s, extract mel features, run ONNX inference")
36    }
37
38    fn reset(&mut self) {
39        todo!("clear ring buffer and internal state")
40    }
41}