1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
//! Events emitted by the voice listener.
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use crate::enrollment::TranscriptRole;
/// Events surfaced by a [`crate::Listener`] as audio flows through capture,
/// VAD, and STT.
///
/// Channels subscribe to a stream of these and react however they want
/// (display a transcript, feed it to a session, animate a waveform, etc.).
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum VoiceEvent {
/// VAD detected the start of a speech segment.
SpeechStart,
/// VAD detected the end of a speech segment.
SpeechEnd,
/// A finalized transcript for the most recent speech segment.
Transcript {
text: String,
/// Total duration of the underlying audio in milliseconds.
duration_ms: u64,
/// Which speaker this came from. `Unknown` when no speaker
/// pipeline is attached or the classifier couldn't decide.
/// `serde(default)` keeps older serialised events loadable.
#[serde(default)]
role: TranscriptRole,
},
/// An incremental partial during streaming transcription.
///
/// Emitted by listeners using a streaming-capable STT backend
/// (Parakeet TDT today; future native-streaming backends would
/// reuse this variant). Partials *monotonically extend* — text
/// never revises once emitted. The canonical "this segment is
/// done" signal is still `Transcript`; `Partial` is purely for
/// live transcript UX.
Partial {
/// Accumulated text decoded so far.
text: String,
/// Audio duration covered so far in milliseconds — useful for
/// UI progress indicators.
duration_ms: u64,
},
/// A raw PCM chunk captured from the microphone, for consumers that want
/// to render waveforms or feed audio elsewhere. Optional — most consumers
/// only care about transcripts.
AudioChunk {
/// 16-bit signed PCM samples, mono.
samples: Vec<i16>,
/// Sample rate in Hz.
sample_rate: u32,
},
/// The user started speaking *while Tokhn was speaking*. The
/// listener fires this when its (boosted) VAD detects user voice
/// loud enough to override the bed / TTS leakage. Channels should
/// halt the active narration immediately so the user can be heard.
BargeIn,
/// A previously-armed enrollment capture succeeded — the
/// voiceprint has been persisted at `save_path`. Downstream
/// callers typically switch onboarding state and reload their
/// [`crate::enrollment::SpeakerPipeline`] to pick up the new
/// enrollment.
EnrollmentCaptured { label: String, save_path: PathBuf },
/// An enrollment capture attempt failed. `reason` is
/// human-readable; the caller decides whether to re-arm with a
/// different sentence / threshold or surface the error.
EnrollmentFailed { reason: String },
}