1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
//! Voice Activity Detection — energy-based with adaptive noise floor.
//!
//! Ported from `app/src-tauri/src/microphone/vad.rs`, which itself was
//! ported from VoiceRail's `EnergyDetector`. Algorithm:
//!
//! 1. Per-chunk RMS in dB.
//! 2. IIR-smoothed instantaneous level.
//! 3. Noise floor: bottom percentile of a warmup window, then slow drift.
//! 4. Hysteresis-gated speech state with minimum onset / silence durations.
//!
//! Tuning lives in [`crate::VoiceConfig`] so channels can override per
//! environment (the prod default of 15 dB is from commit `7033ca3`, raised
//! from 9 dB to keep the audio bed out of the AirPods microphone).
use crate::VoiceConfig;
use std::time::Instant;
const NOISE_FLOOR_DEFAULT_DB: f32 = -50.0;
const NOISE_FLOOR_PERCENTILE: f32 = 0.20;
const NOISE_FLOOR_ADAPT_RATE: f32 = 0.05;
const SILENCE_FLOOR_DB: f32 = -96.0;
const WARMUP_SKIP_MS: u64 = 800;
const WARMUP_CALIBRATE_MS: u64 = 1500;
pub struct VadState {
smoothed_rms_db: f32,
noise_floor_db: f32,
is_speaking: bool,
calibrated: bool,
calibration_samples: Vec<f32>,
speech_onset_at: Option<Instant>,
silence_onset_at: Option<Instant>,
start_time: Instant,
#[allow(dead_code)]
sample_rate: u32,
threshold_db: f32,
smoothing_factor: f32,
hysteresis_db: f32,
speech_onset_ms: u64,
turn_end_ms: u64,
/// Temporary additive offset on top of `threshold_db`. The capture
/// loop bumps this while Tokhn is speaking so only louder-than-echo
/// user voice can register — that's barge-in. Set back to 0 once
/// playback ends.
threshold_boost_db: f32,
}
impl VadState {
/// Create a VAD state from a [`VoiceConfig`]. Pulls all tuning knobs
/// (threshold, smoothing, hysteresis, onset/turn timings) from config so
/// callers don't need to know the constants.
pub fn from_config(sample_rate: u32, config: &VoiceConfig) -> Self {
Self {
smoothed_rms_db: NOISE_FLOOR_DEFAULT_DB,
noise_floor_db: NOISE_FLOOR_DEFAULT_DB,
is_speaking: false,
calibrated: false,
calibration_samples: Vec::with_capacity(128),
speech_onset_at: None,
silence_onset_at: None,
start_time: Instant::now(),
sample_rate,
threshold_db: config.vad_threshold_db,
smoothing_factor: config.smoothing_factor,
hysteresis_db: config.hysteresis_db,
speech_onset_ms: config.speech_onset_ms as u64,
turn_end_ms: config.turn_end_ms as u64,
threshold_boost_db: 0.0,
}
}
/// Add a temporary offset on top of the configured threshold.
/// The capture loop bumps this by ~18 dB while Tokhn is speaking
/// so only loud user speech (barge-in) registers; sets it back
/// to 0 once playback ends.
pub fn set_threshold_boost(&mut self, db: f32) {
self.threshold_boost_db = db;
}
/// The currently calibrated noise floor in dB. The capture loop
/// reads this to compute a finalized segment's SNR before
/// deciding whether to send it to STT.
pub fn noise_floor_db(&self) -> f32 {
self.noise_floor_db
}
/// Has the noise-floor warmup finished?
pub fn is_calibrated(&self) -> bool {
self.calibrated
}
/// Process a chunk of f32 samples in `[-1.0, 1.0]`.
pub fn process_samples(&mut self, samples: &[f32]) {
let elapsed_ms = self.start_time.elapsed().as_millis() as u64;
// RMS in dB
let mut sum_sq: f32 = 0.0;
for &s in samples {
let scaled = s * 32768.0;
sum_sq += scaled * scaled;
}
let rms = (sum_sq / samples.len() as f32).sqrt();
let instant_db = if rms < 1.0 {
SILENCE_FLOOR_DB
} else {
20.0 * (rms / 32768.0).log10()
};
// IIR smoothing
self.smoothed_rms_db = self.smoothing_factor * instant_db
+ (1.0 - self.smoothing_factor) * self.smoothed_rms_db;
// Noise floor calibration
if elapsed_ms < WARMUP_SKIP_MS {
return;
} else if !self.calibrated && elapsed_ms < WARMUP_CALIBRATE_MS {
self.calibration_samples.push(self.smoothed_rms_db);
return;
} else if !self.calibrated && !self.calibration_samples.is_empty() {
let mut sorted = self.calibration_samples.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let cutoff = (sorted.len() as f32 * NOISE_FLOOR_PERCENTILE).max(1.0) as usize;
let sum: f32 = sorted[..cutoff].iter().sum();
self.noise_floor_db = sum / cutoff as f32;
self.calibrated = true;
tracing::info!(
"[vad] noise floor calibrated: {:.1}dB from {} samples",
self.noise_floor_db,
self.calibration_samples.len()
);
} else if self.calibrated && self.smoothed_rms_db < self.noise_floor_db {
self.noise_floor_db +=
(self.smoothed_rms_db - self.noise_floor_db) * NOISE_FLOOR_ADAPT_RATE;
}
// Threshold with hysteresis. The temporary boost is added on
// top of the configured threshold so we can require louder
// speech during TTS playback (barge-in) without permanently
// raising the bar.
let threshold_db = self.noise_floor_db + self.threshold_db + self.threshold_boost_db;
let was_speaking = self.is_speaking;
if self.is_speaking {
if self.smoothed_rms_db < threshold_db - self.hysteresis_db {
self.is_speaking = false;
}
} else if self.smoothed_rms_db > threshold_db {
self.is_speaking = true;
}
let now = Instant::now();
if self.is_speaking && !was_speaking {
self.speech_onset_at = Some(now);
self.silence_onset_at = None;
} else if !self.is_speaking && was_speaking {
self.silence_onset_at = Some(now);
}
}
/// Energy is currently above the speech threshold.
pub fn is_speech_active(&self) -> bool {
self.calibrated && self.is_speaking
}
/// Speech has been confirmed (above threshold for the onset duration).
#[allow(dead_code)]
pub fn is_speech_confirmed(&self) -> bool {
if !self.calibrated || !self.is_speaking {
return false;
}
match self.speech_onset_at {
Some(onset) => onset.elapsed().as_millis() as u64 >= self.speech_onset_ms,
None => false,
}
}
/// The turn has ended (silence persisted for `turn_end_ms` after speech).
pub fn turn_ended(&self) -> bool {
match self.silence_onset_at {
Some(silence_at) => silence_at.elapsed().as_millis() as u64 >= self.turn_end_ms,
None => false,
}
}
/// Clear the speech-state machine without re-running calibration.
/// Used by the capture loop after Tokhn finishes speaking — the
/// brief period during own playback can otherwise leave the VAD
/// in a "speaking" state that immediately produces a false-positive
/// segment from the trailing room reverb.
pub fn reset_speech_state(&mut self) {
self.is_speaking = false;
self.speech_onset_at = None;
self.silence_onset_at = None;
self.smoothed_rms_db = self.noise_floor_db;
}
}