use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::sync::watch;
pub const DEFAULT_VOICE_PROMPT_OVERLAY: &str =
"[VOICE CONTEXT: This is a real-time voice call. Speed is critical — act immediately.
When checking email, get ALL emails in the inbox (not just unread — set unreadOnly to false).
The most important emails are ones that were read but never replied to or acted on.
Skip newsletters, marketing emails, and automated notifications.
Return ONLY subject lines and sender names. Do NOT include email bodies.
Focus on personal/work emails from real people that likely need a response.
When checking calendar, limit to the next 2 weeks and list only time, title, and attendees.
Do NOT ask clarifying questions — use sensible defaults and act.
Keep responses under 500 characters. The user is waiting on a live call.]";
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ListenerMode {
Auto,
PushToTalk,
WakeWord,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TtsProvider {
Elevenlabs,
Local,
Kokoro,
AppleSpeech,
}
impl Default for TtsProvider {
fn default() -> Self {
#[cfg(target_os = "macos")]
{
Self::AppleSpeech
}
#[cfg(not(target_os = "macos"))]
{
Self::Elevenlabs
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SttProvider {
Elevenlabs,
WhisperCpp,
Parakeet,
AppleSpeech,
}
impl Default for SttProvider {
fn default() -> Self {
#[cfg(target_os = "macos")]
{
Self::AppleSpeech
}
#[cfg(not(target_os = "macos"))]
{
Self::WhisperCpp
}
}
}
impl Default for ListenerMode {
fn default() -> Self {
Self::Auto
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct VoiceConfig {
#[serde(default)]
pub stt_provider: SttProvider,
#[serde(default)]
pub tts_provider: TtsProvider,
#[serde(default)]
pub elevenlabs_api_key: Option<String>,
#[serde(default = "default_voice_id")]
pub elevenlabs_voice_id: String,
#[serde(default = "default_tts_model")]
pub elevenlabs_tts_model: String,
#[serde(default = "default_local_tts_url")]
pub local_tts_url: String,
#[serde(default = "default_local_tts_model")]
pub local_tts_model: String,
#[serde(default = "default_whisper_cpp_model")]
pub whisper_cpp_model: String,
#[serde(default = "default_local_tts_voice")]
pub local_tts_voice: String,
#[serde(default = "default_local_tts_speed")]
pub local_tts_speed: f32,
#[serde(default = "default_local_tts_temperature")]
pub local_tts_temperature: f32,
#[serde(default)]
pub local_tts_ref_audio: Option<String>,
#[serde(default)]
pub local_tts_ref_text: Option<String>,
#[serde(default)]
pub local_tts_instruct: Option<String>,
#[serde(default)]
pub input_device: Option<String>,
#[serde(default = "default_sample_rate")]
pub sample_rate: u32,
#[serde(default = "default_language")]
pub language: String,
#[serde(default)]
pub mode: ListenerMode,
#[serde(default = "default_wake_words")]
pub wake_words: Vec<String>,
#[serde(default = "default_vad_threshold_db")]
pub vad_threshold_db: f32,
#[serde(default = "default_speech_onset_ms")]
pub speech_onset_ms: u32,
#[serde(default = "default_turn_end_ms")]
pub turn_end_ms: u32,
#[serde(default = "default_smoothing_factor")]
pub smoothing_factor: f32,
#[serde(default = "default_hysteresis_db")]
pub hysteresis_db: f32,
#[serde(default = "default_barge_in_boost_db")]
pub barge_in_boost_db: f32,
#[serde(default = "default_boost_tail_ms")]
pub boost_tail_ms: u64,
#[serde(default = "default_max_segment_ms")]
pub max_segment_ms: u64,
#[serde(default = "default_segment_min_snr_db")]
pub segment_min_snr_db: f32,
#[serde(default)]
pub voice_prompt_overlay: Option<String>,
#[serde(default)]
pub progress_interval_secs: Option<u64>,
#[serde(default)]
pub max_progress_attempts: Option<u32>,
}
impl VoiceConfig {
pub fn from_env() -> Self {
let mut cfg = Self::default();
if let Ok(v) = std::env::var("TOKHN_STT_PROVIDER") {
match v.to_lowercase().as_str() {
"whisper-cpp" | "whispercpp" | "whisper_cpp" | "local" => {
cfg.stt_provider = SttProvider::WhisperCpp;
}
"elevenlabs" | "eleven_labs" | "eleven-labs" => {
cfg.stt_provider = SttProvider::Elevenlabs;
}
"parakeet" | "parakeet-tdt" | "parakeet_tdt" => {
cfg.stt_provider = SttProvider::Parakeet;
}
"apple_speech" | "apple-speech" | "apple" | "sfspeech" => {
cfg.stt_provider = SttProvider::AppleSpeech;
}
_ => {}
}
}
if let Ok(v) = std::env::var("TOKHN_STT_MODEL") {
if !v.is_empty() {
cfg.whisper_cpp_model = v;
}
}
if let Ok(v) = std::env::var("TOKHN_TTS_PROVIDER") {
match v.to_lowercase().as_str() {
"local" => cfg.tts_provider = TtsProvider::Local,
"kokoro" | "kokoro_native" | "kokoro-native" => {
cfg.tts_provider = TtsProvider::Kokoro;
}
"elevenlabs" | "eleven_labs" | "eleven-labs" => {
cfg.tts_provider = TtsProvider::Elevenlabs;
}
"apple_speech" | "apple-speech" | "apple" | "avspeech" => {
cfg.tts_provider = TtsProvider::AppleSpeech;
}
_ => {}
}
}
if let Ok(v) = std::env::var("TOKHN_TTS_URL") {
if !v.is_empty() {
cfg.local_tts_url = v;
}
}
if let Ok(v) = std::env::var("TOKHN_TTS_MODEL") {
if !v.is_empty() {
cfg.local_tts_model = v;
}
}
cfg
}
}
impl Default for VoiceConfig {
fn default() -> Self {
Self {
stt_provider: SttProvider::default(),
tts_provider: TtsProvider::default(),
elevenlabs_api_key: None,
elevenlabs_voice_id: default_voice_id(),
elevenlabs_tts_model: default_tts_model(),
local_tts_url: default_local_tts_url(),
local_tts_model: default_local_tts_model(),
whisper_cpp_model: default_whisper_cpp_model(),
local_tts_voice: default_local_tts_voice(),
local_tts_speed: default_local_tts_speed(),
local_tts_temperature: default_local_tts_temperature(),
local_tts_ref_audio: None,
local_tts_ref_text: None,
local_tts_instruct: None,
input_device: None,
sample_rate: default_sample_rate(),
language: default_language(),
mode: ListenerMode::default(),
wake_words: default_wake_words(),
vad_threshold_db: default_vad_threshold_db(),
speech_onset_ms: default_speech_onset_ms(),
turn_end_ms: default_turn_end_ms(),
smoothing_factor: default_smoothing_factor(),
hysteresis_db: default_hysteresis_db(),
barge_in_boost_db: default_barge_in_boost_db(),
boost_tail_ms: default_boost_tail_ms(),
max_segment_ms: default_max_segment_ms(),
segment_min_snr_db: default_segment_min_snr_db(),
voice_prompt_overlay: None,
progress_interval_secs: None,
max_progress_attempts: None,
}
}
}
pub fn compose_voice_context(config: &VoiceConfig, caller_context: Option<&str>) -> Option<String> {
let overlay = config
.voice_prompt_overlay
.as_deref()
.unwrap_or(DEFAULT_VOICE_PROMPT_OVERLAY);
match (overlay.is_empty(), caller_context) {
(true, None) => None,
(true, Some(ctx)) => Some(ctx.to_string()),
(false, None) => Some(overlay.to_string()),
(false, Some(ctx)) => Some(format!("{overlay}\n\n{ctx}")),
}
}
#[derive(Debug, Clone)]
pub struct VoiceConfigSender {
tx: Arc<watch::Sender<VoiceConfig>>,
}
#[derive(Debug, Clone)]
pub struct VoiceConfigHandle {
rx: watch::Receiver<VoiceConfig>,
}
pub fn voice_config_watch(initial: VoiceConfig) -> (VoiceConfigSender, VoiceConfigHandle) {
let (tx, rx) = watch::channel(initial);
(
VoiceConfigSender { tx: Arc::new(tx) },
VoiceConfigHandle { rx },
)
}
impl VoiceConfigSender {
pub fn update(&self, config: VoiceConfig) {
self.tx.send_if_modified(|current| {
if *current != config {
*current = config;
true
} else {
false
}
});
}
pub fn current(&self) -> VoiceConfig {
self.tx.borrow().clone()
}
}
impl VoiceConfigHandle {
pub fn current(&self) -> VoiceConfig {
self.rx.borrow().clone()
}
pub async fn changed(&mut self) -> Option<VoiceConfig> {
self.rx.changed().await.ok()?;
Some(self.rx.borrow_and_update().clone())
}
}
fn default_voice_id() -> String {
"UznIBkKIQe3ZG2tGydre".into()
}
fn default_tts_model() -> String {
"eleven_turbo_v2_5".into()
}
fn default_sample_rate() -> u32 {
16_000
}
fn default_language() -> String {
"en".into()
}
fn default_wake_words() -> Vec<String> {
vec!["tokhn".into(), "token".into(), "talking".into()]
}
fn default_vad_threshold_db() -> f32 {
15.0
}
fn default_speech_onset_ms() -> u32 {
100
}
fn default_turn_end_ms() -> u32 {
1400
}
fn default_smoothing_factor() -> f32 {
0.3
}
fn default_hysteresis_db() -> f32 {
3.0
}
fn default_barge_in_boost_db() -> f32 {
18.0
}
fn default_boost_tail_ms() -> u64 {
500
}
fn default_max_segment_ms() -> u64 {
6_000
}
fn default_segment_min_snr_db() -> f32 {
8.0
}
fn default_local_tts_url() -> String {
"http://127.0.0.1:19280/v1".into()
}
fn default_local_tts_model() -> String {
"mlx-community/Kokoro-82M-bf16".into()
}
fn default_whisper_cpp_model() -> String {
"large-v3-turbo-q5_0".into()
}
fn default_local_tts_voice() -> String {
"af_heart".into()
}
fn default_local_tts_speed() -> f32 {
1.0
}
fn default_local_tts_temperature() -> f32 {
0.7
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn compose_voice_context_uses_default_when_unset() {
let cfg = VoiceConfig::default();
let out = compose_voice_context(&cfg, None).expect("overlay present by default");
assert!(out.starts_with("[VOICE CONTEXT:"));
}
#[test]
fn compose_voice_context_empty_overlay_disables() {
let cfg = VoiceConfig {
voice_prompt_overlay: Some(String::new()),
..VoiceConfig::default()
};
assert_eq!(compose_voice_context(&cfg, None), None);
assert_eq!(
compose_voice_context(&cfg, Some("hi")),
Some("hi".to_string())
);
}
#[test]
fn compose_voice_context_concatenates_with_caller_prompt() {
let cfg = VoiceConfig {
voice_prompt_overlay: Some("OVERLAY".into()),
..VoiceConfig::default()
};
assert_eq!(
compose_voice_context(&cfg, Some("CALLER")),
Some("OVERLAY\n\nCALLER".to_string())
);
}
#[test]
fn compose_voice_context_overlay_only_when_no_caller_prompt() {
let cfg = VoiceConfig {
voice_prompt_overlay: Some("OVERLAY".into()),
..VoiceConfig::default()
};
assert_eq!(
compose_voice_context(&cfg, None),
Some("OVERLAY".to_string())
);
}
}