1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
//! Voice keyterms for improving STT accuracy in the voice_stream endpoint.
//!
//! Provides domain-specific vocabulary hints (Deepgram "keywords") so the STT
//! engine correctly recognises coding terminology, project names, and branch
//! names that would otherwise be misheard.
use std::collections::HashSet;
use std::path::Path;
use crate::bootstrap::state::get_project_root;
use crate::utils::git::get_branch;
// ─── Global keyterms ────────────────────────────────────────────────
/// Terms Deepgram consistently mangles without keyword hints.
/// Note: "Claude" and "Anthropic" are already server-side base keyterms.
/// Avoid terms nobody speaks aloud as-spelled (stdout → "standard out").
const GLOBAL_KEYTERMS: &[&str] = &[
"MCP",
"symlink",
"grep",
"regex",
"localhost",
"codebase",
"TypeScript",
"JSON",
"OAuth",
"webhook",
"gRPC",
"dotfiles",
"subagent",
"worktree",
];
// ─── Helpers ────────────────────────────────────────────────────────
/// Split an identifier (camelCase, PascalCase, kebab-case, snake_case, or
/// path segments) into individual words. Fragments of 2 chars or fewer are
/// discarded to avoid noise.
pub fn split_identifier(name: &str) -> Vec<String> {
let mut result = name
.replace(
|c: char| c.is_lowercase() && c.is_ascii(),
|c: char| {
// Insert space before uppercase letters
format!(" {}", c)
},
)
.replace('-', " ")
.replace('_', " ")
.replace('.', " ")
.replace('/', " ")
.replace('\\', " ")
.split_whitespace()
.map(|w| w.trim().to_string())
.filter(|w| w.len() > 2 && w.len() <= 20)
.collect::<Vec<_>>();
result.dedup();
result
}
fn file_name_words(file_path: &str) -> Vec<String> {
let stem = Path::new(file_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("");
split_identifier(stem)
}
// ─── Public API ─────────────────────────────────────────────────────
const MAX_KEYTERMS: usize = 50;
/// Build a list of keyterms for the voice_stream STT endpoint.
///
/// Combines hardcoded global coding terms with session context (project name,
/// git branch, recent files) without any model calls.
pub async fn get_voice_keyterms(recent_files: Option<&HashSet<String>>) -> Vec<String> {
let mut terms: HashSet<String> = GLOBAL_KEYTERMS.iter().map(|s| s.to_string()).collect();
// Project root basename as a single term — users say "claude CLI internal"
// as a phrase, not isolated words. Keeping the whole basename lets the
// STT's keyterm boosting match the phrase regardless of separator.
if let Ok(project_root) = get_project_root() {
if let Some(name) = Path::new(&project_root)
.file_name()
.and_then(|n| n.to_str())
{
if name.len() > 2 && name.len() <= 50 {
terms.insert(name.to_string());
}
}
}
// Git branch words (e.g. "feat/voice-keyterms" → "feat", "voice", "keyterms")
if let Ok(branch) = get_branch() {
if let Some(branch) = branch {
for word in split_identifier(&branch) {
terms.insert(word);
}
}
}
// Recent file names — only scan enough to fill remaining slots
if let Some(files) = recent_files {
for file_path in files {
if terms.len() >= MAX_KEYTERMS {
break;
}
for word in file_name_words(file_path) {
terms.insert(word);
}
}
}
terms.into_iter().take(MAX_KEYTERMS).collect()
}