1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
//! Subprocess wrapper around macOS `/usr/bin/say`.
//!
//! Used by the TTS feature (Ctrl+B S read-aloud,
//! greeting + goodbye) after the tts-rs / AVFoundation
//! approach hit a per-process state bug on at least one
//! macOS version: the first `Tts::default()` engine could
//! speak once, but every subsequent `speak()` call on the
//! same instance (or a freshly-recreated one) returned
//! Ok with a new utterance id but produced no audio.
//!
//! Each `Say::speak` call spawns a brand-new subprocess.
//! No shared state across calls → no reuse bug. The
//! macOS `say` binary itself wraps the same AVFoundation
//! engine, but each subprocess gets a fresh AVFoundation
//! context, side-stepping the per-process corruption.
//!
//! Trade-offs vs. an in-process tts crate:
//! * Per-call latency: ~50-150 ms (subprocess startup +
//! audio device init). Imperceptible for a chord-
//! triggered Ctrl+B S; the greeting / goodbye paths
//! pay it once each.
//! * No fine-grained progress callbacks — we can only
//! poll `try_wait()` to detect process exit, not
//! per-word events. The playback modal already shows
//! a spinner + elapsed time, which is enough.
//! * Voice picking is by string match against
//! `say -v "?"` output (the canonical macOS voice
//! listing).
//!
//! Platform scope: macOS only. Other platforms get a
//! "TTS is macOS-only in 1.2.9" modal from the caller —
//! we don't try to wrap a per-platform TTS abstraction
//! here.
use std::io::Write;
use std::path::Path;
use std::process::{Child, Command, Stdio};
/// Lifecycle wrapper for a single `say` subprocess. Owns
/// the spawned `Child` until either (a) the user dismisses
/// the playback modal, (b) the process exits naturally, or
/// (c) the App is dropped at shutdown. All three paths
/// call `stop()` for a clean teardown.
#[derive(Debug, Default)]
pub(crate) struct Say {
child: Option<Child>,
}
impl Say {
/// Returns `Ok(())` when the host can run `say`. The
/// caller uses this to decide between "spawn and
/// speak" vs "show the macOS-only modal". Cheap:
/// only checks `cfg!(target_os = "macos")` and the
/// binary path's existence — no subprocess spawned.
pub(super) fn available() -> Result<(), &'static str> {
if !cfg!(target_os = "macos") {
return Err("TTS is macOS-only in 1.2.9");
}
if !Path::new("/usr/bin/say").exists() {
return Err("/usr/bin/say not found");
}
Ok(())
}
/// Speak `text` via `/usr/bin/say`. Any prior
/// subprocess this `Say` owns is killed first so a
/// new `Ctrl+B S` during playback interrupts cleanly
/// (the same UX the tts-rs `interrupt: true` flag was
/// supposed to give us). Text is passed via stdin to
/// avoid command-line escaping issues with non-ASCII
/// content (Russian, em-dashes, smart quotes, etc.).
///
/// `voice` is the voice name as listed by `say -v "?"`;
/// empty string falls back to the system default.
/// `rate_wpm` is words-per-minute; `say` accepts an
/// integer here (typical range 100-400, default
/// ~175-220 per voice). Pass `None` to let the voice
/// pick its own default.
pub(super) fn speak(
&mut self,
text: &str,
voice: &str,
rate_wpm: Option<u16>,
) -> std::io::Result<()> {
// Kill any in-flight prior speech first.
self.stop();
let mut cmd = Command::new("/usr/bin/say");
if !voice.is_empty() {
cmd.arg("-v").arg(voice);
}
if let Some(r) = rate_wpm {
cmd.arg("-r").arg(r.to_string());
}
cmd.stdin(Stdio::piped());
cmd.stdout(Stdio::null());
cmd.stderr(Stdio::null());
let mut child = cmd.spawn()?;
if let Some(mut stdin) = child.stdin.take() {
stdin.write_all(text.as_bytes())?;
// stdin drops here, closing the pipe — `say`
// sees EOF and starts producing audio.
}
self.child = Some(child);
Ok(())
}
/// True while the spawned `say` is still running.
/// Cheap (`try_wait` is a non-blocking waitpid).
/// False when no child was spawned, after the child
/// exits naturally, or after `stop()`.
pub(super) fn is_speaking(&mut self) -> bool {
let Some(child) = self.child.as_mut() else {
return false;
};
match child.try_wait() {
Ok(None) => true,
Ok(Some(_)) => {
// Reap immediately so we don't leave a
// zombie in `child`.
self.child = None;
false
}
Err(_) => false,
}
}
/// Kill the spawned `say` subprocess (if any) and
/// reap it. Idempotent. Errors are swallowed —
/// the worst case is a leaked zombie, which Drop
/// handles via the same path on App teardown.
pub(super) fn stop(&mut self) {
if let Some(mut child) = self.child.take() {
let _ = child.kill();
let _ = child.wait();
}
}
/// Speak `text` into a file at `dest` via
/// `/usr/bin/say -o`. Blocks until the subprocess
/// exits or `timeout` fires (whichever comes first).
/// Returns the number of bytes written on success.
///
/// 1.2.17+: extracted from the inline implementation
/// that used to live in `commit_tts_save_as_audio` so
/// the `TtsEngine` dispatcher can route save-as-audio
/// uniformly across backends. Behaviour is
/// byte-identical to the 1.2.9 inline code.
pub(super) fn speak_to_file_blocking(
text: &str,
voice: &str,
rate_wpm: Option<u16>,
dest: &Path,
timeout: std::time::Duration,
) -> Result<u64, String> {
Self::available().map_err(|s| s.to_string())?;
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent).map_err(|e| {
format!("couldn't create {}: {e}", parent.display())
})?;
}
let mut cmd = Command::new("/usr/bin/say");
cmd.arg("-o").arg(dest);
if !voice.is_empty() {
cmd.arg("-v").arg(voice);
}
if let Some(r) = rate_wpm {
cmd.arg("-r").arg(r.to_string());
}
cmd.stdin(Stdio::piped());
cmd.stdout(Stdio::null());
cmd.stderr(Stdio::piped());
let mut child = cmd
.spawn()
.map_err(|e| format!("spawn failed: {e}"))?;
if let Some(mut stdin) = child.stdin.take() {
stdin
.write_all(text.as_bytes())
.map_err(|e| {
let _ = child.kill();
let _ = child.wait();
format!("write stdin: {e}")
})?;
}
let deadline = std::time::Instant::now() + timeout;
loop {
match child.try_wait() {
Ok(Some(status)) => {
if status.success() {
let bytes = std::fs::metadata(dest)
.map(|m| m.len())
.unwrap_or(0);
return Ok(bytes);
} else {
let mut stderr = String::new();
if let Some(mut s) = child.stderr.take() {
use std::io::Read;
let _ = s.read_to_string(&mut stderr);
}
return Err(format!(
"`say` exited {} — {}",
status.code().unwrap_or(-1),
stderr.trim(),
));
}
}
Ok(None) => {
if std::time::Instant::now() >= deadline {
let _ = child.kill();
let _ = child.wait();
return Err(format!(
"timed out after {}s — partial file at {}",
timeout.as_secs(),
dest.display(),
));
}
std::thread::sleep(std::time::Duration::from_millis(50));
}
Err(e) => {
return Err(format!("wait failed: {e}"));
}
}
}
}
/// Enumerate installed voices by running
/// `say -v "?"` and parsing the output. Each line:
///
/// Milena (Enhanced) ru-RU # Sample text.
///
/// The voice name extends from the start of the line
/// to where two-or-more spaces precede the locale.
/// Returns (name, locale, sample) tuples in the order
/// `say` produced them. On failure returns an empty
/// list — callers fall back to "no voices known".
pub(super) fn list_voices() -> Vec<(String, String, String)> {
let output = match Command::new("/usr/bin/say")
.arg("-v")
.arg("?")
.output()
{
Ok(o) => o,
Err(_) => return Vec::new(),
};
let stdout = String::from_utf8_lossy(&output.stdout);
let mut out = Vec::new();
for line in stdout.lines() {
// Split on the first "# " (sample marker).
let (head, sample) = match line.split_once("# ") {
Some((a, b)) => (a.trim_end(), b.to_string()),
None => (line.trim_end(), String::new()),
};
// Locale starts where `name<spaces>` ends —
// we find the last whitespace run and split
// there. The trailing 5-char locale (e.g.
// "ru-RU") may also be longer ("en-scotland")
// so we split on whitespace + take last token.
let mut parts: Vec<&str> = head.split_whitespace().collect();
if parts.is_empty() {
continue;
}
let locale = parts.pop().unwrap_or("").to_string();
let name = parts.join(" ");
if name.is_empty() {
continue;
}
out.push((name, locale, sample));
}
out
}
/// Substring-match `needle` against installed voice
/// names (case-insensitive). Prefers entries whose
/// names also contain "Enhanced" or "Premium" so
/// `"Milena"` auto-upgrades to `"Milena (Enhanced)"`
/// when the premium variant is installed. Returns
/// the chosen voice name verbatim (suitable for the
/// `-v` flag) or `None` when nothing matches.
pub(super) fn pick_voice(needle: &str) -> Option<String> {
if needle.is_empty() {
return None;
}
let needle_lc = needle.to_lowercase();
let voices = Self::list_voices();
let mut best: Option<(String, bool, usize)> = None;
for (name, _locale, _sample) in voices {
if !name.to_lowercase().contains(&needle_lc) {
continue;
}
let lc = name.to_lowercase();
let enhanced =
lc.contains("enhanced") || lc.contains("premium");
let len = name.chars().count();
let candidate = (name.clone(), enhanced, len);
best = match best {
None => Some(candidate),
Some(prev) => {
// Prefer Enhanced; break ties on
// shorter name (so plain "Milena"
// doesn't lose to a random
// "Milena's Cousin").
let prev_score = (prev.1, std::cmp::Reverse(prev.2));
let new_score = (candidate.1, std::cmp::Reverse(candidate.2));
if new_score > prev_score {
Some(candidate)
} else {
Some(prev)
}
}
};
}
best.map(|(n, _, _)| n)
}
}
impl Drop for Say {
fn drop(&mut self) {
self.stop();
}
}