use std::sync::{Arc, Condvar, Mutex};
use std::time::{Duration, Instant};
use objc::runtime::Object;
#[allow(unused_imports)]
use objc::{msg_send, sel, sel_impl};
use tracing::{debug, info, warn};
use super::ffi::{ns_string_from_str, ns_string_to_rust, objc_class, release_objc_object};
use super::{AudioData, AudioError};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum SpeechAuthStatus {
NotDetermined = 0,
Denied = 1,
Restricted = 2,
Authorized = 3,
}
impl SpeechAuthStatus {
fn from_raw(v: i64) -> Self {
match v {
1 => Self::Denied,
2 => Self::Restricted,
3 => Self::Authorized,
_ => Self::NotDetermined,
}
}
}
pub fn transcribe(audio: &AudioData) -> Result<String, AudioError> {
debug!(samples = audio.samples.len(), "transcribing audio");
transcribe_with_sf_speech(audio)
}
pub fn speak(text: &str) -> Result<Duration, AudioError> {
if text.is_empty() {
return Err(AudioError::Synthesis(
"Cannot speak empty string".to_string(),
));
}
debug!(chars = text.len(), "speaking text");
speak_with_ns_speech_synthesizer(text)
}
fn speech_authorization_status() -> SpeechAuthStatus {
let cls = objc_class("SFSpeechRecognizer");
if cls.is_null() {
return SpeechAuthStatus::Restricted;
}
let raw: i64 = unsafe { msg_send![cls, authorizationStatus] };
SpeechAuthStatus::from_raw(raw)
}
fn request_speech_authorization() -> Result<(), AudioError> {
let status = speech_authorization_status();
match status {
SpeechAuthStatus::Authorized => return Ok(()),
SpeechAuthStatus::Denied => {
return Err(AudioError::PermissionDenied);
}
SpeechAuthStatus::Restricted => {
return Err(AudioError::Transcription(
"Speech recognition is restricted on this device".to_string(),
));
}
SpeechAuthStatus::NotDetermined => {}
}
info!("Requesting SFSpeechRecognizer authorization from user");
let cls = objc_class("SFSpeechRecognizer");
if cls.is_null() {
return Err(AudioError::Transcription(
"SFSpeechRecognizer class not available (macOS 10.15+ required)".to_string(),
));
}
let granted_holder: Arc<Mutex<Option<SpeechAuthStatus>>> = Arc::new(Mutex::new(None));
let cvar = Arc::new(Condvar::new());
let granted_clone = Arc::clone(&granted_holder);
let cvar_clone = Arc::clone(&cvar);
let block = block::ConcreteBlock::new(move |raw_status: i64| {
let new_status = SpeechAuthStatus::from_raw(raw_status);
if let Ok(mut guard) = granted_clone.lock() {
*guard = Some(new_status);
}
cvar_clone.notify_one();
})
.copy();
unsafe {
let _: () = msg_send![cls, requestAuthorization: &*block];
}
let guard = granted_holder.lock().map_err(|_| {
AudioError::Transcription("Lock poisoned waiting for speech auth".to_string())
})?;
let (mut guard, timeout) = cvar
.wait_timeout(guard, Duration::from_secs(30))
.map_err(|_| AudioError::Transcription("Condvar wait failed".to_string()))?;
if timeout.timed_out() {
warn!("SFSpeechRecognizer authorization dialog timed out after 30s");
return Err(AudioError::PermissionDenied);
}
match guard.take().unwrap_or(SpeechAuthStatus::NotDetermined) {
SpeechAuthStatus::Authorized => Ok(()),
SpeechAuthStatus::Denied => Err(AudioError::PermissionDenied),
SpeechAuthStatus::Restricted => Err(AudioError::Transcription(
"Speech recognition is restricted on this device".to_string(),
)),
SpeechAuthStatus::NotDetermined => Err(AudioError::PermissionDenied),
}
}
fn transcribe_with_sf_speech(audio: &AudioData) -> Result<String, AudioError> {
request_speech_authorization()?;
let wav_bytes = audio.to_wav_bytes();
let tmp_path = write_temp_wav(&wav_bytes)
.map_err(|e| AudioError::Framework(format!("Temp file write failed: {e}")))?;
let result = run_sf_speech_recognizer(&tmp_path);
let _ = std::fs::remove_file(&tmp_path);
result
}
fn write_temp_wav(bytes: &[u8]) -> Result<String, std::io::Error> {
use std::os::unix::fs::OpenOptionsExt;
use std::time::{SystemTime, UNIX_EPOCH};
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.subsec_nanos())
.unwrap_or(0);
let path = format!(
"/tmp/axterminator_audio_{}_{}.wav",
std::process::id(),
nanos
);
let mut file = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.mode(0o600)
.open(&path)?;
std::io::Write::write_all(&mut file, bytes)?;
Ok(path)
}
fn run_sf_speech_recognizer(wav_path: &str) -> Result<String, AudioError> {
let recognizer = create_sf_speech_recognizer().ok_or_else(|| {
AudioError::Transcription(
"SFSpeechRecognizer unavailable — check that speech recognition \
is enabled and the locale (en-US) is supported on this device"
.to_string(),
)
})?;
let is_available: bool = unsafe { msg_send![recognizer, isAvailable] };
if !is_available {
return Err(AudioError::Transcription(
"SFSpeechRecognizer reports isAvailable=NO — \
on-device speech recognition may not be downloaded yet"
.to_string(),
));
}
let url = nsurl_from_path(wav_path)
.ok_or_else(|| AudioError::Transcription(format!("Cannot create NSURL for: {wav_path}")))?;
let request = create_sf_speech_url_recognition_request(url).ok_or_else(|| {
AudioError::Transcription("Failed to create recognition request".to_string())
})?;
set_requires_on_device_recognition(request, false);
let result_holder: Arc<Mutex<Option<Result<String, AudioError>>>> = Arc::new(Mutex::new(None));
let cvar = Arc::new(Condvar::new());
let result_clone = Arc::clone(&result_holder);
let cvar_clone = Arc::clone(&cvar);
recognize_async(
recognizer,
request,
move |transcript: Option<String>, error: Option<String>| {
let result = match (transcript, error) {
(Some(text), _) => Ok(text),
(None, Some(err)) => Err(AudioError::Transcription(err)),
(None, None) => Ok(String::new()),
};
if let Ok(mut guard) = result_clone.lock() {
*guard = Some(result);
}
cvar_clone.notify_one();
},
);
extern "C" {
fn CFRunLoopRunInMode(mode: *const Object, seconds: f64, ret: bool) -> i32;
static kCFRunLoopDefaultMode: *const Object;
}
let deadline = std::time::Instant::now() + Duration::from_secs(15);
loop {
if let Ok(guard) = result_holder.lock() {
if guard.is_some() {
break;
}
}
if std::time::Instant::now() >= deadline {
warn!("SFSpeechRecognizer timed out after 15s");
return Err(AudioError::Transcription(
"Recognition timed out — check that Speech Recognition is enabled in \
System Settings > Privacy & Security > Speech Recognition, and that \
the on-device dictation model is downloaded (System Settings > Keyboard > Dictation)"
.to_string(),
));
}
unsafe {
CFRunLoopRunInMode(kCFRunLoopDefaultMode, 0.1, false);
}
}
let mut guard = result_holder
.lock()
.map_err(|_| AudioError::Transcription("Lock poisoned".to_string()))?;
guard.take().unwrap_or(Ok(String::new()))
}
fn recognize_async(
recognizer: *mut Object,
request: *mut Object,
callback: impl Fn(Option<String>, Option<String>) + Send + 'static,
) {
let cb = Arc::new(Mutex::new(callback));
let task_block = block::ConcreteBlock::new(move |result: *mut Object, error: *mut Object| {
let is_final: bool = if result.is_null() {
true
} else {
unsafe { msg_send![result, isFinal] }
};
if !is_final {
return;
}
let transcript = if result.is_null() {
None
} else {
let best: *mut Object = unsafe { msg_send![result, bestTranscription] };
if best.is_null() {
None
} else {
let ns: *mut Object = unsafe { msg_send![best, formattedString] };
Some(ns_string_to_rust(ns))
}
};
let error_msg = if error.is_null() {
None
} else {
let desc: *mut Object = unsafe { msg_send![error, localizedDescription] };
Some(ns_string_to_rust(desc))
};
if let Ok(f) = cb.lock() {
f(transcript, error_msg);
}
})
.copy();
unsafe {
let _: *mut Object = msg_send![recognizer,
recognitionTaskWithRequest: request
resultHandler: &*task_block
];
}
}
fn create_sf_speech_recognizer() -> Option<*mut Object> {
let cls = objc_class("SFSpeechRecognizer");
if cls.is_null() {
return None;
}
let locale_cls = objc_class("NSLocale");
if locale_cls.is_null() {
return None;
}
let locale_id = ns_string_from_str("en-US");
let locale: *mut Object =
unsafe { msg_send![locale_cls, localeWithLocaleIdentifier: locale_id] };
let recognizer: *mut Object = unsafe {
let obj: *mut Object = msg_send![cls, alloc];
msg_send![obj, initWithLocale: locale]
};
if recognizer.is_null() {
None
} else {
Some(recognizer)
}
}
fn create_sf_speech_url_recognition_request(url: *mut Object) -> Option<*mut Object> {
let cls = objc_class("SFSpeechURLRecognitionRequest");
if cls.is_null() {
return None;
}
let req: *mut Object = unsafe {
let obj: *mut Object = msg_send![cls, alloc];
msg_send![obj, initWithURL: url]
};
if req.is_null() {
None
} else {
Some(req)
}
}
fn set_requires_on_device_recognition(request: *mut Object, value: bool) {
unsafe {
let _: () = msg_send![request, setRequiresOnDeviceRecognition: value];
}
}
fn nsurl_from_path(path: &str) -> Option<*mut Object> {
let cls = objc_class("NSURL");
if cls.is_null() {
return None;
}
let ns_path = ns_string_from_str(path);
let url: *mut Object = unsafe { msg_send![cls, fileURLWithPath: ns_path] };
if url.is_null() {
None
} else {
Some(url)
}
}
fn speak_with_ns_speech_synthesizer(text: &str) -> Result<Duration, AudioError> {
let synth = create_ns_speech_synthesizer()
.ok_or_else(|| AudioError::Synthesis("NSSpeechSynthesizer unavailable".to_string()))?;
let started = Instant::now();
let ns_text = ns_string_from_str(text);
if ns_text.is_null() {
release_objc_object(synth);
return Err(AudioError::Synthesis(
"Failed to create NSString for text".to_string(),
));
}
let started_ok: bool = unsafe { msg_send![synth, startSpeakingString: ns_text] };
if !started_ok {
release_objc_object(synth);
return Err(AudioError::Synthesis(
"startSpeakingString: returned NO".to_string(),
));
}
let deadline = started + Duration::from_secs(120);
loop {
std::thread::sleep(Duration::from_millis(10));
let is_speaking: bool = unsafe { msg_send![synth, isSpeaking] };
if !is_speaking || Instant::now() >= deadline {
break;
}
}
let elapsed = started.elapsed();
release_objc_object(synth);
Ok(elapsed)
}
fn create_ns_speech_synthesizer() -> Option<*mut Object> {
let cls = objc_class("NSSpeechSynthesizer");
if cls.is_null() {
return None;
}
let synth: *mut Object = unsafe {
let obj: *mut Object = msg_send![cls, alloc];
msg_send![obj, initWithVoice: std::ptr::null_mut::<Object>()]
};
if synth.is_null() {
None
} else {
Some(synth)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::audio::{encode_wav_pcm16, CHANNELS, SAMPLE_RATE};
#[test]
fn speech_auth_status_from_raw_authorized() {
assert_eq!(SpeechAuthStatus::from_raw(3), SpeechAuthStatus::Authorized);
}
#[test]
fn speech_auth_status_from_raw_denied() {
assert_eq!(SpeechAuthStatus::from_raw(1), SpeechAuthStatus::Denied);
}
#[test]
fn speech_auth_status_from_raw_restricted() {
assert_eq!(SpeechAuthStatus::from_raw(2), SpeechAuthStatus::Restricted);
}
#[test]
fn speech_auth_status_from_raw_not_determined() {
assert_eq!(
SpeechAuthStatus::from_raw(0),
SpeechAuthStatus::NotDetermined
);
}
#[test]
fn speech_auth_status_from_raw_unknown_defaults_to_not_determined() {
assert_eq!(
SpeechAuthStatus::from_raw(99),
SpeechAuthStatus::NotDetermined
);
}
#[test]
fn speak_empty_text_returns_synthesis_error() {
let err = speak("").unwrap_err();
assert_eq!(err.code(), "synthesis_error");
}
#[test]
fn write_temp_wav_creates_readable_file() {
use std::os::unix::fs::PermissionsExt;
let bytes = encode_wav_pcm16(&[], SAMPLE_RATE, CHANNELS);
let path = write_temp_wav(&bytes).unwrap();
let meta = std::fs::metadata(&path).unwrap();
let mode = meta.permissions().mode();
assert_eq!(
mode & 0o777,
0o600,
"expected mode 0600, got {:o}",
mode & 0o777
);
let _ = std::fs::remove_file(&path);
}
#[test]
fn write_temp_wav_file_contains_wav_header() {
let samples: Vec<f32> = vec![0.0; 16];
let bytes = encode_wav_pcm16(&samples, SAMPLE_RATE, CHANNELS);
let path = write_temp_wav(&bytes).unwrap();
let content = std::fs::read(&path).unwrap();
assert_eq!(&content[0..4], b"RIFF");
let _ = std::fs::remove_file(&path);
}
#[test]
fn write_temp_wav_paths_are_unique_across_calls() {
let bytes = encode_wav_pcm16(&[], SAMPLE_RATE, CHANNELS);
let p1 = write_temp_wav(&bytes).unwrap();
let p2 = write_temp_wav(&bytes).unwrap();
assert_ne!(p1, p2);
let _ = std::fs::remove_file(&p1);
let _ = std::fs::remove_file(&p2);
}
}