use std::env;
use std::ffi::{c_char, c_void, CStr, CString};
use std::mem;
use std::path::PathBuf;
use std::ptr;
use std::sync::OnceLock;
const PIPER_ESPEAKNG_DATA_DIRECTORY: &str = "PIPER_ESPEAKNG_DATA_DIRECTORY";
const ESPEAKNG_DATA_DIR_NAME: &str = "espeak-ng-data";
#[derive(Debug, Clone)]
pub struct ESpeakError(pub String);
impl std::error::Error for ESpeakError {}
impl std::fmt::Display for ESpeakError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "eSpeak-ng error: {}", self.0)
}
}
pub type ESpeakResult<T> = Result<T, ESpeakError>;
static ESPEAK_INIT: OnceLock<ESpeakResult<()>> = OnceLock::new();
fn init_espeak() -> ESpeakResult<()> {
let data_dir = locate_espeak_data();
let path_cstr = data_dir
.as_ref()
.and_then(|p| CString::new(p.to_string_lossy().as_ref()).ok());
let path_ptr = path_cstr.as_ref().map_or(ptr::null(), |c| c.as_ptr());
let sample_rate = unsafe {
espeak_rs_sys::espeak_Initialize(
espeak_rs_sys::espeak_AUDIO_OUTPUT_AUDIO_OUTPUT_RETRIEVAL,
0,
path_ptr,
espeak_rs_sys::espeakINITIALIZE_DONT_EXIT as i32,
)
};
if sample_rate <= 0 {
Err(ESpeakError(format!(
"Failed to initialize eSpeak-ng (code {sample_rate}). \
Try setting `{PIPER_ESPEAKNG_DATA_DIRECTORY}` to the directory containing `{ESPEAKNG_DATA_DIR_NAME}`."
)))
} else {
Ok(())
}
}
fn locate_espeak_data() -> Option<PathBuf> {
if let Ok(dir) = env::var(PIPER_ESPEAKNG_DATA_DIRECTORY) {
let p = PathBuf::from(dir);
if p.join(ESPEAKNG_DATA_DIR_NAME).exists() {
return Some(p);
}
}
if let Ok(cwd) = env::current_dir() {
if cwd.join(ESPEAKNG_DATA_DIR_NAME).exists() {
return Some(cwd);
}
}
if let Ok(exe) = env::current_exe() {
if let Some(dir) = exe.parent() {
if dir.join(ESPEAKNG_DATA_DIR_NAME).exists() {
return Some(dir.to_path_buf());
}
}
}
None
}
fn strip_lang_switches(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut depth: usize = 0;
for c in s.chars() {
match c {
'(' => depth += 1,
')' => depth = depth.saturating_sub(1),
_ if depth == 0 => out.push(c),
_ => {}
}
}
out
}
pub fn text_to_phonemes(
text: &str,
language: &str,
phoneme_separator: Option<char>,
) -> ESpeakResult<Vec<String>> {
ESPEAK_INIT
.get_or_init(init_espeak)
.as_ref()
.map_err(|e| e.clone())?;
let lang_cstr = CString::new(language)
.map_err(|_| ESpeakError("Language name contains a null byte".into()))?;
let set_voice = unsafe { espeak_rs_sys::espeak_SetVoiceByName(lang_cstr.as_ptr()) };
if set_voice != espeak_rs_sys::espeak_ERROR_EE_OK {
return Err(ESpeakError(format!("Failed to set voice: `{language}`")));
}
let phoneme_mode = match phoneme_separator {
Some(c) => ((c as u32) << 8) | espeak_rs_sys::espeakINITIALIZE_PHONEME_IPA,
None => espeak_rs_sys::espeakINITIALIZE_PHONEME_IPA,
} as i32;
let mut sentences: Vec<String> = Vec::new();
let mut current = String::new();
for line in text.lines() {
let text_cstr =
CString::new(line).map_err(|_| ESpeakError("Text contains a null byte".into()))?;
let mut text_ptr: *const c_char = text_cstr.as_ptr();
while !text_ptr.is_null() {
let clause = unsafe {
let res = espeak_rs_sys::espeak_TextToPhonemes(
&mut text_ptr as *mut *const c_char as *mut *const c_void,
espeak_rs_sys::espeakCHARS_UTF8 as i32,
phoneme_mode,
);
if res.is_null() {
continue;
}
CStr::from_ptr(res).to_string_lossy().into_owned()
};
let clause = strip_lang_switches(&clause);
if clause.is_empty() {
continue;
}
current.push_str(&clause);
if matches!(current.trim_end().chars().last(), Some('.' | '?' | '!')) {
sentences.push(mem::take(&mut current));
}
}
if !current.is_empty() {
sentences.push(mem::take(&mut current));
}
}
Ok(sentences)
}
#[cfg(test)]
mod tests {
use super::*;
const TEXT_ALICE: &str =
"Who are you? said the Caterpillar. Replied Alice , rather shyly, I hardly know, sir!";
#[test]
fn test_basic_en() -> ESpeakResult<()> {
let phonemes = text_to_phonemes("test", "en-US", None)?.join("");
assert_eq!(phonemes, "tˈɛst.");
Ok(())
}
#[test]
fn test_it_splits_sentences() -> ESpeakResult<()> {
let phonemes = text_to_phonemes(TEXT_ALICE, "en-US", None)?;
assert_eq!(phonemes.len(), 3);
Ok(())
}
#[test]
fn test_it_adds_phoneme_separator() -> ESpeakResult<()> {
let phonemes = text_to_phonemes("test", "en-US", Some('_'))?.join("");
assert_eq!(phonemes, "t_ˈɛ_s_t.");
Ok(())
}
#[test]
fn test_it_preserves_clause_breakers() -> ESpeakResult<()> {
let phonemes = text_to_phonemes(TEXT_ALICE, "en-US", None)?.join("");
for c in ['.', ',', '?', '!'] {
assert!(phonemes.contains(c), "Clause breaker `{c}` not preserved");
}
Ok(())
}
#[test]
fn test_arabic() -> ESpeakResult<()> {
let phonemes = text_to_phonemes("مَرْحَبَاً بِكَ أَيُّهَا الْرَّجُلْ", "ar", None)?.join("");
assert_eq!(phonemes, "mˈarħabˌaː bikˌa ʔaˈiːuhˌaː alrrˈadʒul.");
Ok(())
}
#[test]
fn test_lang_switch_markers_stripped() -> ESpeakResult<()> {
let phonemes = text_to_phonemes("Hello معناها مرحباً", "ar", None)?.join("");
assert!(!phonemes.contains("(en)"));
assert!(!phonemes.contains("(ar)"));
Ok(())
}
#[test]
fn test_line_splitting() -> ESpeakResult<()> {
let phonemes = text_to_phonemes("Hello\nThere\nAnd\nWelcome", "en-US", None)?;
assert_eq!(phonemes.len(), 4);
Ok(())
}
}