Skip to main content

rust_tts_wrapper/
lib.rs

1//! # rust-tts-wrapper
2//!
3//! Cross-platform TTS (Text-to-Speech) wrapper with a C ABI.
4//! Mirrors [`js-tts-wrapper`] and `SwiftTTSWrapper`, supporting 21 engines:
5//! system (speech-dispatcher), Sherpa-ONNX (191 local models), and 19 cloud providers.
6//!
7//! [`js-tts-wrapper`]: https://github.com/AACTools/js-tts-wrapper
8//!
9//! ## Quick start (C)
10//!
11//! ```c
12//! tts_ctx* ctx = tts_create("system", NULL);
13//! tts_speak(ctx, "Hello world");
14//! tts_destroy(ctx);
15//! ```
16
17#![allow(
18    clippy::missing_panics_doc,
19    clippy::not_unsafe_ptr_arg_deref,
20    clippy::cast_possible_truncation,
21    clippy::cast_possible_wrap,
22    clippy::cast_sign_loss,
23    clippy::ptr_as_ptr,
24    clippy::cast_ptr_alignment,
25    clippy::doc_markdown,
26    clippy::multiple_crate_versions,
27    clippy::field_reassign_with_default,
28    non_camel_case_types,
29    dead_code
30)]
31
32#[cfg(feature = "cloud")]
33mod cloud_engine;
34pub mod engine;
35pub mod factory;
36#[cfg(feature = "sherpaonnx")]
37mod sherpaonnx_engine;
38#[cfg(feature = "system")]
39mod system_engine;
40pub mod types;
41
42use std::ffi::{CStr, CString};
43use std::os::raw::c_char;
44use std::ptr;
45use std::sync::Mutex;
46
47use engine::TtsEngine;
48use factory::create_engine;
49
50type BoxedEngine = Box<dyn TtsEngine>;
51
52/// Opaque context holding an engine instance and its per-instance settings.
53pub type CAudioCb = Option<extern "C" fn(*const u8, usize, *mut std::ffi::c_void)>;
54pub type CBoundaryCb = Option<extern "C" fn(*const c_char, f32, f32, *mut std::ffi::c_void)>;
55type BoxedAudioCb = Box<dyn FnMut(&[u8])>;
56type BoxedBoundaryCb = Box<dyn FnMut(&str, f32, f32)>;
57
58pub struct tts_ctx {
59    engine: Mutex<BoxedEngine>,
60    voice_id: Mutex<Option<String>>,
61    rate: Mutex<f32>,
62    pitch: Mutex<f32>,
63    volume: Mutex<f32>,
64    last_error: Mutex<String>,
65    on_audio: Mutex<CAudioCb>,
66    on_audio_userdata: Mutex<*mut std::ffi::c_void>,
67    on_boundary: Mutex<CBoundaryCb>,
68    on_boundary_userdata: Mutex<*mut std::ffi::c_void>,
69}
70
71static LAST_ERROR: Mutex<Option<CString>> = Mutex::new(None);
72
73fn set_error(msg: &str) {
74    if let Ok(mut guard) = LAST_ERROR.lock() {
75        *guard = Some(CString::new(msg).unwrap_or_else(|_| CString::new("error").unwrap()));
76    }
77}
78
79/// Create a new TTS engine instance.
80///
81/// Returns an opaque context pointer on success, or null on failure.
82/// Call [`tts_get_last_error`] to retrieve the error message on failure.
83///
84/// # Safety
85///
86/// `engine_id` must be a valid null-terminated C string.
87/// `credentials_json` may be null or a valid null-terminated JSON string.
88#[no_mangle]
89pub extern "C" fn tts_create(
90    engine_id: *const c_char,
91    credentials_json: *const c_char,
92) -> *mut tts_ctx {
93    if engine_id.is_null() {
94        set_error("engine_id is null");
95        return ptr::null_mut();
96    }
97    let engine_id_str = unsafe { CStr::from_ptr(engine_id) }
98        .to_string_lossy()
99        .into_owned();
100    let creds = if credentials_json.is_null() {
101        String::new()
102    } else {
103        unsafe { CStr::from_ptr(credentials_json) }
104            .to_string_lossy()
105            .into_owned()
106    };
107
108    if let Some(engine) = create_engine(&engine_id_str, &creds) {
109        let ctx = Box::new(tts_ctx {
110            engine: Mutex::new(engine),
111            voice_id: Mutex::new(None),
112            rate: Mutex::new(1.0),
113            pitch: Mutex::new(1.0),
114            volume: Mutex::new(1.0),
115            last_error: Mutex::new(String::new()),
116            on_audio: Mutex::new(None),
117            on_audio_userdata: Mutex::new(ptr::null_mut()),
118            on_boundary: Mutex::new(None),
119            on_boundary_userdata: Mutex::new(ptr::null_mut()),
120        });
121        Box::into_raw(ctx)
122    } else {
123        set_error(&format!("Unknown engine: {engine_id_str}"));
124        ptr::null_mut()
125    }
126}
127
128/// Destroy a TTS context and free all associated resources.
129///
130/// # Safety
131///
132/// `ctx` must be a pointer previously returned by [`tts_create`],
133/// or null (no-op).
134#[no_mangle]
135pub extern "C" fn tts_destroy(ctx: *mut tts_ctx) {
136    if !ctx.is_null() {
137        unsafe {
138            drop(Box::from_raw(ctx));
139        }
140    }
141}
142
143/// Speak `text` asynchronously using the engine in `ctx`.
144///
145/// Returns 0 on success, -1 on failure.
146///
147/// # Safety
148///
149/// `ctx` must be a valid pointer from [`tts_create`].
150/// `text` must be a valid null-terminated C string.
151#[no_mangle]
152pub extern "C" fn tts_speak(ctx: *mut tts_ctx, text: *const c_char) -> i32 {
153    if ctx.is_null() || text.is_null() {
154        return -1;
155    }
156    let ctx_ref = unsafe { &*ctx };
157    let text_str = unsafe { CStr::from_ptr(text) }
158        .to_string_lossy()
159        .into_owned();
160    let voice = ctx_ref.voice_id.lock().unwrap().clone();
161    let rate = *ctx_ref.rate.lock().unwrap();
162    let pitch = *ctx_ref.pitch.lock().unwrap();
163    let volume = *ctx_ref.volume.lock().unwrap();
164
165    let audio_cb = *ctx_ref.on_audio.lock().unwrap();
166    let audio_userdata = *ctx_ref.on_audio_userdata.lock().unwrap();
167    let boundary_cb = *ctx_ref.on_boundary.lock().unwrap();
168    let boundary_userdata = *ctx_ref.on_boundary_userdata.lock().unwrap();
169
170    let mut on_audio_closure: Option<BoxedAudioCb> = match audio_cb {
171        Some(cb) => Some(Box::new(move |bytes: &[u8]| {
172            cb(bytes.as_ptr(), bytes.len(), audio_userdata);
173        })),
174        None => None,
175    };
176
177    let mut on_boundary_closure: Option<BoxedBoundaryCb> = match boundary_cb {
178        Some(cb) => Some(Box::new(move |word: &str, start: f32, end: f32| {
179            if let Ok(c_word) = CString::new(word) {
180                cb(c_word.as_ptr(), start, end, boundary_userdata);
181            }
182        })),
183        None => None,
184    };
185
186    let engine = ctx_ref.engine.lock().unwrap();
187    match engine.speak(
188        &text_str,
189        voice.as_deref(),
190        rate,
191        pitch,
192        volume,
193        on_audio_closure
194            .as_mut()
195            .map(|f| &mut **f as &mut dyn FnMut(&[u8])),
196        on_boundary_closure
197            .as_mut()
198            .map(|f| &mut **f as &mut dyn FnMut(&str, f32, f32)),
199    ) {
200        Ok(()) => 0,
201        Err(e) => {
202            *ctx_ref.last_error.lock().unwrap() = e.to_string();
203            -1
204        }
205    }
206}
207
208/// Speak `text` synchronously (blocks until complete).
209///
210/// Returns 0 on success, -1 on failure.
211///
212/// # Safety
213///
214/// `ctx` must be a valid pointer from [`tts_create`].
215/// `text` must be a valid null-terminated C string.
216#[no_mangle]
217pub extern "C" fn tts_speak_sync(ctx: *mut tts_ctx, text: *const c_char) -> i32 {
218    if ctx.is_null() || text.is_null() {
219        return -1;
220    }
221    let ctx_ref = unsafe { &*ctx };
222    let text_str = unsafe { CStr::from_ptr(text) }
223        .to_string_lossy()
224        .into_owned();
225    let voice = ctx_ref.voice_id.lock().unwrap().clone();
226    let rate = *ctx_ref.rate.lock().unwrap();
227    let pitch = *ctx_ref.pitch.lock().unwrap();
228    let volume = *ctx_ref.volume.lock().unwrap();
229
230    let audio_cb = *ctx_ref.on_audio.lock().unwrap();
231    let audio_userdata = *ctx_ref.on_audio_userdata.lock().unwrap();
232    let boundary_cb = *ctx_ref.on_boundary.lock().unwrap();
233    let boundary_userdata = *ctx_ref.on_boundary_userdata.lock().unwrap();
234
235    let mut on_audio_closure: Option<BoxedAudioCb> = match audio_cb {
236        Some(cb) => Some(Box::new(move |bytes: &[u8]| {
237            cb(bytes.as_ptr(), bytes.len(), audio_userdata);
238        })),
239        None => None,
240    };
241
242    let mut on_boundary_closure: Option<BoxedBoundaryCb> = match boundary_cb {
243        Some(cb) => Some(Box::new(move |word: &str, start: f32, end: f32| {
244            if let Ok(c_word) = CString::new(word) {
245                cb(c_word.as_ptr(), start, end, boundary_userdata);
246            }
247        })),
248        None => None,
249    };
250
251    let engine = ctx_ref.engine.lock().unwrap();
252    match engine.speak_sync(
253        &text_str,
254        voice.as_deref(),
255        rate,
256        pitch,
257        volume,
258        on_audio_closure
259            .as_mut()
260            .map(|f| &mut **f as &mut dyn FnMut(&[u8])),
261        on_boundary_closure
262            .as_mut()
263            .map(|f| &mut **f as &mut dyn FnMut(&str, f32, f32)),
264    ) {
265        Ok(()) => 0,
266        Err(e) => {
267            *ctx_ref.last_error.lock().unwrap() = e.to_string();
268            -1
269        }
270    }
271}
272
273/// Stop any in-progress speech.
274///
275/// # Safety
276///
277/// `ctx` must be a valid pointer from [`tts_create`].
278#[no_mangle]
279pub extern "C" fn tts_stop(ctx: *mut tts_ctx) {
280    if ctx.is_null() {
281        return;
282    }
283    let ctx_ref = unsafe { &*ctx };
284    let engine = ctx_ref.engine.lock().unwrap();
285    let _ = engine.stop();
286}
287
288/// Retrieve the list of available voices for the engine.
289///
290/// On success, writes a heap-allocated array to `*out_voices` and its length
291/// to `*out_count`. Caller must free with [`tts_free_voices`].
292///
293/// Returns 0 on success, -1 on failure.
294///
295/// # Safety
296///
297/// `ctx` must be valid. `out_voices` and `out_count` must be non-null.
298#[no_mangle]
299pub extern "C" fn tts_get_voices(
300    ctx: *mut tts_ctx,
301    out_voices: *mut *mut types::tts_voice,
302    out_count: *mut i32,
303) -> i32 {
304    if ctx.is_null() || out_voices.is_null() || out_count.is_null() {
305        return -1;
306    }
307    let ctx_ref = unsafe { &*ctx };
308    let engine = ctx_ref.engine.lock().unwrap();
309    match engine.get_voices() {
310        Ok(voices) => {
311            let len = voices.len();
312            if len == 0 {
313                unsafe {
314                    *out_voices = ptr::null_mut();
315                    *out_count = 0;
316                }
317                return 0;
318            }
319            let layout = std::alloc::Layout::array::<types::tts_voice>(len).unwrap();
320            let arr_ptr = unsafe { std::alloc::alloc(layout).cast::<types::tts_voice>() };
321            for (i, v) in voices.iter().enumerate() {
322                unsafe {
323                    let entry = arr_ptr.add(i);
324                    std::ptr::write(
325                        entry,
326                        types::tts_voice {
327                            id: CString::new(v.id.clone()).unwrap().into_raw(),
328                            name: CString::new(v.name.clone()).unwrap().into_raw(),
329                            language: CString::new(v.primary_language().to_string())
330                                .unwrap()
331                                .into_raw(),
332                            gender: CString::new(v.gender.to_string()).unwrap().into_raw(),
333                            engine: CString::new(v.provider.clone()).unwrap().into_raw(),
334                        },
335                    );
336                }
337            }
338            unsafe {
339                *out_voices = arr_ptr;
340                *out_count = len as i32;
341            }
342            0
343        }
344        Err(e) => {
345            *ctx_ref.last_error.lock().unwrap() = e.to_string();
346            -1
347        }
348    }
349}
350
351/// Free a voice array previously returned by [`tts_get_voices`].
352///
353/// # Safety
354///
355/// `voices` must be a pointer from `tts_get_voices` with the matching `count`.
356#[no_mangle]
357pub extern "C" fn tts_free_voices(voices: *mut types::tts_voice, count: i32) {
358    if voices.is_null() || count <= 0 {
359        return;
360    }
361    for i in 0..count {
362        unsafe {
363            let v = voices.add(i as usize);
364            if !(*v).id.is_null() {
365                let _ = CString::from_raw((*v).id);
366            }
367            if !(*v).name.is_null() {
368                let _ = CString::from_raw((*v).name);
369            }
370            if !(*v).language.is_null() {
371                let _ = CString::from_raw((*v).language);
372            }
373            if !(*v).gender.is_null() {
374                let _ = CString::from_raw((*v).gender);
375            }
376            if !(*v).engine.is_null() {
377                let _ = CString::from_raw((*v).engine);
378            }
379        }
380    }
381    let layout = std::alloc::Layout::array::<types::tts_voice>(count as usize).unwrap();
382    unsafe {
383        std::alloc::dealloc(voices.cast::<u8>(), layout);
384    }
385}
386
387/// Set the voice for subsequent speak calls.
388///
389/// # Safety
390///
391/// `ctx` must be valid. `voice_id` must be a valid null-terminated C string.
392#[no_mangle]
393pub extern "C" fn tts_set_voice(ctx: *mut tts_ctx, voice_id: *const c_char) {
394    if ctx.is_null() || voice_id.is_null() {
395        return;
396    }
397    let ctx_ref = unsafe { &*ctx };
398    let id = unsafe { CStr::from_ptr(voice_id) }
399        .to_string_lossy()
400        .into_owned();
401    *ctx_ref.voice_id.lock().unwrap() = Some(id);
402}
403
404/// Set the speech rate (1.0 = normal).
405///
406/// # Safety
407///
408/// `ctx` must be valid.
409#[no_mangle]
410pub extern "C" fn tts_set_rate(ctx: *mut tts_ctx, rate: f32) {
411    if ctx.is_null() {
412        return;
413    }
414    *unsafe { &*ctx }.rate.lock().unwrap() = rate;
415}
416
417/// Set the speech pitch (1.0 = normal).
418///
419/// # Safety
420///
421/// `ctx` must be valid.
422#[no_mangle]
423pub extern "C" fn tts_set_pitch(ctx: *mut tts_ctx, pitch: f32) {
424    if ctx.is_null() {
425        return;
426    }
427    *unsafe { &*ctx }.pitch.lock().unwrap() = pitch;
428}
429
430/// Set the speech volume (1.0 = normal).
431///
432/// # Safety
433///
434/// `ctx` must be valid.
435#[no_mangle]
436pub extern "C" fn tts_set_volume(ctx: *mut tts_ctx, volume: f32) {
437    if ctx.is_null() {
438        return;
439    }
440    *unsafe { &*ctx }.volume.lock().unwrap() = volume;
441}
442
443/// Set the callback for streaming audio chunks.
444///
445/// # Safety
446/// `ctx` must be valid.
447#[no_mangle]
448pub extern "C" fn tts_set_on_audio(
449    ctx: *mut tts_ctx,
450    cb: CAudioCb,
451    userdata: *mut std::ffi::c_void,
452) {
453    if ctx.is_null() {
454        return;
455    }
456    let ctx_ref = unsafe { &*ctx };
457    *ctx_ref.on_audio.lock().unwrap() = cb;
458    *ctx_ref.on_audio_userdata.lock().unwrap() = userdata;
459}
460
461/// Set the callback for word boundary events.
462///
463/// # Safety
464/// `ctx` must be valid.
465#[no_mangle]
466pub extern "C" fn tts_set_on_boundary(
467    ctx: *mut tts_ctx,
468    cb: CBoundaryCb,
469    userdata: *mut std::ffi::c_void,
470) {
471    if ctx.is_null() {
472        return;
473    }
474    let ctx_ref = unsafe { &*ctx };
475    *ctx_ref.on_boundary.lock().unwrap() = cb;
476    *ctx_ref.on_boundary_userdata.lock().unwrap() = userdata;
477}
478
479/// Return the number of registered engines.
480#[no_mangle]
481pub extern "C" fn tts_get_engine_count() -> i32 {
482    factory::engine_count() as i32
483}
484
485/// Write engine descriptors into a caller-allocated array.
486///
487/// `out_engines` must point to at least [`tts_get_engine_count`] entries.
488/// Caller must free each entry's strings and the array with [`tts_free_engine_info`].
489///
490/// # Safety
491///
492/// `out_engines` must be non-null and point to enough space.
493#[no_mangle]
494pub extern "C" fn tts_get_engines(out_engines: *mut types::tts_engine_info) {
495    if out_engines.is_null() {
496        return;
497    }
498    let engines = factory::engine_list();
499    for (i, e) in engines.iter().enumerate() {
500        unsafe {
501            let entry = out_engines.add(i);
502            std::ptr::write(
503                entry,
504                types::tts_engine_info {
505                    id: CString::new(e.id.clone()).unwrap().into_raw(),
506                    name: CString::new(e.name.clone()).unwrap().into_raw(),
507                    needs_credentials: e.needs_credentials,
508                    credential_keys_json: CString::new(e.credential_keys_json.clone())
509                        .unwrap()
510                        .into_raw(),
511                },
512            );
513        }
514    }
515}
516
517/// Free an engine info array previously returned by [`tts_get_engines`].
518///
519/// # Safety
520///
521/// `engines` must be a pointer from `tts_get_engines` with the matching `count`.
522#[no_mangle]
523pub extern "C" fn tts_free_engine_info(engines: *mut types::tts_engine_info, count: i32) {
524    if engines.is_null() || count <= 0 {
525        return;
526    }
527    for i in 0..count {
528        unsafe {
529            let e = engines.add(i as usize);
530            if !(*e).id.is_null() {
531                let _ = CString::from_raw((*e).id);
532            }
533            if !(*e).name.is_null() {
534                let _ = CString::from_raw((*e).name);
535            }
536            if !(*e).credential_keys_json.is_null() {
537                let _ = CString::from_raw((*e).credential_keys_json);
538            }
539        }
540    }
541    let layout = std::alloc::Layout::array::<types::tts_engine_info>(count as usize).unwrap();
542    unsafe {
543        std::alloc::dealloc(engines.cast::<u8>(), layout);
544    }
545}
546
547/// Return the last error message as a C string, or null if none.
548///
549/// The returned pointer is valid until the next call to any TTS function.
550#[no_mangle]
551pub extern "C" fn tts_get_last_error() -> *const c_char {
552    match LAST_ERROR.lock() {
553        Ok(guard) => match guard.as_ref() {
554            Some(cs) => cs.as_ptr(),
555            None => ptr::null(),
556        },
557        Err(_) => ptr::null(),
558    }
559}
560
561/// Pause in-progress speech.
562///
563/// # Safety
564/// `ctx` must be valid.
565#[no_mangle]
566pub extern "C" fn tts_pause(ctx: *mut tts_ctx) {
567    if ctx.is_null() {
568        return;
569    }
570    let ctx_ref = unsafe { &*ctx };
571    let engine = ctx_ref.engine.lock().unwrap();
572    let _ = engine.pause();
573}
574
575/// Resume paused speech.
576///
577/// # Safety
578/// `ctx` must be valid.
579#[no_mangle]
580pub extern "C" fn tts_resume(ctx: *mut tts_ctx) {
581    if ctx.is_null() {
582        return;
583    }
584    let ctx_ref = unsafe { &*ctx };
585    let engine = ctx_ref.engine.lock().unwrap();
586    let _ = engine.resume();
587}
588
589/// Synthesize text to audio bytes without playback.
590/// Writes a heap-allocated buffer to `*out_bytes` and its length to `*out_len`.
591/// Caller must free with [`tts_free_bytes`].
592/// Returns 0 on success, -1 on failure.
593///
594/// # Safety
595/// `ctx` must be valid. `out_bytes` and `out_len` must be non-null.
596#[no_mangle]
597pub extern "C" fn tts_synth_to_bytes(
598    ctx: *mut tts_ctx,
599    text: *const c_char,
600    out_bytes: *mut *mut u8,
601    out_len: *mut usize,
602) -> i32 {
603    if ctx.is_null() || text.is_null() || out_bytes.is_null() || out_len.is_null() {
604        return -1;
605    }
606    let ctx_ref = unsafe { &*ctx };
607    let text_str = unsafe { CStr::from_ptr(text) }
608        .to_string_lossy()
609        .into_owned();
610    let voice = ctx_ref.voice_id.lock().unwrap().clone();
611    let rate = *ctx_ref.rate.lock().unwrap();
612    let pitch = *ctx_ref.pitch.lock().unwrap();
613    let volume = *ctx_ref.volume.lock().unwrap();
614
615    let engine = ctx_ref.engine.lock().unwrap();
616    match engine.synth_to_bytes(&text_str, voice.as_deref(), rate, pitch, volume) {
617        Ok(data) => {
618            if data.is_empty() {
619                unsafe {
620                    *out_bytes = ptr::null_mut();
621                    *out_len = 0;
622                }
623                return 0;
624            }
625            let len = data.len();
626            let layout = std::alloc::Layout::array::<u8>(len).unwrap();
627            let ptr = unsafe { std::alloc::alloc(layout) };
628            unsafe {
629                ptr::copy_nonoverlapping(data.as_ptr(), ptr, len);
630                *out_bytes = ptr;
631                *out_len = len;
632            }
633            0
634        }
635        Err(e) => {
636            *ctx_ref.last_error.lock().unwrap() = e.to_string();
637            -1
638        }
639    }
640}
641
642/// Free a byte buffer returned by [`tts_synth_to_bytes`].
643///
644/// # Safety
645/// `bytes` must be from `tts_synth_to_bytes` with the matching `len`.
646#[no_mangle]
647pub extern "C" fn tts_free_bytes(bytes: *mut u8, len: usize) {
648    if bytes.is_null() || len == 0 {
649        return;
650    }
651    let layout = std::alloc::Layout::array::<u8>(len).unwrap();
652    unsafe {
653        std::alloc::dealloc(bytes, layout);
654    }
655}