coqui_tts/
lib.rs

1//! Rust bindings for the coqui-TTS python library for Text-To-Speech
2
3use std::borrow::Borrow;
4
5use pyo3::{prelude::*, types::{PyDict, PyList}};
6
7/// TTS Synthesizer. equivilant to `TTS.utils.synthesizer.Synthesizer`
8#[derive(Debug)]
9pub struct Synthesizer {
10    locals: Py<PyDict>,
11}
12
13impl Synthesizer {
14    /// Create a new Synthesizer, performing startup initialization (this method is NOT cheap to call, expect a few SECONDS of runtime)
15    ///
16    /// this will also download apropreate models if they are missing
17    ///
18    /// # Arguments
19    ///
20    /// model: the name of the TTS model to use. see https://github.com/coqui-ai/TTS for models.
21    ///
22    /// # Note
23    ///
24    /// this may spew out some text to stdout about initialization,
25    /// this is from the python library and there is nothing that can be done about it
26    ///
27    pub fn new(model: &str, use_cuda: bool) -> Self {
28        Python::with_gil(|py| {
29            let locals: Py<PyDict> = PyDict::new(py).into();
30            let locals_ref = locals.as_ref(py).borrow();
31            locals_ref.set_item("model_name", model).unwrap();
32            locals_ref.set_item("use_cuda", use_cuda).unwrap();
33            py.run(r#"
34from TTS.utils.synthesizer import Synthesizer
35from TTS.utils.manage import ModelManager
36# create instance of the coqui tts model manager
37manager = ModelManager()
38# download the model
39(
40    model_path,
41    config_path,
42    model_item,
43) = manager.download_model(model_name)
44# download the vocoder
45vocoder_path, vocoder_config_path, _ = manager.download_model(
46    model_item["default_vocoder"]
47)
48# create the coqui tts instance
49coqui_tts = Synthesizer(
50    model_path,
51    config_path,
52    vocoder_checkpoint=vocoder_path,
53    vocoder_config=vocoder_config_path,
54    use_cuda=use_cuda
55)
56            "#, None, Some(locals.as_ref(py).borrow())).unwrap();
57            Self { locals }
58        })
59    }
60
61    /// Synthesize some audio.
62    ///
63    /// # Returned format
64    /// channels: 1?
65    /// rate: see [`Synthesizer::sample_rate`]
66    ///
67    pub fn tts(&mut self, text: &str) -> Vec<f32> {
68        Python::with_gil(|py| {
69            let tts = self.locals.as_ref(py).borrow().get_item("coqui_tts").unwrap();
70            let audio = tts.call_method1("tts", (text,)).unwrap().downcast::<PyList>().unwrap();
71            audio.extract::<Vec<f32>>().unwrap()
72        })
73    }
74
75    pub fn sample_rate(&mut self) -> u64 {
76        Python::with_gil(|py| {
77            let tts = self.locals.as_ref(py).borrow().get_item("coqui_tts").unwrap();
78            tts.getattr("output_sample_rate").unwrap().extract::<u64>().unwrap()
79        })
80    }
81}