voicevox_dyn/
lib.rs

1//! # VoiceVox without hassle
2//! Downloads VOICEVOX CORE and dynamically loads it at runtime.
3//!
4//! The intent of this crate is to make using voicevox as easy as possible
5//! and in particular making it easy to distribute a single binary that
6//! sets up voicevox itself and is also able to run it.
7//!
8//! ### Alternatives
9//! If you prefer to dynamically link voicevox instead, I recommend using [vvcore](https://github.com/iwase22334/voicevox-core-rs).
10
11use color_eyre::eyre::bail;
12use libloading::Symbol;
13use std::{ffi::OsStr, path::PathBuf, process::Stdio};
14use tracing::info;
15
16pub struct VoiceVox {
17    fns: VoiceVoxFns,
18    init: bool,
19}
20
21#[ouroboros::self_referencing]
22pub struct VoiceVoxFns {
23    lib: libloading::Library,
24    #[covariant]
25    #[borrows(lib)]
26    init: Symbol<'this, unsafe extern "C" fn(InitOptions) -> ResultCode>,
27    #[covariant]
28    #[borrows(lib)]
29    load_model: Symbol<'this, unsafe extern "C" fn(u32) -> ResultCode>,
30    #[covariant]
31    #[borrows(lib)]
32    tts: Symbol<'this, TtsFn>,
33    #[covariant]
34    #[borrows(lib)]
35    wav_free: Symbol<'this, unsafe extern "C" fn(*mut u8)>,
36}
37
38type TtsFn = unsafe extern "C" fn(
39    text: *const ::std::os::raw::c_char,
40    speaker_id: u32,
41    options: TtsOptions,
42    output_wav_length: *mut usize,
43    output_wav: *mut *mut u8,
44) -> ResultCode;
45
46impl VoiceVox {
47    /// Creates a new VoiceVox instance and downloads all required files for running
48    /// voicevox into the directory of the executable.
49    ///
50    /// Note that `VoiceVox` is not initialized automatically, as initialization is expensive. To initialize `VoiceVox` call [`VoiceVox::init`].
51    ///
52    /// After initialization, `VoiceVox` can be used to synthesize speech with [`VoiceVox::tts`].
53    ///
54    /// By default the CPU runtime for voicevox is downloaded. For cuda support,
55    /// use [`VoiceVox::new_with_args`] with `["--device", "cuda"]` as the argument.
56    pub fn load() -> color_eyre::Result<Self> {
57        Self::load_with_args(std::iter::empty::<&str>())
58    }
59
60    /// Same as [`VoiceVox::new`] but allows passing arguments to the voicevox downloader.
61    ///
62    /// See [here](https://github.com/VOICEVOX/voicevox_core/blob/6a662757b8d42fc5d0902364b1d549684b50b5bc/crates/download/src/main.rs#L50) for a list of possible arguments.
63    pub fn load_with_args<S: AsRef<OsStr>>(
64        args: impl IntoIterator<Item = S>,
65    ) -> color_eyre::Result<Self> {
66        let exe_path = download_path()?;
67        let dll = exe_path.join("voicevox_core.dll");
68
69        if !dll.exists() {
70            // get the downloader
71            info!("Downloading voicevox downloader.");
72            let mut reader = ureq::get(&voicevox_downloader_url()?).call()?.into_reader();
73            let downloader_path = exe_path.join("voicevox_downloader");
74            let file = std::fs::File::create(&downloader_path)?;
75            std::io::copy(&mut reader, &mut std::io::BufWriter::new(file))?;
76
77            // use the downloader
78            let mut child = std::process::Command::new(downloader_path)
79                .args([
80                    "-o",
81                    exe_path.to_str().ok_or(color_eyre::eyre::eyre!(
82                        "failed to convert {:?} to str",
83                        exe_path
84                    ))?,
85                ])
86                .args(args)
87                .stdout(Stdio::piped())
88                .stderr(Stdio::piped())
89                .spawn()?;
90
91            info!("Downloading voicevox. This may take a while, roughly 700MB of data will be downloaded.");
92            // This doesn't output the progress bars, so not very useful.
93            // let mut out = child.stdout.take().unwrap();
94            // let mut err = child.stderr.take().unwrap();
95            // std::thread::spawn(move || {
96            //     std::io::copy(&mut out, &mut std::io::stderr()).unwrap();
97            // });
98            // std::thread::spawn(move || {
99            //     std::io::copy(&mut err, &mut std::io::stdout()).unwrap();
100            // });
101
102            child.wait()?;
103        }
104
105        unsafe {
106            let lib = libloading::Library::new(dll).unwrap();
107
108            Ok(Self {
109                fns: VoiceVoxFns::new(
110                    lib,
111                    |lib| lib.get(b"voicevox_initialize").unwrap(),
112                    |lib| lib.get(b"voicevox_load_model").unwrap(),
113                    |lib| lib.get(b"voicevox_tts").unwrap(),
114                    |lib| lib.get(b"voicevox_wav_free").unwrap(),
115                ),
116                init: false,
117            })
118        }
119    }
120
121    /// Initializes the voicevox runtime. This is expensive when called with
122    /// `load_all_models = true`, so it is recommended to instead load only
123    /// the models you need with [`VoiceVox::load_model`].
124    pub fn init(
125        &mut self,
126        acceleration_mode: AccelerationMode,
127        cpu_num_threads: u16,
128        load_all_models: bool,
129    ) -> color_eyre::Result<()> {
130        let opts = InitOptions::new(acceleration_mode, cpu_num_threads, load_all_models)?;
131
132        info!("Initializing voicevox. This can take a while.");
133        if self.init {
134            return Ok(());
135        }
136        match unsafe { (self.fns.borrow_init())(opts) } {
137            ResultCode::Ok => {
138                self.init = true;
139                Ok(())
140            }
141            e => Err(e.into()),
142        }
143    }
144
145    /// Loads one of the models.
146    pub fn load_model(&self, speaker_id: u32) -> Result<(), ResultCode> {
147        match unsafe { (self.fns.borrow_load_model())(speaker_id) } {
148            ResultCode::Ok => Ok(()),
149            e => Err(e),
150        }
151    }
152
153    /// Synthesizes speech from the given text.
154    ///
155    /// To get a list of speaker ids, run the [`VoiceVox::new`] once
156    /// and check `model/metas.json` in the directory of the executable.
157    pub fn tts(
158        &self,
159        text: impl AsRef<str>,
160        speaker_id: u32,
161        opts: TtsOptions,
162    ) -> Result<CPointerWrap<u8>, ResultCode> {
163        let text = text.as_ref();
164        info!("Synthesizing speech from: {}", text);
165
166        let text = std::ffi::CString::new(text).unwrap();
167        let mut output_wav_length = 0;
168        let mut output_wav = std::ptr::null_mut();
169
170        match unsafe {
171            (self.fns.borrow_tts())(
172                text.as_ptr(),
173                speaker_id,
174                opts,
175                &mut output_wav_length,
176                &mut output_wav,
177            )
178        } {
179            ResultCode::Ok => Ok(CPointerWrap::new(
180                output_wav,
181                output_wav_length,
182                self.fns.borrow_wav_free(),
183            )),
184            e => Err(e),
185        }
186    }
187}
188
189fn download_path() -> color_eyre::Result<PathBuf> {
190    let exe_path = std::env::current_exe()?;
191    Ok(exe_path
192        .parent()
193        .ok_or(color_eyre::eyre::eyre!("exe path has no parent directory"))?
194        .to_owned())
195}
196
197fn voicevox_downloader_url() -> color_eyre::Result<String> {
198    let os = match std::env::consts::OS {
199        os @ "windows" | os @ "linux" => os,
200        "macos" => "osx",
201        _ => bail!("unsupported os"),
202    };
203    let arch = match std::env::consts::ARCH {
204        "x86_64" => "x64",
205        "aarch64" => "arm64",
206        _ => bail!("unsupported arch"),
207    };
208    let extension = match os {
209        "windows" => ".exe",
210        _ => "",
211    };
212    let base = "https://github.com/VOICEVOX/voicevox_core/releases/latest/download/download-";
213    Ok(format!("{base}{os}-{arch}{extension}"))
214}
215
216#[repr(C)]
217#[derive(Default, Debug, Copy, Clone)]
218pub struct TtsOptions {
219    pub kana: bool,
220    pub enable_interrogative_upspeak: bool,
221}
222
223#[repr(C)]
224#[derive(Debug, Clone)]
225pub struct InitOptions {
226    acceleration_mode: i32,
227    cpu_num_threads: u16,
228    load_all_models: bool,
229    open_jtalk_dict_dir: *mut ::std::os::raw::c_char,
230}
231
232#[derive(Debug, Clone, Copy)]
233pub enum AccelerationMode {
234    Auto,
235    Cpu,
236    Gpu,
237}
238
239impl InitOptions {
240    pub fn new(
241        acceleration_mode: AccelerationMode,
242        cpu_num_threads: u16,
243        load_all_models: bool,
244    ) -> color_eyre::Result<Self> {
245        let p = download_path()?
246            .join("open_jtalk_dic_utf_8-1.11")
247            .canonicalize()?;
248        let open_jtalk_dict_dir = p
249            .to_str()
250            .ok_or(color_eyre::eyre::eyre!("failed to convert {:?} to str", p))?;
251
252        Ok(Self {
253            acceleration_mode: match acceleration_mode {
254                AccelerationMode::Auto => 0,
255                AccelerationMode::Cpu => 1,
256                AccelerationMode::Gpu => 2,
257            },
258            cpu_num_threads,
259            load_all_models,
260            open_jtalk_dict_dir: std::ffi::CString::new(open_jtalk_dict_dir)
261                .unwrap()
262                .into_raw(),
263        })
264    }
265}
266
267impl Drop for InitOptions {
268    fn drop(&mut self) {
269        drop(unsafe { std::ffi::CString::from_raw(self.open_jtalk_dict_dir) })
270    }
271}
272
273#[repr(i32)]
274#[derive(Debug, PartialEq, Eq)]
275pub enum ResultCode {
276    /// Success
277    Ok = 0,
278    /// Failed to load Open JTalk dictionary file
279    NotLoadedOpenjtalkDictError = 1,
280    /// Failed to load the model
281    LoadModelError = 2,
282    /// Failed to get supported device information
283    GetSupportedDevicesError = 3,
284    /// GPU mode is not supported
285    GpuSupportError = 4,
286    /// Failed to load meta information
287    LoadMetasError = 5,
288    /// Status is uninitialized
289    UninitializedStatusError = 6,
290    /// Invalid speaker ID specified
291    InvalidSpeakerIdError = 7,
292    /// Invalid model index specified
293    InvalidModelIndexError = 8,
294    /// Inference failed
295    InferenceError = 9,
296    /// Failed to output context labels
297    ExtractFullContextLabelError = 10,
298    /// Invalid UTF-8 string input
299    InvalidUtf8InputError = 11,
300    /// Failed to parse Aquestalk-style text
301    ParseKanaError = 12,
302    /// Invalid AudioQuery
303    InvalidAudioQueryError = 13,
304}
305
306impl std::fmt::Display for ResultCode {
307    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
308        let s = match self {
309            ResultCode::Ok => "Success",
310            ResultCode::NotLoadedOpenjtalkDictError => "Failed to load Open JTalk dictionary file",
311            ResultCode::LoadModelError => "Failed to load the model",
312            ResultCode::GetSupportedDevicesError => "Failed to get supported device information",
313            ResultCode::GpuSupportError => "GPU mode is not supported",
314            ResultCode::LoadMetasError => "Failed to load meta information",
315            ResultCode::UninitializedStatusError => "Status is uninitialized",
316            ResultCode::InvalidSpeakerIdError => "Invalid speaker ID specified",
317            ResultCode::InvalidModelIndexError => "Invalid model index specified",
318            ResultCode::InferenceError => "Inference failed",
319            ResultCode::ExtractFullContextLabelError => "Failed to output context labels",
320            ResultCode::InvalidUtf8InputError => "Invalid UTF-8 string input",
321            ResultCode::ParseKanaError => "Failed to parse Aquestalk-style text",
322            ResultCode::InvalidAudioQueryError => "Invalid AudioQuery",
323        };
324        write!(f, "{}", s)
325    }
326}
327
328impl std::error::Error for ResultCode {}
329
330/// Once dropped the memory is freed.
331pub struct CPointerWrap<'a, T> {
332    bytes: *mut T,
333    length: usize,
334    free_fn: &'a Symbol<'a, unsafe extern "C" fn(*mut T)>,
335}
336
337impl<'a, T> CPointerWrap<'a, T> {
338    pub fn new(
339        bytes: *mut T,
340        length: usize,
341        free_fn: &'a Symbol<'a, unsafe extern "C" fn(*mut T)>,
342    ) -> Self {
343        Self {
344            bytes,
345            length,
346            free_fn,
347        }
348    }
349
350    pub fn as_slice(&self) -> &[T] {
351        unsafe { std::slice::from_raw_parts(self.bytes, self.length) }
352    }
353}
354
355impl<'a, T> Drop for CPointerWrap<'a, T> {
356    fn drop(&mut self) {
357        unsafe { (self.free_fn)(self.bytes) };
358    }
359}
voicevox_dyn/lib.rs

voicevox_dyn/
lib.rs