1use color_eyre::eyre::bail;
12use libloading::Symbol;
13use std::{ffi::OsStr, path::PathBuf, process::Stdio};
14use tracing::info;
15
16pub struct VoiceVox {
17 fns: VoiceVoxFns,
18 init: bool,
19}
20
21#[ouroboros::self_referencing]
22pub struct VoiceVoxFns {
23 lib: libloading::Library,
24 #[covariant]
25 #[borrows(lib)]
26 init: Symbol<'this, unsafe extern "C" fn(InitOptions) -> ResultCode>,
27 #[covariant]
28 #[borrows(lib)]
29 load_model: Symbol<'this, unsafe extern "C" fn(u32) -> ResultCode>,
30 #[covariant]
31 #[borrows(lib)]
32 tts: Symbol<'this, TtsFn>,
33 #[covariant]
34 #[borrows(lib)]
35 wav_free: Symbol<'this, unsafe extern "C" fn(*mut u8)>,
36}
37
38type TtsFn = unsafe extern "C" fn(
39 text: *const ::std::os::raw::c_char,
40 speaker_id: u32,
41 options: TtsOptions,
42 output_wav_length: *mut usize,
43 output_wav: *mut *mut u8,
44) -> ResultCode;
45
46impl VoiceVox {
47 pub fn load() -> color_eyre::Result<Self> {
57 Self::load_with_args(std::iter::empty::<&str>())
58 }
59
60 pub fn load_with_args<S: AsRef<OsStr>>(
64 args: impl IntoIterator<Item = S>,
65 ) -> color_eyre::Result<Self> {
66 let exe_path = download_path()?;
67 let dll = exe_path.join("voicevox_core.dll");
68
69 if !dll.exists() {
70 info!("Downloading voicevox downloader.");
72 let mut reader = ureq::get(&voicevox_downloader_url()?).call()?.into_reader();
73 let downloader_path = exe_path.join("voicevox_downloader");
74 let file = std::fs::File::create(&downloader_path)?;
75 std::io::copy(&mut reader, &mut std::io::BufWriter::new(file))?;
76
77 let mut child = std::process::Command::new(downloader_path)
79 .args([
80 "-o",
81 exe_path.to_str().ok_or(color_eyre::eyre::eyre!(
82 "failed to convert {:?} to str",
83 exe_path
84 ))?,
85 ])
86 .args(args)
87 .stdout(Stdio::piped())
88 .stderr(Stdio::piped())
89 .spawn()?;
90
91 info!("Downloading voicevox. This may take a while, roughly 700MB of data will be downloaded.");
92 child.wait()?;
103 }
104
105 unsafe {
106 let lib = libloading::Library::new(dll).unwrap();
107
108 Ok(Self {
109 fns: VoiceVoxFns::new(
110 lib,
111 |lib| lib.get(b"voicevox_initialize").unwrap(),
112 |lib| lib.get(b"voicevox_load_model").unwrap(),
113 |lib| lib.get(b"voicevox_tts").unwrap(),
114 |lib| lib.get(b"voicevox_wav_free").unwrap(),
115 ),
116 init: false,
117 })
118 }
119 }
120
121 pub fn init(
125 &mut self,
126 acceleration_mode: AccelerationMode,
127 cpu_num_threads: u16,
128 load_all_models: bool,
129 ) -> color_eyre::Result<()> {
130 let opts = InitOptions::new(acceleration_mode, cpu_num_threads, load_all_models)?;
131
132 info!("Initializing voicevox. This can take a while.");
133 if self.init {
134 return Ok(());
135 }
136 match unsafe { (self.fns.borrow_init())(opts) } {
137 ResultCode::Ok => {
138 self.init = true;
139 Ok(())
140 }
141 e => Err(e.into()),
142 }
143 }
144
145 pub fn load_model(&self, speaker_id: u32) -> Result<(), ResultCode> {
147 match unsafe { (self.fns.borrow_load_model())(speaker_id) } {
148 ResultCode::Ok => Ok(()),
149 e => Err(e),
150 }
151 }
152
153 pub fn tts(
158 &self,
159 text: impl AsRef<str>,
160 speaker_id: u32,
161 opts: TtsOptions,
162 ) -> Result<CPointerWrap<u8>, ResultCode> {
163 let text = text.as_ref();
164 info!("Synthesizing speech from: {}", text);
165
166 let text = std::ffi::CString::new(text).unwrap();
167 let mut output_wav_length = 0;
168 let mut output_wav = std::ptr::null_mut();
169
170 match unsafe {
171 (self.fns.borrow_tts())(
172 text.as_ptr(),
173 speaker_id,
174 opts,
175 &mut output_wav_length,
176 &mut output_wav,
177 )
178 } {
179 ResultCode::Ok => Ok(CPointerWrap::new(
180 output_wav,
181 output_wav_length,
182 self.fns.borrow_wav_free(),
183 )),
184 e => Err(e),
185 }
186 }
187}
188
189fn download_path() -> color_eyre::Result<PathBuf> {
190 let exe_path = std::env::current_exe()?;
191 Ok(exe_path
192 .parent()
193 .ok_or(color_eyre::eyre::eyre!("exe path has no parent directory"))?
194 .to_owned())
195}
196
197fn voicevox_downloader_url() -> color_eyre::Result<String> {
198 let os = match std::env::consts::OS {
199 os @ "windows" | os @ "linux" => os,
200 "macos" => "osx",
201 _ => bail!("unsupported os"),
202 };
203 let arch = match std::env::consts::ARCH {
204 "x86_64" => "x64",
205 "aarch64" => "arm64",
206 _ => bail!("unsupported arch"),
207 };
208 let extension = match os {
209 "windows" => ".exe",
210 _ => "",
211 };
212 let base = "https://github.com/VOICEVOX/voicevox_core/releases/latest/download/download-";
213 Ok(format!("{base}{os}-{arch}{extension}"))
214}
215
216#[repr(C)]
217#[derive(Default, Debug, Copy, Clone)]
218pub struct TtsOptions {
219 pub kana: bool,
220 pub enable_interrogative_upspeak: bool,
221}
222
223#[repr(C)]
224#[derive(Debug, Clone)]
225pub struct InitOptions {
226 acceleration_mode: i32,
227 cpu_num_threads: u16,
228 load_all_models: bool,
229 open_jtalk_dict_dir: *mut ::std::os::raw::c_char,
230}
231
232#[derive(Debug, Clone, Copy)]
233pub enum AccelerationMode {
234 Auto,
235 Cpu,
236 Gpu,
237}
238
239impl InitOptions {
240 pub fn new(
241 acceleration_mode: AccelerationMode,
242 cpu_num_threads: u16,
243 load_all_models: bool,
244 ) -> color_eyre::Result<Self> {
245 let p = download_path()?
246 .join("open_jtalk_dic_utf_8-1.11")
247 .canonicalize()?;
248 let open_jtalk_dict_dir = p
249 .to_str()
250 .ok_or(color_eyre::eyre::eyre!("failed to convert {:?} to str", p))?;
251
252 Ok(Self {
253 acceleration_mode: match acceleration_mode {
254 AccelerationMode::Auto => 0,
255 AccelerationMode::Cpu => 1,
256 AccelerationMode::Gpu => 2,
257 },
258 cpu_num_threads,
259 load_all_models,
260 open_jtalk_dict_dir: std::ffi::CString::new(open_jtalk_dict_dir)
261 .unwrap()
262 .into_raw(),
263 })
264 }
265}
266
267impl Drop for InitOptions {
268 fn drop(&mut self) {
269 drop(unsafe { std::ffi::CString::from_raw(self.open_jtalk_dict_dir) })
270 }
271}
272
273#[repr(i32)]
274#[derive(Debug, PartialEq, Eq)]
275pub enum ResultCode {
276 Ok = 0,
278 NotLoadedOpenjtalkDictError = 1,
280 LoadModelError = 2,
282 GetSupportedDevicesError = 3,
284 GpuSupportError = 4,
286 LoadMetasError = 5,
288 UninitializedStatusError = 6,
290 InvalidSpeakerIdError = 7,
292 InvalidModelIndexError = 8,
294 InferenceError = 9,
296 ExtractFullContextLabelError = 10,
298 InvalidUtf8InputError = 11,
300 ParseKanaError = 12,
302 InvalidAudioQueryError = 13,
304}
305
306impl std::fmt::Display for ResultCode {
307 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
308 let s = match self {
309 ResultCode::Ok => "Success",
310 ResultCode::NotLoadedOpenjtalkDictError => "Failed to load Open JTalk dictionary file",
311 ResultCode::LoadModelError => "Failed to load the model",
312 ResultCode::GetSupportedDevicesError => "Failed to get supported device information",
313 ResultCode::GpuSupportError => "GPU mode is not supported",
314 ResultCode::LoadMetasError => "Failed to load meta information",
315 ResultCode::UninitializedStatusError => "Status is uninitialized",
316 ResultCode::InvalidSpeakerIdError => "Invalid speaker ID specified",
317 ResultCode::InvalidModelIndexError => "Invalid model index specified",
318 ResultCode::InferenceError => "Inference failed",
319 ResultCode::ExtractFullContextLabelError => "Failed to output context labels",
320 ResultCode::InvalidUtf8InputError => "Invalid UTF-8 string input",
321 ResultCode::ParseKanaError => "Failed to parse Aquestalk-style text",
322 ResultCode::InvalidAudioQueryError => "Invalid AudioQuery",
323 };
324 write!(f, "{}", s)
325 }
326}
327
328impl std::error::Error for ResultCode {}
329
330pub struct CPointerWrap<'a, T> {
332 bytes: *mut T,
333 length: usize,
334 free_fn: &'a Symbol<'a, unsafe extern "C" fn(*mut T)>,
335}
336
337impl<'a, T> CPointerWrap<'a, T> {
338 pub fn new(
339 bytes: *mut T,
340 length: usize,
341 free_fn: &'a Symbol<'a, unsafe extern "C" fn(*mut T)>,
342 ) -> Self {
343 Self {
344 bytes,
345 length,
346 free_fn,
347 }
348 }
349
350 pub fn as_slice(&self) -> &[T] {
351 unsafe { std::slice::from_raw_parts(self.bytes, self.length) }
352 }
353}
354
355impl<'a, T> Drop for CPointerWrap<'a, T> {
356 fn drop(&mut self) {
357 unsafe { (self.free_fn)(self.bytes) };
358 }
359}