speech_synthesis/
lib.rs

1use core::future::Future;
2
3pub use ::ssml;
4
5mod audio;
6pub use self::audio::{AudioChannels, AudioCodec, AudioContainer, AudioEncoding, AudioFormat, AudioFormatPreference};
7mod event;
8pub use self::event::{BasicViseme, BasicVisemeFrame, BlendShape, BlendShapeVisemeFrame, UtteranceEvent, UtteranceEventStream};
9
10/// Configuration for a single speech synthesis utterance.
11#[derive(Debug, Default, Clone)]
12#[non_exhaustive]
13pub struct UtteranceConfig {
14	/// Whether to emit [`UtteranceEvent::WordBoundary`] events.
15	pub emit_word_boundary_events: bool,
16	/// Whether to emit [`UtteranceEvent::SentenceBoundary`] events.
17	pub emit_sentence_boundary_events: bool,
18	/// Whether to emit [`UtteranceEvent::VisemesChunk`]/[`UtteranceEvent::BlendShapeVisemesChunk`] events.
19	pub emit_visemes: bool,
20	/// The name of the voice to use for synthesis.
21	pub voice: Option<Box<str>>,
22	/// The language to use for raw text synthesis.
23	pub language: Option<Box<str>>
24}
25
26impl UtteranceConfig {
27	/// Configures whether to emit [`UtteranceEvent::WordBoundary`] events.
28	pub fn with_emit_word_boundary_events(mut self, x: bool) -> Self {
29		self.emit_word_boundary_events = x;
30		self
31	}
32
33	/// Configures whether to emit [`UtteranceEvent::SentenceBoundary`] events.
34	pub fn with_emit_sentence_boundary_events(mut self, x: bool) -> Self {
35		self.emit_sentence_boundary_events = x;
36		self
37	}
38
39	/// Configures whether to emit [`UtteranceEvent::VisemesChunk`]/[`UtteranceEvent::BlendShapeVisemesChunk`] events.
40	pub fn with_emit_visemes(mut self, x: bool) -> Self {
41		self.emit_visemes = x;
42		self
43	}
44
45	/// Configures the name of the voice to use for synthesis.
46	///
47	/// This is generally only used for [text synthesis](SpeechSynthesiser::synthesise_text_stream) and will be ignored
48	/// with [SSML synthesis](SpeechSynthesiser::synthesise_ssml_stream).
49	pub fn with_voice(mut self, x: impl Into<Box<str>>) -> Self {
50		self.voice = Some(x.into());
51		self
52	}
53
54	/// Configures the language to use for raw text synthesis.
55	///
56	/// This is generally only used for [text synthesis](SpeechSynthesiser::synthesise_text_stream) and will be ignored
57	/// with [SSML synthesis](SpeechSynthesiser::synthesise_ssml_stream).
58	pub fn with_language(mut self, x: impl Into<Box<str>>) -> Self {
59		self.language = Some(x.into());
60		self
61	}
62}
63
64/// Common trait for a speech synthesiser.
65pub trait SpeechSynthesiser {
66	type Error: std::error::Error + Send + Sync + 'static;
67
68	/// Negotiate an audio format supported by both the application and this synthesiser. The synthesiser returns `None`
69	/// if:
70	/// - Any requested sample rate is not supported.
71	/// - Any requested container is not supported.
72	/// - Any requested channel count is not supported.
73	///
74	/// If multiple values are provided for a preference by the application, the synthesiser should prioritise the
75	/// highest quality configuration. For optional properties (such as bitrate), this should **not** fail, and instead
76	/// return the highest quality bitrate closest to the user's preference.
77	///
78	/// i.e., for a synthesiser that only supports 44100 Hz, stereo MP3 at either 128 or 192 Kbps:
79	/// - requesting a sample rate of `48000` or `22050` should return `None`,
80	/// - requesting [`AudioChannels::Mono`] should return `None`,
81	/// - requesting OGG format should return `None`,
82	/// - and requesting 44100 Hz stereo MP3 at 160 Kbps should return an audio format of 44100 Hz stereo MP3 **at 192
83	///   Kbps**.
84	fn negotiate_audio_format(&self, pref: &AudioFormatPreference) -> Option<AudioFormat>;
85
86	/// Stream the synthesis of an [`ssml`] document.
87	///
88	/// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
89	/// audio format that both your application and the synthesiser supports via
90	/// [`SpeechSynthesiser::negotiate_audio_format`].
91	///
92	/// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
93	fn synthesise_ssml_stream(
94		&self,
95		input: &ssml::Speak<'_>,
96		audio_format: &AudioFormat,
97		config: &UtteranceConfig
98	) -> impl Future<Output = Result<impl UtteranceEventStream<Self::Error> + 'static, Self::Error>> + Send;
99
100	/// Stream the synthesis of **raw text**.
101	///
102	/// Note that text is hardly controllable. For more advanced control of the synthesised speech, including prosody,
103	/// pitch contour, or pronunciation of words, see [`SpeechSynthesiser::synthesise_ssml_stream`] and [`ssml`].
104	///
105	/// This method should **not** be able to accept a raw string of SSML. SSML should be handled exclusively through
106	/// [`SpeechSynthesiser::synthesise_ssml_stream`].
107	///
108	/// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
109	/// audio format that both your application and the synthesiser supports via
110	/// [`SpeechSynthesiser::negotiate_audio_format`].
111	///
112	/// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
113	fn synthesise_text_stream(
114		&self,
115		input: &str,
116		audio_format: &AudioFormat,
117		config: &UtteranceConfig
118	) -> impl Future<Output = Result<impl UtteranceEventStream<Self::Error> + 'static, Self::Error>> + Send;
119}