speech_synthesis/lib.rs
1use core::future::Future;
2
3pub use ::ssml;
4
5mod audio;
6pub use self::audio::{AudioChannels, AudioCodec, AudioContainer, AudioEncoding, AudioFormat, AudioFormatPreference};
7mod event;
8pub use self::event::{BasicViseme, BasicVisemeFrame, BlendShape, BlendShapeVisemeFrame, UtteranceEvent, UtteranceEventStream};
9
10/// Configuration for a single speech synthesis utterance.
11#[derive(Debug, Default, Clone)]
12#[non_exhaustive]
13pub struct UtteranceConfig {
14 /// Whether to emit [`UtteranceEvent::WordBoundary`] events.
15 pub emit_word_boundary_events: bool,
16 /// Whether to emit [`UtteranceEvent::SentenceBoundary`] events.
17 pub emit_sentence_boundary_events: bool,
18 /// Whether to emit [`UtteranceEvent::VisemesChunk`]/[`UtteranceEvent::BlendShapeVisemesChunk`] events.
19 pub emit_visemes: bool,
20 /// The name of the voice to use for synthesis.
21 pub voice: Option<Box<str>>,
22 /// The language to use for raw text synthesis.
23 pub language: Option<Box<str>>
24}
25
26impl UtteranceConfig {
27 /// Configures whether to emit [`UtteranceEvent::WordBoundary`] events.
28 pub fn with_emit_word_boundary_events(mut self, x: bool) -> Self {
29 self.emit_word_boundary_events = x;
30 self
31 }
32
33 /// Configures whether to emit [`UtteranceEvent::SentenceBoundary`] events.
34 pub fn with_emit_sentence_boundary_events(mut self, x: bool) -> Self {
35 self.emit_sentence_boundary_events = x;
36 self
37 }
38
39 /// Configures whether to emit [`UtteranceEvent::VisemesChunk`]/[`UtteranceEvent::BlendShapeVisemesChunk`] events.
40 pub fn with_emit_visemes(mut self, x: bool) -> Self {
41 self.emit_visemes = x;
42 self
43 }
44
45 /// Configures the name of the voice to use for synthesis.
46 ///
47 /// This is generally only used for [text synthesis](SpeechSynthesiser::synthesise_text_stream) and will be ignored
48 /// with [SSML synthesis](SpeechSynthesiser::synthesise_ssml_stream).
49 pub fn with_voice(mut self, x: impl Into<Box<str>>) -> Self {
50 self.voice = Some(x.into());
51 self
52 }
53
54 /// Configures the language to use for raw text synthesis.
55 ///
56 /// This is generally only used for [text synthesis](SpeechSynthesiser::synthesise_text_stream) and will be ignored
57 /// with [SSML synthesis](SpeechSynthesiser::synthesise_ssml_stream).
58 pub fn with_language(mut self, x: impl Into<Box<str>>) -> Self {
59 self.language = Some(x.into());
60 self
61 }
62}
63
64/// Common trait for a speech synthesiser.
65pub trait SpeechSynthesiser {
66 type Error: std::error::Error + Send + Sync + 'static;
67
68 /// Negotiate an audio format supported by both the application and this synthesiser. The synthesiser returns `None`
69 /// if:
70 /// - Any requested sample rate is not supported.
71 /// - Any requested container is not supported.
72 /// - Any requested channel count is not supported.
73 ///
74 /// If multiple values are provided for a preference by the application, the synthesiser should prioritise the
75 /// highest quality configuration. For optional properties (such as bitrate), this should **not** fail, and instead
76 /// return the highest quality bitrate closest to the user's preference.
77 ///
78 /// i.e., for a synthesiser that only supports 44100 Hz, stereo MP3 at either 128 or 192 Kbps:
79 /// - requesting a sample rate of `48000` or `22050` should return `None`,
80 /// - requesting [`AudioChannels::Mono`] should return `None`,
81 /// - requesting OGG format should return `None`,
82 /// - and requesting 44100 Hz stereo MP3 at 160 Kbps should return an audio format of 44100 Hz stereo MP3 **at 192
83 /// Kbps**.
84 fn negotiate_audio_format(&self, pref: &AudioFormatPreference) -> Option<AudioFormat>;
85
86 /// Stream the synthesis of an [`ssml`] document.
87 ///
88 /// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
89 /// audio format that both your application and the synthesiser supports via
90 /// [`SpeechSynthesiser::negotiate_audio_format`].
91 ///
92 /// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
93 fn synthesise_ssml_stream(
94 &self,
95 input: &ssml::Speak<'_>,
96 audio_format: &AudioFormat,
97 config: &UtteranceConfig
98 ) -> impl Future<Output = Result<impl UtteranceEventStream<Self::Error> + 'static, Self::Error>> + Send;
99
100 /// Stream the synthesis of **raw text**.
101 ///
102 /// Note that text is hardly controllable. For more advanced control of the synthesised speech, including prosody,
103 /// pitch contour, or pronunciation of words, see [`SpeechSynthesiser::synthesise_ssml_stream`] and [`ssml`].
104 ///
105 /// This method should **not** be able to accept a raw string of SSML. SSML should be handled exclusively through
106 /// [`SpeechSynthesiser::synthesise_ssml_stream`].
107 ///
108 /// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
109 /// audio format that both your application and the synthesiser supports via
110 /// [`SpeechSynthesiser::negotiate_audio_format`].
111 ///
112 /// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
113 fn synthesise_text_stream(
114 &self,
115 input: &str,
116 audio_format: &AudioFormat,
117 config: &UtteranceConfig
118 ) -> impl Future<Output = Result<impl UtteranceEventStream<Self::Error> + 'static, Self::Error>> + Send;
119}