1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
pub use ::ssml;
// TODO: in the future see if we can remove the anyhow dependency without too much annoyance
pub use anyhow::{Error, Result};
pub use async_trait::async_trait;
mod audio;
pub use self::audio::{AudioChannels, AudioCodec, AudioContainer, AudioEncoding, AudioFormat, AudioFormatPreference};
mod event;
pub use self::event::{BasicViseme, BasicVisemeFrame, BlendShape, BlendShapeVisemeFrame, UtteranceEvent, UtteranceEventStream};
/// Configuration for a single speech synthesis utterance.
#[derive(Debug, Default, Clone)]
#[non_exhaustive]
pub struct UtteranceConfig {
/// Whether to emit [`UtteranceEvent::WordBoundary`] events.
pub emit_word_boundary_events: bool,
/// Whether to emit [`UtteranceEvent::SentenceBoundary`] events.
pub emit_sentence_boundary_events: bool,
/// Whether to emit [`UtteranceEvent::VisemesChunk`]/[`UtteranceEvent::BlendShapeVisemesChunk`] events.
pub emit_visemes: bool,
/// The name of the voice to use for synthesis.
pub voice: Option<Box<str>>
}
/// Common trait for a speech synthesiser.
#[async_trait]
pub trait SpeechSynthesiser {
type EventStream: UtteranceEventStream;
/// Negotiate an audio format supported by both the application and this synthesiser. The synthesiser returns `None`
/// if:
/// - Any requested sample rate is not supported.
/// - Any requested container is not supported.
/// - Any requested channel count is not supported.
///
/// If multiple values are provided for a preference by the application, the synthesiser should prioritise the
/// highest quality configuration. For optional properties (such as bitrate), this should **not** fail, and instead
/// return the highest quality bitrate closest to the user's preference.
///
/// i.e., for a synthesiser that only supports 44100 Hz, stereo MP3 at either 128 or 192 Kbps:
/// - requesting a sample rate of `48000` or `22050` should return `None`,
/// - requesting [`AudioChannels::Mono`] should return `None`,
/// - requesting OGG format should return `None`,
/// - and requesting 44100 Hz stereo MP3 at 160 Kbps should return an audio format of 44100 Hz stereo MP3 **at 192
/// Kbps**.
fn negotiate_audio_format(&self, pref: AudioFormatPreference) -> Option<AudioFormat>;
/// Stream the synthesis of an [`ssml`] document.
///
/// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
/// audio format that both your application and the synthesiser supports via
/// [`SpeechSynthesiser::negotiate_audio_format`].
///
/// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
async fn synthesise_ssml_stream(&self, input: ssml::Speak, audio_format: &AudioFormat, config: &UtteranceConfig) -> crate::Result<Self::EventStream>;
/// Stream the synthesis of **raw text**.
///
/// Note that text is hardly controllable. For more advanced control of the synthesised speech, including prosody,
/// pitch contour, or pronunciation of words, see [`SpeechSynthesiser::synthesise_ssml_stream`] and [`ssml`].
///
/// This method should **not** be able to accept a raw string of SSML. SSML should be handled exclusively through
/// [`SpeechSynthesiser::synthesise_ssml_stream`].
///
/// Audio will be streamed in chunks, in the format specified by the given [`AudioFormat`]. You can negotiate an
/// audio format that both your application and the synthesiser supports via
/// [`SpeechSynthesiser::negotiate_audio_format`].
///
/// You'll need to configure whether to receive events like visemes or boundaries with an [`UtteranceConfig`].
async fn synthesise_text_stream(
&self,
input: impl AsRef<str> + Send,
audio_format: &AudioFormat,
config: &UtteranceConfig
) -> crate::Result<Self::EventStream>;
}