speech_synthesis/
event.rs

1use futures_core::Stream;
2
3/// Key & weight information for a single blend shape as part of a [`BlendShapeVisemeFrame`].
4#[derive(Debug, Clone)]
5pub struct BlendShape {
6	/// Blend shape key, typically as an [ARKit](https://developer.apple.com/documentation/arkit/arblendshapelocation?language=objc) blend shape.
7	pub key: Box<str>,
8	/// Weight of the blend shape from `0.0` (no influence) to `1.0` (full influence).
9	pub weight: f32
10}
11
12/// A single frame for visemes in blend shape format.
13#[derive(Debug, Clone)]
14pub struct BlendShapeVisemeFrame {
15	pub blendshapes: Box<[BlendShape]>,
16	/// Offset of this blendshape frame relative to the beginning of the audio stream.
17	pub frame_offset: f32
18}
19
20/// A 'basic' viseme.
21///
22/// The format for basic visemes is not currently defined due to conflicts between Azure Cognitive Speech Services &
23/// Amazon Polly's viseme mappings.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct BasicViseme(pub char);
26
27/// A single frame of 'basic' visemes.
28#[derive(Debug, Clone)]
29pub struct BasicVisemeFrame {
30	pub viseme: BasicViseme,
31	/// Offset of this viseme frame relative to the beginning of the audio stream.
32	pub frame_offset: f32
33}
34
35/// An event emitted by a speech synthesiser's [`UtteranceEventStream`].
36#[derive(Debug)]
37#[non_exhaustive]
38pub enum UtteranceEvent {
39	/// Marks the audio offset of an [`ssml::Mark`].
40	SsmlMark {
41		/// The position in milliseconds the mark occurred, relative to the beginning of the audio stream.
42		at_millis: f32,
43		/// The name of the mark in SSML.
44		mark: Box<str>
45	},
46	/// Marks the time boundary of a spoken word in the audio.
47	WordBoundary {
48		/// The position in milliseconds the spoken word begun, relative to the beginning of the audio stream.
49		from_millis: f32,
50		/// The position in milliseconds the spoken word ended, relative to the beginning of the audio stream.
51		to_millis: f32,
52		/// The text of the single word spoken between this boundary.
53		text: Box<str>
54	},
55	/// Marks the time boundary of a sentence in the audio.
56	SentenceBoundary {
57		/// The position in milliseconds the sentence begun, relative to the beginning of the audio stream.
58		from_millis: f32,
59		/// The position in milliseconds the sentence ended, relative to the beginning of the audio stream.
60		to_millis: f32,
61		/// The text of the sentence spoken between this boundary.
62		text: Box<str>
63	},
64	/// A chunk of viseme frames in blend shape format.
65	BlendShapeVisemesChunk(Box<[BlendShapeVisemeFrame]>),
66	/// A chunk of frames of 'basic' visemes.
67	VisemesChunk(Box<[BasicVisemeFrame]>),
68	/// A chunk of synthesised speech audio in the requested format.
69	AudioChunk(Box<[u8]>)
70}
71
72/// A stream of [`UtteranceEvent`]s returned by the synthesiser.
73///
74/// May be an [`Err`][Result::Err] if an error was encountered during synthesis (i.e. a socket disconnect).
75pub trait UtteranceEventStream<E>: Stream<Item = Result<UtteranceEvent, E>> + Send {}
76
77impl<E, T: Stream<Item = Result<UtteranceEvent, E>> + Send> UtteranceEventStream<E> for T {}