1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
use futures_core::Stream;

/// Key & weight information for a single blend shape as part of a [`BlendShapeVisemeFrame`].
#[derive(Debug, Clone)]
pub struct BlendShape {
	/// Blend shape key, typically as an [ARKit](https://developer.apple.com/documentation/arkit/arblendshapelocation?language=objc) blend shape.
	pub key: Box<str>,
	/// Weight of the blend shape from `0.0` (no influence) to `1.0` (full influence).
	pub weight: f32
}

/// A single frame for visemes in blend shape format.
#[derive(Debug, Clone)]
pub struct BlendShapeVisemeFrame {
	pub blendshapes: Box<[BlendShape]>,
	/// Offset of this blendshape frame relative to the beginning of the audio stream.
	pub frame_offset: f32
}

/// A 'basic' viseme.
///
/// The format for basic visemes is not currently defined due to conflicts between Azure Cognitive Speech Services &
/// Amazon Polly's viseme mappings.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct BasicViseme(pub char);

/// A single frame of 'basic' visemes.
#[derive(Debug, Clone)]
pub struct BasicVisemeFrame {
	pub viseme: BasicViseme,
	/// Offset of this viseme frame relative to the beginning of the audio stream.
	pub frame_offset: f32
}

/// An event emitted by a speech synthesiser's [`UtteranceEventStream`].
#[derive(Debug)]
#[non_exhaustive]
pub enum UtteranceEvent {
	/// Marks the audio offset of an SSML `<mark />`.
	SsmlMark {
		/// The position in milliseconds the mark occurred, relative to the beginning of the audio stream.
		at_millis: f32,
		/// The name of the mark in SSML.
		mark: Box<str>
	},
	/// Marks the time boundary of a spoken word in the audio.
	WordBoundary {
		/// The position in milliseconds the spoken word begun, relative to the beginning of the audio stream.
		from_millis: f32,
		/// The position in milliseconds the spoken word ended, relative to the beginning of the audio stream.
		to_millis: f32,
		/// The text of the single word spoken between this boundary.
		text: Box<str>
	},
	/// Marks the time boundary of a sentence in the audio.
	SentenceBoundary {
		/// The position in milliseconds the sentence begun, relative to the beginning of the audio stream.
		from_millis: f32,
		/// The position in milliseconds the sentence ended, relative to the beginning of the audio stream.
		to_millis: f32,
		/// The text of the sentence spoken between this boundary.
		text: Box<str>
	},
	/// A chunk of viseme frames in blend shape format.
	BlendShapeVisemesChunk(Box<[BlendShapeVisemeFrame]>),
	/// A chunk of frames of 'basic' visemes.
	VisemesChunk(Box<[BasicVisemeFrame]>),
	/// A chunk of synthesised speech audio in the requested format.
	AudioChunk(Box<[u8]>)
}

/// A stream of [`UtteranceEvent`]s returned by the synthesiser.
///
/// May be an [`Err`][Result::Err] if an error was encountered during synthesis (i.e. a socket disconnect).
pub trait UtteranceEventStream: Stream<Item = crate::Result<UtteranceEvent>> + Send + Unpin {}