speech_synthesis/event.rs
1use futures_core::Stream;
2
3/// Key & weight information for a single blend shape as part of a [`BlendShapeVisemeFrame`].
4#[derive(Debug, Clone)]
5pub struct BlendShape {
6 /// Blend shape key, typically as an [ARKit](https://developer.apple.com/documentation/arkit/arblendshapelocation?language=objc) blend shape.
7 pub key: Box<str>,
8 /// Weight of the blend shape from `0.0` (no influence) to `1.0` (full influence).
9 pub weight: f32
10}
11
12/// A single frame for visemes in blend shape format.
13#[derive(Debug, Clone)]
14pub struct BlendShapeVisemeFrame {
15 pub blendshapes: Box<[BlendShape]>,
16 /// Offset of this blendshape frame relative to the beginning of the audio stream.
17 pub frame_offset: f32
18}
19
20/// A 'basic' viseme.
21///
22/// The format for basic visemes is not currently defined due to conflicts between Azure Cognitive Speech Services &
23/// Amazon Polly's viseme mappings.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct BasicViseme(pub char);
26
27/// A single frame of 'basic' visemes.
28#[derive(Debug, Clone)]
29pub struct BasicVisemeFrame {
30 pub viseme: BasicViseme,
31 /// Offset of this viseme frame relative to the beginning of the audio stream.
32 pub frame_offset: f32
33}
34
35/// An event emitted by a speech synthesiser's [`UtteranceEventStream`].
36#[derive(Debug)]
37#[non_exhaustive]
38pub enum UtteranceEvent {
39 /// Marks the audio offset of an [`ssml::Mark`].
40 SsmlMark {
41 /// The position in milliseconds the mark occurred, relative to the beginning of the audio stream.
42 at_millis: f32,
43 /// The name of the mark in SSML.
44 mark: Box<str>
45 },
46 /// Marks the time boundary of a spoken word in the audio.
47 WordBoundary {
48 /// The position in milliseconds the spoken word begun, relative to the beginning of the audio stream.
49 from_millis: f32,
50 /// The position in milliseconds the spoken word ended, relative to the beginning of the audio stream.
51 to_millis: f32,
52 /// The text of the single word spoken between this boundary.
53 text: Box<str>
54 },
55 /// Marks the time boundary of a sentence in the audio.
56 SentenceBoundary {
57 /// The position in milliseconds the sentence begun, relative to the beginning of the audio stream.
58 from_millis: f32,
59 /// The position in milliseconds the sentence ended, relative to the beginning of the audio stream.
60 to_millis: f32,
61 /// The text of the sentence spoken between this boundary.
62 text: Box<str>
63 },
64 /// A chunk of viseme frames in blend shape format.
65 BlendShapeVisemesChunk(Box<[BlendShapeVisemeFrame]>),
66 /// A chunk of frames of 'basic' visemes.
67 VisemesChunk(Box<[BasicVisemeFrame]>),
68 /// A chunk of synthesised speech audio in the requested format.
69 AudioChunk(Box<[u8]>)
70}
71
72/// A stream of [`UtteranceEvent`]s returned by the synthesiser.
73///
74/// May be an [`Err`][Result::Err] if an error was encountered during synthesis (i.e. a socket disconnect).
75pub trait UtteranceEventStream<E>: Stream<Item = Result<UtteranceEvent, E>> + Send {}
76
77impl<E, T: Stream<Item = Result<UtteranceEvent, E>> + Send> UtteranceEventStream<E> for T {}