1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
use futures_core::Stream;
/// Key & weight information for a single blend shape as part of a [`BlendShapeVisemeFrame`].
#[derive(Debug, Clone)]
pub struct BlendShape {
	/// Blend shape key, typically as an [ARKit](https://developer.apple.com/documentation/arkit/arblendshapelocation?language=objc) blend shape.
	pub key: Box<str>,
	/// Weight of the blend shape from `0.0` (no influence) to `1.0` (full influence).
	pub weight: f32
}
/// A single frame for visemes in blend shape format.
#[derive(Debug, Clone)]
pub struct BlendShapeVisemeFrame {
	pub blendshapes: Box<[BlendShape]>,
	/// Offset of this blendshape frame relative to the beginning of the audio stream.
	pub frame_offset: f32
}
/// A 'basic' viseme.
///
/// The format for basic visemes is not currently defined due to conflicts between Azure Cognitive Speech Services &
/// Amazon Polly's viseme mappings.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct BasicViseme(pub char);
/// A single frame of 'basic' visemes.
#[derive(Debug, Clone)]
pub struct BasicVisemeFrame {
	pub viseme: BasicViseme,
	/// Offset of this viseme frame relative to the beginning of the audio stream.
	pub frame_offset: f32
}
/// An event emitted by a speech synthesiser's [`UtteranceEventStream`].
#[derive(Debug)]
#[non_exhaustive]
pub enum UtteranceEvent {
	/// Marks the audio offset of an SSML `<mark />`.
	SsmlMark {
		/// The position in milliseconds the mark occurred, relative to the beginning of the audio stream.
		at_millis: f32,
		/// The name of the mark in SSML.
		mark: Box<str>
	},
	/// Marks the time boundary of a spoken word in the audio.
	WordBoundary {
		/// The position in milliseconds the spoken word begun, relative to the beginning of the audio stream.
		from_millis: f32,
		/// The position in milliseconds the spoken word ended, relative to the beginning of the audio stream.
		to_millis: f32,
		/// The text of the single word spoken between this boundary.
		text: Box<str>
	},
	/// Marks the time boundary of a sentence in the audio.
	SentenceBoundary {
		/// The position in milliseconds the sentence begun, relative to the beginning of the audio stream.
		from_millis: f32,
		/// The position in milliseconds the sentence ended, relative to the beginning of the audio stream.
		to_millis: f32,
		/// The text of the sentence spoken between this boundary.
		text: Box<str>
	},
	/// A chunk of viseme frames in blend shape format.
	BlendShapeVisemesChunk(Box<[BlendShapeVisemeFrame]>),
	/// A chunk of frames of 'basic' visemes.
	VisemesChunk(Box<[BasicVisemeFrame]>),
	/// A chunk of synthesised speech audio in the requested format.
	AudioChunk(Box<[u8]>)
}
/// A stream of [`UtteranceEvent`]s returned by the synthesiser.
///
/// May be an [`Err`][Result::Err] if an error was encountered during synthesis (i.e. a socket disconnect).
pub trait UtteranceEventStream: Stream<Item = crate::Result<UtteranceEvent>> + Send + Unpin {}