1use crate::formatters::base::Platform;
2use serde::{Deserialize, Serialize};
3
4#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
5pub struct SsmlCapability {
6 pub element: String,
7 pub description: String,
8 pub attributes: Vec<String>,
9 pub speech_markdown_syntax: Vec<String>,
10 pub example: String,
11}
12
13#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
14pub struct PlatformCapabilities {
15 pub platform: String,
16 pub ssml_elements: Vec<SsmlCapability>,
17 pub unsupported: Vec<String>,
18}
19
20pub fn get_supported_ssml(platform: Platform) -> PlatformCapabilities {
21 match platform {
22 Platform::AmazonAlexa => amazon_alexa_capabilities(),
23 Platform::GoogleAssistant => google_assistant_capabilities(),
24 Platform::MicrosoftAzure => microsoft_azure_capabilities(),
25 Platform::Apple => apple_capabilities(),
26 Platform::W3c => w3c_capabilities(),
27 Platform::SamsungBixby => samsung_bixby_capabilities(),
28 Platform::ElevenLabs => elevenlabs_capabilities(),
29 Platform::IbmWatson => ibm_watson_capabilities(),
30 }
31}
32
33fn break_element() -> SsmlCapability {
34 SsmlCapability {
35 element: "break".into(),
36 description: "Insert a pause".into(),
37 attributes: vec!["time".into(), "strength".into()],
38 speech_markdown_syntax: vec!["[2s]".into(), "[500ms]".into(), "[break:strong]".into()],
39 example: "Hello [2s] world".into(),
40 }
41}
42
43fn emphasis_element() -> SsmlCapability {
44 SsmlCapability {
45 element: "emphasis".into(),
46 description: "Emphasize text".into(),
47 attributes: vec!["level".into()],
48 speech_markdown_syntax: vec![
49 "+word+".into(),
50 "++word++".into(),
51 "-word-".into(),
52 "~word~".into(),
53 ],
54 example: "++important++".into(),
55 }
56}
57
58fn prosody_element() -> SsmlCapability {
59 SsmlCapability {
60 element: "prosody".into(),
61 description: "Control rate, pitch, and volume".into(),
62 attributes: vec!["rate".into(), "pitch".into(), "volume".into()],
63 speech_markdown_syntax: vec![
64 "(text)[rate:\"slow\"]".into(),
65 "(text)[pitch:\"high\"]".into(),
66 "(text)[volume:\"soft\"]".into(),
67 ],
68 example: "(read this)[rate:\"fast\";volume:\"loud\"]".into(),
69 }
70}
71
72fn audio_element() -> SsmlCapability {
73 SsmlCapability {
74 element: "audio".into(),
75 description: "Play an audio file".into(),
76 attributes: vec!["src".into()],
77 speech_markdown_syntax: vec".into(), "".into()],
78 example: "".into(),
79 }
80}
81
82fn say_as_element(sub_type: &str, _interpret_as: &str) -> SsmlCapability {
83 let (syntax, desc) = match sub_type {
84 "characters" => ("(ABC)[characters]", "Spell out characters"),
85 "number" => ("(42)[number]", "Read as cardinal number"),
86 "ordinal" => ("(1)[ordinal]", "Read as ordinal (first, second)"),
87 "fraction" => ("(1/2)[fraction]", "Read as fraction"),
88 "telephone" => ("(555-1234)[telephone]", "Read as phone number"),
89 "address" => ("(123 Main St)[address]", "Read as address"),
90 "unit" => ("(5kg)[unit]", "Read as unit of measurement"),
91 "time" => ("(2:30)[time:\"hms12\"]", "Read as time"),
92 "date" => ("(01/02/2024)[date:\"mdy\"]", "Read as date"),
93 "interjection" => ("(wow)[interjection]", "Read as interjection"),
94 "expletive" => ("(word)[expletive]", "Bleep/censor word"),
95 _ => ("", "Unknown"),
96 };
97 SsmlCapability {
98 element: "say-as".into(),
99 description: desc.into(),
100 attributes: vec!["interpret-as".into(), "format".into()],
101 speech_markdown_syntax: vec![syntax.into()],
102 example: syntax.into(),
103 }
104}
105
106fn sub_element() -> SsmlCapability {
107 SsmlCapability {
108 element: "sub".into(),
109 description: "Substitute pronunciation".into(),
110 attributes: vec!["alias".into()],
111 speech_markdown_syntax: vec!["{alias}text".into()],
112 example: "{AL}aluminum".into(),
113 }
114}
115
116fn mark_element() -> SsmlCapability {
117 SsmlCapability {
118 element: "mark".into(),
119 description: "Insert a named marker".into(),
120 attributes: vec!["name".into()],
121 speech_markdown_syntax: vec!["[mark:name]".into()],
122 example: "Hello [mark:chapter1] world".into(),
123 }
124}
125
126fn phoneme_element() -> SsmlCapability {
127 SsmlCapability {
128 element: "phoneme".into(),
129 description: "Custom pronunciation (IPA)".into(),
130 attributes: vec!["alphabet".into(), "ph".into()],
131 speech_markdown_syntax: vec!["(text)/phoneme".into()],
132 example: "(piccolo)/ˈpi.kɑː.loʊ".into(),
133 }
134}
135
136fn voice_element() -> SsmlCapability {
137 SsmlCapability {
138 element: "voice".into(),
139 description: "Switch to a different voice".into(),
140 attributes: vec!["name".into()],
141 speech_markdown_syntax: vec!["(text)[voice:\"Kendra\"]".into()],
142 example: "(hello)[voice:\"Brian\"]".into(),
143 }
144}
145
146fn lang_element() -> SsmlCapability {
147 SsmlCapability {
148 element: "lang".into(),
149 description: "Set language".into(),
150 attributes: vec!["xml:lang".into()],
151 speech_markdown_syntax: vec!["(text)[lang:\"fr-FR\"]".into()],
152 example: "(bonjour)[lang:\"fr-FR\"]".into(),
153 }
154}
155
156fn amazon_alexa_capabilities() -> PlatformCapabilities {
157 PlatformCapabilities {
158 platform: "amazon-alexa".into(),
159 ssml_elements: vec![
160 break_element(),
161 emphasis_element(),
162 prosody_element(),
163 audio_element(),
164 sub_element(),
165 phoneme_element(),
166 mark_element(),
167 lang_element(),
168 voice_element(),
169 say_as_element("characters", "characters"),
170 say_as_element("number", "number"),
171 say_as_element("ordinal", "ordinal"),
172 say_as_element("fraction", "fraction"),
173 say_as_element("telephone", "telephone"),
174 say_as_element("address", "address"),
175 say_as_element("unit", "unit"),
176 say_as_element("time", "time"),
177 say_as_element("date", "date"),
178 say_as_element("interjection", "interjection"),
179 say_as_element("expletive", "expletive"),
180 SsmlCapability {
181 element: "amazon:effect".into(),
182 description: "Whisper effect".into(),
183 attributes: vec!["name".into()],
184 speech_markdown_syntax: vec!["(text)[whisper]".into()],
185 example: "(hello)[whisper]".into(),
186 },
187 SsmlCapability {
188 element: "amazon:emotion".into(),
189 description: "Express emotion (excited/disappointed)".into(),
190 attributes: vec!["name".into(), "intensity".into()],
191 speech_markdown_syntax: vec![
192 "#[excited] text".into(),
193 "#[disappointed] text".into(),
194 ],
195 example: "#[excited] Great news!".into(),
196 },
197 SsmlCapability {
198 element: "amazon:domain".into(),
199 description: "Switch to news or music domain".into(),
200 attributes: vec!["name".into()],
201 speech_markdown_syntax: vec!["#[newscaster] text".into(), "#[dj] text".into()],
202 example: "#[newscaster] Breaking news today".into(),
203 },
204 ],
205 unsupported: vec!["google:style".into(), "mstts:express-as".into()],
206 }
207}
208
209fn google_assistant_capabilities() -> PlatformCapabilities {
210 PlatformCapabilities {
211 platform: "google-assistant".into(),
212 ssml_elements: vec![
213 break_element(),
214 emphasis_element(),
215 prosody_element(),
216 audio_element(),
217 sub_element(),
218 mark_element(),
219 say_as_element("characters", "characters"),
220 say_as_element("number", "number"),
221 say_as_element("ordinal", "ordinal"),
222 say_as_element("fraction", "fraction"),
223 say_as_element("telephone", "telephone"),
224 say_as_element("address", "address"),
225 say_as_element("unit", "unit"),
226 say_as_element("time", "time"),
227 say_as_element("date", "date"),
228 say_as_element("interjection", "interjection"),
229 say_as_element("expletive", "expletive"),
230 SsmlCapability {
231 element: "google:style".into(),
232 description: "Google speaking style".into(),
233 attributes: vec!["name".into()],
234 speech_markdown_syntax: vec!["(text)[style:\"name\"]".into()],
235 example: "(hello)[style:\"cheerful\"]".into(),
236 },
237 ],
238 unsupported: vec![
239 "voice".into(),
240 "lang".into(),
241 "phoneme".into(),
242 "amazon:effect".into(),
243 "amazon:emotion".into(),
244 "amazon:domain".into(),
245 "mstts:express-as".into(),
246 "excited section".into(),
247 "disappointed section".into(),
248 ],
249 }
250}
251
252fn microsoft_azure_capabilities() -> PlatformCapabilities {
253 PlatformCapabilities {
254 platform: "microsoft-azure".into(),
255 ssml_elements: vec![
256 break_element(),
257 prosody_element(),
258 audio_element(),
259 sub_element(),
260 phoneme_element(),
261 mark_element(),
262 lang_element(),
263 voice_element(),
264 say_as_element("characters", "characters"),
265 say_as_element("number", "cardinal"),
266 say_as_element("ordinal", "ordinal"),
267 say_as_element("fraction", "fraction"),
268 say_as_element("telephone", "telephone"),
269 say_as_element("address", "address"),
270 say_as_element("unit", "unit"),
271 say_as_element("time", "time"),
272 say_as_element("date", "date"),
273 say_as_element("interjection", "interjection"),
274 say_as_element("expletive", "expletive"),
275 SsmlCapability {
276 element: "mstts:express-as".into(),
277 description: "Express emotion/style (42 styles)".into(),
278 attributes: vec!["style".into()],
279 speech_markdown_syntax: vec![
280 "#[cheerful] text".into(),
281 "#[sad] text".into(),
282 "#[angry] text".into(),
283 "(text)[excited]".into(),
284 "(text)[disappointed]".into(),
285 ],
286 example: "#[cheerful] Hello there!".into(),
287 },
288 SsmlCapability {
289 element: "prosody (whisper)".into(),
290 description: "Whisper via prosody (rate:slow + volume:x-soft)".into(),
291 attributes: vec!["rate".into(), "volume".into()],
292 speech_markdown_syntax: vec!["(text)[whisper]".into()],
293 example: "(hello)[whisper]".into(),
294 },
295 ],
296 unsupported: vec![
297 "emphasis (not supported by Azure)".into(),
298 "amazon:effect".into(),
299 "amazon:emotion".into(),
300 "amazon:domain".into(),
301 ],
302 }
303}
304
305fn apple_capabilities() -> PlatformCapabilities {
306 PlatformCapabilities {
307 platform: "apple".into(),
308 ssml_elements: vec![
309 break_element(),
310 emphasis_element(),
311 prosody_element(),
312 audio_element(),
313 sub_element(),
314 phoneme_element(),
315 mark_element(),
316 lang_element(),
317 voice_element(),
318 say_as_element("characters", "characters"),
319 say_as_element("number", "number"),
320 say_as_element("ordinal", "ordinal"),
321 say_as_element("date", "date"),
322 say_as_element("time", "time"),
323 ],
324 unsupported: vec![
325 "amazon:effect".into(),
326 "amazon:emotion".into(),
327 "amazon:domain".into(),
328 "mstts:express-as".into(),
329 "google:style".into(),
330 ],
331 }
332}
333
334fn w3c_capabilities() -> PlatformCapabilities {
335 PlatformCapabilities {
336 platform: "w3c".into(),
337 ssml_elements: vec![
338 break_element(),
339 emphasis_element(),
340 prosody_element(),
341 audio_element(),
342 sub_element(),
343 phoneme_element(),
344 mark_element(),
345 lang_element(),
346 voice_element(),
347 say_as_element("characters", "characters"),
348 say_as_element("number", "number"),
349 say_as_element("ordinal", "ordinal"),
350 say_as_element("fraction", "fraction"),
351 say_as_element("telephone", "telephone"),
352 say_as_element("address", "address"),
353 say_as_element("unit", "unit"),
354 say_as_element("time", "time"),
355 say_as_element("date", "date"),
356 say_as_element("interjection", "interjection"),
357 say_as_element("expletive", "expletive"),
358 ],
359 unsupported: vec![
360 "amazon:effect".into(),
361 "amazon:emotion".into(),
362 "amazon:domain".into(),
363 "mstts:express-as".into(),
364 "google:style".into(),
365 ],
366 }
367}
368
369fn samsung_bixby_capabilities() -> PlatformCapabilities {
370 PlatformCapabilities {
371 platform: "samsung-bixby".into(),
372 ssml_elements: vec![
373 break_element(),
374 emphasis_element(),
375 prosody_element(),
376 audio_element(),
377 sub_element(),
378 mark_element(),
379 say_as_element("characters", "characters"),
380 say_as_element("number", "number"),
381 say_as_element("ordinal", "ordinal"),
382 say_as_element("date", "date"),
383 say_as_element("time", "time"),
384 ],
385 unsupported: vec![
386 "voice".into(),
387 "lang".into(),
388 "phoneme".into(),
389 "amazon:effect".into(),
390 "amazon:emotion".into(),
391 "amazon:domain".into(),
392 "mstts:express-as".into(),
393 "google:style".into(),
394 ],
395 }
396}
397
398fn elevenlabs_capabilities() -> PlatformCapabilities {
399 PlatformCapabilities {
400 platform: "elevenlabs".into(),
401 ssml_elements: vec![
402 break_element(),
403 prosody_element(),
404 audio_element(),
405 phoneme_element(),
406 mark_element(),
407 say_as_element("characters", "characters"),
408 say_as_element("number", "number"),
409 say_as_element("date", "date"),
410 say_as_element("time", "time"),
411 ],
412 unsupported: vec![
413 "emphasis".into(),
414 "voice".into(),
415 "lang".into(),
416 "sub".into(),
417 "amazon:effect".into(),
418 "amazon:emotion".into(),
419 "amazon:domain".into(),
420 "mstts:express-as".into(),
421 "google:style".into(),
422 ],
423 }
424}
425
426fn ibm_watson_capabilities() -> PlatformCapabilities {
427 PlatformCapabilities {
428 platform: "ibm-watson".into(),
429 ssml_elements: vec![
430 break_element(),
431 emphasis_element(),
432 prosody_element(),
433 audio_element(),
434 sub_element(),
435 mark_element(),
436 say_as_element("characters", "characters"),
437 say_as_element("number", "number"),
438 say_as_element("ordinal", "ordinal"),
439 say_as_element("date", "date"),
440 say_as_element("time", "time"),
441 ],
442 unsupported: vec![
443 "voice".into(),
444 "lang".into(),
445 "phoneme".into(),
446 "amazon:effect".into(),
447 "amazon:emotion".into(),
448 "amazon:domain".into(),
449 "mstts:express-as".into(),
450 "google:style".into(),
451 ],
452 }
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 #[test]
460 fn test_all_platforms_have_capabilities() {
461 for platform in [
462 Platform::AmazonAlexa,
463 Platform::GoogleAssistant,
464 Platform::MicrosoftAzure,
465 Platform::Apple,
466 Platform::W3c,
467 Platform::SamsungBixby,
468 Platform::ElevenLabs,
469 Platform::IbmWatson,
470 ] {
471 let caps = get_supported_ssml(platform);
472 assert!(!caps.ssml_elements.is_empty(), "{:?} has no elements", platform);
473 assert!(!caps.platform.is_empty());
474 }
475 }
476
477 #[test]
478 fn test_alexa_has_emotion() {
479 let caps = get_supported_ssml(Platform::AmazonAlexa);
480 assert!(caps.ssml_elements.iter().any(|e| e.element == "amazon:emotion"));
481 }
482
483 #[test]
484 fn test_azure_has_express_as() {
485 let caps = get_supported_ssml(Platform::MicrosoftAzure);
486 assert!(caps.ssml_elements.iter().any(|e| e.element == "mstts:express-as"));
487 }
488
489 #[test]
490 fn test_google_no_voice() {
491 let caps = get_supported_ssml(Platform::GoogleAssistant);
492 assert!(caps.unsupported.contains(&"voice".to_string()));
493 }
494
495 #[test]
496 fn test_azure_no_emphasis() {
497 let caps = get_supported_ssml(Platform::MicrosoftAzure);
498 assert!(caps.unsupported.iter().any(|u| u.contains("emphasis")));
499 }
500
501 #[test]
502 fn test_serialization() {
503 let caps = get_supported_ssml(Platform::AmazonAlexa);
504 let json = serde_json::to_string(&caps).unwrap();
505 assert!(json.contains("amazon:emotion"));
506 let deserialized: PlatformCapabilities = serde_json::from_str(&json).unwrap();
507 assert_eq!(caps, deserialized);
508 }
509
510 #[test]
511 fn test_all_platforms_have_break() {
512 for platform in [
513 Platform::AmazonAlexa,
514 Platform::GoogleAssistant,
515 Platform::MicrosoftAzure,
516 Platform::Apple,
517 Platform::W3c,
518 Platform::SamsungBixby,
519 Platform::ElevenLabs,
520 Platform::IbmWatson,
521 ] {
522 let caps = get_supported_ssml(platform);
523 assert!(
524 caps.ssml_elements.iter().any(|e| e.element == "break"),
525 "{:?} missing break",
526 platform
527 );
528 }
529 }
530}