Skip to main content

speechmarkdown_rust/
capabilities.rs

1use crate::formatters::base::Platform;
2use serde::{Deserialize, Serialize};
3
4#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
5pub struct SsmlCapability {
6    pub element: String,
7    pub description: String,
8    pub attributes: Vec<String>,
9    pub speech_markdown_syntax: Vec<String>,
10    pub example: String,
11}
12
13#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
14pub struct PlatformCapabilities {
15    pub platform: String,
16    pub ssml_elements: Vec<SsmlCapability>,
17    pub unsupported: Vec<String>,
18}
19
20pub fn get_supported_ssml(platform: Platform) -> PlatformCapabilities {
21    match platform {
22        Platform::AmazonAlexa => amazon_alexa_capabilities(),
23        Platform::GoogleAssistant => google_assistant_capabilities(),
24        Platform::MicrosoftAzure => microsoft_azure_capabilities(),
25        Platform::Apple => apple_capabilities(),
26        Platform::W3c => w3c_capabilities(),
27        Platform::SamsungBixby => samsung_bixby_capabilities(),
28        Platform::ElevenLabs => elevenlabs_capabilities(),
29        Platform::IbmWatson => ibm_watson_capabilities(),
30    }
31}
32
33fn break_element() -> SsmlCapability {
34    SsmlCapability {
35        element: "break".into(),
36        description: "Insert a pause".into(),
37        attributes: vec!["time".into(), "strength".into()],
38        speech_markdown_syntax: vec!["[2s]".into(), "[500ms]".into(), "[break:strong]".into()],
39        example: "Hello [2s] world".into(),
40    }
41}
42
43fn emphasis_element() -> SsmlCapability {
44    SsmlCapability {
45        element: "emphasis".into(),
46        description: "Emphasize text".into(),
47        attributes: vec!["level".into()],
48        speech_markdown_syntax: vec![
49            "+word+".into(),
50            "++word++".into(),
51            "-word-".into(),
52            "~word~".into(),
53        ],
54        example: "++important++".into(),
55    }
56}
57
58fn prosody_element() -> SsmlCapability {
59    SsmlCapability {
60        element: "prosody".into(),
61        description: "Control rate, pitch, and volume".into(),
62        attributes: vec!["rate".into(), "pitch".into(), "volume".into()],
63        speech_markdown_syntax: vec![
64            "(text)[rate:\"slow\"]".into(),
65            "(text)[pitch:\"high\"]".into(),
66            "(text)[volume:\"soft\"]".into(),
67        ],
68        example: "(read this)[rate:\"fast\";volume:\"loud\"]".into(),
69    }
70}
71
72fn audio_element() -> SsmlCapability {
73    SsmlCapability {
74        element: "audio".into(),
75        description: "Play an audio file".into(),
76        attributes: vec!["src".into()],
77        speech_markdown_syntax: vec!["![caption](url)".into(), "![](url)".into()],
78        example: "![sound](https://example.com/audio.mp3)".into(),
79    }
80}
81
82fn say_as_element(sub_type: &str, _interpret_as: &str) -> SsmlCapability {
83    let (syntax, desc) = match sub_type {
84        "characters" => ("(ABC)[characters]", "Spell out characters"),
85        "number" => ("(42)[number]", "Read as cardinal number"),
86        "ordinal" => ("(1)[ordinal]", "Read as ordinal (first, second)"),
87        "fraction" => ("(1/2)[fraction]", "Read as fraction"),
88        "telephone" => ("(555-1234)[telephone]", "Read as phone number"),
89        "address" => ("(123 Main St)[address]", "Read as address"),
90        "unit" => ("(5kg)[unit]", "Read as unit of measurement"),
91        "time" => ("(2:30)[time:\"hms12\"]", "Read as time"),
92        "date" => ("(01/02/2024)[date:\"mdy\"]", "Read as date"),
93        "interjection" => ("(wow)[interjection]", "Read as interjection"),
94        "expletive" => ("(word)[expletive]", "Bleep/censor word"),
95        _ => ("", "Unknown"),
96    };
97    SsmlCapability {
98        element: "say-as".into(),
99        description: desc.into(),
100        attributes: vec!["interpret-as".into(), "format".into()],
101        speech_markdown_syntax: vec![syntax.into()],
102        example: syntax.into(),
103    }
104}
105
106fn sub_element() -> SsmlCapability {
107    SsmlCapability {
108        element: "sub".into(),
109        description: "Substitute pronunciation".into(),
110        attributes: vec!["alias".into()],
111        speech_markdown_syntax: vec!["{alias}text".into()],
112        example: "{AL}aluminum".into(),
113    }
114}
115
116fn mark_element() -> SsmlCapability {
117    SsmlCapability {
118        element: "mark".into(),
119        description: "Insert a named marker".into(),
120        attributes: vec!["name".into()],
121        speech_markdown_syntax: vec!["[mark:name]".into()],
122        example: "Hello [mark:chapter1] world".into(),
123    }
124}
125
126fn phoneme_element() -> SsmlCapability {
127    SsmlCapability {
128        element: "phoneme".into(),
129        description: "Custom pronunciation (IPA)".into(),
130        attributes: vec!["alphabet".into(), "ph".into()],
131        speech_markdown_syntax: vec!["(text)/phoneme".into()],
132        example: "(piccolo)/ˈpi.kɑː.loʊ".into(),
133    }
134}
135
136fn voice_element() -> SsmlCapability {
137    SsmlCapability {
138        element: "voice".into(),
139        description: "Switch to a different voice".into(),
140        attributes: vec!["name".into()],
141        speech_markdown_syntax: vec!["(text)[voice:\"Kendra\"]".into()],
142        example: "(hello)[voice:\"Brian\"]".into(),
143    }
144}
145
146fn lang_element() -> SsmlCapability {
147    SsmlCapability {
148        element: "lang".into(),
149        description: "Set language".into(),
150        attributes: vec!["xml:lang".into()],
151        speech_markdown_syntax: vec!["(text)[lang:\"fr-FR\"]".into()],
152        example: "(bonjour)[lang:\"fr-FR\"]".into(),
153    }
154}
155
156fn amazon_alexa_capabilities() -> PlatformCapabilities {
157    PlatformCapabilities {
158        platform: "amazon-alexa".into(),
159        ssml_elements: vec![
160            break_element(),
161            emphasis_element(),
162            prosody_element(),
163            audio_element(),
164            sub_element(),
165            phoneme_element(),
166            mark_element(),
167            lang_element(),
168            voice_element(),
169            say_as_element("characters", "characters"),
170            say_as_element("number", "number"),
171            say_as_element("ordinal", "ordinal"),
172            say_as_element("fraction", "fraction"),
173            say_as_element("telephone", "telephone"),
174            say_as_element("address", "address"),
175            say_as_element("unit", "unit"),
176            say_as_element("time", "time"),
177            say_as_element("date", "date"),
178            say_as_element("interjection", "interjection"),
179            say_as_element("expletive", "expletive"),
180            SsmlCapability {
181                element: "amazon:effect".into(),
182                description: "Whisper effect".into(),
183                attributes: vec!["name".into()],
184                speech_markdown_syntax: vec!["(text)[whisper]".into()],
185                example: "(hello)[whisper]".into(),
186            },
187            SsmlCapability {
188                element: "amazon:emotion".into(),
189                description: "Express emotion (excited/disappointed)".into(),
190                attributes: vec!["name".into(), "intensity".into()],
191                speech_markdown_syntax: vec![
192                    "#[excited] text".into(),
193                    "#[disappointed] text".into(),
194                ],
195                example: "#[excited] Great news!".into(),
196            },
197            SsmlCapability {
198                element: "amazon:domain".into(),
199                description: "Switch to news or music domain".into(),
200                attributes: vec!["name".into()],
201                speech_markdown_syntax: vec!["#[newscaster] text".into(), "#[dj] text".into()],
202                example: "#[newscaster] Breaking news today".into(),
203            },
204        ],
205        unsupported: vec!["google:style".into(), "mstts:express-as".into()],
206    }
207}
208
209fn google_assistant_capabilities() -> PlatformCapabilities {
210    PlatformCapabilities {
211        platform: "google-assistant".into(),
212        ssml_elements: vec![
213            break_element(),
214            emphasis_element(),
215            prosody_element(),
216            audio_element(),
217            sub_element(),
218            mark_element(),
219            say_as_element("characters", "characters"),
220            say_as_element("number", "number"),
221            say_as_element("ordinal", "ordinal"),
222            say_as_element("fraction", "fraction"),
223            say_as_element("telephone", "telephone"),
224            say_as_element("address", "address"),
225            say_as_element("unit", "unit"),
226            say_as_element("time", "time"),
227            say_as_element("date", "date"),
228            say_as_element("interjection", "interjection"),
229            say_as_element("expletive", "expletive"),
230            SsmlCapability {
231                element: "google:style".into(),
232                description: "Google speaking style".into(),
233                attributes: vec!["name".into()],
234                speech_markdown_syntax: vec!["(text)[style:\"name\"]".into()],
235                example: "(hello)[style:\"cheerful\"]".into(),
236            },
237        ],
238        unsupported: vec![
239            "voice".into(),
240            "lang".into(),
241            "phoneme".into(),
242            "amazon:effect".into(),
243            "amazon:emotion".into(),
244            "amazon:domain".into(),
245            "mstts:express-as".into(),
246            "excited section".into(),
247            "disappointed section".into(),
248        ],
249    }
250}
251
252fn microsoft_azure_capabilities() -> PlatformCapabilities {
253    PlatformCapabilities {
254        platform: "microsoft-azure".into(),
255        ssml_elements: vec![
256            break_element(),
257            prosody_element(),
258            audio_element(),
259            sub_element(),
260            phoneme_element(),
261            mark_element(),
262            lang_element(),
263            voice_element(),
264            say_as_element("characters", "characters"),
265            say_as_element("number", "cardinal"),
266            say_as_element("ordinal", "ordinal"),
267            say_as_element("fraction", "fraction"),
268            say_as_element("telephone", "telephone"),
269            say_as_element("address", "address"),
270            say_as_element("unit", "unit"),
271            say_as_element("time", "time"),
272            say_as_element("date", "date"),
273            say_as_element("interjection", "interjection"),
274            say_as_element("expletive", "expletive"),
275            SsmlCapability {
276                element: "mstts:express-as".into(),
277                description: "Express emotion/style (42 styles)".into(),
278                attributes: vec!["style".into()],
279                speech_markdown_syntax: vec![
280                    "#[cheerful] text".into(),
281                    "#[sad] text".into(),
282                    "#[angry] text".into(),
283                    "(text)[excited]".into(),
284                    "(text)[disappointed]".into(),
285                ],
286                example: "#[cheerful] Hello there!".into(),
287            },
288            SsmlCapability {
289                element: "prosody (whisper)".into(),
290                description: "Whisper via prosody (rate:slow + volume:x-soft)".into(),
291                attributes: vec!["rate".into(), "volume".into()],
292                speech_markdown_syntax: vec!["(text)[whisper]".into()],
293                example: "(hello)[whisper]".into(),
294            },
295        ],
296        unsupported: vec![
297            "emphasis (not supported by Azure)".into(),
298            "amazon:effect".into(),
299            "amazon:emotion".into(),
300            "amazon:domain".into(),
301        ],
302    }
303}
304
305fn apple_capabilities() -> PlatformCapabilities {
306    PlatformCapabilities {
307        platform: "apple".into(),
308        ssml_elements: vec![
309            break_element(),
310            emphasis_element(),
311            prosody_element(),
312            audio_element(),
313            sub_element(),
314            phoneme_element(),
315            mark_element(),
316            lang_element(),
317            voice_element(),
318            say_as_element("characters", "characters"),
319            say_as_element("number", "number"),
320            say_as_element("ordinal", "ordinal"),
321            say_as_element("date", "date"),
322            say_as_element("time", "time"),
323        ],
324        unsupported: vec![
325            "amazon:effect".into(),
326            "amazon:emotion".into(),
327            "amazon:domain".into(),
328            "mstts:express-as".into(),
329            "google:style".into(),
330        ],
331    }
332}
333
334fn w3c_capabilities() -> PlatformCapabilities {
335    PlatformCapabilities {
336        platform: "w3c".into(),
337        ssml_elements: vec![
338            break_element(),
339            emphasis_element(),
340            prosody_element(),
341            audio_element(),
342            sub_element(),
343            phoneme_element(),
344            mark_element(),
345            lang_element(),
346            voice_element(),
347            say_as_element("characters", "characters"),
348            say_as_element("number", "number"),
349            say_as_element("ordinal", "ordinal"),
350            say_as_element("fraction", "fraction"),
351            say_as_element("telephone", "telephone"),
352            say_as_element("address", "address"),
353            say_as_element("unit", "unit"),
354            say_as_element("time", "time"),
355            say_as_element("date", "date"),
356            say_as_element("interjection", "interjection"),
357            say_as_element("expletive", "expletive"),
358        ],
359        unsupported: vec![
360            "amazon:effect".into(),
361            "amazon:emotion".into(),
362            "amazon:domain".into(),
363            "mstts:express-as".into(),
364            "google:style".into(),
365        ],
366    }
367}
368
369fn samsung_bixby_capabilities() -> PlatformCapabilities {
370    PlatformCapabilities {
371        platform: "samsung-bixby".into(),
372        ssml_elements: vec![
373            break_element(),
374            emphasis_element(),
375            prosody_element(),
376            audio_element(),
377            sub_element(),
378            mark_element(),
379            say_as_element("characters", "characters"),
380            say_as_element("number", "number"),
381            say_as_element("ordinal", "ordinal"),
382            say_as_element("date", "date"),
383            say_as_element("time", "time"),
384        ],
385        unsupported: vec![
386            "voice".into(),
387            "lang".into(),
388            "phoneme".into(),
389            "amazon:effect".into(),
390            "amazon:emotion".into(),
391            "amazon:domain".into(),
392            "mstts:express-as".into(),
393            "google:style".into(),
394        ],
395    }
396}
397
398fn elevenlabs_capabilities() -> PlatformCapabilities {
399    PlatformCapabilities {
400        platform: "elevenlabs".into(),
401        ssml_elements: vec![
402            break_element(),
403            prosody_element(),
404            audio_element(),
405            phoneme_element(),
406            mark_element(),
407            say_as_element("characters", "characters"),
408            say_as_element("number", "number"),
409            say_as_element("date", "date"),
410            say_as_element("time", "time"),
411        ],
412        unsupported: vec![
413            "emphasis".into(),
414            "voice".into(),
415            "lang".into(),
416            "sub".into(),
417            "amazon:effect".into(),
418            "amazon:emotion".into(),
419            "amazon:domain".into(),
420            "mstts:express-as".into(),
421            "google:style".into(),
422        ],
423    }
424}
425
426fn ibm_watson_capabilities() -> PlatformCapabilities {
427    PlatformCapabilities {
428        platform: "ibm-watson".into(),
429        ssml_elements: vec![
430            break_element(),
431            emphasis_element(),
432            prosody_element(),
433            audio_element(),
434            sub_element(),
435            mark_element(),
436            say_as_element("characters", "characters"),
437            say_as_element("number", "number"),
438            say_as_element("ordinal", "ordinal"),
439            say_as_element("date", "date"),
440            say_as_element("time", "time"),
441        ],
442        unsupported: vec![
443            "voice".into(),
444            "lang".into(),
445            "phoneme".into(),
446            "amazon:effect".into(),
447            "amazon:emotion".into(),
448            "amazon:domain".into(),
449            "mstts:express-as".into(),
450            "google:style".into(),
451        ],
452    }
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn test_all_platforms_have_capabilities() {
461        for platform in [
462            Platform::AmazonAlexa,
463            Platform::GoogleAssistant,
464            Platform::MicrosoftAzure,
465            Platform::Apple,
466            Platform::W3c,
467            Platform::SamsungBixby,
468            Platform::ElevenLabs,
469            Platform::IbmWatson,
470        ] {
471            let caps = get_supported_ssml(platform);
472            assert!(!caps.ssml_elements.is_empty(), "{:?} has no elements", platform);
473            assert!(!caps.platform.is_empty());
474        }
475    }
476
477    #[test]
478    fn test_alexa_has_emotion() {
479        let caps = get_supported_ssml(Platform::AmazonAlexa);
480        assert!(caps.ssml_elements.iter().any(|e| e.element == "amazon:emotion"));
481    }
482
483    #[test]
484    fn test_azure_has_express_as() {
485        let caps = get_supported_ssml(Platform::MicrosoftAzure);
486        assert!(caps.ssml_elements.iter().any(|e| e.element == "mstts:express-as"));
487    }
488
489    #[test]
490    fn test_google_no_voice() {
491        let caps = get_supported_ssml(Platform::GoogleAssistant);
492        assert!(caps.unsupported.contains(&"voice".to_string()));
493    }
494
495    #[test]
496    fn test_azure_no_emphasis() {
497        let caps = get_supported_ssml(Platform::MicrosoftAzure);
498        assert!(caps.unsupported.iter().any(|u| u.contains("emphasis")));
499    }
500
501    #[test]
502    fn test_serialization() {
503        let caps = get_supported_ssml(Platform::AmazonAlexa);
504        let json = serde_json::to_string(&caps).unwrap();
505        assert!(json.contains("amazon:emotion"));
506        let deserialized: PlatformCapabilities = serde_json::from_str(&json).unwrap();
507        assert_eq!(caps, deserialized);
508    }
509
510    #[test]
511    fn test_all_platforms_have_break() {
512        for platform in [
513            Platform::AmazonAlexa,
514            Platform::GoogleAssistant,
515            Platform::MicrosoftAzure,
516            Platform::Apple,
517            Platform::W3c,
518            Platform::SamsungBixby,
519            Platform::ElevenLabs,
520            Platform::IbmWatson,
521        ] {
522            let caps = get_supported_ssml(platform);
523            assert!(
524                caps.ssml_elements.iter().any(|e| e.element == "break"),
525                "{:?} missing break",
526                platform
527            );
528        }
529    }
530}