Skip to main content

speechmarkdown_rust/parser/
parser.rs

1use crate::ast::{AstNode, NodeType};
2use crate::capabilities::PlatformCapabilities;
3use crate::error::Result;
4use crate::formatters::base::{FormatterOptions, Platform};
5use crate::formatters::{create_formatter, Formatter, TextFormatter};
6use crate::ssml_to_smd;
7
8pub struct SpeechMarkdownParser;
9
10impl SpeechMarkdownParser {
11    /// Parse SpeechMarkdown text into an AST
12    pub fn parse(input: &str) -> Result<AstNode> {
13        Self::parse_simple(input)
14    }
15
16    /// Convert SpeechMarkdown to plain text
17    pub fn to_text(input: &str) -> Result<String> {
18        let ast = Self::parse(input)?;
19        let formatter = TextFormatter::new();
20        formatter.format(&ast)
21    }
22
23    /// Convert SpeechMarkdown to SSML for the specified platform
24    pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
25        let ast = Self::parse(input)?;
26        let options = FormatterOptions {
27            platform,
28            ..Default::default()
29        };
30        let formatter = create_formatter(platform, options);
31        formatter.format(&ast)
32    }
33
34    /// Convert SSML to SpeechMarkdown (best-effort, lossy for unsupported elements)
35    pub fn to_smd(ssml: &str) -> Result<String> {
36        ssml_to_smd::ssml_to_smd(ssml)
37    }
38
39    /// Get supported SSML elements for a platform
40    pub fn supported_ssml(platform: Platform) -> PlatformCapabilities {
41        crate::capabilities::get_supported_ssml(platform)
42    }
43
44    /// Check if a string contains SpeechMarkdown syntax
45    pub fn is_speech_markdown(input: &str) -> bool {
46        if let Ok(ast) = Self::parse(input) {
47            ast.children.iter().any(|child| {
48                !matches!(
49                    child.node_type,
50                    NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
51                )
52            })
53        } else {
54            false
55        }
56    }
57
58    /// Validate SpeechMarkdown input, returning an error message if invalid
59    pub fn validate(input: &str) -> Result<()> {
60        Self::parse(input)?;
61        Ok(())
62    }
63
64    /// Simple manual parser for basic SpeechMarkdown syntax
65    fn parse_simple(input: &str) -> Result<AstNode> {
66        let mut document = AstNode::document();
67        let mut current_text = String::new();
68        let mut chars = input.chars().peekable();
69
70        let flush_text = |doc: &mut AstNode, text: &mut String| {
71            if !text.is_empty() {
72                let node = AstNode::text(text.clone());
73                text.clear();
74                doc.children.push(node);
75            }
76        };
77
78        while let Some(c) = chars.next() {
79            match c {
80                '#' if chars.peek() == Some(&'[') => {
81                    flush_text(&mut document, &mut current_text);
82                    chars.next();
83                    let (section_content, found) = Self::read_until(&mut chars, ']');
84                    if found {
85                        let mut node = AstNode::new(NodeType::Section, section_content.clone());
86                        for modifier in section_content.split(';') {
87                            if let Some((key, value)) = modifier.split_once(':') {
88                                node = node
89                                    .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
90                            } else {
91                                node = node.with_attribute("style", modifier.trim());
92                            }
93                        }
94                        document = document.add_child(node);
95                    } else {
96                        current_text.push('#');
97                        current_text.push('[');
98                        current_text.push_str(&section_content);
99                    }
100                }
101                '[' => {
102                    flush_text(&mut document, &mut current_text);
103                    let (bracket_content, found) = Self::read_until(&mut chars, ']');
104                    if found {
105                        if let Some(rest) = bracket_content.strip_prefix("break:") {
106                            let break_value = Self::strip_quotes(rest.trim());
107                            if Self::is_time_break(break_value) {
108                                document = document.add_child(AstNode::new(
109                                    NodeType::ShortBreak,
110                                    format!("[{}]", break_value),
111                                ));
112                            } else {
113                                let mut node =
114                                    AstNode::new(NodeType::Break, break_value.to_string());
115                                node = node.with_attribute("strength", break_value);
116                                document = document.add_child(node);
117                            }
118                        } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
119                            let mark_value = Self::strip_quotes(rest.trim());
120                            document = document
121                                .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
122                        } else if Self::is_time_break(&bracket_content) {
123                            document = document.add_child(AstNode::new(
124                                NodeType::ShortBreak,
125                                format!("[{}]", bracket_content),
126                            ));
127                        } else {
128                            current_text.push('[');
129                            current_text.push_str(&bracket_content);
130                            current_text.push(']');
131                        }
132                    } else {
133                        current_text.push('[');
134                        current_text.push_str(&bracket_content);
135                    }
136                }
137                '~' => {
138                    let prev_is_boundary = current_text.is_empty()
139                        || current_text.ends_with(|c: char| c.is_whitespace());
140                    if !prev_is_boundary {
141                        current_text.push('~');
142                    } else {
143                        flush_text(&mut document, &mut current_text);
144                        let mut emphasized_text = String::new();
145                        let mut found_end = false;
146                        while let Some(&next_c) = chars.peek() {
147                            chars.next();
148                            if next_c == '~' {
149                                found_end = true;
150                                break;
151                            }
152                            emphasized_text.push(next_c);
153                        }
154                        if found_end
155                            && !emphasized_text.is_empty()
156                            && !emphasized_text.contains(' ')
157                        {
158                            document = document.add_child(AstNode::new(
159                                NodeType::ShortEmphasisNone,
160                                emphasized_text,
161                            ));
162                        } else {
163                            current_text.push('~');
164                            current_text.push_str(&emphasized_text);
165                            if found_end {
166                                current_text.push('~');
167                            }
168                        }
169                    }
170                }
171                '-' => {
172                    let prev_is_boundary = current_text.is_empty()
173                        || current_text.ends_with(|c: char| c.is_whitespace());
174                    if !prev_is_boundary {
175                        current_text.push('-');
176                    } else {
177                        flush_text(&mut document, &mut current_text);
178                        let mut emphasized_text = String::new();
179                        let mut found_end = false;
180                        while let Some(&next_c) = chars.peek() {
181                            chars.next();
182                            if next_c == '\n' || next_c == '\r' {
183                                emphasized_text.push(next_c);
184                                break;
185                            }
186                            if next_c == '-' {
187                                let next_is_boundary =
188                                    chars.peek().is_none_or(|c| c.is_whitespace());
189                                if next_is_boundary {
190                                    found_end = true;
191                                    break;
192                                } else {
193                                    emphasized_text.push('-');
194                                }
195                            } else {
196                                emphasized_text.push(next_c);
197                            }
198                        }
199                        if found_end
200                            && !emphasized_text.is_empty()
201                            && !emphasized_text.contains(' ')
202                        {
203                            document = document.add_child(AstNode::new(
204                                NodeType::ShortEmphasisReduced,
205                                emphasized_text,
206                            ));
207                        } else {
208                            current_text.push('-');
209                            current_text.push_str(&emphasized_text);
210                            if found_end {
211                                current_text.push('-');
212                            }
213                        }
214                    }
215                }
216                '+' => {
217                    flush_text(&mut document, &mut current_text);
218                    let mut plus_count = 1;
219                    while chars.peek() == Some(&'+') {
220                        chars.next();
221                        plus_count += 1;
222                    }
223                    let mut emphasized_text = String::new();
224                    let mut found_end = false;
225                    while let Some(&next_c) = chars.peek() {
226                        if next_c == '+' {
227                            let mut closing_pluses = 0;
228                            while chars.peek() == Some(&'+') {
229                                chars.next();
230                                closing_pluses += 1;
231                            }
232                            if closing_pluses == plus_count {
233                                found_end = true;
234                                break;
235                            } else {
236                                for _ in 0..closing_pluses {
237                                    emphasized_text.push('+');
238                                }
239                            }
240                        } else {
241                            chars.next();
242                            emphasized_text.push(next_c);
243                        }
244                    }
245                    if found_end {
246                        let node_type = if plus_count >= 2 {
247                            NodeType::ShortEmphasisStrong
248                        } else {
249                            NodeType::ShortEmphasisModerate
250                        };
251                        document = document.add_child(AstNode::new(node_type, emphasized_text));
252                    } else {
253                        for _ in 0..plus_count {
254                            current_text.push('+');
255                        }
256                        current_text.push_str(&emphasized_text);
257                    }
258                }
259                '(' => {
260                    flush_text(&mut document, &mut current_text);
261                    let mut modifier_content = String::new();
262                    let mut found_closing_paren = false;
263                    while let Some(&next_c) = chars.peek() {
264                        chars.next();
265                        if next_c == ')' {
266                            found_closing_paren = true;
267                            break;
268                        }
269                        modifier_content.push(next_c);
270                    }
271
272                    if found_closing_paren {
273                        if chars.peek() == Some(&'[') {
274                            chars.next();
275                            let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
276                            if found_bracket {
277                                let mut node =
278                                    AstNode::new(NodeType::TextModifier, modifier_content);
279                                for modifier in modifiers.split(';') {
280                                    if let Some((key, value)) = modifier.split_once(':') {
281                                        node = node.with_attribute(
282                                            key.trim(),
283                                            Self::strip_quotes(value.trim()),
284                                        );
285                                    } else {
286                                        let key = modifier.trim();
287                                        if !key.is_empty() {
288                                            node = node.with_attribute(key, "");
289                                        }
290                                    }
291                                }
292                                document = document.add_child(node);
293                            } else {
294                                current_text.push('(');
295                                current_text.push_str(&modifier_content);
296                                current_text.push(')');
297                                current_text.push('[');
298                                current_text.push_str(&modifiers);
299                            }
300                        } else if chars.peek() == Some(&'{') {
301                            chars.next();
302                            let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
303                            if found_brace {
304                                let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
305                                if !alias_text.is_empty() {
306                                    node = node.with_attribute("alias", alias_text);
307                                }
308                                document = document.add_child(node);
309                            } else {
310                                current_text.push('(');
311                                current_text.push_str(&modifier_content);
312                                current_text.push(')');
313                                current_text.push('{');
314                                current_text.push_str(&alias_text);
315                            }
316                        } else if chars.peek() == Some(&'/') {
317                            chars.next();
318                            let mut phoneme = String::new();
319                            let mut found_slash = false;
320                            while let Some(&next_c) = chars.peek() {
321                                chars.next();
322                                if next_c == '/' {
323                                    found_slash = true;
324                                    break;
325                                }
326                                phoneme.push(next_c);
327                            }
328                            if found_slash {
329                                let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
330                                node = node.with_attribute("phoneme", phoneme);
331                                document = document.add_child(node);
332                            } else {
333                                current_text.push('(');
334                                current_text.push_str(&modifier_content);
335                                current_text.push(')');
336                                current_text.push('/');
337                                current_text.push_str(&phoneme);
338                            }
339                        } else {
340                            current_text.push('(');
341                            current_text.push_str(&modifier_content);
342                            current_text.push(')');
343                        }
344                    } else {
345                        current_text.push('(');
346                        current_text.push_str(&modifier_content);
347                    }
348                }
349                '/' => {
350                    flush_text(&mut document, &mut current_text);
351                    let mut ipa_content = String::new();
352                    let mut found_slash = false;
353                    while let Some(&next_c) = chars.peek() {
354                        if next_c == '/' {
355                            chars.next();
356                            found_slash = true;
357                            break;
358                        }
359                        if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
360                            break;
361                        }
362                        chars.next();
363                        ipa_content.push(next_c);
364                    }
365                    if found_slash && !ipa_content.is_empty() {
366                        let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
367                        node = node.with_attribute("alphabet", "ipa");
368                        node = node.with_attribute("ph", ipa_content.trim().to_string());
369                        document = document.add_child(node);
370                    } else if found_slash {
371                        current_text.push('/');
372                        current_text.push('/');
373                    } else {
374                        current_text.push('/');
375                        current_text.push_str(&ipa_content);
376                    }
377                }
378                '{' => {
379                    flush_text(&mut document, &mut current_text);
380                    let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
381                    if found_brace && !sub_text.is_empty() {
382                        let mut alias_text = String::new();
383                        while let Some(&next_c) = chars.peek() {
384                            if next_c.is_whitespace()
385                                || next_c == '('
386                                || next_c == '['
387                                || next_c == '+'
388                                || next_c == '~'
389                                || next_c == '!'
390                                || next_c == '/'
391                                || next_c == '{'
392                                || next_c == '}'
393                                || next_c == '#'
394                            {
395                                break;
396                            }
397                            chars.next();
398                            alias_text.push(next_c);
399                        }
400                        let mut node = AstNode::new(NodeType::ShortSub, sub_text);
401                        if !alias_text.is_empty() {
402                            node = node.with_attribute("alias", alias_text);
403                        }
404                        document = document.add_child(node);
405                    } else {
406                        current_text.push('{');
407                        current_text.push_str(&sub_text);
408                    }
409                }
410                '!' => {
411                    if chars.peek() == Some(&'[') {
412                        flush_text(&mut document, &mut current_text);
413                        chars.next();
414                        let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
415
416                        if found_caption_end && chars.peek() == Some(&'(') {
417                            chars.next();
418                            let (url, found_url_end) = Self::read_until(&mut chars, ')');
419                            if found_url_end {
420                                let mut node = AstNode::new(NodeType::Audio, caption);
421                                node = node.with_attribute("src", Self::strip_quotes(&url));
422                                document = document.add_child(node);
423                            } else {
424                                current_text.push_str(&format!("![{}]", caption));
425                            }
426                        } else if found_caption_end && chars.peek() == Some(&'[') {
427                            chars.next();
428                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
429                            if found_url_end {
430                                let mut node = AstNode::new(NodeType::Audio, caption);
431                                node = node.with_attribute("src", Self::strip_quotes(&url));
432                                document = document.add_child(node);
433                            } else {
434                                current_text.push_str(&format!("![{}]", caption));
435                            }
436                        } else if found_caption_end {
437                            let possible_url = Self::strip_quotes(&caption);
438                            if possible_url.starts_with("http://")
439                                || possible_url.starts_with("https://")
440                                || possible_url.starts_with("soundbank://")
441                                || possible_url.contains("://")
442                                || possible_url.contains('.')
443                            {
444                                let mut node = AstNode::new(NodeType::Audio, String::new());
445                                node = node.with_attribute("src", possible_url);
446                                document = document.add_child(node);
447                            } else {
448                                current_text.push_str(&format!("![{}]", caption));
449                            }
450                        } else {
451                            current_text.push_str(&format!("![{}", caption));
452                        }
453                    } else if chars.peek() == Some(&'(') {
454                        flush_text(&mut document, &mut current_text);
455                        chars.next();
456                        let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
457                        if found_caption_end && chars.peek() == Some(&'[') {
458                            chars.next();
459                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
460                            if found_url_end {
461                                let mut node = AstNode::new(NodeType::Audio, caption);
462                                node = node.with_attribute("src", Self::strip_quotes(&url));
463                                document = document.add_child(node);
464                            } else {
465                                current_text.push_str(&format!("!({}[", caption));
466                            }
467                        } else {
468                            current_text.push_str(&format!("!({}", caption));
469                        }
470                    } else {
471                        current_text.push('!');
472                    }
473                }
474                _ => {
475                    current_text.push(c);
476                }
477            }
478        }
479
480        if !current_text.is_empty() {
481            document = document.add_child(AstNode::text(current_text));
482        }
483
484        Ok(document)
485    }
486
487    fn strip_quotes(s: &str) -> &str {
488        let s = s.trim();
489        if s.len() >= 2 {
490            let first = s.chars().next().unwrap();
491            let last = s.chars().last().unwrap();
492            if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
493                return &s[1..s.len() - 1];
494            }
495        }
496        s
497    }
498
499    fn is_time_break(s: &str) -> bool {
500        s.ends_with("s") || s.ends_with("ms")
501    }
502
503    fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
504        let mut content = String::new();
505        let mut found = false;
506        while let Some(&next_c) = chars.peek() {
507            chars.next();
508            if next_c == end {
509                found = true;
510                break;
511            }
512            content.push(next_c);
513        }
514        (content, found)
515    }
516}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    #[test]
523    fn test_parse_simple_text() {
524        let result = SpeechMarkdownParser::parse("Hello world");
525        assert!(result.is_ok());
526
527        let ast = result.unwrap();
528        assert_eq!(ast.node_type, NodeType::Document);
529        assert!(!ast.children.is_empty());
530    }
531
532    #[test]
533    fn test_parse_short_break() {
534        let result = SpeechMarkdownParser::parse("Sample [2s] text");
535        assert!(result.is_ok());
536    }
537
538    #[test]
539    fn test_parse_emphasis_strong() {
540        let result = SpeechMarkdownParser::parse("++strong emphasis++");
541        assert!(result.is_ok());
542    }
543
544    #[test]
545    fn test_parse_text_modifier() {
546        let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
547        assert!(result.is_ok());
548    }
549
550    #[test]
551    fn test_parse_audio() {
552        let result = SpeechMarkdownParser::parse("![caption](\"https://example.com/audio.mp3\")");
553        assert!(result.is_ok());
554    }
555
556    #[test]
557    fn test_debug_substitution() {
558        let input = "{Al}aluminum";
559        let result = SpeechMarkdownParser::parse(input);
560        assert!(result.is_ok());
561
562        let ast = result.unwrap();
563        println!("=== Substitution Debug ===");
564        println!("Input: {}", input);
565        println!("AST: {:?}", ast);
566        println!("Children: {:?}", ast.children);
567        println!("========================");
568    }
569
570    #[test]
571    fn test_debug_emphasis_ssml() {
572        let input = "++strong emphasis++";
573        let result =
574            SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
575        println!("=== Emphasis SSML Debug ===");
576        println!("Input: {}", input);
577        println!("SSML Result: {:?}", result);
578        println!("==========================");
579    }
580
581    #[test]
582    fn test_is_speech_markdown() {
583        assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
584        assert!(!SpeechMarkdownParser::is_speech_markdown(""));
585        assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
586        assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
587        assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
588        assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
589        assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
590        assert!(SpeechMarkdownParser::is_speech_markdown("![audio](url)"));
591    }
592
593    #[test]
594    fn test_validate() {
595        assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
596        assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
597        assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
598        assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
599    }
600}