Skip to main content

speechmarkdown_rust/parser/
parser.rs

1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5
6pub struct SpeechMarkdownParser;
7
8impl SpeechMarkdownParser {
9    /// Parse SpeechMarkdown text into an AST
10    pub fn parse(input: &str) -> Result<AstNode> {
11        Self::parse_simple(input)
12    }
13
14    /// Convert SpeechMarkdown to plain text
15    pub fn to_text(input: &str) -> Result<String> {
16        let ast = Self::parse(input)?;
17        let formatter = TextFormatter::new();
18        formatter.format(&ast)
19    }
20
21    /// Convert SpeechMarkdown to SSML for the specified platform
22    pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
23        let ast = Self::parse(input)?;
24        let options = FormatterOptions {
25            platform,
26            ..Default::default()
27        };
28        let formatter = create_formatter(platform, options);
29        formatter.format(&ast)
30    }
31
32    /// Check if a string contains SpeechMarkdown syntax
33    pub fn is_speech_markdown(input: &str) -> bool {
34        if let Ok(ast) = Self::parse(input) {
35            ast.children.iter().any(|child| {
36                !matches!(
37                    child.node_type,
38                    NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
39                )
40            })
41        } else {
42            false
43        }
44    }
45
46    /// Validate SpeechMarkdown input, returning an error message if invalid
47    pub fn validate(input: &str) -> Result<()> {
48        Self::parse(input)?;
49        Ok(())
50    }
51
52    /// Simple manual parser for basic SpeechMarkdown syntax
53    fn parse_simple(input: &str) -> Result<AstNode> {
54        let mut document = AstNode::document();
55        let mut current_text = String::new();
56        let mut chars = input.chars().peekable();
57
58        let flush_text = |doc: &mut AstNode, text: &mut String| {
59            if !text.is_empty() {
60                let node = AstNode::text(text.clone());
61                text.clear();
62                doc.children.push(node);
63            }
64        };
65
66        while let Some(c) = chars.next() {
67            match c {
68                '#' if chars.peek() == Some(&'[') => {
69                    flush_text(&mut document, &mut current_text);
70                    chars.next();
71                    let (section_content, found) = Self::read_until(&mut chars, ']');
72                    if found {
73                        let mut node = AstNode::new(NodeType::Section, section_content.clone());
74                        for modifier in section_content.split(';') {
75                            if let Some((key, value)) = modifier.split_once(':') {
76                                node = node
77                                    .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
78                            } else {
79                                node = node.with_attribute("style", modifier.trim());
80                            }
81                        }
82                        document = document.add_child(node);
83                    } else {
84                        current_text.push('#');
85                        current_text.push('[');
86                        current_text.push_str(&section_content);
87                    }
88                }
89                '[' => {
90                    flush_text(&mut document, &mut current_text);
91                    let (bracket_content, found) = Self::read_until(&mut chars, ']');
92                    if found {
93                        if let Some(rest) = bracket_content.strip_prefix("break:") {
94                            let break_value = Self::strip_quotes(rest.trim());
95                            if Self::is_time_break(break_value) {
96                                document = document.add_child(AstNode::new(
97                                    NodeType::ShortBreak,
98                                    format!("[{}]", break_value),
99                                ));
100                            } else {
101                                let mut node =
102                                    AstNode::new(NodeType::Break, break_value.to_string());
103                                node = node.with_attribute("strength", break_value);
104                                document = document.add_child(node);
105                            }
106                        } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
107                            let mark_value = Self::strip_quotes(rest.trim());
108                            document = document
109                                .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
110                        } else if Self::is_time_break(&bracket_content) {
111                            document = document.add_child(AstNode::new(
112                                NodeType::ShortBreak,
113                                format!("[{}]", bracket_content),
114                            ));
115                        } else {
116                            current_text.push('[');
117                            current_text.push_str(&bracket_content);
118                            current_text.push(']');
119                        }
120                    } else {
121                        current_text.push('[');
122                        current_text.push_str(&bracket_content);
123                    }
124                }
125                '~' => {
126                    let prev_is_boundary = current_text.is_empty()
127                        || current_text.ends_with(|c: char| c.is_whitespace());
128                    if !prev_is_boundary {
129                        current_text.push('~');
130                    } else {
131                        flush_text(&mut document, &mut current_text);
132                        let mut emphasized_text = String::new();
133                        let mut found_end = false;
134                        while let Some(&next_c) = chars.peek() {
135                            chars.next();
136                            if next_c == '~' {
137                                found_end = true;
138                                break;
139                            }
140                            emphasized_text.push(next_c);
141                        }
142                        if found_end
143                            && !emphasized_text.is_empty()
144                            && !emphasized_text.contains(' ')
145                        {
146                            document = document.add_child(AstNode::new(
147                                NodeType::ShortEmphasisNone,
148                                emphasized_text,
149                            ));
150                        } else {
151                            current_text.push('~');
152                            current_text.push_str(&emphasized_text);
153                            if found_end {
154                                current_text.push('~');
155                            }
156                        }
157                    }
158                }
159                '-' => {
160                    let prev_is_boundary = current_text.is_empty()
161                        || current_text.ends_with(|c: char| c.is_whitespace());
162                    if !prev_is_boundary {
163                        current_text.push('-');
164                    } else {
165                        flush_text(&mut document, &mut current_text);
166                        let mut emphasized_text = String::new();
167                        let mut found_end = false;
168                        while let Some(&next_c) = chars.peek() {
169                            chars.next();
170                            if next_c == '\n' || next_c == '\r' {
171                                emphasized_text.push(next_c);
172                                break;
173                            }
174                            if next_c == '-' {
175                                let next_is_boundary =
176                                    chars.peek().is_none_or(|c| c.is_whitespace());
177                                if next_is_boundary {
178                                    found_end = true;
179                                    break;
180                                } else {
181                                    emphasized_text.push('-');
182                                }
183                            } else {
184                                emphasized_text.push(next_c);
185                            }
186                        }
187                        if found_end
188                            && !emphasized_text.is_empty()
189                            && !emphasized_text.contains(' ')
190                        {
191                            document = document.add_child(AstNode::new(
192                                NodeType::ShortEmphasisReduced,
193                                emphasized_text,
194                            ));
195                        } else {
196                            current_text.push('-');
197                            current_text.push_str(&emphasized_text);
198                            if found_end {
199                                current_text.push('-');
200                            }
201                        }
202                    }
203                }
204                '+' => {
205                    flush_text(&mut document, &mut current_text);
206                    let mut plus_count = 1;
207                    while chars.peek() == Some(&'+') {
208                        chars.next();
209                        plus_count += 1;
210                    }
211                    let mut emphasized_text = String::new();
212                    let mut found_end = false;
213                    while let Some(&next_c) = chars.peek() {
214                        if next_c == '+' {
215                            let mut closing_pluses = 0;
216                            while chars.peek() == Some(&'+') {
217                                chars.next();
218                                closing_pluses += 1;
219                            }
220                            if closing_pluses == plus_count {
221                                found_end = true;
222                                break;
223                            } else {
224                                for _ in 0..closing_pluses {
225                                    emphasized_text.push('+');
226                                }
227                            }
228                        } else {
229                            chars.next();
230                            emphasized_text.push(next_c);
231                        }
232                    }
233                    if found_end {
234                        let node_type = if plus_count >= 2 {
235                            NodeType::ShortEmphasisStrong
236                        } else {
237                            NodeType::ShortEmphasisModerate
238                        };
239                        document = document.add_child(AstNode::new(node_type, emphasized_text));
240                    } else {
241                        for _ in 0..plus_count {
242                            current_text.push('+');
243                        }
244                        current_text.push_str(&emphasized_text);
245                    }
246                }
247                '(' => {
248                    flush_text(&mut document, &mut current_text);
249                    let mut modifier_content = String::new();
250                    let mut found_closing_paren = false;
251                    while let Some(&next_c) = chars.peek() {
252                        chars.next();
253                        if next_c == ')' {
254                            found_closing_paren = true;
255                            break;
256                        }
257                        modifier_content.push(next_c);
258                    }
259
260                    if found_closing_paren {
261                        if chars.peek() == Some(&'[') {
262                            chars.next();
263                            let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
264                            if found_bracket {
265                                let mut node =
266                                    AstNode::new(NodeType::TextModifier, modifier_content);
267                                for modifier in modifiers.split(';') {
268                                    if let Some((key, value)) = modifier.split_once(':') {
269                                        node = node.with_attribute(
270                                            key.trim(),
271                                            Self::strip_quotes(value.trim()),
272                                        );
273                                    } else {
274                                        let key = modifier.trim();
275                                        if !key.is_empty() {
276                                            node = node.with_attribute(key, "");
277                                        }
278                                    }
279                                }
280                                document = document.add_child(node);
281                            } else {
282                                current_text.push('(');
283                                current_text.push_str(&modifier_content);
284                                current_text.push(')');
285                                current_text.push('[');
286                                current_text.push_str(&modifiers);
287                            }
288                        } else if chars.peek() == Some(&'{') {
289                            chars.next();
290                            let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
291                            if found_brace {
292                                let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
293                                if !alias_text.is_empty() {
294                                    node = node.with_attribute("alias", alias_text);
295                                }
296                                document = document.add_child(node);
297                            } else {
298                                current_text.push('(');
299                                current_text.push_str(&modifier_content);
300                                current_text.push(')');
301                                current_text.push('{');
302                                current_text.push_str(&alias_text);
303                            }
304                        } else if chars.peek() == Some(&'/') {
305                            chars.next();
306                            let mut phoneme = String::new();
307                            let mut found_slash = false;
308                            while let Some(&next_c) = chars.peek() {
309                                chars.next();
310                                if next_c == '/' {
311                                    found_slash = true;
312                                    break;
313                                }
314                                phoneme.push(next_c);
315                            }
316                            if found_slash {
317                                let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
318                                node = node.with_attribute("phoneme", phoneme);
319                                document = document.add_child(node);
320                            } else {
321                                current_text.push('(');
322                                current_text.push_str(&modifier_content);
323                                current_text.push(')');
324                                current_text.push('/');
325                                current_text.push_str(&phoneme);
326                            }
327                        } else {
328                            current_text.push('(');
329                            current_text.push_str(&modifier_content);
330                            current_text.push(')');
331                        }
332                    } else {
333                        current_text.push('(');
334                        current_text.push_str(&modifier_content);
335                    }
336                }
337                '/' => {
338                    flush_text(&mut document, &mut current_text);
339                    let mut ipa_content = String::new();
340                    let mut found_slash = false;
341                    while let Some(&next_c) = chars.peek() {
342                        if next_c == '/' {
343                            chars.next();
344                            found_slash = true;
345                            break;
346                        }
347                        if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
348                            break;
349                        }
350                        chars.next();
351                        ipa_content.push(next_c);
352                    }
353                    if found_slash && !ipa_content.is_empty() {
354                        let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
355                        node = node.with_attribute("alphabet", "ipa");
356                        node = node.with_attribute("ph", ipa_content.trim().to_string());
357                        document = document.add_child(node);
358                    } else if found_slash {
359                        current_text.push('/');
360                        current_text.push('/');
361                    } else {
362                        current_text.push('/');
363                        current_text.push_str(&ipa_content);
364                    }
365                }
366                '{' => {
367                    flush_text(&mut document, &mut current_text);
368                    let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
369                    if found_brace && !sub_text.is_empty() {
370                        let mut alias_text = String::new();
371                        while let Some(&next_c) = chars.peek() {
372                            if next_c.is_whitespace()
373                                || next_c == '('
374                                || next_c == '['
375                                || next_c == '+'
376                                || next_c == '~'
377                                || next_c == '!'
378                                || next_c == '/'
379                                || next_c == '{'
380                                || next_c == '}'
381                                || next_c == '#'
382                            {
383                                break;
384                            }
385                            chars.next();
386                            alias_text.push(next_c);
387                        }
388                        let mut node = AstNode::new(NodeType::ShortSub, sub_text);
389                        if !alias_text.is_empty() {
390                            node = node.with_attribute("alias", alias_text);
391                        }
392                        document = document.add_child(node);
393                    } else {
394                        current_text.push('{');
395                        current_text.push_str(&sub_text);
396                    }
397                }
398                '!' => {
399                    if chars.peek() == Some(&'[') {
400                        flush_text(&mut document, &mut current_text);
401                        chars.next();
402                        let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
403
404                        if found_caption_end && chars.peek() == Some(&'(') {
405                            chars.next();
406                            let (url, found_url_end) = Self::read_until(&mut chars, ')');
407                            if found_url_end {
408                                let mut node = AstNode::new(NodeType::Audio, caption);
409                                node = node.with_attribute("src", Self::strip_quotes(&url));
410                                document = document.add_child(node);
411                            } else {
412                                current_text.push_str(&format!("![{}]", caption));
413                            }
414                        } else if found_caption_end && chars.peek() == Some(&'[') {
415                            chars.next();
416                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
417                            if found_url_end {
418                                let mut node = AstNode::new(NodeType::Audio, caption);
419                                node = node.with_attribute("src", Self::strip_quotes(&url));
420                                document = document.add_child(node);
421                            } else {
422                                current_text.push_str(&format!("![{}]", caption));
423                            }
424                        } else if found_caption_end {
425                            let possible_url = Self::strip_quotes(&caption);
426                            if possible_url.starts_with("http://")
427                                || possible_url.starts_with("https://")
428                                || possible_url.starts_with("soundbank://")
429                                || possible_url.contains("://")
430                                || possible_url.contains('.')
431                            {
432                                let mut node = AstNode::new(NodeType::Audio, String::new());
433                                node = node.with_attribute("src", possible_url);
434                                document = document.add_child(node);
435                            } else {
436                                current_text.push_str(&format!("![{}]", caption));
437                            }
438                        } else {
439                            current_text.push_str(&format!("![{}", caption));
440                        }
441                    } else if chars.peek() == Some(&'(') {
442                        flush_text(&mut document, &mut current_text);
443                        chars.next();
444                        let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
445                        if found_caption_end && chars.peek() == Some(&'[') {
446                            chars.next();
447                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
448                            if found_url_end {
449                                let mut node = AstNode::new(NodeType::Audio, caption);
450                                node = node.with_attribute("src", Self::strip_quotes(&url));
451                                document = document.add_child(node);
452                            } else {
453                                current_text.push_str(&format!("!({}[", caption));
454                            }
455                        } else {
456                            current_text.push_str(&format!("!({}", caption));
457                        }
458                    } else {
459                        current_text.push('!');
460                    }
461                }
462                _ => {
463                    current_text.push(c);
464                }
465            }
466        }
467
468        if !current_text.is_empty() {
469            document = document.add_child(AstNode::text(current_text));
470        }
471
472        Ok(document)
473    }
474
475    fn strip_quotes(s: &str) -> &str {
476        let s = s.trim();
477        if s.len() >= 2 {
478            let first = s.chars().next().unwrap();
479            let last = s.chars().last().unwrap();
480            if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
481                return &s[1..s.len() - 1];
482            }
483        }
484        s
485    }
486
487    fn is_time_break(s: &str) -> bool {
488        s.ends_with("s") || s.ends_with("ms")
489    }
490
491    fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
492        let mut content = String::new();
493        let mut found = false;
494        while let Some(&next_c) = chars.peek() {
495            chars.next();
496            if next_c == end {
497                found = true;
498                break;
499            }
500            content.push(next_c);
501        }
502        (content, found)
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_parse_simple_text() {
512        let result = SpeechMarkdownParser::parse("Hello world");
513        assert!(result.is_ok());
514
515        let ast = result.unwrap();
516        assert_eq!(ast.node_type, NodeType::Document);
517        assert!(!ast.children.is_empty());
518    }
519
520    #[test]
521    fn test_parse_short_break() {
522        let result = SpeechMarkdownParser::parse("Sample [2s] text");
523        assert!(result.is_ok());
524    }
525
526    #[test]
527    fn test_parse_emphasis_strong() {
528        let result = SpeechMarkdownParser::parse("++strong emphasis++");
529        assert!(result.is_ok());
530    }
531
532    #[test]
533    fn test_parse_text_modifier() {
534        let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
535        assert!(result.is_ok());
536    }
537
538    #[test]
539    fn test_parse_audio() {
540        let result = SpeechMarkdownParser::parse("![caption](\"https://example.com/audio.mp3\")");
541        assert!(result.is_ok());
542    }
543
544    #[test]
545    fn test_debug_substitution() {
546        let input = "{Al}aluminum";
547        let result = SpeechMarkdownParser::parse(input);
548        assert!(result.is_ok());
549
550        let ast = result.unwrap();
551        println!("=== Substitution Debug ===");
552        println!("Input: {}", input);
553        println!("AST: {:?}", ast);
554        println!("Children: {:?}", ast.children);
555        println!("========================");
556    }
557
558    #[test]
559    fn test_debug_emphasis_ssml() {
560        let input = "++strong emphasis++";
561        let result =
562            SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
563        println!("=== Emphasis SSML Debug ===");
564        println!("Input: {}", input);
565        println!("SSML Result: {:?}", result);
566        println!("==========================");
567    }
568
569    #[test]
570    fn test_is_speech_markdown() {
571        assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
572        assert!(!SpeechMarkdownParser::is_speech_markdown(""));
573        assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
574        assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
575        assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
576        assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
577        assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
578        assert!(SpeechMarkdownParser::is_speech_markdown("![audio](url)"));
579    }
580
581    #[test]
582    fn test_validate() {
583        assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
584        assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
585        assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
586        assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
587    }
588}