Skip to main content

speechmarkdown_rust/parser/
parser.rs

1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5use crate::ssml_to_smd;
6
7pub struct SpeechMarkdownParser;
8
9impl SpeechMarkdownParser {
10    /// Parse SpeechMarkdown text into an AST
11    pub fn parse(input: &str) -> Result<AstNode> {
12        Self::parse_simple(input)
13    }
14
15    /// Convert SpeechMarkdown to plain text
16    pub fn to_text(input: &str) -> Result<String> {
17        let ast = Self::parse(input)?;
18        let formatter = TextFormatter::new();
19        formatter.format(&ast)
20    }
21
22    /// Convert SpeechMarkdown to SSML for the specified platform
23    pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
24        let ast = Self::parse(input)?;
25        let options = FormatterOptions {
26            platform,
27            ..Default::default()
28        };
29        let formatter = create_formatter(platform, options);
30        formatter.format(&ast)
31    }
32
33    /// Convert SSML to SpeechMarkdown (best-effort, lossy for unsupported elements)
34    pub fn to_smd(ssml: &str) -> Result<String> {
35        ssml_to_smd::ssml_to_smd(ssml)
36    }
37
38    /// Check if a string contains SpeechMarkdown syntax
39    pub fn is_speech_markdown(input: &str) -> bool {
40        if let Ok(ast) = Self::parse(input) {
41            ast.children.iter().any(|child| {
42                !matches!(
43                    child.node_type,
44                    NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
45                )
46            })
47        } else {
48            false
49        }
50    }
51
52    /// Validate SpeechMarkdown input, returning an error message if invalid
53    pub fn validate(input: &str) -> Result<()> {
54        Self::parse(input)?;
55        Ok(())
56    }
57
58    /// Simple manual parser for basic SpeechMarkdown syntax
59    fn parse_simple(input: &str) -> Result<AstNode> {
60        let mut document = AstNode::document();
61        let mut current_text = String::new();
62        let mut chars = input.chars().peekable();
63
64        let flush_text = |doc: &mut AstNode, text: &mut String| {
65            if !text.is_empty() {
66                let node = AstNode::text(text.clone());
67                text.clear();
68                doc.children.push(node);
69            }
70        };
71
72        while let Some(c) = chars.next() {
73            match c {
74                '#' if chars.peek() == Some(&'[') => {
75                    flush_text(&mut document, &mut current_text);
76                    chars.next();
77                    let (section_content, found) = Self::read_until(&mut chars, ']');
78                    if found {
79                        let mut node = AstNode::new(NodeType::Section, section_content.clone());
80                        for modifier in section_content.split(';') {
81                            if let Some((key, value)) = modifier.split_once(':') {
82                                node = node
83                                    .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
84                            } else {
85                                node = node.with_attribute("style", modifier.trim());
86                            }
87                        }
88                        document = document.add_child(node);
89                    } else {
90                        current_text.push('#');
91                        current_text.push('[');
92                        current_text.push_str(&section_content);
93                    }
94                }
95                '[' => {
96                    flush_text(&mut document, &mut current_text);
97                    let (bracket_content, found) = Self::read_until(&mut chars, ']');
98                    if found {
99                        if let Some(rest) = bracket_content.strip_prefix("break:") {
100                            let break_value = Self::strip_quotes(rest.trim());
101                            if Self::is_time_break(break_value) {
102                                document = document.add_child(AstNode::new(
103                                    NodeType::ShortBreak,
104                                    format!("[{}]", break_value),
105                                ));
106                            } else {
107                                let mut node =
108                                    AstNode::new(NodeType::Break, break_value.to_string());
109                                node = node.with_attribute("strength", break_value);
110                                document = document.add_child(node);
111                            }
112                        } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
113                            let mark_value = Self::strip_quotes(rest.trim());
114                            document = document
115                                .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
116                        } else if Self::is_time_break(&bracket_content) {
117                            document = document.add_child(AstNode::new(
118                                NodeType::ShortBreak,
119                                format!("[{}]", bracket_content),
120                            ));
121                        } else {
122                            current_text.push('[');
123                            current_text.push_str(&bracket_content);
124                            current_text.push(']');
125                        }
126                    } else {
127                        current_text.push('[');
128                        current_text.push_str(&bracket_content);
129                    }
130                }
131                '~' => {
132                    let prev_is_boundary = current_text.is_empty()
133                        || current_text.ends_with(|c: char| c.is_whitespace());
134                    if !prev_is_boundary {
135                        current_text.push('~');
136                    } else {
137                        flush_text(&mut document, &mut current_text);
138                        let mut emphasized_text = String::new();
139                        let mut found_end = false;
140                        while let Some(&next_c) = chars.peek() {
141                            chars.next();
142                            if next_c == '~' {
143                                found_end = true;
144                                break;
145                            }
146                            emphasized_text.push(next_c);
147                        }
148                        if found_end
149                            && !emphasized_text.is_empty()
150                            && !emphasized_text.contains(' ')
151                        {
152                            document = document.add_child(AstNode::new(
153                                NodeType::ShortEmphasisNone,
154                                emphasized_text,
155                            ));
156                        } else {
157                            current_text.push('~');
158                            current_text.push_str(&emphasized_text);
159                            if found_end {
160                                current_text.push('~');
161                            }
162                        }
163                    }
164                }
165                '-' => {
166                    let prev_is_boundary = current_text.is_empty()
167                        || current_text.ends_with(|c: char| c.is_whitespace());
168                    if !prev_is_boundary {
169                        current_text.push('-');
170                    } else {
171                        flush_text(&mut document, &mut current_text);
172                        let mut emphasized_text = String::new();
173                        let mut found_end = false;
174                        while let Some(&next_c) = chars.peek() {
175                            chars.next();
176                            if next_c == '\n' || next_c == '\r' {
177                                emphasized_text.push(next_c);
178                                break;
179                            }
180                            if next_c == '-' {
181                                let next_is_boundary =
182                                    chars.peek().is_none_or(|c| c.is_whitespace());
183                                if next_is_boundary {
184                                    found_end = true;
185                                    break;
186                                } else {
187                                    emphasized_text.push('-');
188                                }
189                            } else {
190                                emphasized_text.push(next_c);
191                            }
192                        }
193                        if found_end
194                            && !emphasized_text.is_empty()
195                            && !emphasized_text.contains(' ')
196                        {
197                            document = document.add_child(AstNode::new(
198                                NodeType::ShortEmphasisReduced,
199                                emphasized_text,
200                            ));
201                        } else {
202                            current_text.push('-');
203                            current_text.push_str(&emphasized_text);
204                            if found_end {
205                                current_text.push('-');
206                            }
207                        }
208                    }
209                }
210                '+' => {
211                    flush_text(&mut document, &mut current_text);
212                    let mut plus_count = 1;
213                    while chars.peek() == Some(&'+') {
214                        chars.next();
215                        plus_count += 1;
216                    }
217                    let mut emphasized_text = String::new();
218                    let mut found_end = false;
219                    while let Some(&next_c) = chars.peek() {
220                        if next_c == '+' {
221                            let mut closing_pluses = 0;
222                            while chars.peek() == Some(&'+') {
223                                chars.next();
224                                closing_pluses += 1;
225                            }
226                            if closing_pluses == plus_count {
227                                found_end = true;
228                                break;
229                            } else {
230                                for _ in 0..closing_pluses {
231                                    emphasized_text.push('+');
232                                }
233                            }
234                        } else {
235                            chars.next();
236                            emphasized_text.push(next_c);
237                        }
238                    }
239                    if found_end {
240                        let node_type = if plus_count >= 2 {
241                            NodeType::ShortEmphasisStrong
242                        } else {
243                            NodeType::ShortEmphasisModerate
244                        };
245                        document = document.add_child(AstNode::new(node_type, emphasized_text));
246                    } else {
247                        for _ in 0..plus_count {
248                            current_text.push('+');
249                        }
250                        current_text.push_str(&emphasized_text);
251                    }
252                }
253                '(' => {
254                    flush_text(&mut document, &mut current_text);
255                    let mut modifier_content = String::new();
256                    let mut found_closing_paren = false;
257                    while let Some(&next_c) = chars.peek() {
258                        chars.next();
259                        if next_c == ')' {
260                            found_closing_paren = true;
261                            break;
262                        }
263                        modifier_content.push(next_c);
264                    }
265
266                    if found_closing_paren {
267                        if chars.peek() == Some(&'[') {
268                            chars.next();
269                            let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
270                            if found_bracket {
271                                let mut node =
272                                    AstNode::new(NodeType::TextModifier, modifier_content);
273                                for modifier in modifiers.split(';') {
274                                    if let Some((key, value)) = modifier.split_once(':') {
275                                        node = node.with_attribute(
276                                            key.trim(),
277                                            Self::strip_quotes(value.trim()),
278                                        );
279                                    } else {
280                                        let key = modifier.trim();
281                                        if !key.is_empty() {
282                                            node = node.with_attribute(key, "");
283                                        }
284                                    }
285                                }
286                                document = document.add_child(node);
287                            } else {
288                                current_text.push('(');
289                                current_text.push_str(&modifier_content);
290                                current_text.push(')');
291                                current_text.push('[');
292                                current_text.push_str(&modifiers);
293                            }
294                        } else if chars.peek() == Some(&'{') {
295                            chars.next();
296                            let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
297                            if found_brace {
298                                let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
299                                if !alias_text.is_empty() {
300                                    node = node.with_attribute("alias", alias_text);
301                                }
302                                document = document.add_child(node);
303                            } else {
304                                current_text.push('(');
305                                current_text.push_str(&modifier_content);
306                                current_text.push(')');
307                                current_text.push('{');
308                                current_text.push_str(&alias_text);
309                            }
310                        } else if chars.peek() == Some(&'/') {
311                            chars.next();
312                            let mut phoneme = String::new();
313                            let mut found_slash = false;
314                            while let Some(&next_c) = chars.peek() {
315                                chars.next();
316                                if next_c == '/' {
317                                    found_slash = true;
318                                    break;
319                                }
320                                phoneme.push(next_c);
321                            }
322                            if found_slash {
323                                let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
324                                node = node.with_attribute("phoneme", phoneme);
325                                document = document.add_child(node);
326                            } else {
327                                current_text.push('(');
328                                current_text.push_str(&modifier_content);
329                                current_text.push(')');
330                                current_text.push('/');
331                                current_text.push_str(&phoneme);
332                            }
333                        } else {
334                            current_text.push('(');
335                            current_text.push_str(&modifier_content);
336                            current_text.push(')');
337                        }
338                    } else {
339                        current_text.push('(');
340                        current_text.push_str(&modifier_content);
341                    }
342                }
343                '/' => {
344                    flush_text(&mut document, &mut current_text);
345                    let mut ipa_content = String::new();
346                    let mut found_slash = false;
347                    while let Some(&next_c) = chars.peek() {
348                        if next_c == '/' {
349                            chars.next();
350                            found_slash = true;
351                            break;
352                        }
353                        if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
354                            break;
355                        }
356                        chars.next();
357                        ipa_content.push(next_c);
358                    }
359                    if found_slash && !ipa_content.is_empty() {
360                        let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
361                        node = node.with_attribute("alphabet", "ipa");
362                        node = node.with_attribute("ph", ipa_content.trim().to_string());
363                        document = document.add_child(node);
364                    } else if found_slash {
365                        current_text.push('/');
366                        current_text.push('/');
367                    } else {
368                        current_text.push('/');
369                        current_text.push_str(&ipa_content);
370                    }
371                }
372                '{' => {
373                    flush_text(&mut document, &mut current_text);
374                    let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
375                    if found_brace && !sub_text.is_empty() {
376                        let mut alias_text = String::new();
377                        while let Some(&next_c) = chars.peek() {
378                            if next_c.is_whitespace()
379                                || next_c == '('
380                                || next_c == '['
381                                || next_c == '+'
382                                || next_c == '~'
383                                || next_c == '!'
384                                || next_c == '/'
385                                || next_c == '{'
386                                || next_c == '}'
387                                || next_c == '#'
388                            {
389                                break;
390                            }
391                            chars.next();
392                            alias_text.push(next_c);
393                        }
394                        let mut node = AstNode::new(NodeType::ShortSub, sub_text);
395                        if !alias_text.is_empty() {
396                            node = node.with_attribute("alias", alias_text);
397                        }
398                        document = document.add_child(node);
399                    } else {
400                        current_text.push('{');
401                        current_text.push_str(&sub_text);
402                    }
403                }
404                '!' => {
405                    if chars.peek() == Some(&'[') {
406                        flush_text(&mut document, &mut current_text);
407                        chars.next();
408                        let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
409
410                        if found_caption_end && chars.peek() == Some(&'(') {
411                            chars.next();
412                            let (url, found_url_end) = Self::read_until(&mut chars, ')');
413                            if found_url_end {
414                                let mut node = AstNode::new(NodeType::Audio, caption);
415                                node = node.with_attribute("src", Self::strip_quotes(&url));
416                                document = document.add_child(node);
417                            } else {
418                                current_text.push_str(&format!("![{}]", caption));
419                            }
420                        } else if found_caption_end && chars.peek() == Some(&'[') {
421                            chars.next();
422                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
423                            if found_url_end {
424                                let mut node = AstNode::new(NodeType::Audio, caption);
425                                node = node.with_attribute("src", Self::strip_quotes(&url));
426                                document = document.add_child(node);
427                            } else {
428                                current_text.push_str(&format!("![{}]", caption));
429                            }
430                        } else if found_caption_end {
431                            let possible_url = Self::strip_quotes(&caption);
432                            if possible_url.starts_with("http://")
433                                || possible_url.starts_with("https://")
434                                || possible_url.starts_with("soundbank://")
435                                || possible_url.contains("://")
436                                || possible_url.contains('.')
437                            {
438                                let mut node = AstNode::new(NodeType::Audio, String::new());
439                                node = node.with_attribute("src", possible_url);
440                                document = document.add_child(node);
441                            } else {
442                                current_text.push_str(&format!("![{}]", caption));
443                            }
444                        } else {
445                            current_text.push_str(&format!("![{}", caption));
446                        }
447                    } else if chars.peek() == Some(&'(') {
448                        flush_text(&mut document, &mut current_text);
449                        chars.next();
450                        let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
451                        if found_caption_end && chars.peek() == Some(&'[') {
452                            chars.next();
453                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
454                            if found_url_end {
455                                let mut node = AstNode::new(NodeType::Audio, caption);
456                                node = node.with_attribute("src", Self::strip_quotes(&url));
457                                document = document.add_child(node);
458                            } else {
459                                current_text.push_str(&format!("!({}[", caption));
460                            }
461                        } else {
462                            current_text.push_str(&format!("!({}", caption));
463                        }
464                    } else {
465                        current_text.push('!');
466                    }
467                }
468                _ => {
469                    current_text.push(c);
470                }
471            }
472        }
473
474        if !current_text.is_empty() {
475            document = document.add_child(AstNode::text(current_text));
476        }
477
478        Ok(document)
479    }
480
481    fn strip_quotes(s: &str) -> &str {
482        let s = s.trim();
483        if s.len() >= 2 {
484            let first = s.chars().next().unwrap();
485            let last = s.chars().last().unwrap();
486            if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
487                return &s[1..s.len() - 1];
488            }
489        }
490        s
491    }
492
493    fn is_time_break(s: &str) -> bool {
494        s.ends_with("s") || s.ends_with("ms")
495    }
496
497    fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
498        let mut content = String::new();
499        let mut found = false;
500        while let Some(&next_c) = chars.peek() {
501            chars.next();
502            if next_c == end {
503                found = true;
504                break;
505            }
506            content.push(next_c);
507        }
508        (content, found)
509    }
510}
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515
516    #[test]
517    fn test_parse_simple_text() {
518        let result = SpeechMarkdownParser::parse("Hello world");
519        assert!(result.is_ok());
520
521        let ast = result.unwrap();
522        assert_eq!(ast.node_type, NodeType::Document);
523        assert!(!ast.children.is_empty());
524    }
525
526    #[test]
527    fn test_parse_short_break() {
528        let result = SpeechMarkdownParser::parse("Sample [2s] text");
529        assert!(result.is_ok());
530    }
531
532    #[test]
533    fn test_parse_emphasis_strong() {
534        let result = SpeechMarkdownParser::parse("++strong emphasis++");
535        assert!(result.is_ok());
536    }
537
538    #[test]
539    fn test_parse_text_modifier() {
540        let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
541        assert!(result.is_ok());
542    }
543
544    #[test]
545    fn test_parse_audio() {
546        let result = SpeechMarkdownParser::parse("![caption](\"https://example.com/audio.mp3\")");
547        assert!(result.is_ok());
548    }
549
550    #[test]
551    fn test_debug_substitution() {
552        let input = "{Al}aluminum";
553        let result = SpeechMarkdownParser::parse(input);
554        assert!(result.is_ok());
555
556        let ast = result.unwrap();
557        println!("=== Substitution Debug ===");
558        println!("Input: {}", input);
559        println!("AST: {:?}", ast);
560        println!("Children: {:?}", ast.children);
561        println!("========================");
562    }
563
564    #[test]
565    fn test_debug_emphasis_ssml() {
566        let input = "++strong emphasis++";
567        let result =
568            SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
569        println!("=== Emphasis SSML Debug ===");
570        println!("Input: {}", input);
571        println!("SSML Result: {:?}", result);
572        println!("==========================");
573    }
574
575    #[test]
576    fn test_is_speech_markdown() {
577        assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
578        assert!(!SpeechMarkdownParser::is_speech_markdown(""));
579        assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
580        assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
581        assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
582        assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
583        assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
584        assert!(SpeechMarkdownParser::is_speech_markdown("![audio](url)"));
585    }
586
587    #[test]
588    fn test_validate() {
589        assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
590        assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
591        assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
592        assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
593    }
594}