Skip to main content

speechmarkdown_rust/parser/
parser.rs

1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5
6pub struct SpeechMarkdownParser;
7
8impl SpeechMarkdownParser {
9    /// Parse SpeechMarkdown text into an AST
10    pub fn parse(input: &str) -> Result<AstNode> {
11        Self::parse_simple(input)
12    }
13
14    /// Convert SpeechMarkdown to plain text
15    pub fn to_text(input: &str) -> Result<String> {
16        let ast = Self::parse(input)?;
17        let formatter = TextFormatter::new();
18        formatter.format(&ast)
19    }
20
21    /// Convert SpeechMarkdown to SSML for the specified platform
22    pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
23        let ast = Self::parse(input)?;
24        let options = FormatterOptions {
25            platform,
26            ..Default::default()
27        };
28        let formatter = create_formatter(platform, options);
29        formatter.format(&ast)
30    }
31
32    /// Simple manual parser for basic SpeechMarkdown syntax
33    fn parse_simple(input: &str) -> Result<AstNode> {
34        let mut document = AstNode::document();
35        let mut current_text = String::new();
36        let mut chars = input.chars().peekable();
37
38        let flush_text = |doc: &mut AstNode, text: &mut String| {
39            if !text.is_empty() {
40                let node = AstNode::text(text.clone());
41                text.clear();
42                doc.children.push(node);
43            }
44        };
45
46        while let Some(c) = chars.next() {
47            match c {
48                '#' if chars.peek() == Some(&'[') => {
49                    flush_text(&mut document, &mut current_text);
50                    chars.next();
51                    let (section_content, found) = Self::read_until(&mut chars, ']');
52                    if found {
53                        let mut node = AstNode::new(NodeType::Section, section_content.clone());
54                        for modifier in section_content.split(';') {
55                            if let Some((key, value)) = modifier.split_once(':') {
56                                node = node
57                                    .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
58                            } else {
59                                node = node.with_attribute("style", modifier.trim());
60                            }
61                        }
62                        document = document.add_child(node);
63                    } else {
64                        current_text.push('#');
65                        current_text.push('[');
66                        current_text.push_str(&section_content);
67                    }
68                }
69                '[' => {
70                    flush_text(&mut document, &mut current_text);
71                    let (bracket_content, found) = Self::read_until(&mut chars, ']');
72                    if found {
73                        if let Some(rest) = bracket_content.strip_prefix("break:") {
74                            let break_value = Self::strip_quotes(rest.trim());
75                            if Self::is_time_break(break_value) {
76                                document = document.add_child(AstNode::new(
77                                    NodeType::ShortBreak,
78                                    format!("[{}]", break_value),
79                                ));
80                            } else {
81                                let mut node =
82                                    AstNode::new(NodeType::Break, break_value.to_string());
83                                node = node.with_attribute("strength", break_value);
84                                document = document.add_child(node);
85                            }
86                        } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
87                            let mark_value = Self::strip_quotes(rest.trim());
88                            document = document
89                                .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
90                        } else if Self::is_time_break(&bracket_content) {
91                            document = document.add_child(AstNode::new(
92                                NodeType::ShortBreak,
93                                format!("[{}]", bracket_content),
94                            ));
95                        } else {
96                            current_text.push('[');
97                            current_text.push_str(&bracket_content);
98                            current_text.push(']');
99                        }
100                    } else {
101                        current_text.push('[');
102                        current_text.push_str(&bracket_content);
103                    }
104                }
105                '~' => {
106                    let prev_is_boundary = current_text.is_empty()
107                        || current_text.ends_with(|c: char| c.is_whitespace());
108                    if !prev_is_boundary {
109                        current_text.push('~');
110                    } else {
111                        flush_text(&mut document, &mut current_text);
112                        let mut emphasized_text = String::new();
113                        let mut found_end = false;
114                        while let Some(&next_c) = chars.peek() {
115                            chars.next();
116                            if next_c == '~' {
117                                found_end = true;
118                                break;
119                            }
120                            emphasized_text.push(next_c);
121                        }
122                        if found_end
123                            && !emphasized_text.is_empty()
124                            && !emphasized_text.contains(' ')
125                        {
126                            document = document.add_child(AstNode::new(
127                                NodeType::ShortEmphasisNone,
128                                emphasized_text,
129                            ));
130                        } else {
131                            current_text.push('~');
132                            current_text.push_str(&emphasized_text);
133                            if found_end {
134                                current_text.push('~');
135                            }
136                        }
137                    }
138                }
139                '-' => {
140                    let prev_is_boundary = current_text.is_empty()
141                        || current_text.ends_with(|c: char| c.is_whitespace());
142                    if !prev_is_boundary {
143                        current_text.push('-');
144                    } else {
145                        flush_text(&mut document, &mut current_text);
146                        let mut emphasized_text = String::new();
147                        let mut found_end = false;
148                        while let Some(&next_c) = chars.peek() {
149                            chars.next();
150                            if next_c == '\n' || next_c == '\r' {
151                                emphasized_text.push(next_c);
152                                break;
153                            }
154                            if next_c == '-' {
155                                let next_is_boundary =
156                                    chars.peek().is_none_or(|c| c.is_whitespace());
157                                if next_is_boundary {
158                                    found_end = true;
159                                    break;
160                                } else {
161                                    emphasized_text.push('-');
162                                }
163                            } else {
164                                emphasized_text.push(next_c);
165                            }
166                        }
167                        if found_end
168                            && !emphasized_text.is_empty()
169                            && !emphasized_text.contains(' ')
170                        {
171                            document = document.add_child(AstNode::new(
172                                NodeType::ShortEmphasisReduced,
173                                emphasized_text,
174                            ));
175                        } else {
176                            current_text.push('-');
177                            current_text.push_str(&emphasized_text);
178                            if found_end {
179                                current_text.push('-');
180                            }
181                        }
182                    }
183                }
184                '+' => {
185                    flush_text(&mut document, &mut current_text);
186                    let mut plus_count = 1;
187                    while chars.peek() == Some(&'+') {
188                        chars.next();
189                        plus_count += 1;
190                    }
191                    let mut emphasized_text = String::new();
192                    let mut found_end = false;
193                    while let Some(&next_c) = chars.peek() {
194                        if next_c == '+' {
195                            let mut closing_pluses = 0;
196                            while chars.peek() == Some(&'+') {
197                                chars.next();
198                                closing_pluses += 1;
199                            }
200                            if closing_pluses == plus_count {
201                                found_end = true;
202                                break;
203                            } else {
204                                for _ in 0..closing_pluses {
205                                    emphasized_text.push('+');
206                                }
207                            }
208                        } else {
209                            chars.next();
210                            emphasized_text.push(next_c);
211                        }
212                    }
213                    if found_end {
214                        let node_type = if plus_count >= 2 {
215                            NodeType::ShortEmphasisStrong
216                        } else {
217                            NodeType::ShortEmphasisModerate
218                        };
219                        document = document.add_child(AstNode::new(node_type, emphasized_text));
220                    } else {
221                        for _ in 0..plus_count {
222                            current_text.push('+');
223                        }
224                        current_text.push_str(&emphasized_text);
225                    }
226                }
227                '(' => {
228                    flush_text(&mut document, &mut current_text);
229                    let mut modifier_content = String::new();
230                    let mut found_closing_paren = false;
231                    while let Some(&next_c) = chars.peek() {
232                        chars.next();
233                        if next_c == ')' {
234                            found_closing_paren = true;
235                            break;
236                        }
237                        modifier_content.push(next_c);
238                    }
239
240                    if found_closing_paren {
241                        if chars.peek() == Some(&'[') {
242                            chars.next();
243                            let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
244                            if found_bracket {
245                                let mut node =
246                                    AstNode::new(NodeType::TextModifier, modifier_content);
247                                for modifier in modifiers.split(';') {
248                                    if let Some((key, value)) = modifier.split_once(':') {
249                                        node = node.with_attribute(
250                                            key.trim(),
251                                            Self::strip_quotes(value.trim()),
252                                        );
253                                    } else {
254                                        let key = modifier.trim();
255                                        if !key.is_empty() {
256                                            node = node.with_attribute(key, "");
257                                        }
258                                    }
259                                }
260                                document = document.add_child(node);
261                            } else {
262                                current_text.push('(');
263                                current_text.push_str(&modifier_content);
264                                current_text.push(')');
265                                current_text.push('[');
266                                current_text.push_str(&modifiers);
267                            }
268                        } else if chars.peek() == Some(&'{') {
269                            chars.next();
270                            let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
271                            if found_brace {
272                                let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
273                                if !alias_text.is_empty() {
274                                    node = node.with_attribute("alias", alias_text);
275                                }
276                                document = document.add_child(node);
277                            } else {
278                                current_text.push('(');
279                                current_text.push_str(&modifier_content);
280                                current_text.push(')');
281                                current_text.push('{');
282                                current_text.push_str(&alias_text);
283                            }
284                        } else if chars.peek() == Some(&'/') {
285                            chars.next();
286                            let mut phoneme = String::new();
287                            let mut found_slash = false;
288                            while let Some(&next_c) = chars.peek() {
289                                chars.next();
290                                if next_c == '/' {
291                                    found_slash = true;
292                                    break;
293                                }
294                                phoneme.push(next_c);
295                            }
296                            if found_slash {
297                                let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
298                                node = node.with_attribute("phoneme", phoneme);
299                                document = document.add_child(node);
300                            } else {
301                                current_text.push('(');
302                                current_text.push_str(&modifier_content);
303                                current_text.push(')');
304                                current_text.push('/');
305                                current_text.push_str(&phoneme);
306                            }
307                        } else {
308                            current_text.push('(');
309                            current_text.push_str(&modifier_content);
310                            current_text.push(')');
311                        }
312                    } else {
313                        current_text.push('(');
314                        current_text.push_str(&modifier_content);
315                    }
316                }
317                '/' => {
318                    flush_text(&mut document, &mut current_text);
319                    let mut ipa_content = String::new();
320                    let mut found_slash = false;
321                    while let Some(&next_c) = chars.peek() {
322                        if next_c == '/' {
323                            chars.next();
324                            found_slash = true;
325                            break;
326                        }
327                        if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
328                            break;
329                        }
330                        chars.next();
331                        ipa_content.push(next_c);
332                    }
333                    if found_slash && !ipa_content.is_empty() {
334                        let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
335                        node = node.with_attribute("alphabet", "ipa");
336                        node = node.with_attribute("ph", ipa_content.trim().to_string());
337                        document = document.add_child(node);
338                    } else if found_slash {
339                        current_text.push('/');
340                        current_text.push('/');
341                    } else {
342                        current_text.push('/');
343                        current_text.push_str(&ipa_content);
344                    }
345                }
346                '{' => {
347                    flush_text(&mut document, &mut current_text);
348                    let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
349                    if found_brace && !sub_text.is_empty() {
350                        let mut alias_text = String::new();
351                        while let Some(&next_c) = chars.peek() {
352                            if next_c.is_whitespace()
353                                || next_c == '('
354                                || next_c == '['
355                                || next_c == '+'
356                                || next_c == '~'
357                                || next_c == '!'
358                                || next_c == '/'
359                                || next_c == '{'
360                                || next_c == '}'
361                                || next_c == '#'
362                            {
363                                break;
364                            }
365                            chars.next();
366                            alias_text.push(next_c);
367                        }
368                        let mut node = AstNode::new(NodeType::ShortSub, sub_text);
369                        if !alias_text.is_empty() {
370                            node = node.with_attribute("alias", alias_text);
371                        }
372                        document = document.add_child(node);
373                    } else {
374                        current_text.push('{');
375                        current_text.push_str(&sub_text);
376                    }
377                }
378                '!' => {
379                    if chars.peek() == Some(&'[') {
380                        flush_text(&mut document, &mut current_text);
381                        chars.next();
382                        let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
383
384                        if found_caption_end && chars.peek() == Some(&'(') {
385                            chars.next();
386                            let (url, found_url_end) = Self::read_until(&mut chars, ')');
387                            if found_url_end {
388                                let mut node = AstNode::new(NodeType::Audio, caption);
389                                node = node.with_attribute("src", Self::strip_quotes(&url));
390                                document = document.add_child(node);
391                            } else {
392                                current_text.push_str(&format!("![{}]", caption));
393                            }
394                        } else if found_caption_end && chars.peek() == Some(&'[') {
395                            chars.next();
396                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
397                            if found_url_end {
398                                let mut node = AstNode::new(NodeType::Audio, caption);
399                                node = node.with_attribute("src", Self::strip_quotes(&url));
400                                document = document.add_child(node);
401                            } else {
402                                current_text.push_str(&format!("![{}]", caption));
403                            }
404                        } else if found_caption_end {
405                            let possible_url = Self::strip_quotes(&caption);
406                            if possible_url.starts_with("http://")
407                                || possible_url.starts_with("https://")
408                                || possible_url.starts_with("soundbank://")
409                                || possible_url.contains("://")
410                                || possible_url.contains('.')
411                            {
412                                let mut node = AstNode::new(NodeType::Audio, String::new());
413                                node = node.with_attribute("src", possible_url);
414                                document = document.add_child(node);
415                            } else {
416                                current_text.push_str(&format!("![{}]", caption));
417                            }
418                        } else {
419                            current_text.push_str(&format!("![{}", caption));
420                        }
421                    } else if chars.peek() == Some(&'(') {
422                        flush_text(&mut document, &mut current_text);
423                        chars.next();
424                        let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
425                        if found_caption_end && chars.peek() == Some(&'[') {
426                            chars.next();
427                            let (url, found_url_end) = Self::read_until(&mut chars, ']');
428                            if found_url_end {
429                                let mut node = AstNode::new(NodeType::Audio, caption);
430                                node = node.with_attribute("src", Self::strip_quotes(&url));
431                                document = document.add_child(node);
432                            } else {
433                                current_text.push_str(&format!("!({}[", caption));
434                            }
435                        } else {
436                            current_text.push_str(&format!("!({}", caption));
437                        }
438                    } else {
439                        current_text.push('!');
440                    }
441                }
442                _ => {
443                    current_text.push(c);
444                }
445            }
446        }
447
448        if !current_text.is_empty() {
449            document = document.add_child(AstNode::text(current_text));
450        }
451
452        Ok(document)
453    }
454
455    fn strip_quotes(s: &str) -> &str {
456        let s = s.trim();
457        if s.len() >= 2 {
458            let first = s.chars().next().unwrap();
459            let last = s.chars().last().unwrap();
460            if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
461                return &s[1..s.len() - 1];
462            }
463        }
464        s
465    }
466
467    fn is_time_break(s: &str) -> bool {
468        s.ends_with("s") || s.ends_with("ms")
469    }
470
471    fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
472        let mut content = String::new();
473        let mut found = false;
474        while let Some(&next_c) = chars.peek() {
475            chars.next();
476            if next_c == end {
477                found = true;
478                break;
479            }
480            content.push(next_c);
481        }
482        (content, found)
483    }
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn test_parse_simple_text() {
492        let result = SpeechMarkdownParser::parse("Hello world");
493        assert!(result.is_ok());
494
495        let ast = result.unwrap();
496        assert_eq!(ast.node_type, NodeType::Document);
497        assert!(!ast.children.is_empty());
498    }
499
500    #[test]
501    fn test_parse_short_break() {
502        let result = SpeechMarkdownParser::parse("Sample [2s] text");
503        assert!(result.is_ok());
504    }
505
506    #[test]
507    fn test_parse_emphasis_strong() {
508        let result = SpeechMarkdownParser::parse("++strong emphasis++");
509        assert!(result.is_ok());
510    }
511
512    #[test]
513    fn test_parse_text_modifier() {
514        let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
515        assert!(result.is_ok());
516    }
517
518    #[test]
519    fn test_parse_audio() {
520        let result = SpeechMarkdownParser::parse("![caption](\"https://example.com/audio.mp3\")");
521        assert!(result.is_ok());
522    }
523
524    #[test]
525    fn test_debug_substitution() {
526        let input = "{Al}aluminum";
527        let result = SpeechMarkdownParser::parse(input);
528        assert!(result.is_ok());
529
530        let ast = result.unwrap();
531        println!("=== Substitution Debug ===");
532        println!("Input: {}", input);
533        println!("AST: {:?}", ast);
534        println!("Children: {:?}", ast.children);
535        println!("========================");
536    }
537
538    #[test]
539    fn test_debug_emphasis_ssml() {
540        let input = "++strong emphasis++";
541        let result =
542            SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
543        println!("=== Emphasis SSML Debug ===");
544        println!("Input: {}", input);
545        println!("SSML Result: {:?}", result);
546        println!("==========================");
547    }
548}