speechmarkdown_rust/parser/
parser.rs1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5
6pub struct SpeechMarkdownParser;
7
8impl SpeechMarkdownParser {
9 pub fn parse(input: &str) -> Result<AstNode> {
11 Self::parse_simple(input)
12 }
13
14 pub fn to_text(input: &str) -> Result<String> {
16 let ast = Self::parse(input)?;
17 let formatter = TextFormatter::new();
18 formatter.format(&ast)
19 }
20
21 pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
23 let ast = Self::parse(input)?;
24 let options = FormatterOptions {
25 platform,
26 ..Default::default()
27 };
28 let formatter = create_formatter(platform, options);
29 formatter.format(&ast)
30 }
31
32 pub fn is_speech_markdown(input: &str) -> bool {
34 if let Ok(ast) = Self::parse(input) {
35 ast.children.iter().any(|child| {
36 !matches!(
37 child.node_type,
38 NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
39 )
40 })
41 } else {
42 false
43 }
44 }
45
46 pub fn validate(input: &str) -> Result<()> {
48 Self::parse(input)?;
49 Ok(())
50 }
51
52 fn parse_simple(input: &str) -> Result<AstNode> {
54 let mut document = AstNode::document();
55 let mut current_text = String::new();
56 let mut chars = input.chars().peekable();
57
58 let flush_text = |doc: &mut AstNode, text: &mut String| {
59 if !text.is_empty() {
60 let node = AstNode::text(text.clone());
61 text.clear();
62 doc.children.push(node);
63 }
64 };
65
66 while let Some(c) = chars.next() {
67 match c {
68 '#' if chars.peek() == Some(&'[') => {
69 flush_text(&mut document, &mut current_text);
70 chars.next();
71 let (section_content, found) = Self::read_until(&mut chars, ']');
72 if found {
73 let mut node = AstNode::new(NodeType::Section, section_content.clone());
74 for modifier in section_content.split(';') {
75 if let Some((key, value)) = modifier.split_once(':') {
76 node = node
77 .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
78 } else {
79 node = node.with_attribute("style", modifier.trim());
80 }
81 }
82 document = document.add_child(node);
83 } else {
84 current_text.push('#');
85 current_text.push('[');
86 current_text.push_str(§ion_content);
87 }
88 }
89 '[' => {
90 flush_text(&mut document, &mut current_text);
91 let (bracket_content, found) = Self::read_until(&mut chars, ']');
92 if found {
93 if let Some(rest) = bracket_content.strip_prefix("break:") {
94 let break_value = Self::strip_quotes(rest.trim());
95 if Self::is_time_break(break_value) {
96 document = document.add_child(AstNode::new(
97 NodeType::ShortBreak,
98 format!("[{}]", break_value),
99 ));
100 } else {
101 let mut node =
102 AstNode::new(NodeType::Break, break_value.to_string());
103 node = node.with_attribute("strength", break_value);
104 document = document.add_child(node);
105 }
106 } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
107 let mark_value = Self::strip_quotes(rest.trim());
108 document = document
109 .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
110 } else if Self::is_time_break(&bracket_content) {
111 document = document.add_child(AstNode::new(
112 NodeType::ShortBreak,
113 format!("[{}]", bracket_content),
114 ));
115 } else {
116 current_text.push('[');
117 current_text.push_str(&bracket_content);
118 current_text.push(']');
119 }
120 } else {
121 current_text.push('[');
122 current_text.push_str(&bracket_content);
123 }
124 }
125 '~' => {
126 let prev_is_boundary = current_text.is_empty()
127 || current_text.ends_with(|c: char| c.is_whitespace());
128 if !prev_is_boundary {
129 current_text.push('~');
130 } else {
131 flush_text(&mut document, &mut current_text);
132 let mut emphasized_text = String::new();
133 let mut found_end = false;
134 while let Some(&next_c) = chars.peek() {
135 chars.next();
136 if next_c == '~' {
137 found_end = true;
138 break;
139 }
140 emphasized_text.push(next_c);
141 }
142 if found_end
143 && !emphasized_text.is_empty()
144 && !emphasized_text.contains(' ')
145 {
146 document = document.add_child(AstNode::new(
147 NodeType::ShortEmphasisNone,
148 emphasized_text,
149 ));
150 } else {
151 current_text.push('~');
152 current_text.push_str(&emphasized_text);
153 if found_end {
154 current_text.push('~');
155 }
156 }
157 }
158 }
159 '-' => {
160 let prev_is_boundary = current_text.is_empty()
161 || current_text.ends_with(|c: char| c.is_whitespace());
162 if !prev_is_boundary {
163 current_text.push('-');
164 } else {
165 flush_text(&mut document, &mut current_text);
166 let mut emphasized_text = String::new();
167 let mut found_end = false;
168 while let Some(&next_c) = chars.peek() {
169 chars.next();
170 if next_c == '\n' || next_c == '\r' {
171 emphasized_text.push(next_c);
172 break;
173 }
174 if next_c == '-' {
175 let next_is_boundary =
176 chars.peek().is_none_or(|c| c.is_whitespace());
177 if next_is_boundary {
178 found_end = true;
179 break;
180 } else {
181 emphasized_text.push('-');
182 }
183 } else {
184 emphasized_text.push(next_c);
185 }
186 }
187 if found_end
188 && !emphasized_text.is_empty()
189 && !emphasized_text.contains(' ')
190 {
191 document = document.add_child(AstNode::new(
192 NodeType::ShortEmphasisReduced,
193 emphasized_text,
194 ));
195 } else {
196 current_text.push('-');
197 current_text.push_str(&emphasized_text);
198 if found_end {
199 current_text.push('-');
200 }
201 }
202 }
203 }
204 '+' => {
205 flush_text(&mut document, &mut current_text);
206 let mut plus_count = 1;
207 while chars.peek() == Some(&'+') {
208 chars.next();
209 plus_count += 1;
210 }
211 let mut emphasized_text = String::new();
212 let mut found_end = false;
213 while let Some(&next_c) = chars.peek() {
214 if next_c == '+' {
215 let mut closing_pluses = 0;
216 while chars.peek() == Some(&'+') {
217 chars.next();
218 closing_pluses += 1;
219 }
220 if closing_pluses == plus_count {
221 found_end = true;
222 break;
223 } else {
224 for _ in 0..closing_pluses {
225 emphasized_text.push('+');
226 }
227 }
228 } else {
229 chars.next();
230 emphasized_text.push(next_c);
231 }
232 }
233 if found_end {
234 let node_type = if plus_count >= 2 {
235 NodeType::ShortEmphasisStrong
236 } else {
237 NodeType::ShortEmphasisModerate
238 };
239 document = document.add_child(AstNode::new(node_type, emphasized_text));
240 } else {
241 for _ in 0..plus_count {
242 current_text.push('+');
243 }
244 current_text.push_str(&emphasized_text);
245 }
246 }
247 '(' => {
248 flush_text(&mut document, &mut current_text);
249 let mut modifier_content = String::new();
250 let mut found_closing_paren = false;
251 while let Some(&next_c) = chars.peek() {
252 chars.next();
253 if next_c == ')' {
254 found_closing_paren = true;
255 break;
256 }
257 modifier_content.push(next_c);
258 }
259
260 if found_closing_paren {
261 if chars.peek() == Some(&'[') {
262 chars.next();
263 let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
264 if found_bracket {
265 let mut node =
266 AstNode::new(NodeType::TextModifier, modifier_content);
267 for modifier in modifiers.split(';') {
268 if let Some((key, value)) = modifier.split_once(':') {
269 node = node.with_attribute(
270 key.trim(),
271 Self::strip_quotes(value.trim()),
272 );
273 } else {
274 let key = modifier.trim();
275 if !key.is_empty() {
276 node = node.with_attribute(key, "");
277 }
278 }
279 }
280 document = document.add_child(node);
281 } else {
282 current_text.push('(');
283 current_text.push_str(&modifier_content);
284 current_text.push(')');
285 current_text.push('[');
286 current_text.push_str(&modifiers);
287 }
288 } else if chars.peek() == Some(&'{') {
289 chars.next();
290 let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
291 if found_brace {
292 let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
293 if !alias_text.is_empty() {
294 node = node.with_attribute("alias", alias_text);
295 }
296 document = document.add_child(node);
297 } else {
298 current_text.push('(');
299 current_text.push_str(&modifier_content);
300 current_text.push(')');
301 current_text.push('{');
302 current_text.push_str(&alias_text);
303 }
304 } else if chars.peek() == Some(&'/') {
305 chars.next();
306 let mut phoneme = String::new();
307 let mut found_slash = false;
308 while let Some(&next_c) = chars.peek() {
309 chars.next();
310 if next_c == '/' {
311 found_slash = true;
312 break;
313 }
314 phoneme.push(next_c);
315 }
316 if found_slash {
317 let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
318 node = node.with_attribute("phoneme", phoneme);
319 document = document.add_child(node);
320 } else {
321 current_text.push('(');
322 current_text.push_str(&modifier_content);
323 current_text.push(')');
324 current_text.push('/');
325 current_text.push_str(&phoneme);
326 }
327 } else {
328 current_text.push('(');
329 current_text.push_str(&modifier_content);
330 current_text.push(')');
331 }
332 } else {
333 current_text.push('(');
334 current_text.push_str(&modifier_content);
335 }
336 }
337 '/' => {
338 flush_text(&mut document, &mut current_text);
339 let mut ipa_content = String::new();
340 let mut found_slash = false;
341 while let Some(&next_c) = chars.peek() {
342 if next_c == '/' {
343 chars.next();
344 found_slash = true;
345 break;
346 }
347 if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
348 break;
349 }
350 chars.next();
351 ipa_content.push(next_c);
352 }
353 if found_slash && !ipa_content.is_empty() {
354 let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
355 node = node.with_attribute("alphabet", "ipa");
356 node = node.with_attribute("ph", ipa_content.trim().to_string());
357 document = document.add_child(node);
358 } else if found_slash {
359 current_text.push('/');
360 current_text.push('/');
361 } else {
362 current_text.push('/');
363 current_text.push_str(&ipa_content);
364 }
365 }
366 '{' => {
367 flush_text(&mut document, &mut current_text);
368 let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
369 if found_brace && !sub_text.is_empty() {
370 let mut alias_text = String::new();
371 while let Some(&next_c) = chars.peek() {
372 if next_c.is_whitespace()
373 || next_c == '('
374 || next_c == '['
375 || next_c == '+'
376 || next_c == '~'
377 || next_c == '!'
378 || next_c == '/'
379 || next_c == '{'
380 || next_c == '}'
381 || next_c == '#'
382 {
383 break;
384 }
385 chars.next();
386 alias_text.push(next_c);
387 }
388 let mut node = AstNode::new(NodeType::ShortSub, sub_text);
389 if !alias_text.is_empty() {
390 node = node.with_attribute("alias", alias_text);
391 }
392 document = document.add_child(node);
393 } else {
394 current_text.push('{');
395 current_text.push_str(&sub_text);
396 }
397 }
398 '!' => {
399 if chars.peek() == Some(&'[') {
400 flush_text(&mut document, &mut current_text);
401 chars.next();
402 let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
403
404 if found_caption_end && chars.peek() == Some(&'(') {
405 chars.next();
406 let (url, found_url_end) = Self::read_until(&mut chars, ')');
407 if found_url_end {
408 let mut node = AstNode::new(NodeType::Audio, caption);
409 node = node.with_attribute("src", Self::strip_quotes(&url));
410 document = document.add_child(node);
411 } else {
412 current_text.push_str(&format!("![{}]", caption));
413 }
414 } else if found_caption_end && chars.peek() == Some(&'[') {
415 chars.next();
416 let (url, found_url_end) = Self::read_until(&mut chars, ']');
417 if found_url_end {
418 let mut node = AstNode::new(NodeType::Audio, caption);
419 node = node.with_attribute("src", Self::strip_quotes(&url));
420 document = document.add_child(node);
421 } else {
422 current_text.push_str(&format!("![{}]", caption));
423 }
424 } else if found_caption_end {
425 let possible_url = Self::strip_quotes(&caption);
426 if possible_url.starts_with("http://")
427 || possible_url.starts_with("https://")
428 || possible_url.starts_with("soundbank://")
429 || possible_url.contains("://")
430 || possible_url.contains('.')
431 {
432 let mut node = AstNode::new(NodeType::Audio, String::new());
433 node = node.with_attribute("src", possible_url);
434 document = document.add_child(node);
435 } else {
436 current_text.push_str(&format!("![{}]", caption));
437 }
438 } else {
439 current_text.push_str(&format!("![{}", caption));
440 }
441 } else if chars.peek() == Some(&'(') {
442 flush_text(&mut document, &mut current_text);
443 chars.next();
444 let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
445 if found_caption_end && chars.peek() == Some(&'[') {
446 chars.next();
447 let (url, found_url_end) = Self::read_until(&mut chars, ']');
448 if found_url_end {
449 let mut node = AstNode::new(NodeType::Audio, caption);
450 node = node.with_attribute("src", Self::strip_quotes(&url));
451 document = document.add_child(node);
452 } else {
453 current_text.push_str(&format!("!({}[", caption));
454 }
455 } else {
456 current_text.push_str(&format!("!({}", caption));
457 }
458 } else {
459 current_text.push('!');
460 }
461 }
462 _ => {
463 current_text.push(c);
464 }
465 }
466 }
467
468 if !current_text.is_empty() {
469 document = document.add_child(AstNode::text(current_text));
470 }
471
472 Ok(document)
473 }
474
475 fn strip_quotes(s: &str) -> &str {
476 let s = s.trim();
477 if s.len() >= 2 {
478 let first = s.chars().next().unwrap();
479 let last = s.chars().last().unwrap();
480 if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
481 return &s[1..s.len() - 1];
482 }
483 }
484 s
485 }
486
487 fn is_time_break(s: &str) -> bool {
488 s.ends_with("s") || s.ends_with("ms")
489 }
490
491 fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
492 let mut content = String::new();
493 let mut found = false;
494 while let Some(&next_c) = chars.peek() {
495 chars.next();
496 if next_c == end {
497 found = true;
498 break;
499 }
500 content.push(next_c);
501 }
502 (content, found)
503 }
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_parse_simple_text() {
512 let result = SpeechMarkdownParser::parse("Hello world");
513 assert!(result.is_ok());
514
515 let ast = result.unwrap();
516 assert_eq!(ast.node_type, NodeType::Document);
517 assert!(!ast.children.is_empty());
518 }
519
520 #[test]
521 fn test_parse_short_break() {
522 let result = SpeechMarkdownParser::parse("Sample [2s] text");
523 assert!(result.is_ok());
524 }
525
526 #[test]
527 fn test_parse_emphasis_strong() {
528 let result = SpeechMarkdownParser::parse("++strong emphasis++");
529 assert!(result.is_ok());
530 }
531
532 #[test]
533 fn test_parse_text_modifier() {
534 let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
535 assert!(result.is_ok());
536 }
537
538 #[test]
539 fn test_parse_audio() {
540 let result = SpeechMarkdownParser::parse("");
541 assert!(result.is_ok());
542 }
543
544 #[test]
545 fn test_debug_substitution() {
546 let input = "{Al}aluminum";
547 let result = SpeechMarkdownParser::parse(input);
548 assert!(result.is_ok());
549
550 let ast = result.unwrap();
551 println!("=== Substitution Debug ===");
552 println!("Input: {}", input);
553 println!("AST: {:?}", ast);
554 println!("Children: {:?}", ast.children);
555 println!("========================");
556 }
557
558 #[test]
559 fn test_debug_emphasis_ssml() {
560 let input = "++strong emphasis++";
561 let result =
562 SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
563 println!("=== Emphasis SSML Debug ===");
564 println!("Input: {}", input);
565 println!("SSML Result: {:?}", result);
566 println!("==========================");
567 }
568
569 #[test]
570 fn test_is_speech_markdown() {
571 assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
572 assert!(!SpeechMarkdownParser::is_speech_markdown(""));
573 assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
574 assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
575 assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
576 assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
577 assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
578 assert!(SpeechMarkdownParser::is_speech_markdown(""));
579 }
580
581 #[test]
582 fn test_validate() {
583 assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
584 assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
585 assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
586 assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
587 }
588}