speechmarkdown_rust/parser/
parser.rs1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5use crate::ssml_to_smd;
6
7pub struct SpeechMarkdownParser;
8
9impl SpeechMarkdownParser {
10 pub fn parse(input: &str) -> Result<AstNode> {
12 Self::parse_simple(input)
13 }
14
15 pub fn to_text(input: &str) -> Result<String> {
17 let ast = Self::parse(input)?;
18 let formatter = TextFormatter::new();
19 formatter.format(&ast)
20 }
21
22 pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
24 let ast = Self::parse(input)?;
25 let options = FormatterOptions {
26 platform,
27 ..Default::default()
28 };
29 let formatter = create_formatter(platform, options);
30 formatter.format(&ast)
31 }
32
33 pub fn to_smd(ssml: &str) -> Result<String> {
35 ssml_to_smd::ssml_to_smd(ssml)
36 }
37
38 pub fn is_speech_markdown(input: &str) -> bool {
40 if let Ok(ast) = Self::parse(input) {
41 ast.children.iter().any(|child| {
42 !matches!(
43 child.node_type,
44 NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
45 )
46 })
47 } else {
48 false
49 }
50 }
51
52 pub fn validate(input: &str) -> Result<()> {
54 Self::parse(input)?;
55 Ok(())
56 }
57
58 fn parse_simple(input: &str) -> Result<AstNode> {
60 let mut document = AstNode::document();
61 let mut current_text = String::new();
62 let mut chars = input.chars().peekable();
63
64 let flush_text = |doc: &mut AstNode, text: &mut String| {
65 if !text.is_empty() {
66 let node = AstNode::text(text.clone());
67 text.clear();
68 doc.children.push(node);
69 }
70 };
71
72 while let Some(c) = chars.next() {
73 match c {
74 '#' if chars.peek() == Some(&'[') => {
75 flush_text(&mut document, &mut current_text);
76 chars.next();
77 let (section_content, found) = Self::read_until(&mut chars, ']');
78 if found {
79 let mut node = AstNode::new(NodeType::Section, section_content.clone());
80 for modifier in section_content.split(';') {
81 if let Some((key, value)) = modifier.split_once(':') {
82 node = node
83 .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
84 } else {
85 node = node.with_attribute("style", modifier.trim());
86 }
87 }
88 document = document.add_child(node);
89 } else {
90 current_text.push('#');
91 current_text.push('[');
92 current_text.push_str(§ion_content);
93 }
94 }
95 '[' => {
96 flush_text(&mut document, &mut current_text);
97 let (bracket_content, found) = Self::read_until(&mut chars, ']');
98 if found {
99 if let Some(rest) = bracket_content.strip_prefix("break:") {
100 let break_value = Self::strip_quotes(rest.trim());
101 if Self::is_time_break(break_value) {
102 document = document.add_child(AstNode::new(
103 NodeType::ShortBreak,
104 format!("[{}]", break_value),
105 ));
106 } else {
107 let mut node =
108 AstNode::new(NodeType::Break, break_value.to_string());
109 node = node.with_attribute("strength", break_value);
110 document = document.add_child(node);
111 }
112 } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
113 let mark_value = Self::strip_quotes(rest.trim());
114 document = document
115 .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
116 } else if Self::is_time_break(&bracket_content) {
117 document = document.add_child(AstNode::new(
118 NodeType::ShortBreak,
119 format!("[{}]", bracket_content),
120 ));
121 } else {
122 current_text.push('[');
123 current_text.push_str(&bracket_content);
124 current_text.push(']');
125 }
126 } else {
127 current_text.push('[');
128 current_text.push_str(&bracket_content);
129 }
130 }
131 '~' => {
132 let prev_is_boundary = current_text.is_empty()
133 || current_text.ends_with(|c: char| c.is_whitespace());
134 if !prev_is_boundary {
135 current_text.push('~');
136 } else {
137 flush_text(&mut document, &mut current_text);
138 let mut emphasized_text = String::new();
139 let mut found_end = false;
140 while let Some(&next_c) = chars.peek() {
141 chars.next();
142 if next_c == '~' {
143 found_end = true;
144 break;
145 }
146 emphasized_text.push(next_c);
147 }
148 if found_end
149 && !emphasized_text.is_empty()
150 && !emphasized_text.contains(' ')
151 {
152 document = document.add_child(AstNode::new(
153 NodeType::ShortEmphasisNone,
154 emphasized_text,
155 ));
156 } else {
157 current_text.push('~');
158 current_text.push_str(&emphasized_text);
159 if found_end {
160 current_text.push('~');
161 }
162 }
163 }
164 }
165 '-' => {
166 let prev_is_boundary = current_text.is_empty()
167 || current_text.ends_with(|c: char| c.is_whitespace());
168 if !prev_is_boundary {
169 current_text.push('-');
170 } else {
171 flush_text(&mut document, &mut current_text);
172 let mut emphasized_text = String::new();
173 let mut found_end = false;
174 while let Some(&next_c) = chars.peek() {
175 chars.next();
176 if next_c == '\n' || next_c == '\r' {
177 emphasized_text.push(next_c);
178 break;
179 }
180 if next_c == '-' {
181 let next_is_boundary =
182 chars.peek().is_none_or(|c| c.is_whitespace());
183 if next_is_boundary {
184 found_end = true;
185 break;
186 } else {
187 emphasized_text.push('-');
188 }
189 } else {
190 emphasized_text.push(next_c);
191 }
192 }
193 if found_end
194 && !emphasized_text.is_empty()
195 && !emphasized_text.contains(' ')
196 {
197 document = document.add_child(AstNode::new(
198 NodeType::ShortEmphasisReduced,
199 emphasized_text,
200 ));
201 } else {
202 current_text.push('-');
203 current_text.push_str(&emphasized_text);
204 if found_end {
205 current_text.push('-');
206 }
207 }
208 }
209 }
210 '+' => {
211 flush_text(&mut document, &mut current_text);
212 let mut plus_count = 1;
213 while chars.peek() == Some(&'+') {
214 chars.next();
215 plus_count += 1;
216 }
217 let mut emphasized_text = String::new();
218 let mut found_end = false;
219 while let Some(&next_c) = chars.peek() {
220 if next_c == '+' {
221 let mut closing_pluses = 0;
222 while chars.peek() == Some(&'+') {
223 chars.next();
224 closing_pluses += 1;
225 }
226 if closing_pluses == plus_count {
227 found_end = true;
228 break;
229 } else {
230 for _ in 0..closing_pluses {
231 emphasized_text.push('+');
232 }
233 }
234 } else {
235 chars.next();
236 emphasized_text.push(next_c);
237 }
238 }
239 if found_end {
240 let node_type = if plus_count >= 2 {
241 NodeType::ShortEmphasisStrong
242 } else {
243 NodeType::ShortEmphasisModerate
244 };
245 document = document.add_child(AstNode::new(node_type, emphasized_text));
246 } else {
247 for _ in 0..plus_count {
248 current_text.push('+');
249 }
250 current_text.push_str(&emphasized_text);
251 }
252 }
253 '(' => {
254 flush_text(&mut document, &mut current_text);
255 let mut modifier_content = String::new();
256 let mut found_closing_paren = false;
257 while let Some(&next_c) = chars.peek() {
258 chars.next();
259 if next_c == ')' {
260 found_closing_paren = true;
261 break;
262 }
263 modifier_content.push(next_c);
264 }
265
266 if found_closing_paren {
267 if chars.peek() == Some(&'[') {
268 chars.next();
269 let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
270 if found_bracket {
271 let mut node =
272 AstNode::new(NodeType::TextModifier, modifier_content);
273 for modifier in modifiers.split(';') {
274 if let Some((key, value)) = modifier.split_once(':') {
275 node = node.with_attribute(
276 key.trim(),
277 Self::strip_quotes(value.trim()),
278 );
279 } else {
280 let key = modifier.trim();
281 if !key.is_empty() {
282 node = node.with_attribute(key, "");
283 }
284 }
285 }
286 document = document.add_child(node);
287 } else {
288 current_text.push('(');
289 current_text.push_str(&modifier_content);
290 current_text.push(')');
291 current_text.push('[');
292 current_text.push_str(&modifiers);
293 }
294 } else if chars.peek() == Some(&'{') {
295 chars.next();
296 let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
297 if found_brace {
298 let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
299 if !alias_text.is_empty() {
300 node = node.with_attribute("alias", alias_text);
301 }
302 document = document.add_child(node);
303 } else {
304 current_text.push('(');
305 current_text.push_str(&modifier_content);
306 current_text.push(')');
307 current_text.push('{');
308 current_text.push_str(&alias_text);
309 }
310 } else if chars.peek() == Some(&'/') {
311 chars.next();
312 let mut phoneme = String::new();
313 let mut found_slash = false;
314 while let Some(&next_c) = chars.peek() {
315 chars.next();
316 if next_c == '/' {
317 found_slash = true;
318 break;
319 }
320 phoneme.push(next_c);
321 }
322 if found_slash {
323 let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
324 node = node.with_attribute("phoneme", phoneme);
325 document = document.add_child(node);
326 } else {
327 current_text.push('(');
328 current_text.push_str(&modifier_content);
329 current_text.push(')');
330 current_text.push('/');
331 current_text.push_str(&phoneme);
332 }
333 } else {
334 current_text.push('(');
335 current_text.push_str(&modifier_content);
336 current_text.push(')');
337 }
338 } else {
339 current_text.push('(');
340 current_text.push_str(&modifier_content);
341 }
342 }
343 '/' => {
344 flush_text(&mut document, &mut current_text);
345 let mut ipa_content = String::new();
346 let mut found_slash = false;
347 while let Some(&next_c) = chars.peek() {
348 if next_c == '/' {
349 chars.next();
350 found_slash = true;
351 break;
352 }
353 if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
354 break;
355 }
356 chars.next();
357 ipa_content.push(next_c);
358 }
359 if found_slash && !ipa_content.is_empty() {
360 let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
361 node = node.with_attribute("alphabet", "ipa");
362 node = node.with_attribute("ph", ipa_content.trim().to_string());
363 document = document.add_child(node);
364 } else if found_slash {
365 current_text.push('/');
366 current_text.push('/');
367 } else {
368 current_text.push('/');
369 current_text.push_str(&ipa_content);
370 }
371 }
372 '{' => {
373 flush_text(&mut document, &mut current_text);
374 let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
375 if found_brace && !sub_text.is_empty() {
376 let mut alias_text = String::new();
377 while let Some(&next_c) = chars.peek() {
378 if next_c.is_whitespace()
379 || next_c == '('
380 || next_c == '['
381 || next_c == '+'
382 || next_c == '~'
383 || next_c == '!'
384 || next_c == '/'
385 || next_c == '{'
386 || next_c == '}'
387 || next_c == '#'
388 {
389 break;
390 }
391 chars.next();
392 alias_text.push(next_c);
393 }
394 let mut node = AstNode::new(NodeType::ShortSub, sub_text);
395 if !alias_text.is_empty() {
396 node = node.with_attribute("alias", alias_text);
397 }
398 document = document.add_child(node);
399 } else {
400 current_text.push('{');
401 current_text.push_str(&sub_text);
402 }
403 }
404 '!' => {
405 if chars.peek() == Some(&'[') {
406 flush_text(&mut document, &mut current_text);
407 chars.next();
408 let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
409
410 if found_caption_end && chars.peek() == Some(&'(') {
411 chars.next();
412 let (url, found_url_end) = Self::read_until(&mut chars, ')');
413 if found_url_end {
414 let mut node = AstNode::new(NodeType::Audio, caption);
415 node = node.with_attribute("src", Self::strip_quotes(&url));
416 document = document.add_child(node);
417 } else {
418 current_text.push_str(&format!("![{}]", caption));
419 }
420 } else if found_caption_end && chars.peek() == Some(&'[') {
421 chars.next();
422 let (url, found_url_end) = Self::read_until(&mut chars, ']');
423 if found_url_end {
424 let mut node = AstNode::new(NodeType::Audio, caption);
425 node = node.with_attribute("src", Self::strip_quotes(&url));
426 document = document.add_child(node);
427 } else {
428 current_text.push_str(&format!("![{}]", caption));
429 }
430 } else if found_caption_end {
431 let possible_url = Self::strip_quotes(&caption);
432 if possible_url.starts_with("http://")
433 || possible_url.starts_with("https://")
434 || possible_url.starts_with("soundbank://")
435 || possible_url.contains("://")
436 || possible_url.contains('.')
437 {
438 let mut node = AstNode::new(NodeType::Audio, String::new());
439 node = node.with_attribute("src", possible_url);
440 document = document.add_child(node);
441 } else {
442 current_text.push_str(&format!("![{}]", caption));
443 }
444 } else {
445 current_text.push_str(&format!("![{}", caption));
446 }
447 } else if chars.peek() == Some(&'(') {
448 flush_text(&mut document, &mut current_text);
449 chars.next();
450 let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
451 if found_caption_end && chars.peek() == Some(&'[') {
452 chars.next();
453 let (url, found_url_end) = Self::read_until(&mut chars, ']');
454 if found_url_end {
455 let mut node = AstNode::new(NodeType::Audio, caption);
456 node = node.with_attribute("src", Self::strip_quotes(&url));
457 document = document.add_child(node);
458 } else {
459 current_text.push_str(&format!("!({}[", caption));
460 }
461 } else {
462 current_text.push_str(&format!("!({}", caption));
463 }
464 } else {
465 current_text.push('!');
466 }
467 }
468 _ => {
469 current_text.push(c);
470 }
471 }
472 }
473
474 if !current_text.is_empty() {
475 document = document.add_child(AstNode::text(current_text));
476 }
477
478 Ok(document)
479 }
480
481 fn strip_quotes(s: &str) -> &str {
482 let s = s.trim();
483 if s.len() >= 2 {
484 let first = s.chars().next().unwrap();
485 let last = s.chars().last().unwrap();
486 if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
487 return &s[1..s.len() - 1];
488 }
489 }
490 s
491 }
492
493 fn is_time_break(s: &str) -> bool {
494 s.ends_with("s") || s.ends_with("ms")
495 }
496
497 fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
498 let mut content = String::new();
499 let mut found = false;
500 while let Some(&next_c) = chars.peek() {
501 chars.next();
502 if next_c == end {
503 found = true;
504 break;
505 }
506 content.push(next_c);
507 }
508 (content, found)
509 }
510}
511
512#[cfg(test)]
513mod tests {
514 use super::*;
515
516 #[test]
517 fn test_parse_simple_text() {
518 let result = SpeechMarkdownParser::parse("Hello world");
519 assert!(result.is_ok());
520
521 let ast = result.unwrap();
522 assert_eq!(ast.node_type, NodeType::Document);
523 assert!(!ast.children.is_empty());
524 }
525
526 #[test]
527 fn test_parse_short_break() {
528 let result = SpeechMarkdownParser::parse("Sample [2s] text");
529 assert!(result.is_ok());
530 }
531
532 #[test]
533 fn test_parse_emphasis_strong() {
534 let result = SpeechMarkdownParser::parse("++strong emphasis++");
535 assert!(result.is_ok());
536 }
537
538 #[test]
539 fn test_parse_text_modifier() {
540 let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
541 assert!(result.is_ok());
542 }
543
544 #[test]
545 fn test_parse_audio() {
546 let result = SpeechMarkdownParser::parse("");
547 assert!(result.is_ok());
548 }
549
550 #[test]
551 fn test_debug_substitution() {
552 let input = "{Al}aluminum";
553 let result = SpeechMarkdownParser::parse(input);
554 assert!(result.is_ok());
555
556 let ast = result.unwrap();
557 println!("=== Substitution Debug ===");
558 println!("Input: {}", input);
559 println!("AST: {:?}", ast);
560 println!("Children: {:?}", ast.children);
561 println!("========================");
562 }
563
564 #[test]
565 fn test_debug_emphasis_ssml() {
566 let input = "++strong emphasis++";
567 let result =
568 SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
569 println!("=== Emphasis SSML Debug ===");
570 println!("Input: {}", input);
571 println!("SSML Result: {:?}", result);
572 println!("==========================");
573 }
574
575 #[test]
576 fn test_is_speech_markdown() {
577 assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
578 assert!(!SpeechMarkdownParser::is_speech_markdown(""));
579 assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
580 assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
581 assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
582 assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
583 assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
584 assert!(SpeechMarkdownParser::is_speech_markdown(""));
585 }
586
587 #[test]
588 fn test_validate() {
589 assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
590 assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
591 assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
592 assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
593 }
594}