speechmarkdown_rust/parser/
parser.rs1use crate::ast::{AstNode, NodeType};
2use crate::capabilities::PlatformCapabilities;
3use crate::error::Result;
4use crate::formatters::base::{FormatterOptions, Platform};
5use crate::formatters::{create_formatter, Formatter, TextFormatter};
6use crate::ssml_to_smd;
7
8pub struct SpeechMarkdownParser;
9
10impl SpeechMarkdownParser {
11 pub fn parse(input: &str) -> Result<AstNode> {
13 Self::parse_simple(input)
14 }
15
16 pub fn to_text(input: &str) -> Result<String> {
18 let ast = Self::parse(input)?;
19 let formatter = TextFormatter::new();
20 formatter.format(&ast)
21 }
22
23 pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
25 let ast = Self::parse(input)?;
26 let options = FormatterOptions {
27 platform,
28 ..Default::default()
29 };
30 let formatter = create_formatter(platform, options);
31 formatter.format(&ast)
32 }
33
34 pub fn to_smd(ssml: &str) -> Result<String> {
36 ssml_to_smd::ssml_to_smd(ssml)
37 }
38
39 pub fn supported_ssml(platform: Platform) -> PlatformCapabilities {
41 crate::capabilities::get_supported_ssml(platform)
42 }
43
44 pub fn is_speech_markdown(input: &str) -> bool {
46 if let Ok(ast) = Self::parse(input) {
47 ast.children.iter().any(|child| {
48 !matches!(
49 child.node_type,
50 NodeType::Document | NodeType::PlainText | NodeType::EmptyLine
51 )
52 })
53 } else {
54 false
55 }
56 }
57
58 pub fn validate(input: &str) -> Result<()> {
60 Self::parse(input)?;
61 Ok(())
62 }
63
64 fn parse_simple(input: &str) -> Result<AstNode> {
66 let mut document = AstNode::document();
67 let mut current_text = String::new();
68 let mut chars = input.chars().peekable();
69
70 let flush_text = |doc: &mut AstNode, text: &mut String| {
71 if !text.is_empty() {
72 let node = AstNode::text(text.clone());
73 text.clear();
74 doc.children.push(node);
75 }
76 };
77
78 while let Some(c) = chars.next() {
79 match c {
80 '#' if chars.peek() == Some(&'[') => {
81 flush_text(&mut document, &mut current_text);
82 chars.next();
83 let (section_content, found) = Self::read_until(&mut chars, ']');
84 if found {
85 let mut node = AstNode::new(NodeType::Section, section_content.clone());
86 for modifier in section_content.split(';') {
87 if let Some((key, value)) = modifier.split_once(':') {
88 node = node
89 .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
90 } else {
91 node = node.with_attribute("style", modifier.trim());
92 }
93 }
94 document = document.add_child(node);
95 } else {
96 current_text.push('#');
97 current_text.push('[');
98 current_text.push_str(§ion_content);
99 }
100 }
101 '[' => {
102 flush_text(&mut document, &mut current_text);
103 let (bracket_content, found) = Self::read_until(&mut chars, ']');
104 if found {
105 if let Some(rest) = bracket_content.strip_prefix("break:") {
106 let break_value = Self::strip_quotes(rest.trim());
107 if Self::is_time_break(break_value) {
108 document = document.add_child(AstNode::new(
109 NodeType::ShortBreak,
110 format!("[{}]", break_value),
111 ));
112 } else {
113 let mut node =
114 AstNode::new(NodeType::Break, break_value.to_string());
115 node = node.with_attribute("strength", break_value);
116 document = document.add_child(node);
117 }
118 } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
119 let mark_value = Self::strip_quotes(rest.trim());
120 document = document
121 .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
122 } else if Self::is_time_break(&bracket_content) {
123 document = document.add_child(AstNode::new(
124 NodeType::ShortBreak,
125 format!("[{}]", bracket_content),
126 ));
127 } else {
128 current_text.push('[');
129 current_text.push_str(&bracket_content);
130 current_text.push(']');
131 }
132 } else {
133 current_text.push('[');
134 current_text.push_str(&bracket_content);
135 }
136 }
137 '~' => {
138 let prev_is_boundary = current_text.is_empty()
139 || current_text.ends_with(|c: char| c.is_whitespace());
140 if !prev_is_boundary {
141 current_text.push('~');
142 } else {
143 flush_text(&mut document, &mut current_text);
144 let mut emphasized_text = String::new();
145 let mut found_end = false;
146 while let Some(&next_c) = chars.peek() {
147 chars.next();
148 if next_c == '~' {
149 found_end = true;
150 break;
151 }
152 emphasized_text.push(next_c);
153 }
154 if found_end
155 && !emphasized_text.is_empty()
156 && !emphasized_text.contains(' ')
157 {
158 document = document.add_child(AstNode::new(
159 NodeType::ShortEmphasisNone,
160 emphasized_text,
161 ));
162 } else {
163 current_text.push('~');
164 current_text.push_str(&emphasized_text);
165 if found_end {
166 current_text.push('~');
167 }
168 }
169 }
170 }
171 '-' => {
172 let prev_is_boundary = current_text.is_empty()
173 || current_text.ends_with(|c: char| c.is_whitespace());
174 if !prev_is_boundary {
175 current_text.push('-');
176 } else {
177 flush_text(&mut document, &mut current_text);
178 let mut emphasized_text = String::new();
179 let mut found_end = false;
180 while let Some(&next_c) = chars.peek() {
181 chars.next();
182 if next_c == '\n' || next_c == '\r' {
183 emphasized_text.push(next_c);
184 break;
185 }
186 if next_c == '-' {
187 let next_is_boundary =
188 chars.peek().is_none_or(|c| c.is_whitespace());
189 if next_is_boundary {
190 found_end = true;
191 break;
192 } else {
193 emphasized_text.push('-');
194 }
195 } else {
196 emphasized_text.push(next_c);
197 }
198 }
199 if found_end
200 && !emphasized_text.is_empty()
201 && !emphasized_text.contains(' ')
202 {
203 document = document.add_child(AstNode::new(
204 NodeType::ShortEmphasisReduced,
205 emphasized_text,
206 ));
207 } else {
208 current_text.push('-');
209 current_text.push_str(&emphasized_text);
210 if found_end {
211 current_text.push('-');
212 }
213 }
214 }
215 }
216 '+' => {
217 flush_text(&mut document, &mut current_text);
218 let mut plus_count = 1;
219 while chars.peek() == Some(&'+') {
220 chars.next();
221 plus_count += 1;
222 }
223 let mut emphasized_text = String::new();
224 let mut found_end = false;
225 while let Some(&next_c) = chars.peek() {
226 if next_c == '+' {
227 let mut closing_pluses = 0;
228 while chars.peek() == Some(&'+') {
229 chars.next();
230 closing_pluses += 1;
231 }
232 if closing_pluses == plus_count {
233 found_end = true;
234 break;
235 } else {
236 for _ in 0..closing_pluses {
237 emphasized_text.push('+');
238 }
239 }
240 } else {
241 chars.next();
242 emphasized_text.push(next_c);
243 }
244 }
245 if found_end {
246 let node_type = if plus_count >= 2 {
247 NodeType::ShortEmphasisStrong
248 } else {
249 NodeType::ShortEmphasisModerate
250 };
251 document = document.add_child(AstNode::new(node_type, emphasized_text));
252 } else {
253 for _ in 0..plus_count {
254 current_text.push('+');
255 }
256 current_text.push_str(&emphasized_text);
257 }
258 }
259 '(' => {
260 flush_text(&mut document, &mut current_text);
261 let mut modifier_content = String::new();
262 let mut found_closing_paren = false;
263 while let Some(&next_c) = chars.peek() {
264 chars.next();
265 if next_c == ')' {
266 found_closing_paren = true;
267 break;
268 }
269 modifier_content.push(next_c);
270 }
271
272 if found_closing_paren {
273 if chars.peek() == Some(&'[') {
274 chars.next();
275 let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
276 if found_bracket {
277 let mut node =
278 AstNode::new(NodeType::TextModifier, modifier_content);
279 for modifier in modifiers.split(';') {
280 if let Some((key, value)) = modifier.split_once(':') {
281 node = node.with_attribute(
282 key.trim(),
283 Self::strip_quotes(value.trim()),
284 );
285 } else {
286 let key = modifier.trim();
287 if !key.is_empty() {
288 node = node.with_attribute(key, "");
289 }
290 }
291 }
292 document = document.add_child(node);
293 } else {
294 current_text.push('(');
295 current_text.push_str(&modifier_content);
296 current_text.push(')');
297 current_text.push('[');
298 current_text.push_str(&modifiers);
299 }
300 } else if chars.peek() == Some(&'{') {
301 chars.next();
302 let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
303 if found_brace {
304 let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
305 if !alias_text.is_empty() {
306 node = node.with_attribute("alias", alias_text);
307 }
308 document = document.add_child(node);
309 } else {
310 current_text.push('(');
311 current_text.push_str(&modifier_content);
312 current_text.push(')');
313 current_text.push('{');
314 current_text.push_str(&alias_text);
315 }
316 } else if chars.peek() == Some(&'/') {
317 chars.next();
318 let mut phoneme = String::new();
319 let mut found_slash = false;
320 while let Some(&next_c) = chars.peek() {
321 chars.next();
322 if next_c == '/' {
323 found_slash = true;
324 break;
325 }
326 phoneme.push(next_c);
327 }
328 if found_slash {
329 let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
330 node = node.with_attribute("phoneme", phoneme);
331 document = document.add_child(node);
332 } else {
333 current_text.push('(');
334 current_text.push_str(&modifier_content);
335 current_text.push(')');
336 current_text.push('/');
337 current_text.push_str(&phoneme);
338 }
339 } else {
340 current_text.push('(');
341 current_text.push_str(&modifier_content);
342 current_text.push(')');
343 }
344 } else {
345 current_text.push('(');
346 current_text.push_str(&modifier_content);
347 }
348 }
349 '/' => {
350 flush_text(&mut document, &mut current_text);
351 let mut ipa_content = String::new();
352 let mut found_slash = false;
353 while let Some(&next_c) = chars.peek() {
354 if next_c == '/' {
355 chars.next();
356 found_slash = true;
357 break;
358 }
359 if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
360 break;
361 }
362 chars.next();
363 ipa_content.push(next_c);
364 }
365 if found_slash && !ipa_content.is_empty() {
366 let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
367 node = node.with_attribute("alphabet", "ipa");
368 node = node.with_attribute("ph", ipa_content.trim().to_string());
369 document = document.add_child(node);
370 } else if found_slash {
371 current_text.push('/');
372 current_text.push('/');
373 } else {
374 current_text.push('/');
375 current_text.push_str(&ipa_content);
376 }
377 }
378 '{' => {
379 flush_text(&mut document, &mut current_text);
380 let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
381 if found_brace && !sub_text.is_empty() {
382 let mut alias_text = String::new();
383 while let Some(&next_c) = chars.peek() {
384 if next_c.is_whitespace()
385 || next_c == '('
386 || next_c == '['
387 || next_c == '+'
388 || next_c == '~'
389 || next_c == '!'
390 || next_c == '/'
391 || next_c == '{'
392 || next_c == '}'
393 || next_c == '#'
394 {
395 break;
396 }
397 chars.next();
398 alias_text.push(next_c);
399 }
400 let mut node = AstNode::new(NodeType::ShortSub, sub_text);
401 if !alias_text.is_empty() {
402 node = node.with_attribute("alias", alias_text);
403 }
404 document = document.add_child(node);
405 } else {
406 current_text.push('{');
407 current_text.push_str(&sub_text);
408 }
409 }
410 '!' => {
411 if chars.peek() == Some(&'[') {
412 flush_text(&mut document, &mut current_text);
413 chars.next();
414 let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
415
416 if found_caption_end && chars.peek() == Some(&'(') {
417 chars.next();
418 let (url, found_url_end) = Self::read_until(&mut chars, ')');
419 if found_url_end {
420 let mut node = AstNode::new(NodeType::Audio, caption);
421 node = node.with_attribute("src", Self::strip_quotes(&url));
422 document = document.add_child(node);
423 } else {
424 current_text.push_str(&format!("![{}]", caption));
425 }
426 } else if found_caption_end && chars.peek() == Some(&'[') {
427 chars.next();
428 let (url, found_url_end) = Self::read_until(&mut chars, ']');
429 if found_url_end {
430 let mut node = AstNode::new(NodeType::Audio, caption);
431 node = node.with_attribute("src", Self::strip_quotes(&url));
432 document = document.add_child(node);
433 } else {
434 current_text.push_str(&format!("![{}]", caption));
435 }
436 } else if found_caption_end {
437 let possible_url = Self::strip_quotes(&caption);
438 if possible_url.starts_with("http://")
439 || possible_url.starts_with("https://")
440 || possible_url.starts_with("soundbank://")
441 || possible_url.contains("://")
442 || possible_url.contains('.')
443 {
444 let mut node = AstNode::new(NodeType::Audio, String::new());
445 node = node.with_attribute("src", possible_url);
446 document = document.add_child(node);
447 } else {
448 current_text.push_str(&format!("![{}]", caption));
449 }
450 } else {
451 current_text.push_str(&format!("![{}", caption));
452 }
453 } else if chars.peek() == Some(&'(') {
454 flush_text(&mut document, &mut current_text);
455 chars.next();
456 let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
457 if found_caption_end && chars.peek() == Some(&'[') {
458 chars.next();
459 let (url, found_url_end) = Self::read_until(&mut chars, ']');
460 if found_url_end {
461 let mut node = AstNode::new(NodeType::Audio, caption);
462 node = node.with_attribute("src", Self::strip_quotes(&url));
463 document = document.add_child(node);
464 } else {
465 current_text.push_str(&format!("!({}[", caption));
466 }
467 } else {
468 current_text.push_str(&format!("!({}", caption));
469 }
470 } else {
471 current_text.push('!');
472 }
473 }
474 _ => {
475 current_text.push(c);
476 }
477 }
478 }
479
480 if !current_text.is_empty() {
481 document = document.add_child(AstNode::text(current_text));
482 }
483
484 Ok(document)
485 }
486
487 fn strip_quotes(s: &str) -> &str {
488 let s = s.trim();
489 if s.len() >= 2 {
490 let first = s.chars().next().unwrap();
491 let last = s.chars().last().unwrap();
492 if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
493 return &s[1..s.len() - 1];
494 }
495 }
496 s
497 }
498
499 fn is_time_break(s: &str) -> bool {
500 s.ends_with("s") || s.ends_with("ms")
501 }
502
503 fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
504 let mut content = String::new();
505 let mut found = false;
506 while let Some(&next_c) = chars.peek() {
507 chars.next();
508 if next_c == end {
509 found = true;
510 break;
511 }
512 content.push(next_c);
513 }
514 (content, found)
515 }
516}
517
518#[cfg(test)]
519mod tests {
520 use super::*;
521
522 #[test]
523 fn test_parse_simple_text() {
524 let result = SpeechMarkdownParser::parse("Hello world");
525 assert!(result.is_ok());
526
527 let ast = result.unwrap();
528 assert_eq!(ast.node_type, NodeType::Document);
529 assert!(!ast.children.is_empty());
530 }
531
532 #[test]
533 fn test_parse_short_break() {
534 let result = SpeechMarkdownParser::parse("Sample [2s] text");
535 assert!(result.is_ok());
536 }
537
538 #[test]
539 fn test_parse_emphasis_strong() {
540 let result = SpeechMarkdownParser::parse("++strong emphasis++");
541 assert!(result.is_ok());
542 }
543
544 #[test]
545 fn test_parse_text_modifier() {
546 let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
547 assert!(result.is_ok());
548 }
549
550 #[test]
551 fn test_parse_audio() {
552 let result = SpeechMarkdownParser::parse("");
553 assert!(result.is_ok());
554 }
555
556 #[test]
557 fn test_debug_substitution() {
558 let input = "{Al}aluminum";
559 let result = SpeechMarkdownParser::parse(input);
560 assert!(result.is_ok());
561
562 let ast = result.unwrap();
563 println!("=== Substitution Debug ===");
564 println!("Input: {}", input);
565 println!("AST: {:?}", ast);
566 println!("Children: {:?}", ast.children);
567 println!("========================");
568 }
569
570 #[test]
571 fn test_debug_emphasis_ssml() {
572 let input = "++strong emphasis++";
573 let result =
574 SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
575 println!("=== Emphasis SSML Debug ===");
576 println!("Input: {}", input);
577 println!("SSML Result: {:?}", result);
578 println!("==========================");
579 }
580
581 #[test]
582 fn test_is_speech_markdown() {
583 assert!(!SpeechMarkdownParser::is_speech_markdown("Hello world"));
584 assert!(!SpeechMarkdownParser::is_speech_markdown(""));
585 assert!(SpeechMarkdownParser::is_speech_markdown("Hello (world)[emphasis:\"strong\"]"));
586 assert!(SpeechMarkdownParser::is_speech_markdown("Sample [2s] text"));
587 assert!(SpeechMarkdownParser::is_speech_markdown("++strong++"));
588 assert!(SpeechMarkdownParser::is_speech_markdown("~word~"));
589 assert!(SpeechMarkdownParser::is_speech_markdown("{Al}aluminum"));
590 assert!(SpeechMarkdownParser::is_speech_markdown(""));
591 }
592
593 #[test]
594 fn test_validate() {
595 assert!(SpeechMarkdownParser::validate("Hello world").is_ok());
596 assert!(SpeechMarkdownParser::validate("Hello (world)[emphasis:\"strong\"]").is_ok());
597 assert!(SpeechMarkdownParser::validate("Sample [2s] text").is_ok());
598 assert!(SpeechMarkdownParser::validate("++strong++").is_ok());
599 }
600}