speechmarkdown_rust/parser/
parser.rs1use crate::ast::{AstNode, NodeType};
2use crate::error::Result;
3use crate::formatters::base::{FormatterOptions, Platform};
4use crate::formatters::{create_formatter, Formatter, TextFormatter};
5
6pub struct SpeechMarkdownParser;
7
8impl SpeechMarkdownParser {
9 pub fn parse(input: &str) -> Result<AstNode> {
11 Self::parse_simple(input)
12 }
13
14 pub fn to_text(input: &str) -> Result<String> {
16 let ast = Self::parse(input)?;
17 let formatter = TextFormatter::new();
18 formatter.format(&ast)
19 }
20
21 pub fn to_ssml(input: &str, platform: Platform) -> Result<String> {
23 let ast = Self::parse(input)?;
24 let options = FormatterOptions {
25 platform,
26 ..Default::default()
27 };
28 let formatter = create_formatter(platform, options);
29 formatter.format(&ast)
30 }
31
32 fn parse_simple(input: &str) -> Result<AstNode> {
34 let mut document = AstNode::document();
35 let mut current_text = String::new();
36 let mut chars = input.chars().peekable();
37
38 let flush_text = |doc: &mut AstNode, text: &mut String| {
39 if !text.is_empty() {
40 let node = AstNode::text(text.clone());
41 text.clear();
42 doc.children.push(node);
43 }
44 };
45
46 while let Some(c) = chars.next() {
47 match c {
48 '#' if chars.peek() == Some(&'[') => {
49 flush_text(&mut document, &mut current_text);
50 chars.next();
51 let (section_content, found) = Self::read_until(&mut chars, ']');
52 if found {
53 let mut node = AstNode::new(NodeType::Section, section_content.clone());
54 for modifier in section_content.split(';') {
55 if let Some((key, value)) = modifier.split_once(':') {
56 node = node
57 .with_attribute(key.trim(), Self::strip_quotes(value.trim()));
58 } else {
59 node = node.with_attribute("style", modifier.trim());
60 }
61 }
62 document = document.add_child(node);
63 } else {
64 current_text.push('#');
65 current_text.push('[');
66 current_text.push_str(§ion_content);
67 }
68 }
69 '[' => {
70 flush_text(&mut document, &mut current_text);
71 let (bracket_content, found) = Self::read_until(&mut chars, ']');
72 if found {
73 if let Some(rest) = bracket_content.strip_prefix("break:") {
74 let break_value = Self::strip_quotes(rest.trim());
75 if Self::is_time_break(break_value) {
76 document = document.add_child(AstNode::new(
77 NodeType::ShortBreak,
78 format!("[{}]", break_value),
79 ));
80 } else {
81 let mut node =
82 AstNode::new(NodeType::Break, break_value.to_string());
83 node = node.with_attribute("strength", break_value);
84 document = document.add_child(node);
85 }
86 } else if let Some(rest) = bracket_content.strip_prefix("mark:") {
87 let mark_value = Self::strip_quotes(rest.trim());
88 document = document
89 .add_child(AstNode::new(NodeType::Mark, mark_value.to_string()));
90 } else if Self::is_time_break(&bracket_content) {
91 document = document.add_child(AstNode::new(
92 NodeType::ShortBreak,
93 format!("[{}]", bracket_content),
94 ));
95 } else {
96 current_text.push('[');
97 current_text.push_str(&bracket_content);
98 current_text.push(']');
99 }
100 } else {
101 current_text.push('[');
102 current_text.push_str(&bracket_content);
103 }
104 }
105 '~' => {
106 let prev_is_boundary = current_text.is_empty()
107 || current_text.ends_with(|c: char| c.is_whitespace());
108 if !prev_is_boundary {
109 current_text.push('~');
110 } else {
111 flush_text(&mut document, &mut current_text);
112 let mut emphasized_text = String::new();
113 let mut found_end = false;
114 while let Some(&next_c) = chars.peek() {
115 chars.next();
116 if next_c == '~' {
117 found_end = true;
118 break;
119 }
120 emphasized_text.push(next_c);
121 }
122 if found_end
123 && !emphasized_text.is_empty()
124 && !emphasized_text.contains(' ')
125 {
126 document = document.add_child(AstNode::new(
127 NodeType::ShortEmphasisNone,
128 emphasized_text,
129 ));
130 } else {
131 current_text.push('~');
132 current_text.push_str(&emphasized_text);
133 if found_end {
134 current_text.push('~');
135 }
136 }
137 }
138 }
139 '-' => {
140 let prev_is_boundary = current_text.is_empty()
141 || current_text.ends_with(|c: char| c.is_whitespace());
142 if !prev_is_boundary {
143 current_text.push('-');
144 } else {
145 flush_text(&mut document, &mut current_text);
146 let mut emphasized_text = String::new();
147 let mut found_end = false;
148 while let Some(&next_c) = chars.peek() {
149 chars.next();
150 if next_c == '\n' || next_c == '\r' {
151 emphasized_text.push(next_c);
152 break;
153 }
154 if next_c == '-' {
155 let next_is_boundary =
156 chars.peek().is_none_or(|c| c.is_whitespace());
157 if next_is_boundary {
158 found_end = true;
159 break;
160 } else {
161 emphasized_text.push('-');
162 }
163 } else {
164 emphasized_text.push(next_c);
165 }
166 }
167 if found_end
168 && !emphasized_text.is_empty()
169 && !emphasized_text.contains(' ')
170 {
171 document = document.add_child(AstNode::new(
172 NodeType::ShortEmphasisReduced,
173 emphasized_text,
174 ));
175 } else {
176 current_text.push('-');
177 current_text.push_str(&emphasized_text);
178 if found_end {
179 current_text.push('-');
180 }
181 }
182 }
183 }
184 '+' => {
185 flush_text(&mut document, &mut current_text);
186 let mut plus_count = 1;
187 while chars.peek() == Some(&'+') {
188 chars.next();
189 plus_count += 1;
190 }
191 let mut emphasized_text = String::new();
192 let mut found_end = false;
193 while let Some(&next_c) = chars.peek() {
194 if next_c == '+' {
195 let mut closing_pluses = 0;
196 while chars.peek() == Some(&'+') {
197 chars.next();
198 closing_pluses += 1;
199 }
200 if closing_pluses == plus_count {
201 found_end = true;
202 break;
203 } else {
204 for _ in 0..closing_pluses {
205 emphasized_text.push('+');
206 }
207 }
208 } else {
209 chars.next();
210 emphasized_text.push(next_c);
211 }
212 }
213 if found_end {
214 let node_type = if plus_count >= 2 {
215 NodeType::ShortEmphasisStrong
216 } else {
217 NodeType::ShortEmphasisModerate
218 };
219 document = document.add_child(AstNode::new(node_type, emphasized_text));
220 } else {
221 for _ in 0..plus_count {
222 current_text.push('+');
223 }
224 current_text.push_str(&emphasized_text);
225 }
226 }
227 '(' => {
228 flush_text(&mut document, &mut current_text);
229 let mut modifier_content = String::new();
230 let mut found_closing_paren = false;
231 while let Some(&next_c) = chars.peek() {
232 chars.next();
233 if next_c == ')' {
234 found_closing_paren = true;
235 break;
236 }
237 modifier_content.push(next_c);
238 }
239
240 if found_closing_paren {
241 if chars.peek() == Some(&'[') {
242 chars.next();
243 let (modifiers, found_bracket) = Self::read_until(&mut chars, ']');
244 if found_bracket {
245 let mut node =
246 AstNode::new(NodeType::TextModifier, modifier_content);
247 for modifier in modifiers.split(';') {
248 if let Some((key, value)) = modifier.split_once(':') {
249 node = node.with_attribute(
250 key.trim(),
251 Self::strip_quotes(value.trim()),
252 );
253 } else {
254 let key = modifier.trim();
255 if !key.is_empty() {
256 node = node.with_attribute(key, "");
257 }
258 }
259 }
260 document = document.add_child(node);
261 } else {
262 current_text.push('(');
263 current_text.push_str(&modifier_content);
264 current_text.push(')');
265 current_text.push('[');
266 current_text.push_str(&modifiers);
267 }
268 } else if chars.peek() == Some(&'{') {
269 chars.next();
270 let (alias_text, found_brace) = Self::read_until(&mut chars, '}');
271 if found_brace {
272 let mut node = AstNode::new(NodeType::ShortSub, modifier_content);
273 if !alias_text.is_empty() {
274 node = node.with_attribute("alias", alias_text);
275 }
276 document = document.add_child(node);
277 } else {
278 current_text.push('(');
279 current_text.push_str(&modifier_content);
280 current_text.push(')');
281 current_text.push('{');
282 current_text.push_str(&alias_text);
283 }
284 } else if chars.peek() == Some(&'/') {
285 chars.next();
286 let mut phoneme = String::new();
287 let mut found_slash = false;
288 while let Some(&next_c) = chars.peek() {
289 chars.next();
290 if next_c == '/' {
291 found_slash = true;
292 break;
293 }
294 phoneme.push(next_c);
295 }
296 if found_slash {
297 let mut node = AstNode::new(NodeType::ShortIpa, modifier_content);
298 node = node.with_attribute("phoneme", phoneme);
299 document = document.add_child(node);
300 } else {
301 current_text.push('(');
302 current_text.push_str(&modifier_content);
303 current_text.push(')');
304 current_text.push('/');
305 current_text.push_str(&phoneme);
306 }
307 } else {
308 current_text.push('(');
309 current_text.push_str(&modifier_content);
310 current_text.push(')');
311 }
312 } else {
313 current_text.push('(');
314 current_text.push_str(&modifier_content);
315 }
316 }
317 '/' => {
318 flush_text(&mut document, &mut current_text);
319 let mut ipa_content = String::new();
320 let mut found_slash = false;
321 while let Some(&next_c) = chars.peek() {
322 if next_c == '/' {
323 chars.next();
324 found_slash = true;
325 break;
326 }
327 if next_c == ' ' || next_c == '\n' || next_c == '\r' || next_c == '\t' {
328 break;
329 }
330 chars.next();
331 ipa_content.push(next_c);
332 }
333 if found_slash && !ipa_content.is_empty() {
334 let mut node = AstNode::new(NodeType::BareIpa, "ipa".to_string());
335 node = node.with_attribute("alphabet", "ipa");
336 node = node.with_attribute("ph", ipa_content.trim().to_string());
337 document = document.add_child(node);
338 } else if found_slash {
339 current_text.push('/');
340 current_text.push('/');
341 } else {
342 current_text.push('/');
343 current_text.push_str(&ipa_content);
344 }
345 }
346 '{' => {
347 flush_text(&mut document, &mut current_text);
348 let (sub_text, found_brace) = Self::read_until(&mut chars, '}');
349 if found_brace && !sub_text.is_empty() {
350 let mut alias_text = String::new();
351 while let Some(&next_c) = chars.peek() {
352 if next_c.is_whitespace()
353 || next_c == '('
354 || next_c == '['
355 || next_c == '+'
356 || next_c == '~'
357 || next_c == '!'
358 || next_c == '/'
359 || next_c == '{'
360 || next_c == '}'
361 || next_c == '#'
362 {
363 break;
364 }
365 chars.next();
366 alias_text.push(next_c);
367 }
368 let mut node = AstNode::new(NodeType::ShortSub, sub_text);
369 if !alias_text.is_empty() {
370 node = node.with_attribute("alias", alias_text);
371 }
372 document = document.add_child(node);
373 } else {
374 current_text.push('{');
375 current_text.push_str(&sub_text);
376 }
377 }
378 '!' => {
379 if chars.peek() == Some(&'[') {
380 flush_text(&mut document, &mut current_text);
381 chars.next();
382 let (caption, found_caption_end) = Self::read_until(&mut chars, ']');
383
384 if found_caption_end && chars.peek() == Some(&'(') {
385 chars.next();
386 let (url, found_url_end) = Self::read_until(&mut chars, ')');
387 if found_url_end {
388 let mut node = AstNode::new(NodeType::Audio, caption);
389 node = node.with_attribute("src", Self::strip_quotes(&url));
390 document = document.add_child(node);
391 } else {
392 current_text.push_str(&format!("![{}]", caption));
393 }
394 } else if found_caption_end && chars.peek() == Some(&'[') {
395 chars.next();
396 let (url, found_url_end) = Self::read_until(&mut chars, ']');
397 if found_url_end {
398 let mut node = AstNode::new(NodeType::Audio, caption);
399 node = node.with_attribute("src", Self::strip_quotes(&url));
400 document = document.add_child(node);
401 } else {
402 current_text.push_str(&format!("![{}]", caption));
403 }
404 } else if found_caption_end {
405 let possible_url = Self::strip_quotes(&caption);
406 if possible_url.starts_with("http://")
407 || possible_url.starts_with("https://")
408 || possible_url.starts_with("soundbank://")
409 || possible_url.contains("://")
410 || possible_url.contains('.')
411 {
412 let mut node = AstNode::new(NodeType::Audio, String::new());
413 node = node.with_attribute("src", possible_url);
414 document = document.add_child(node);
415 } else {
416 current_text.push_str(&format!("![{}]", caption));
417 }
418 } else {
419 current_text.push_str(&format!("![{}", caption));
420 }
421 } else if chars.peek() == Some(&'(') {
422 flush_text(&mut document, &mut current_text);
423 chars.next();
424 let (caption, found_caption_end) = Self::read_until(&mut chars, ')');
425 if found_caption_end && chars.peek() == Some(&'[') {
426 chars.next();
427 let (url, found_url_end) = Self::read_until(&mut chars, ']');
428 if found_url_end {
429 let mut node = AstNode::new(NodeType::Audio, caption);
430 node = node.with_attribute("src", Self::strip_quotes(&url));
431 document = document.add_child(node);
432 } else {
433 current_text.push_str(&format!("!({}[", caption));
434 }
435 } else {
436 current_text.push_str(&format!("!({}", caption));
437 }
438 } else {
439 current_text.push('!');
440 }
441 }
442 _ => {
443 current_text.push(c);
444 }
445 }
446 }
447
448 if !current_text.is_empty() {
449 document = document.add_child(AstNode::text(current_text));
450 }
451
452 Ok(document)
453 }
454
455 fn strip_quotes(s: &str) -> &str {
456 let s = s.trim();
457 if s.len() >= 2 {
458 let first = s.chars().next().unwrap();
459 let last = s.chars().last().unwrap();
460 if (first == '"' && last == '"') || (first == '\'' && last == '\'') {
461 return &s[1..s.len() - 1];
462 }
463 }
464 s
465 }
466
467 fn is_time_break(s: &str) -> bool {
468 s.ends_with("s") || s.ends_with("ms")
469 }
470
471 fn read_until(chars: &mut std::iter::Peekable<std::str::Chars>, end: char) -> (String, bool) {
472 let mut content = String::new();
473 let mut found = false;
474 while let Some(&next_c) = chars.peek() {
475 chars.next();
476 if next_c == end {
477 found = true;
478 break;
479 }
480 content.push(next_c);
481 }
482 (content, found)
483 }
484}
485
486#[cfg(test)]
487mod tests {
488 use super::*;
489
490 #[test]
491 fn test_parse_simple_text() {
492 let result = SpeechMarkdownParser::parse("Hello world");
493 assert!(result.is_ok());
494
495 let ast = result.unwrap();
496 assert_eq!(ast.node_type, NodeType::Document);
497 assert!(!ast.children.is_empty());
498 }
499
500 #[test]
501 fn test_parse_short_break() {
502 let result = SpeechMarkdownParser::parse("Sample [2s] text");
503 assert!(result.is_ok());
504 }
505
506 #[test]
507 fn test_parse_emphasis_strong() {
508 let result = SpeechMarkdownParser::parse("++strong emphasis++");
509 assert!(result.is_ok());
510 }
511
512 #[test]
513 fn test_parse_text_modifier() {
514 let result = SpeechMarkdownParser::parse("(text)[voice:\"Kendra\"]");
515 assert!(result.is_ok());
516 }
517
518 #[test]
519 fn test_parse_audio() {
520 let result = SpeechMarkdownParser::parse("");
521 assert!(result.is_ok());
522 }
523
524 #[test]
525 fn test_debug_substitution() {
526 let input = "{Al}aluminum";
527 let result = SpeechMarkdownParser::parse(input);
528 assert!(result.is_ok());
529
530 let ast = result.unwrap();
531 println!("=== Substitution Debug ===");
532 println!("Input: {}", input);
533 println!("AST: {:?}", ast);
534 println!("Children: {:?}", ast.children);
535 println!("========================");
536 }
537
538 #[test]
539 fn test_debug_emphasis_ssml() {
540 let input = "++strong emphasis++";
541 let result =
542 SpeechMarkdownParser::to_ssml(input, crate::formatters::base::Platform::AmazonAlexa);
543 println!("=== Emphasis SSML Debug ===");
544 println!("Input: {}", input);
545 println!("SSML Result: {:?}", result);
546 println!("==========================");
547 }
548}