1use crate::entities::{ListStyle, TextDirection};
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18 pub spans: Vec<ParsedSpan>,
19 pub heading_level: Option<i64>,
20 pub list_style: Option<ListStyle>,
21 pub is_code_block: bool,
22 pub line_height: Option<i64>,
23 pub non_breakable_lines: Option<bool>,
24 pub direction: Option<TextDirection>,
25 pub background_color: Option<String>,
26}
27
28pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
31 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
32
33 let options =
34 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
35 let parser = Parser::new_ext(markdown, options);
36
37 let mut blocks: Vec<ParsedBlock> = Vec::new();
38 let mut current_spans: Vec<ParsedSpan> = Vec::new();
39 let mut current_heading: Option<i64> = None;
40 let mut current_list_style: Option<ListStyle> = None;
41 let mut is_code_block = false;
42 let mut in_block = false;
43
44 let mut bold = false;
46 let mut italic = false;
47 let mut strikeout = false;
48 let mut link_href: Option<String> = None;
49
50 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
52
53 for event in parser {
54 match event {
55 Event::Start(Tag::Paragraph) => {
56 in_block = true;
57 current_heading = None;
58 is_code_block = false;
59 }
60 Event::End(TagEnd::Paragraph) => {
61 if !current_spans.is_empty() || in_block {
62 blocks.push(ParsedBlock {
63 spans: std::mem::take(&mut current_spans),
64 heading_level: current_heading.take(),
65 list_style: current_list_style.clone(),
66 is_code_block: false,
67 line_height: None,
68 non_breakable_lines: None,
69 direction: None,
70 background_color: None,
71 });
72 }
73 in_block = false;
74 current_list_style = None;
75 }
76 Event::Start(Tag::Heading { level, .. }) => {
77 in_block = true;
78 current_heading = Some(heading_level_to_i64(level));
79 is_code_block = false;
80 }
81 Event::End(TagEnd::Heading(_)) => {
82 blocks.push(ParsedBlock {
83 spans: std::mem::take(&mut current_spans),
84 heading_level: current_heading.take(),
85 list_style: None,
86 is_code_block: false,
87 line_height: None,
88 non_breakable_lines: None,
89 direction: None,
90 background_color: None,
91 });
92 in_block = false;
93 }
94 Event::Start(Tag::List(ordered)) => {
95 let style = if ordered.is_some() {
96 Some(ListStyle::Decimal)
97 } else {
98 Some(ListStyle::Disc)
99 };
100 list_stack.push(style);
101 }
102 Event::End(TagEnd::List(_)) => {
103 list_stack.pop();
104 }
105 Event::Start(Tag::Item) => {
106 in_block = true;
107 current_list_style = list_stack.last().cloned().flatten();
108 }
109 Event::End(TagEnd::Item) => {
110 if !current_spans.is_empty() {
113 blocks.push(ParsedBlock {
114 spans: std::mem::take(&mut current_spans),
115 heading_level: None,
116 list_style: current_list_style.clone(),
117 is_code_block: false,
118 line_height: None,
119 non_breakable_lines: None,
120 direction: None,
121 background_color: None,
122 });
123 }
124 in_block = false;
125 current_list_style = None;
126 }
127 Event::Start(Tag::CodeBlock(_)) => {
128 in_block = true;
129 is_code_block = true;
130 }
131 Event::End(TagEnd::CodeBlock) => {
132 blocks.push(ParsedBlock {
133 spans: std::mem::take(&mut current_spans),
134 heading_level: None,
135 list_style: None,
136 is_code_block: true,
137 line_height: None,
138 non_breakable_lines: None,
139 direction: None,
140 background_color: None,
141 });
142 in_block = false;
143 is_code_block = false;
144 }
145 Event::Start(Tag::Emphasis) => {
146 italic = true;
147 }
148 Event::End(TagEnd::Emphasis) => {
149 italic = false;
150 }
151 Event::Start(Tag::Strong) => {
152 bold = true;
153 }
154 Event::End(TagEnd::Strong) => {
155 bold = false;
156 }
157 Event::Start(Tag::Strikethrough) => {
158 strikeout = true;
159 }
160 Event::End(TagEnd::Strikethrough) => {
161 strikeout = false;
162 }
163 Event::Start(Tag::Link { dest_url, .. }) => {
164 link_href = Some(dest_url.to_string());
165 }
166 Event::End(TagEnd::Link) => {
167 link_href = None;
168 }
169 Event::Text(text) => {
170 if !in_block {
171 in_block = true;
173 }
174 current_spans.push(ParsedSpan {
175 text: text.to_string(),
176 bold,
177 italic,
178 underline: false,
179 strikeout,
180 code: is_code_block,
181 link_href: link_href.clone(),
182 });
183 }
184 Event::Code(text) => {
185 if !in_block {
186 in_block = true;
187 }
188 current_spans.push(ParsedSpan {
189 text: text.to_string(),
190 bold,
191 italic,
192 underline: false,
193 strikeout,
194 code: true,
195 link_href: link_href.clone(),
196 });
197 }
198 Event::SoftBreak => {
199 current_spans.push(ParsedSpan {
201 text: " ".to_string(),
202 bold,
203 italic,
204 underline: false,
205 strikeout,
206 code: false,
207 link_href: link_href.clone(),
208 });
209 }
210 Event::HardBreak => {
211 if !current_spans.is_empty() || in_block {
213 blocks.push(ParsedBlock {
214 spans: std::mem::take(&mut current_spans),
215 heading_level: current_heading.take(),
216 list_style: current_list_style.clone(),
217 is_code_block,
218 line_height: None,
219 non_breakable_lines: None,
220 direction: None,
221 background_color: None,
222 });
223 }
224 }
225 _ => {}
226 }
227 }
228
229 if !current_spans.is_empty() {
231 blocks.push(ParsedBlock {
232 spans: std::mem::take(&mut current_spans),
233 heading_level: current_heading,
234 list_style: current_list_style,
235 is_code_block,
236 line_height: None,
237 non_breakable_lines: None,
238 direction: None,
239 background_color: None,
240 });
241 }
242
243 if blocks.is_empty() {
245 blocks.push(ParsedBlock {
246 spans: vec![ParsedSpan {
247 text: String::new(),
248 ..Default::default()
249 }],
250 heading_level: None,
251 list_style: None,
252 is_code_block: false,
253 line_height: None,
254 non_breakable_lines: None,
255 direction: None,
256 background_color: None,
257 });
258 }
259
260 blocks
261}
262
263fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
264 use pulldown_cmark::HeadingLevel;
265 match level {
266 HeadingLevel::H1 => 1,
267 HeadingLevel::H2 => 2,
268 HeadingLevel::H3 => 3,
269 HeadingLevel::H4 => 4,
270 HeadingLevel::H5 => 5,
271 HeadingLevel::H6 => 6,
272 }
273}
274
275use scraper::Node;
278
279#[derive(Debug, Clone, Default)]
281struct BlockStyles {
282 line_height: Option<i64>,
283 non_breakable_lines: Option<bool>,
284 direction: Option<TextDirection>,
285 background_color: Option<String>,
286}
287
288fn parse_block_styles(style: &str) -> BlockStyles {
291 let mut result = BlockStyles::default();
292 for part in style.split(';') {
293 let part = part.trim();
294 if let Some((prop, val)) = part.split_once(':') {
295 let prop = prop.trim().to_ascii_lowercase();
296 let val = val.trim();
297 match prop.as_str() {
298 "line-height" => {
299 if let Ok(v) = val.parse::<f64>() {
301 result.line_height = Some((v * 1000.0) as i64);
302 }
303 }
304 "white-space" => {
305 if val == "pre" || val == "nowrap" || val == "pre-wrap" {
306 result.non_breakable_lines = Some(true);
307 }
308 }
309 "direction" => {
310 if val.eq_ignore_ascii_case("rtl") {
311 result.direction = Some(TextDirection::RightToLeft);
312 } else if val.eq_ignore_ascii_case("ltr") {
313 result.direction = Some(TextDirection::LeftToRight);
314 }
315 }
316 "background-color" | "background" => {
317 result.background_color = Some(val.to_string());
318 }
319 _ => {}
320 }
321 }
322 }
323 result
324}
325
326pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
327 use scraper::Html;
328
329 let fragment = Html::parse_fragment(html);
330 let mut blocks: Vec<ParsedBlock> = Vec::new();
331
332 let root = fragment.root_element();
334
335 #[derive(Clone, Default)]
336 struct FmtState {
337 bold: bool,
338 italic: bool,
339 underline: bool,
340 strikeout: bool,
341 code: bool,
342 link_href: Option<String>,
343 }
344
345 const MAX_RECURSION_DEPTH: usize = 256;
346
347 fn walk_node(
348 node: ego_tree::NodeRef<Node>,
349 state: &FmtState,
350 blocks: &mut Vec<ParsedBlock>,
351 current_list_style: &Option<ListStyle>,
352 depth: usize,
353 ) {
354 if depth > MAX_RECURSION_DEPTH {
355 return;
356 }
357 match node.value() {
358 Node::Element(el) => {
359 let tag = el.name();
360 let mut new_state = state.clone();
361 let mut new_list_style = current_list_style.clone();
362
363 let is_block_tag = matches!(
365 tag,
366 "p" | "div"
367 | "h1"
368 | "h2"
369 | "h3"
370 | "h4"
371 | "h5"
372 | "h6"
373 | "li"
374 | "pre"
375 | "br"
376 | "blockquote"
377 );
378
379 match tag {
381 "b" | "strong" => new_state.bold = true,
382 "i" | "em" => new_state.italic = true,
383 "u" | "ins" => new_state.underline = true,
384 "s" | "del" | "strike" => new_state.strikeout = true,
385 "code" => new_state.code = true,
386 "a" => {
387 if let Some(href) = el.attr("href") {
388 new_state.link_href = Some(href.to_string());
389 }
390 }
391 "ul" => {
392 new_list_style = Some(ListStyle::Disc);
393 }
394 "ol" => {
395 new_list_style = Some(ListStyle::Decimal);
396 }
397 _ => {}
398 }
399
400 let heading_level = match tag {
402 "h1" => Some(1),
403 "h2" => Some(2),
404 "h3" => Some(3),
405 "h4" => Some(4),
406 "h5" => Some(5),
407 "h6" => Some(6),
408 _ => None,
409 };
410
411 let is_code_block = tag == "pre";
412
413 let css = if is_block_tag {
415 el.attr("style").map(parse_block_styles).unwrap_or_default()
416 } else {
417 BlockStyles::default()
418 };
419
420 if tag == "br" {
421 blocks.push(ParsedBlock {
423 spans: vec![ParsedSpan {
424 text: String::new(),
425 ..Default::default()
426 }],
427 heading_level: None,
428 list_style: None,
429 is_code_block: false,
430 line_height: None,
431 non_breakable_lines: None,
432 direction: None,
433 background_color: None,
434 });
435 return;
436 }
437
438 if is_block_tag && tag != "br" {
439 let mut spans: Vec<ParsedSpan> = Vec::new();
441 collect_inline_spans(
442 node,
443 &new_state,
444 &mut spans,
445 &new_list_style,
446 blocks,
447 depth + 1,
448 );
449
450 let list_style_for_block = if tag == "li" {
451 new_list_style.clone()
452 } else {
453 None
454 };
455
456 if !spans.is_empty() || heading_level.is_some() {
457 blocks.push(ParsedBlock {
458 spans,
459 heading_level,
460 list_style: list_style_for_block,
461 is_code_block,
462 line_height: css.line_height,
463 non_breakable_lines: css.non_breakable_lines,
464 direction: css.direction,
465 background_color: css.background_color,
466 });
467 }
468 } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
469 for child in node.children() {
471 walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
472 }
473 } else {
474 for child in node.children() {
476 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
477 }
478 }
479 }
480 Node::Text(text) => {
481 let t = text.text.to_string();
482 let trimmed = t.trim();
483 if !trimmed.is_empty() {
484 blocks.push(ParsedBlock {
486 spans: vec![ParsedSpan {
487 text: trimmed.to_string(),
488 bold: state.bold,
489 italic: state.italic,
490 underline: state.underline,
491 strikeout: state.strikeout,
492 code: state.code,
493 link_href: state.link_href.clone(),
494 }],
495 heading_level: None,
496 list_style: None,
497 is_code_block: false,
498 line_height: None,
499 non_breakable_lines: None,
500 direction: None,
501 background_color: None,
502 });
503 }
504 }
505 _ => {
506 for child in node.children() {
508 walk_node(child, state, blocks, current_list_style, depth + 1);
509 }
510 }
511 }
512 }
513
514 fn collect_inline_spans(
518 node: ego_tree::NodeRef<Node>,
519 state: &FmtState,
520 spans: &mut Vec<ParsedSpan>,
521 current_list_style: &Option<ListStyle>,
522 blocks: &mut Vec<ParsedBlock>,
523 depth: usize,
524 ) {
525 if depth > MAX_RECURSION_DEPTH {
526 return;
527 }
528 for child in node.children() {
529 match child.value() {
530 Node::Text(text) => {
531 let t = text.text.to_string();
532 if !t.is_empty() {
533 spans.push(ParsedSpan {
534 text: t,
535 bold: state.bold,
536 italic: state.italic,
537 underline: state.underline,
538 strikeout: state.strikeout,
539 code: state.code,
540 link_href: state.link_href.clone(),
541 });
542 }
543 }
544 Node::Element(el) => {
545 let tag = el.name();
546 let mut new_state = state.clone();
547
548 match tag {
549 "b" | "strong" => new_state.bold = true,
550 "i" | "em" => new_state.italic = true,
551 "u" | "ins" => new_state.underline = true,
552 "s" | "del" | "strike" => new_state.strikeout = true,
553 "code" => new_state.code = true,
554 "a" => {
555 if let Some(href) = el.attr("href") {
556 new_state.link_href = Some(href.to_string());
557 }
558 }
559 _ => {}
560 }
561
562 let nested_block = matches!(
564 tag,
565 "p" | "div"
566 | "h1"
567 | "h2"
568 | "h3"
569 | "h4"
570 | "h5"
571 | "h6"
572 | "li"
573 | "pre"
574 | "blockquote"
575 | "ul"
576 | "ol"
577 );
578
579 if tag == "br" {
580 spans.push(ParsedSpan {
583 text: String::new(),
584 ..Default::default()
585 });
586 } else if nested_block {
587 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
589 } else {
590 collect_inline_spans(
592 child,
593 &new_state,
594 spans,
595 current_list_style,
596 blocks,
597 depth + 1,
598 );
599 }
600 }
601 _ => {}
602 }
603 }
604 }
605
606 let initial_state = FmtState::default();
607 for child in root.children() {
608 walk_node(child, &initial_state, &mut blocks, &None, 0);
609 }
610
611 if blocks.is_empty() {
613 blocks.push(ParsedBlock {
614 spans: vec![ParsedSpan {
615 text: String::new(),
616 ..Default::default()
617 }],
618 heading_level: None,
619 list_style: None,
620 is_code_block: false,
621 line_height: None,
622 non_breakable_lines: None,
623 direction: None,
624 background_color: None,
625 });
626 }
627
628 blocks
629}
630
631#[cfg(test)]
632mod tests {
633 use super::*;
634
635 #[test]
636 fn test_parse_markdown_simple_paragraph() {
637 let blocks = parse_markdown("Hello **world**");
638 assert_eq!(blocks.len(), 1);
639 assert!(blocks[0].spans.len() >= 2);
640 let plain_span = blocks[0]
642 .spans
643 .iter()
644 .find(|s| s.text.contains("Hello"))
645 .unwrap();
646 assert!(!plain_span.bold);
647 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
648 assert!(bold_span.bold);
649 }
650
651 #[test]
652 fn test_parse_markdown_heading() {
653 let blocks = parse_markdown("# Title");
654 assert_eq!(blocks.len(), 1);
655 assert_eq!(blocks[0].heading_level, Some(1));
656 assert_eq!(blocks[0].spans[0].text, "Title");
657 }
658
659 #[test]
660 fn test_parse_markdown_list() {
661 let blocks = parse_markdown("- item1\n- item2");
662 assert!(blocks.len() >= 2);
663 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
664 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
665 }
666
667 #[test]
668 fn test_parse_html_simple() {
669 let blocks = parse_html("<p>Hello <b>world</b></p>");
670 assert_eq!(blocks.len(), 1);
671 assert!(blocks[0].spans.len() >= 2);
672 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
673 assert!(bold_span.bold);
674 }
675
676 #[test]
677 fn test_parse_html_multiple_paragraphs() {
678 let blocks = parse_html("<p>A</p><p>B</p>");
679 assert_eq!(blocks.len(), 2);
680 }
681
682 #[test]
683 fn test_parse_html_heading() {
684 let blocks = parse_html("<h2>Subtitle</h2>");
685 assert_eq!(blocks.len(), 1);
686 assert_eq!(blocks[0].heading_level, Some(2));
687 }
688
689 #[test]
690 fn test_parse_html_list() {
691 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
692 assert!(blocks.len() >= 2);
693 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
694 }
695
696 #[test]
697 fn test_parse_markdown_code_block() {
698 let blocks = parse_markdown("```\nfn main() {}\n```");
699 assert_eq!(blocks.len(), 1);
700 assert!(blocks[0].is_code_block);
701 assert!(blocks[0].spans[0].code);
702 }
703
704 #[test]
705 fn test_parse_markdown_nested_formatting() {
706 let blocks = parse_markdown("***bold italic***");
707 assert_eq!(blocks.len(), 1);
708 let span = &blocks[0].spans[0];
709 assert!(span.bold);
710 assert!(span.italic);
711 }
712
713 #[test]
714 fn test_parse_markdown_link() {
715 let blocks = parse_markdown("[click](http://example.com)");
716 assert_eq!(blocks.len(), 1);
717 let span = &blocks[0].spans[0];
718 assert_eq!(span.text, "click");
719 assert_eq!(span.link_href, Some("http://example.com".to_string()));
720 }
721
722 #[test]
723 fn test_parse_markdown_empty() {
724 let blocks = parse_markdown("");
725 assert_eq!(blocks.len(), 1);
726 assert!(blocks[0].spans[0].text.is_empty());
727 }
728
729 #[test]
730 fn test_parse_html_empty() {
731 let blocks = parse_html("");
732 assert_eq!(blocks.len(), 1);
733 assert!(blocks[0].spans[0].text.is_empty());
734 }
735
736 #[test]
737 fn test_parse_html_nested_formatting() {
738 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
739 assert_eq!(blocks.len(), 1);
740 let span = &blocks[0].spans[0];
741 assert!(span.bold);
742 assert!(span.italic);
743 }
744
745 #[test]
746 fn test_parse_html_link() {
747 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
748 assert_eq!(blocks.len(), 1);
749 let span = &blocks[0].spans[0];
750 assert_eq!(span.text, "click");
751 assert_eq!(span.link_href, Some("http://example.com".to_string()));
752 }
753
754 #[test]
755 fn test_parse_html_ordered_list() {
756 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
757 assert!(blocks.len() >= 2);
758 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
759 }
760
761 #[test]
762 fn test_parse_markdown_ordered_list() {
763 let blocks = parse_markdown("1. first\n2. second");
764 assert!(blocks.len() >= 2);
765 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
766 }
767
768 #[test]
769 fn test_parse_html_blockquote_nested() {
770 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
771 assert!(blocks.len() >= 3);
772 }
773
774 #[test]
775 fn test_parse_block_styles_line_height() {
776 let styles = parse_block_styles("line-height: 1.5");
777 assert_eq!(styles.line_height, Some(1500));
778 }
779
780 #[test]
781 fn test_parse_block_styles_direction_rtl() {
782 let styles = parse_block_styles("direction: rtl");
783 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
784 }
785
786 #[test]
787 fn test_parse_block_styles_background_color() {
788 let styles = parse_block_styles("background-color: #ff0000");
789 assert_eq!(styles.background_color, Some("#ff0000".to_string()));
790 }
791
792 #[test]
793 fn test_parse_block_styles_white_space_pre() {
794 let styles = parse_block_styles("white-space: pre");
795 assert_eq!(styles.non_breakable_lines, Some(true));
796 }
797
798 #[test]
799 fn test_parse_block_styles_multiple() {
800 let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
801 assert_eq!(styles.line_height, Some(2000));
802 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
803 assert_eq!(styles.background_color, Some("blue".to_string()));
804 }
805
806 #[test]
807 fn test_parse_html_block_styles_extracted() {
808 let blocks = parse_html(
809 r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
810 );
811 assert_eq!(blocks.len(), 1);
812 assert_eq!(blocks[0].line_height, Some(1500));
813 assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
814 assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
815 }
816
817 #[test]
818 fn test_parse_html_white_space_pre() {
819 let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
820 assert_eq!(blocks.len(), 1);
821 assert_eq!(blocks[0].non_breakable_lines, Some(true));
822 }
823
824 #[test]
825 fn test_parse_html_no_styles_returns_none() {
826 let blocks = parse_html("<p>plain</p>");
827 assert_eq!(blocks.len(), 1);
828 assert_eq!(blocks[0].line_height, None);
829 assert_eq!(blocks[0].direction, None);
830 assert_eq!(blocks[0].background_color, None);
831 assert_eq!(blocks[0].non_breakable_lines, None);
832 }
833}