1use crate::entities::{ListStyle, TextDirection};
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18 pub spans: Vec<ParsedSpan>,
19 pub heading_level: Option<i64>,
20 pub list_style: Option<ListStyle>,
21 pub is_code_block: bool,
22 pub line_height: Option<i64>,
23 pub non_breakable_lines: Option<bool>,
24 pub direction: Option<TextDirection>,
25 pub background_color: Option<String>,
26}
27
28impl ParsedBlock {
29 pub fn is_inline_only(&self) -> bool {
32 self.heading_level.is_none()
33 && self.list_style.is_none()
34 && !self.is_code_block
35 && self.line_height.is_none()
36 && self.non_breakable_lines.is_none()
37 && self.direction.is_none()
38 && self.background_color.is_none()
39 }
40}
41
42pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
45 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
46
47 let options =
48 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
49 let parser = Parser::new_ext(markdown, options);
50
51 let mut blocks: Vec<ParsedBlock> = Vec::new();
52 let mut current_spans: Vec<ParsedSpan> = Vec::new();
53 let mut current_heading: Option<i64> = None;
54 let mut current_list_style: Option<ListStyle> = None;
55 let mut is_code_block = false;
56 let mut in_block = false;
57
58 let mut bold = false;
60 let mut italic = false;
61 let mut strikeout = false;
62 let mut link_href: Option<String> = None;
63
64 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
66
67 for event in parser {
68 match event {
69 Event::Start(Tag::Paragraph) => {
70 in_block = true;
71 current_heading = None;
72 is_code_block = false;
73 }
74 Event::End(TagEnd::Paragraph) => {
75 if !current_spans.is_empty() || in_block {
76 blocks.push(ParsedBlock {
77 spans: std::mem::take(&mut current_spans),
78 heading_level: current_heading.take(),
79 list_style: current_list_style.clone(),
80 is_code_block: false,
81 line_height: None,
82 non_breakable_lines: None,
83 direction: None,
84 background_color: None,
85 });
86 }
87 in_block = false;
88 current_list_style = None;
89 }
90 Event::Start(Tag::Heading { level, .. }) => {
91 in_block = true;
92 current_heading = Some(heading_level_to_i64(level));
93 is_code_block = false;
94 }
95 Event::End(TagEnd::Heading(_)) => {
96 blocks.push(ParsedBlock {
97 spans: std::mem::take(&mut current_spans),
98 heading_level: current_heading.take(),
99 list_style: None,
100 is_code_block: false,
101 line_height: None,
102 non_breakable_lines: None,
103 direction: None,
104 background_color: None,
105 });
106 in_block = false;
107 }
108 Event::Start(Tag::List(ordered)) => {
109 let style = if ordered.is_some() {
110 Some(ListStyle::Decimal)
111 } else {
112 Some(ListStyle::Disc)
113 };
114 list_stack.push(style);
115 }
116 Event::End(TagEnd::List(_)) => {
117 list_stack.pop();
118 }
119 Event::Start(Tag::Item) => {
120 in_block = true;
121 current_list_style = list_stack.last().cloned().flatten();
122 }
123 Event::End(TagEnd::Item) => {
124 if !current_spans.is_empty() {
127 blocks.push(ParsedBlock {
128 spans: std::mem::take(&mut current_spans),
129 heading_level: None,
130 list_style: current_list_style.clone(),
131 is_code_block: false,
132 line_height: None,
133 non_breakable_lines: None,
134 direction: None,
135 background_color: None,
136 });
137 }
138 in_block = false;
139 current_list_style = None;
140 }
141 Event::Start(Tag::CodeBlock(_)) => {
142 in_block = true;
143 is_code_block = true;
144 }
145 Event::End(TagEnd::CodeBlock) => {
146 blocks.push(ParsedBlock {
147 spans: std::mem::take(&mut current_spans),
148 heading_level: None,
149 list_style: None,
150 is_code_block: true,
151 line_height: None,
152 non_breakable_lines: None,
153 direction: None,
154 background_color: None,
155 });
156 in_block = false;
157 is_code_block = false;
158 }
159 Event::Start(Tag::Emphasis) => {
160 italic = true;
161 }
162 Event::End(TagEnd::Emphasis) => {
163 italic = false;
164 }
165 Event::Start(Tag::Strong) => {
166 bold = true;
167 }
168 Event::End(TagEnd::Strong) => {
169 bold = false;
170 }
171 Event::Start(Tag::Strikethrough) => {
172 strikeout = true;
173 }
174 Event::End(TagEnd::Strikethrough) => {
175 strikeout = false;
176 }
177 Event::Start(Tag::Link { dest_url, .. }) => {
178 link_href = Some(dest_url.to_string());
179 }
180 Event::End(TagEnd::Link) => {
181 link_href = None;
182 }
183 Event::Text(text) => {
184 if !in_block {
185 in_block = true;
187 }
188 current_spans.push(ParsedSpan {
189 text: text.to_string(),
190 bold,
191 italic,
192 underline: false,
193 strikeout,
194 code: is_code_block,
195 link_href: link_href.clone(),
196 });
197 }
198 Event::Code(text) => {
199 if !in_block {
200 in_block = true;
201 }
202 current_spans.push(ParsedSpan {
203 text: text.to_string(),
204 bold,
205 italic,
206 underline: false,
207 strikeout,
208 code: true,
209 link_href: link_href.clone(),
210 });
211 }
212 Event::SoftBreak => {
213 current_spans.push(ParsedSpan {
215 text: " ".to_string(),
216 bold,
217 italic,
218 underline: false,
219 strikeout,
220 code: false,
221 link_href: link_href.clone(),
222 });
223 }
224 Event::HardBreak => {
225 if !current_spans.is_empty() || in_block {
227 blocks.push(ParsedBlock {
228 spans: std::mem::take(&mut current_spans),
229 heading_level: current_heading.take(),
230 list_style: current_list_style.clone(),
231 is_code_block,
232 line_height: None,
233 non_breakable_lines: None,
234 direction: None,
235 background_color: None,
236 });
237 }
238 }
239 _ => {}
240 }
241 }
242
243 if !current_spans.is_empty() {
245 blocks.push(ParsedBlock {
246 spans: std::mem::take(&mut current_spans),
247 heading_level: current_heading,
248 list_style: current_list_style,
249 is_code_block,
250 line_height: None,
251 non_breakable_lines: None,
252 direction: None,
253 background_color: None,
254 });
255 }
256
257 if blocks.is_empty() {
259 blocks.push(ParsedBlock {
260 spans: vec![ParsedSpan {
261 text: String::new(),
262 ..Default::default()
263 }],
264 heading_level: None,
265 list_style: None,
266 is_code_block: false,
267 line_height: None,
268 non_breakable_lines: None,
269 direction: None,
270 background_color: None,
271 });
272 }
273
274 blocks
275}
276
277fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
278 use pulldown_cmark::HeadingLevel;
279 match level {
280 HeadingLevel::H1 => 1,
281 HeadingLevel::H2 => 2,
282 HeadingLevel::H3 => 3,
283 HeadingLevel::H4 => 4,
284 HeadingLevel::H5 => 5,
285 HeadingLevel::H6 => 6,
286 }
287}
288
289use scraper::Node;
292
293#[derive(Debug, Clone, Default)]
295struct BlockStyles {
296 line_height: Option<i64>,
297 non_breakable_lines: Option<bool>,
298 direction: Option<TextDirection>,
299 background_color: Option<String>,
300}
301
302fn parse_block_styles(style: &str) -> BlockStyles {
305 let mut result = BlockStyles::default();
306 for part in style.split(';') {
307 let part = part.trim();
308 if let Some((prop, val)) = part.split_once(':') {
309 let prop = prop.trim().to_ascii_lowercase();
310 let val = val.trim();
311 match prop.as_str() {
312 "line-height" => {
313 if let Ok(v) = val.parse::<f64>() {
315 result.line_height = Some((v * 1000.0) as i64);
316 }
317 }
318 "white-space" => {
319 if val == "pre" || val == "nowrap" || val == "pre-wrap" {
320 result.non_breakable_lines = Some(true);
321 }
322 }
323 "direction" => {
324 if val.eq_ignore_ascii_case("rtl") {
325 result.direction = Some(TextDirection::RightToLeft);
326 } else if val.eq_ignore_ascii_case("ltr") {
327 result.direction = Some(TextDirection::LeftToRight);
328 }
329 }
330 "background-color" | "background" => {
331 result.background_color = Some(val.to_string());
332 }
333 _ => {}
334 }
335 }
336 }
337 result
338}
339
340pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
341 use scraper::Html;
342
343 let fragment = Html::parse_fragment(html);
344 let mut blocks: Vec<ParsedBlock> = Vec::new();
345
346 let root = fragment.root_element();
348
349 #[derive(Clone, Default)]
350 struct FmtState {
351 bold: bool,
352 italic: bool,
353 underline: bool,
354 strikeout: bool,
355 code: bool,
356 link_href: Option<String>,
357 }
358
359 const MAX_RECURSION_DEPTH: usize = 256;
360
361 fn walk_node(
362 node: ego_tree::NodeRef<Node>,
363 state: &FmtState,
364 blocks: &mut Vec<ParsedBlock>,
365 current_list_style: &Option<ListStyle>,
366 depth: usize,
367 ) {
368 if depth > MAX_RECURSION_DEPTH {
369 return;
370 }
371 match node.value() {
372 Node::Element(el) => {
373 let tag = el.name();
374 let mut new_state = state.clone();
375 let mut new_list_style = current_list_style.clone();
376
377 let is_block_tag = matches!(
379 tag,
380 "p" | "div"
381 | "h1"
382 | "h2"
383 | "h3"
384 | "h4"
385 | "h5"
386 | "h6"
387 | "li"
388 | "pre"
389 | "br"
390 | "blockquote"
391 );
392
393 match tag {
395 "b" | "strong" => new_state.bold = true,
396 "i" | "em" => new_state.italic = true,
397 "u" | "ins" => new_state.underline = true,
398 "s" | "del" | "strike" => new_state.strikeout = true,
399 "code" => new_state.code = true,
400 "a" => {
401 if let Some(href) = el.attr("href") {
402 new_state.link_href = Some(href.to_string());
403 }
404 }
405 "ul" => {
406 new_list_style = Some(ListStyle::Disc);
407 }
408 "ol" => {
409 new_list_style = Some(ListStyle::Decimal);
410 }
411 _ => {}
412 }
413
414 let heading_level = match tag {
416 "h1" => Some(1),
417 "h2" => Some(2),
418 "h3" => Some(3),
419 "h4" => Some(4),
420 "h5" => Some(5),
421 "h6" => Some(6),
422 _ => None,
423 };
424
425 let is_code_block = tag == "pre";
426
427 let css = if is_block_tag {
429 el.attr("style").map(parse_block_styles).unwrap_or_default()
430 } else {
431 BlockStyles::default()
432 };
433
434 if tag == "br" {
435 blocks.push(ParsedBlock {
437 spans: vec![ParsedSpan {
438 text: String::new(),
439 ..Default::default()
440 }],
441 heading_level: None,
442 list_style: None,
443 is_code_block: false,
444 line_height: None,
445 non_breakable_lines: None,
446 direction: None,
447 background_color: None,
448 });
449 return;
450 }
451
452 if is_block_tag && tag != "br" {
453 let mut spans: Vec<ParsedSpan> = Vec::new();
455 collect_inline_spans(
456 node,
457 &new_state,
458 &mut spans,
459 &new_list_style,
460 blocks,
461 depth + 1,
462 );
463
464 let list_style_for_block = if tag == "li" {
465 new_list_style.clone()
466 } else {
467 None
468 };
469
470 if !spans.is_empty() || heading_level.is_some() {
471 blocks.push(ParsedBlock {
472 spans,
473 heading_level,
474 list_style: list_style_for_block,
475 is_code_block,
476 line_height: css.line_height,
477 non_breakable_lines: css.non_breakable_lines,
478 direction: css.direction,
479 background_color: css.background_color,
480 });
481 }
482 } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
483 for child in node.children() {
485 walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
486 }
487 } else {
488 for child in node.children() {
490 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
491 }
492 }
493 }
494 Node::Text(text) => {
495 let t = text.text.to_string();
496 let trimmed = t.trim();
497 if !trimmed.is_empty() {
498 blocks.push(ParsedBlock {
500 spans: vec![ParsedSpan {
501 text: trimmed.to_string(),
502 bold: state.bold,
503 italic: state.italic,
504 underline: state.underline,
505 strikeout: state.strikeout,
506 code: state.code,
507 link_href: state.link_href.clone(),
508 }],
509 heading_level: None,
510 list_style: None,
511 is_code_block: false,
512 line_height: None,
513 non_breakable_lines: None,
514 direction: None,
515 background_color: None,
516 });
517 }
518 }
519 _ => {
520 for child in node.children() {
522 walk_node(child, state, blocks, current_list_style, depth + 1);
523 }
524 }
525 }
526 }
527
528 fn collect_inline_spans(
532 node: ego_tree::NodeRef<Node>,
533 state: &FmtState,
534 spans: &mut Vec<ParsedSpan>,
535 current_list_style: &Option<ListStyle>,
536 blocks: &mut Vec<ParsedBlock>,
537 depth: usize,
538 ) {
539 if depth > MAX_RECURSION_DEPTH {
540 return;
541 }
542 for child in node.children() {
543 match child.value() {
544 Node::Text(text) => {
545 let t = text.text.to_string();
546 if !t.is_empty() {
547 spans.push(ParsedSpan {
548 text: t,
549 bold: state.bold,
550 italic: state.italic,
551 underline: state.underline,
552 strikeout: state.strikeout,
553 code: state.code,
554 link_href: state.link_href.clone(),
555 });
556 }
557 }
558 Node::Element(el) => {
559 let tag = el.name();
560 let mut new_state = state.clone();
561
562 match tag {
563 "b" | "strong" => new_state.bold = true,
564 "i" | "em" => new_state.italic = true,
565 "u" | "ins" => new_state.underline = true,
566 "s" | "del" | "strike" => new_state.strikeout = true,
567 "code" => new_state.code = true,
568 "a" => {
569 if let Some(href) = el.attr("href") {
570 new_state.link_href = Some(href.to_string());
571 }
572 }
573 _ => {}
574 }
575
576 let nested_block = matches!(
578 tag,
579 "p" | "div"
580 | "h1"
581 | "h2"
582 | "h3"
583 | "h4"
584 | "h5"
585 | "h6"
586 | "li"
587 | "pre"
588 | "blockquote"
589 | "ul"
590 | "ol"
591 );
592
593 if tag == "br" {
594 spans.push(ParsedSpan {
597 text: String::new(),
598 ..Default::default()
599 });
600 } else if nested_block {
601 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
603 } else {
604 collect_inline_spans(
606 child,
607 &new_state,
608 spans,
609 current_list_style,
610 blocks,
611 depth + 1,
612 );
613 }
614 }
615 _ => {}
616 }
617 }
618 }
619
620 let initial_state = FmtState::default();
621 for child in root.children() {
622 walk_node(child, &initial_state, &mut blocks, &None, 0);
623 }
624
625 if blocks.is_empty() {
627 blocks.push(ParsedBlock {
628 spans: vec![ParsedSpan {
629 text: String::new(),
630 ..Default::default()
631 }],
632 heading_level: None,
633 list_style: None,
634 is_code_block: false,
635 line_height: None,
636 non_breakable_lines: None,
637 direction: None,
638 background_color: None,
639 });
640 }
641
642 blocks
643}
644
645#[cfg(test)]
646mod tests {
647 use super::*;
648
649 #[test]
650 fn test_parse_markdown_simple_paragraph() {
651 let blocks = parse_markdown("Hello **world**");
652 assert_eq!(blocks.len(), 1);
653 assert!(blocks[0].spans.len() >= 2);
654 let plain_span = blocks[0]
656 .spans
657 .iter()
658 .find(|s| s.text.contains("Hello"))
659 .unwrap();
660 assert!(!plain_span.bold);
661 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
662 assert!(bold_span.bold);
663 }
664
665 #[test]
666 fn test_parse_markdown_heading() {
667 let blocks = parse_markdown("# Title");
668 assert_eq!(blocks.len(), 1);
669 assert_eq!(blocks[0].heading_level, Some(1));
670 assert_eq!(blocks[0].spans[0].text, "Title");
671 }
672
673 #[test]
674 fn test_parse_markdown_list() {
675 let blocks = parse_markdown("- item1\n- item2");
676 assert!(blocks.len() >= 2);
677 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
678 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
679 }
680
681 #[test]
682 fn test_parse_html_simple() {
683 let blocks = parse_html("<p>Hello <b>world</b></p>");
684 assert_eq!(blocks.len(), 1);
685 assert!(blocks[0].spans.len() >= 2);
686 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
687 assert!(bold_span.bold);
688 }
689
690 #[test]
691 fn test_parse_html_multiple_paragraphs() {
692 let blocks = parse_html("<p>A</p><p>B</p>");
693 assert_eq!(blocks.len(), 2);
694 }
695
696 #[test]
697 fn test_parse_html_heading() {
698 let blocks = parse_html("<h2>Subtitle</h2>");
699 assert_eq!(blocks.len(), 1);
700 assert_eq!(blocks[0].heading_level, Some(2));
701 }
702
703 #[test]
704 fn test_parse_html_list() {
705 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
706 assert!(blocks.len() >= 2);
707 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
708 }
709
710 #[test]
711 fn test_parse_markdown_code_block() {
712 let blocks = parse_markdown("```\nfn main() {}\n```");
713 assert_eq!(blocks.len(), 1);
714 assert!(blocks[0].is_code_block);
715 assert!(blocks[0].spans[0].code);
716 }
717
718 #[test]
719 fn test_parse_markdown_nested_formatting() {
720 let blocks = parse_markdown("***bold italic***");
721 assert_eq!(blocks.len(), 1);
722 let span = &blocks[0].spans[0];
723 assert!(span.bold);
724 assert!(span.italic);
725 }
726
727 #[test]
728 fn test_parse_markdown_link() {
729 let blocks = parse_markdown("[click](http://example.com)");
730 assert_eq!(blocks.len(), 1);
731 let span = &blocks[0].spans[0];
732 assert_eq!(span.text, "click");
733 assert_eq!(span.link_href, Some("http://example.com".to_string()));
734 }
735
736 #[test]
737 fn test_parse_markdown_empty() {
738 let blocks = parse_markdown("");
739 assert_eq!(blocks.len(), 1);
740 assert!(blocks[0].spans[0].text.is_empty());
741 }
742
743 #[test]
744 fn test_parse_html_empty() {
745 let blocks = parse_html("");
746 assert_eq!(blocks.len(), 1);
747 assert!(blocks[0].spans[0].text.is_empty());
748 }
749
750 #[test]
751 fn test_parse_html_nested_formatting() {
752 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
753 assert_eq!(blocks.len(), 1);
754 let span = &blocks[0].spans[0];
755 assert!(span.bold);
756 assert!(span.italic);
757 }
758
759 #[test]
760 fn test_parse_html_link() {
761 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
762 assert_eq!(blocks.len(), 1);
763 let span = &blocks[0].spans[0];
764 assert_eq!(span.text, "click");
765 assert_eq!(span.link_href, Some("http://example.com".to_string()));
766 }
767
768 #[test]
769 fn test_parse_html_ordered_list() {
770 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
771 assert!(blocks.len() >= 2);
772 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
773 }
774
775 #[test]
776 fn test_parse_markdown_ordered_list() {
777 let blocks = parse_markdown("1. first\n2. second");
778 assert!(blocks.len() >= 2);
779 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
780 }
781
782 #[test]
783 fn test_parse_html_blockquote_nested() {
784 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
785 assert!(blocks.len() >= 3);
786 }
787
788 #[test]
789 fn test_parse_block_styles_line_height() {
790 let styles = parse_block_styles("line-height: 1.5");
791 assert_eq!(styles.line_height, Some(1500));
792 }
793
794 #[test]
795 fn test_parse_block_styles_direction_rtl() {
796 let styles = parse_block_styles("direction: rtl");
797 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
798 }
799
800 #[test]
801 fn test_parse_block_styles_background_color() {
802 let styles = parse_block_styles("background-color: #ff0000");
803 assert_eq!(styles.background_color, Some("#ff0000".to_string()));
804 }
805
806 #[test]
807 fn test_parse_block_styles_white_space_pre() {
808 let styles = parse_block_styles("white-space: pre");
809 assert_eq!(styles.non_breakable_lines, Some(true));
810 }
811
812 #[test]
813 fn test_parse_block_styles_multiple() {
814 let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
815 assert_eq!(styles.line_height, Some(2000));
816 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
817 assert_eq!(styles.background_color, Some("blue".to_string()));
818 }
819
820 #[test]
821 fn test_parse_html_block_styles_extracted() {
822 let blocks = parse_html(
823 r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
824 );
825 assert_eq!(blocks.len(), 1);
826 assert_eq!(blocks[0].line_height, Some(1500));
827 assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
828 assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
829 }
830
831 #[test]
832 fn test_parse_html_white_space_pre() {
833 let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
834 assert_eq!(blocks.len(), 1);
835 assert_eq!(blocks[0].non_breakable_lines, Some(true));
836 }
837
838 #[test]
839 fn test_parse_html_no_styles_returns_none() {
840 let blocks = parse_html("<p>plain</p>");
841 assert_eq!(blocks.len(), 1);
842 assert_eq!(blocks[0].line_height, None);
843 assert_eq!(blocks[0].direction, None);
844 assert_eq!(blocks[0].background_color, None);
845 assert_eq!(blocks[0].non_breakable_lines, None);
846 }
847}