1use crate::entities::{ListStyle, TextDirection};
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedTableCell {
18 pub spans: Vec<ParsedSpan>,
19}
20
21#[derive(Debug, Clone)]
23pub struct ParsedTable {
24 pub header_rows: usize,
26 pub rows: Vec<Vec<ParsedTableCell>>,
28}
29
30#[derive(Debug, Clone)]
32pub enum ParsedElement {
33 Block(ParsedBlock),
34 Table(ParsedTable),
35}
36
37impl ParsedElement {
38 pub fn flatten_to_blocks(elements: Vec<ParsedElement>) -> Vec<ParsedBlock> {
41 let mut blocks = Vec::new();
42 for elem in elements {
43 match elem {
44 ParsedElement::Block(b) => blocks.push(b),
45 ParsedElement::Table(t) => {
46 for row in t.rows {
47 for cell in row {
48 blocks.push(ParsedBlock {
49 spans: cell.spans,
50 heading_level: None,
51 list_style: None,
52 list_indent: 0,
53 is_code_block: false,
54 code_language: None,
55 blockquote_depth: 0,
56 line_height: None,
57 non_breakable_lines: None,
58 direction: None,
59 background_color: None,
60 });
61 }
62 }
63 }
64 }
65 }
66 if blocks.is_empty() {
67 blocks.push(ParsedBlock {
68 spans: vec![ParsedSpan {
69 text: String::new(),
70 ..Default::default()
71 }],
72 heading_level: None,
73 list_style: None,
74 list_indent: 0,
75 is_code_block: false,
76 code_language: None,
77 blockquote_depth: 0,
78 line_height: None,
79 non_breakable_lines: None,
80 direction: None,
81 background_color: None,
82 });
83 }
84 blocks
85 }
86}
87
88#[derive(Debug, Clone)]
90pub struct ParsedBlock {
91 pub spans: Vec<ParsedSpan>,
92 pub heading_level: Option<i64>,
93 pub list_style: Option<ListStyle>,
94 pub list_indent: u32,
95 pub is_code_block: bool,
96 pub code_language: Option<String>,
97 pub blockquote_depth: u32,
98 pub line_height: Option<i64>,
99 pub non_breakable_lines: Option<bool>,
100 pub direction: Option<TextDirection>,
101 pub background_color: Option<String>,
102}
103
104impl ParsedBlock {
105 pub fn is_inline_only(&self) -> bool {
108 self.heading_level.is_none()
109 && self.list_style.is_none()
110 && !self.is_code_block
111 && self.blockquote_depth == 0
112 && self.line_height.is_none()
113 && self.non_breakable_lines.is_none()
114 && self.direction.is_none()
115 && self.background_color.is_none()
116 }
117}
118
119pub fn parse_markdown(markdown: &str) -> Vec<ParsedElement> {
122 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
123
124 let options =
125 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
126 let parser = Parser::new_ext(markdown, options);
127
128 let mut elements: Vec<ParsedElement> = Vec::new();
129 let mut current_spans: Vec<ParsedSpan> = Vec::new();
130 let mut current_heading: Option<i64> = None;
131 let mut current_list_style: Option<ListStyle> = None;
132 let mut is_code_block = false;
133 let mut code_language: Option<String> = None;
134 let mut blockquote_depth: u32 = 0;
135 let mut in_block = false;
136
137 let mut bold = false;
139 let mut italic = false;
140 let mut strikeout = false;
141 let mut link_href: Option<String> = None;
142
143 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
145 let mut current_list_indent: u32 = 0;
146
147 let mut in_table = false;
149 let mut in_table_head = false;
150 let mut table_rows: Vec<Vec<ParsedTableCell>> = Vec::new();
151 let mut current_row_cells: Vec<ParsedTableCell> = Vec::new();
152 let mut current_cell_spans: Vec<ParsedSpan> = Vec::new();
153 let mut table_header_rows: usize = 0;
154
155 for event in parser {
156 match event {
157 Event::Start(Tag::Paragraph) => {
158 in_block = true;
159 current_heading = None;
160 is_code_block = false;
161 }
162 Event::End(TagEnd::Paragraph) => {
163 if !current_spans.is_empty() || in_block {
164 elements.push(ParsedElement::Block(ParsedBlock {
165 spans: std::mem::take(&mut current_spans),
166 heading_level: current_heading.take(),
167 list_style: current_list_style.clone(),
168 list_indent: current_list_indent,
169 is_code_block: false,
170 code_language: None,
171 blockquote_depth,
172 line_height: None,
173 non_breakable_lines: None,
174 direction: None,
175 background_color: None,
176 }));
177 }
178 in_block = false;
179 current_list_style = None;
180 }
181 Event::Start(Tag::Heading { level, .. }) => {
182 in_block = true;
183 current_heading = Some(heading_level_to_i64(level));
184 is_code_block = false;
185 }
186 Event::End(TagEnd::Heading(_)) => {
187 elements.push(ParsedElement::Block(ParsedBlock {
188 spans: std::mem::take(&mut current_spans),
189 heading_level: current_heading.take(),
190 list_style: None,
191 list_indent: 0,
192 is_code_block: false,
193 code_language: None,
194 blockquote_depth,
195 line_height: None,
196 non_breakable_lines: None,
197 direction: None,
198 background_color: None,
199 }));
200 in_block = false;
201 }
202 Event::Start(Tag::List(ordered)) => {
203 let style = if ordered.is_some() {
204 Some(ListStyle::Decimal)
205 } else {
206 Some(ListStyle::Disc)
207 };
208 list_stack.push(style);
209 }
210 Event::End(TagEnd::List(_)) => {
211 list_stack.pop();
212 }
213 Event::Start(Tag::Item) => {
214 if !current_spans.is_empty() {
217 elements.push(ParsedElement::Block(ParsedBlock {
218 spans: std::mem::take(&mut current_spans),
219 heading_level: None,
220 list_style: current_list_style.clone(),
221 list_indent: current_list_indent,
222 is_code_block: false,
223 code_language: None,
224 blockquote_depth,
225 line_height: None,
226 non_breakable_lines: None,
227 direction: None,
228 background_color: None,
229 }));
230 }
231 in_block = true;
232 current_list_style = list_stack.last().cloned().flatten();
233 current_list_indent = if list_stack.is_empty() {
234 0
235 } else {
236 (list_stack.len() - 1) as u32
237 };
238 }
239 Event::End(TagEnd::Item) => {
240 if !current_spans.is_empty() {
243 elements.push(ParsedElement::Block(ParsedBlock {
244 spans: std::mem::take(&mut current_spans),
245 heading_level: None,
246 list_style: current_list_style.clone(),
247 list_indent: current_list_indent,
248 is_code_block: false,
249 code_language: None,
250 blockquote_depth,
251 line_height: None,
252 non_breakable_lines: None,
253 direction: None,
254 background_color: None,
255 }));
256 }
257 in_block = false;
258 current_list_style = None;
259 }
260 Event::Start(Tag::CodeBlock(kind)) => {
261 in_block = true;
262 is_code_block = true;
263 code_language = match &kind {
264 pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
265 Some(lang.to_string())
266 }
267 _ => None,
268 };
269 }
270 Event::End(TagEnd::CodeBlock) => {
271 if let Some(last) = current_spans.last_mut()
273 && last.text.ends_with('\n')
274 {
275 last.text.truncate(last.text.len() - 1);
276 }
277 elements.push(ParsedElement::Block(ParsedBlock {
278 spans: std::mem::take(&mut current_spans),
279 heading_level: None,
280 list_style: None,
281 list_indent: 0,
282 is_code_block: true,
283 code_language: code_language.take(),
284 blockquote_depth,
285 line_height: None,
286 non_breakable_lines: None,
287 direction: None,
288 background_color: None,
289 }));
290 in_block = false;
291 is_code_block = false;
292 }
293 Event::Start(Tag::Table(_)) => {
295 in_table = true;
296 in_table_head = false;
297 table_rows.clear();
298 current_row_cells.clear();
299 current_cell_spans.clear();
300 table_header_rows = 0;
301 }
302 Event::End(TagEnd::Table) => {
303 elements.push(ParsedElement::Table(ParsedTable {
304 header_rows: table_header_rows,
305 rows: std::mem::take(&mut table_rows),
306 }));
307 in_table = false;
308 }
309 Event::Start(Tag::TableHead) => {
310 in_table_head = true;
311 current_row_cells.clear();
312 }
313 Event::End(TagEnd::TableHead) => {
314 table_rows.push(std::mem::take(&mut current_row_cells));
316 table_header_rows += 1;
317 in_table_head = false;
318 }
319 Event::Start(Tag::TableRow) => {
320 current_row_cells.clear();
321 }
322 Event::End(TagEnd::TableRow) => {
323 if !in_table_head {
325 table_rows.push(std::mem::take(&mut current_row_cells));
326 }
327 }
328 Event::Start(Tag::TableCell) => {
329 current_cell_spans.clear();
330 }
331 Event::End(TagEnd::TableCell) => {
332 current_row_cells.push(ParsedTableCell {
333 spans: std::mem::take(&mut current_cell_spans),
334 });
335 }
336 Event::Start(Tag::Emphasis) => {
338 italic = true;
339 }
340 Event::End(TagEnd::Emphasis) => {
341 italic = false;
342 }
343 Event::Start(Tag::Strong) => {
344 bold = true;
345 }
346 Event::End(TagEnd::Strong) => {
347 bold = false;
348 }
349 Event::Start(Tag::Strikethrough) => {
350 strikeout = true;
351 }
352 Event::End(TagEnd::Strikethrough) => {
353 strikeout = false;
354 }
355 Event::Start(Tag::Link { dest_url, .. }) => {
356 link_href = Some(dest_url.to_string());
357 }
358 Event::End(TagEnd::Link) => {
359 link_href = None;
360 }
361 Event::Text(text) => {
362 let span = ParsedSpan {
363 text: text.to_string(),
364 bold,
365 italic,
366 underline: false,
367 strikeout,
368 code: is_code_block,
369 link_href: link_href.clone(),
370 };
371 if in_table {
372 current_cell_spans.push(span);
373 } else {
374 if !in_block {
375 in_block = true;
376 }
377 current_spans.push(span);
378 }
379 }
380 Event::Code(text) => {
381 let span = ParsedSpan {
382 text: text.to_string(),
383 bold,
384 italic,
385 underline: false,
386 strikeout,
387 code: true,
388 link_href: link_href.clone(),
389 };
390 if in_table {
391 current_cell_spans.push(span);
392 } else {
393 if !in_block {
394 in_block = true;
395 }
396 current_spans.push(span);
397 }
398 }
399 Event::SoftBreak => {
400 let span = ParsedSpan {
401 text: " ".to_string(),
402 bold,
403 italic,
404 underline: false,
405 strikeout,
406 code: false,
407 link_href: link_href.clone(),
408 };
409 if in_table {
410 current_cell_spans.push(span);
411 } else {
412 current_spans.push(span);
413 }
414 }
415 Event::HardBreak => {
416 if !current_spans.is_empty() || in_block {
418 elements.push(ParsedElement::Block(ParsedBlock {
419 spans: std::mem::take(&mut current_spans),
420 heading_level: current_heading.take(),
421 list_style: current_list_style.clone(),
422 list_indent: current_list_indent,
423 is_code_block,
424 code_language: code_language.clone(),
425 blockquote_depth,
426 line_height: None,
427 non_breakable_lines: None,
428 direction: None,
429 background_color: None,
430 }));
431 }
432 }
433 Event::Start(Tag::BlockQuote(_)) => {
434 blockquote_depth += 1;
435 }
436 Event::End(TagEnd::BlockQuote(_)) => {
437 blockquote_depth = blockquote_depth.saturating_sub(1);
438 }
439 _ => {}
440 }
441 }
442
443 if !current_spans.is_empty() {
445 elements.push(ParsedElement::Block(ParsedBlock {
446 spans: std::mem::take(&mut current_spans),
447 heading_level: current_heading,
448 list_style: current_list_style,
449 list_indent: current_list_indent,
450 is_code_block,
451 code_language: code_language.take(),
452 blockquote_depth,
453 line_height: None,
454 non_breakable_lines: None,
455 direction: None,
456 background_color: None,
457 }));
458 }
459
460 if elements.is_empty() {
462 elements.push(ParsedElement::Block(ParsedBlock {
463 spans: vec![ParsedSpan {
464 text: String::new(),
465 ..Default::default()
466 }],
467 heading_level: None,
468 list_style: None,
469 list_indent: 0,
470 is_code_block: false,
471 code_language: None,
472 blockquote_depth: 0,
473 line_height: None,
474 non_breakable_lines: None,
475 direction: None,
476 background_color: None,
477 }));
478 }
479
480 elements
481}
482
483fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
484 use pulldown_cmark::HeadingLevel;
485 match level {
486 HeadingLevel::H1 => 1,
487 HeadingLevel::H2 => 2,
488 HeadingLevel::H3 => 3,
489 HeadingLevel::H4 => 4,
490 HeadingLevel::H5 => 5,
491 HeadingLevel::H6 => 6,
492 }
493}
494
495use scraper::Node;
498
499#[derive(Debug, Clone, Default)]
501struct BlockStyles {
502 line_height: Option<i64>,
503 non_breakable_lines: Option<bool>,
504 direction: Option<TextDirection>,
505 background_color: Option<String>,
506}
507
508fn parse_block_styles(style: &str) -> BlockStyles {
511 let mut result = BlockStyles::default();
512 for part in style.split(';') {
513 let part = part.trim();
514 if let Some((prop, val)) = part.split_once(':') {
515 let prop = prop.trim().to_ascii_lowercase();
516 let val = val.trim();
517 match prop.as_str() {
518 "line-height" => {
519 if let Ok(v) = val.parse::<f64>() {
521 result.line_height = Some((v * 1000.0) as i64);
522 }
523 }
524 "white-space" => {
525 if val == "pre" || val == "nowrap" || val == "pre-wrap" {
526 result.non_breakable_lines = Some(true);
527 }
528 }
529 "direction" => {
530 if val.eq_ignore_ascii_case("rtl") {
531 result.direction = Some(TextDirection::RightToLeft);
532 } else if val.eq_ignore_ascii_case("ltr") {
533 result.direction = Some(TextDirection::LeftToRight);
534 }
535 }
536 "background-color" | "background" => {
537 result.background_color = Some(val.to_string());
538 }
539 _ => {}
540 }
541 }
542 }
543 result
544}
545
546pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
547 ParsedElement::flatten_to_blocks(parse_html_elements(html))
548}
549
550pub fn parse_html_elements(html: &str) -> Vec<ParsedElement> {
551 use scraper::Html;
552
553 let fragment = Html::parse_fragment(html);
554 let mut elements: Vec<ParsedElement> = Vec::new();
555
556 let root = fragment.root_element();
558
559 #[derive(Clone, Default)]
560 struct FmtState {
561 bold: bool,
562 italic: bool,
563 underline: bool,
564 strikeout: bool,
565 code: bool,
566 link_href: Option<String>,
567 }
568
569 const MAX_RECURSION_DEPTH: usize = 256;
570
571 fn collect_cell_spans(
573 node: ego_tree::NodeRef<Node>,
574 state: &FmtState,
575 spans: &mut Vec<ParsedSpan>,
576 depth: usize,
577 ) {
578 if depth > MAX_RECURSION_DEPTH {
579 return;
580 }
581 for child in node.children() {
582 match child.value() {
583 Node::Text(text) => {
584 let t = text.text.to_string();
585 if !t.is_empty() {
586 spans.push(ParsedSpan {
587 text: t,
588 bold: state.bold,
589 italic: state.italic,
590 underline: state.underline,
591 strikeout: state.strikeout,
592 code: state.code,
593 link_href: state.link_href.clone(),
594 });
595 }
596 }
597 Node::Element(el) => {
598 let tag = el.name();
599 let mut new_state = state.clone();
600 match tag {
601 "b" | "strong" => new_state.bold = true,
602 "i" | "em" => new_state.italic = true,
603 "u" | "ins" => new_state.underline = true,
604 "s" | "del" | "strike" => new_state.strikeout = true,
605 "code" => new_state.code = true,
606 "a" => {
607 if let Some(href) = el.attr("href") {
608 new_state.link_href = Some(href.to_string());
609 }
610 }
611 _ => {}
612 }
613 collect_cell_spans(child, &new_state, spans, depth + 1);
614 }
615 _ => {}
616 }
617 }
618 }
619
620 fn parse_table_element(table_node: ego_tree::NodeRef<Node>) -> ParsedTable {
622 let mut rows: Vec<Vec<ParsedTableCell>> = Vec::new();
623 let mut header_rows: usize = 0;
624
625 fn collect_rows(
626 node: ego_tree::NodeRef<Node>,
627 rows: &mut Vec<Vec<ParsedTableCell>>,
628 header_rows: &mut usize,
629 in_thead: bool,
630 ) {
631 for child in node.children() {
632 if let Node::Element(el) = child.value() {
633 match el.name() {
634 "thead" => collect_rows(child, rows, header_rows, true),
635 "tbody" | "tfoot" => collect_rows(child, rows, header_rows, false),
636 "tr" => {
637 let mut cells: Vec<ParsedTableCell> = Vec::new();
638 for td in child.children() {
639 if let Node::Element(td_el) = td.value()
640 && matches!(td_el.name(), "td" | "th")
641 {
642 let mut spans = Vec::new();
643 let state = FmtState::default();
644 collect_cell_spans(td, &state, &mut spans, 0);
645 if spans.is_empty() {
646 spans.push(ParsedSpan::default());
647 }
648 cells.push(ParsedTableCell { spans });
649 }
650 }
651 if !cells.is_empty() {
652 rows.push(cells);
653 if in_thead {
654 *header_rows += 1;
655 }
656 }
657 }
658 _ => {}
659 }
660 }
661 }
662 }
663
664 collect_rows(table_node, &mut rows, &mut header_rows, false);
665
666 if header_rows == 0 && !rows.is_empty() {
668 header_rows = 1;
669 }
670
671 ParsedTable { header_rows, rows }
672 }
673
674 fn walk_node(
675 node: ego_tree::NodeRef<Node>,
676 state: &FmtState,
677 elements: &mut Vec<ParsedElement>,
678 current_list_style: &Option<ListStyle>,
679 blockquote_depth: u32,
680 list_depth: u32,
681 depth: usize,
682 ) {
683 if depth > MAX_RECURSION_DEPTH {
684 return;
685 }
686 match node.value() {
687 Node::Element(el) => {
688 let tag = el.name();
689 let mut new_state = state.clone();
690 let mut new_list_style = current_list_style.clone();
691 let mut bq_depth = blockquote_depth;
692 let mut new_list_depth = list_depth;
693
694 let is_block_tag = matches!(
696 tag,
697 "p" | "div"
698 | "h1"
699 | "h2"
700 | "h3"
701 | "h4"
702 | "h5"
703 | "h6"
704 | "li"
705 | "pre"
706 | "br"
707 | "blockquote"
708 | "body"
709 | "html"
710 );
711
712 match tag {
714 "b" | "strong" => new_state.bold = true,
715 "i" | "em" => new_state.italic = true,
716 "u" | "ins" => new_state.underline = true,
717 "s" | "del" | "strike" => new_state.strikeout = true,
718 "code" => new_state.code = true,
719 "a" => {
720 if let Some(href) = el.attr("href") {
721 new_state.link_href = Some(href.to_string());
722 }
723 }
724 "ul" => {
725 new_list_style = Some(ListStyle::Disc);
726 new_list_depth = list_depth + 1;
727 }
728 "ol" => {
729 new_list_style = Some(ListStyle::Decimal);
730 new_list_depth = list_depth + 1;
731 }
732 "blockquote" => {
733 bq_depth += 1;
734 }
735 _ => {}
736 }
737
738 let heading_level = match tag {
740 "h1" => Some(1),
741 "h2" => Some(2),
742 "h3" => Some(3),
743 "h4" => Some(4),
744 "h5" => Some(5),
745 "h6" => Some(6),
746 _ => None,
747 };
748
749 let is_code_block = tag == "pre";
750
751 let code_language = if is_code_block {
753 node.children().find_map(|child| {
754 if let Node::Element(cel) = child.value()
755 && cel.name() == "code"
756 && let Some(cls) = cel.attr("class")
757 {
758 return cls
759 .split_whitespace()
760 .find_map(|c| c.strip_prefix("language-"))
761 .map(|l| l.to_string());
762 }
763 None
764 })
765 } else {
766 None
767 };
768
769 let css = if is_block_tag {
771 el.attr("style").map(parse_block_styles).unwrap_or_default()
772 } else {
773 BlockStyles::default()
774 };
775
776 if tag == "table" {
777 let parsed_table = parse_table_element(node);
779 if !parsed_table.rows.is_empty() {
780 elements.push(ParsedElement::Table(parsed_table));
781 }
782 return;
783 }
784
785 if tag == "br" {
786 elements.push(ParsedElement::Block(ParsedBlock {
788 spans: vec![ParsedSpan {
789 text: String::new(),
790 ..Default::default()
791 }],
792 heading_level: None,
793 list_style: None,
794 list_indent: 0,
795 is_code_block: false,
796 code_language: None,
797 blockquote_depth: bq_depth,
798 line_height: None,
799 non_breakable_lines: None,
800 direction: None,
801 background_color: None,
802 }));
803 return;
804 }
805
806 if tag == "blockquote" {
807 for child in node.children() {
809 walk_node(
810 child,
811 &new_state,
812 elements,
813 &new_list_style,
814 bq_depth,
815 new_list_depth,
816 depth + 1,
817 );
818 }
819 } else if is_block_tag && tag != "br" {
820 let mut spans: Vec<ParsedSpan> = Vec::new();
825 let mut nested_elements: Vec<ParsedElement> = Vec::new();
826 collect_inline_spans(
827 node,
828 &new_state,
829 &mut spans,
830 &new_list_style,
831 &mut nested_elements,
832 bq_depth,
833 new_list_depth,
834 depth + 1,
835 );
836
837 let list_style_for_block = if tag == "li" {
838 new_list_style.clone()
839 } else {
840 None
841 };
842
843 let list_indent_for_block = if tag == "li" {
844 new_list_depth.saturating_sub(1)
845 } else {
846 0
847 };
848
849 if !spans.is_empty() || heading_level.is_some() {
850 elements.push(ParsedElement::Block(ParsedBlock {
851 spans,
852 heading_level,
853 list_style: list_style_for_block,
854 list_indent: list_indent_for_block,
855 is_code_block,
856 code_language,
857 blockquote_depth: bq_depth,
858 line_height: css.line_height,
859 non_breakable_lines: css.non_breakable_lines,
860 direction: css.direction,
861 background_color: css.background_color,
862 }));
863 }
864 elements.append(&mut nested_elements);
866 } else if matches!(tag, "ul" | "ol" | "thead" | "tbody" | "tr") {
867 for child in node.children() {
869 walk_node(
870 child,
871 &new_state,
872 elements,
873 &new_list_style,
874 bq_depth,
875 new_list_depth,
876 depth + 1,
877 );
878 }
879 } else {
880 for child in node.children() {
882 walk_node(
883 child,
884 &new_state,
885 elements,
886 current_list_style,
887 bq_depth,
888 list_depth,
889 depth + 1,
890 );
891 }
892 }
893 }
894 Node::Text(text) => {
895 let t = text.text.to_string();
896 let trimmed = t.trim();
897 if !trimmed.is_empty() {
898 elements.push(ParsedElement::Block(ParsedBlock {
900 spans: vec![ParsedSpan {
901 text: trimmed.to_string(),
902 bold: state.bold,
903 italic: state.italic,
904 underline: state.underline,
905 strikeout: state.strikeout,
906 code: state.code,
907 link_href: state.link_href.clone(),
908 }],
909 heading_level: None,
910 list_style: None,
911 list_indent: 0,
912 is_code_block: false,
913 code_language: None,
914 blockquote_depth,
915 line_height: None,
916 non_breakable_lines: None,
917 direction: None,
918 background_color: None,
919 }));
920 }
921 }
922 _ => {
923 for child in node.children() {
925 walk_node(
926 child,
927 state,
928 elements,
929 current_list_style,
930 blockquote_depth,
931 list_depth,
932 depth + 1,
933 );
934 }
935 }
936 }
937 }
938
939 #[allow(clippy::too_many_arguments)]
943 fn collect_inline_spans(
944 node: ego_tree::NodeRef<Node>,
945 state: &FmtState,
946 spans: &mut Vec<ParsedSpan>,
947 current_list_style: &Option<ListStyle>,
948 elements: &mut Vec<ParsedElement>,
949 blockquote_depth: u32,
950 list_depth: u32,
951 depth: usize,
952 ) {
953 if depth > MAX_RECURSION_DEPTH {
954 return;
955 }
956 for child in node.children() {
957 match child.value() {
958 Node::Text(text) => {
959 let t = text.text.to_string();
960 if !t.is_empty() {
961 spans.push(ParsedSpan {
962 text: t,
963 bold: state.bold,
964 italic: state.italic,
965 underline: state.underline,
966 strikeout: state.strikeout,
967 code: state.code,
968 link_href: state.link_href.clone(),
969 });
970 }
971 }
972 Node::Element(el) => {
973 let tag = el.name();
974 let mut new_state = state.clone();
975
976 match tag {
977 "b" | "strong" => new_state.bold = true,
978 "i" | "em" => new_state.italic = true,
979 "u" | "ins" => new_state.underline = true,
980 "s" | "del" | "strike" => new_state.strikeout = true,
981 "code" => new_state.code = true,
982 "a" => {
983 if let Some(href) = el.attr("href") {
984 new_state.link_href = Some(href.to_string());
985 }
986 }
987 _ => {}
988 }
989
990 let nested_block = matches!(
992 tag,
993 "p" | "div"
994 | "h1"
995 | "h2"
996 | "h3"
997 | "h4"
998 | "h5"
999 | "h6"
1000 | "li"
1001 | "pre"
1002 | "blockquote"
1003 | "ul"
1004 | "ol"
1005 );
1006
1007 if tag == "br" {
1008 spans.push(ParsedSpan {
1011 text: String::new(),
1012 ..Default::default()
1013 });
1014 } else if nested_block || tag == "table" {
1015 walk_node(
1017 child,
1018 &new_state,
1019 elements,
1020 current_list_style,
1021 blockquote_depth,
1022 list_depth,
1023 depth + 1,
1024 );
1025 } else {
1026 collect_inline_spans(
1028 child,
1029 &new_state,
1030 spans,
1031 current_list_style,
1032 elements,
1033 blockquote_depth,
1034 list_depth,
1035 depth + 1,
1036 );
1037 }
1038 }
1039 _ => {}
1040 }
1041 }
1042 }
1043
1044 let initial_state = FmtState::default();
1045 let mut root_spans: Vec<ParsedSpan> = Vec::new();
1049 collect_inline_spans(
1050 *root,
1051 &initial_state,
1052 &mut root_spans,
1053 &None,
1054 &mut elements,
1055 0,
1056 0,
1057 0,
1058 );
1059 if !root_spans.is_empty() {
1060 elements.push(ParsedElement::Block(ParsedBlock {
1061 spans: root_spans,
1062 heading_level: None,
1063 list_style: None,
1064 list_indent: 0,
1065 is_code_block: false,
1066 code_language: None,
1067 blockquote_depth: 0,
1068 line_height: None,
1069 non_breakable_lines: None,
1070 direction: None,
1071 background_color: None,
1072 }));
1073 }
1074
1075 if elements.is_empty() {
1077 elements.push(ParsedElement::Block(ParsedBlock {
1078 spans: vec![ParsedSpan {
1079 text: String::new(),
1080 ..Default::default()
1081 }],
1082 heading_level: None,
1083 list_style: None,
1084 list_indent: 0,
1085 is_code_block: false,
1086 code_language: None,
1087 blockquote_depth: 0,
1088 line_height: None,
1089 non_breakable_lines: None,
1090 direction: None,
1091 background_color: None,
1092 }));
1093 }
1094
1095 elements
1096}
1097
1098#[cfg(test)]
1099mod tests {
1100 use super::*;
1101
1102 fn parse_markdown_blocks(md: &str) -> Vec<ParsedBlock> {
1104 ParsedElement::flatten_to_blocks(parse_markdown(md))
1105 }
1106
1107 #[test]
1108 fn test_parse_markdown_simple_paragraph() {
1109 let blocks = parse_markdown_blocks("Hello **world**");
1110 assert_eq!(blocks.len(), 1);
1111 assert!(blocks[0].spans.len() >= 2);
1112 let plain_span = blocks[0]
1114 .spans
1115 .iter()
1116 .find(|s| s.text.contains("Hello"))
1117 .unwrap();
1118 assert!(!plain_span.bold);
1119 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1120 assert!(bold_span.bold);
1121 }
1122
1123 #[test]
1124 fn test_parse_markdown_heading() {
1125 let blocks = parse_markdown_blocks("# Title");
1126 assert_eq!(blocks.len(), 1);
1127 assert_eq!(blocks[0].heading_level, Some(1));
1128 assert_eq!(blocks[0].spans[0].text, "Title");
1129 }
1130
1131 #[test]
1132 fn test_parse_markdown_list() {
1133 let blocks = parse_markdown_blocks("- item1\n- item2");
1134 assert!(blocks.len() >= 2);
1135 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1136 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1137 }
1138
1139 #[test]
1140 fn test_parse_html_simple() {
1141 let blocks = parse_html("<p>Hello <b>world</b></p>");
1142 assert_eq!(blocks.len(), 1);
1143 assert!(blocks[0].spans.len() >= 2);
1144 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1145 assert!(bold_span.bold);
1146 }
1147
1148 #[test]
1149 fn test_parse_html_multiple_paragraphs() {
1150 let blocks = parse_html("<p>A</p><p>B</p>");
1151 assert_eq!(blocks.len(), 2);
1152 }
1153
1154 #[test]
1155 fn test_parse_html_heading() {
1156 let blocks = parse_html("<h2>Subtitle</h2>");
1157 assert_eq!(blocks.len(), 1);
1158 assert_eq!(blocks[0].heading_level, Some(2));
1159 }
1160
1161 #[test]
1162 fn test_parse_html_list() {
1163 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
1164 assert!(blocks.len() >= 2);
1165 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1166 }
1167
1168 #[test]
1169 fn test_parse_markdown_code_block() {
1170 let blocks = parse_markdown_blocks("```\nfn main() {}\n```");
1171 assert_eq!(blocks.len(), 1);
1172 assert!(blocks[0].is_code_block);
1173 assert!(blocks[0].spans[0].code);
1174 let text: String = blocks[0].spans.iter().map(|s| s.text.as_str()).collect();
1176 assert_eq!(
1177 text, "fn main() {}",
1178 "code block text should not have trailing newline"
1179 );
1180 }
1181
1182 #[test]
1183 fn test_parse_markdown_nested_formatting() {
1184 let blocks = parse_markdown_blocks("***bold italic***");
1185 assert_eq!(blocks.len(), 1);
1186 let span = &blocks[0].spans[0];
1187 assert!(span.bold);
1188 assert!(span.italic);
1189 }
1190
1191 #[test]
1192 fn test_parse_markdown_link() {
1193 let blocks = parse_markdown_blocks("[click](http://example.com)");
1194 assert_eq!(blocks.len(), 1);
1195 let span = &blocks[0].spans[0];
1196 assert_eq!(span.text, "click");
1197 assert_eq!(span.link_href, Some("http://example.com".to_string()));
1198 }
1199
1200 #[test]
1201 fn test_parse_markdown_empty() {
1202 let blocks = parse_markdown_blocks("");
1203 assert_eq!(blocks.len(), 1);
1204 assert!(blocks[0].spans[0].text.is_empty());
1205 }
1206
1207 #[test]
1208 fn test_parse_html_empty() {
1209 let blocks = parse_html("");
1210 assert_eq!(blocks.len(), 1);
1211 assert!(blocks[0].spans[0].text.is_empty());
1212 }
1213
1214 #[test]
1215 fn test_parse_html_nested_formatting() {
1216 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
1217 assert_eq!(blocks.len(), 1);
1218 let span = &blocks[0].spans[0];
1219 assert!(span.bold);
1220 assert!(span.italic);
1221 }
1222
1223 #[test]
1224 fn test_parse_html_link() {
1225 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
1226 assert_eq!(blocks.len(), 1);
1227 let span = &blocks[0].spans[0];
1228 assert_eq!(span.text, "click");
1229 assert_eq!(span.link_href, Some("http://example.com".to_string()));
1230 }
1231
1232 #[test]
1233 fn test_parse_html_ordered_list() {
1234 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
1235 assert!(blocks.len() >= 2);
1236 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1237 }
1238
1239 #[test]
1240 fn test_parse_markdown_ordered_list() {
1241 let blocks = parse_markdown_blocks("1. first\n2. second");
1242 assert!(blocks.len() >= 2);
1243 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1244 }
1245
1246 #[test]
1247 fn test_parse_html_blockquote_nested() {
1248 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
1249 assert!(blocks.len() >= 3);
1250 }
1251
1252 #[test]
1253 fn test_parse_block_styles_line_height() {
1254 let styles = parse_block_styles("line-height: 1.5");
1255 assert_eq!(styles.line_height, Some(1500));
1256 }
1257
1258 #[test]
1259 fn test_parse_block_styles_direction_rtl() {
1260 let styles = parse_block_styles("direction: rtl");
1261 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1262 }
1263
1264 #[test]
1265 fn test_parse_block_styles_background_color() {
1266 let styles = parse_block_styles("background-color: #ff0000");
1267 assert_eq!(styles.background_color, Some("#ff0000".to_string()));
1268 }
1269
1270 #[test]
1271 fn test_parse_block_styles_white_space_pre() {
1272 let styles = parse_block_styles("white-space: pre");
1273 assert_eq!(styles.non_breakable_lines, Some(true));
1274 }
1275
1276 #[test]
1277 fn test_parse_block_styles_multiple() {
1278 let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
1279 assert_eq!(styles.line_height, Some(2000));
1280 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1281 assert_eq!(styles.background_color, Some("blue".to_string()));
1282 }
1283
1284 #[test]
1285 fn test_parse_html_block_styles_extracted() {
1286 let blocks = parse_html(
1287 r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
1288 );
1289 assert_eq!(blocks.len(), 1);
1290 assert_eq!(blocks[0].line_height, Some(1500));
1291 assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
1292 assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
1293 }
1294
1295 #[test]
1296 fn test_parse_html_white_space_pre() {
1297 let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
1298 assert_eq!(blocks.len(), 1);
1299 assert_eq!(blocks[0].non_breakable_lines, Some(true));
1300 }
1301
1302 #[test]
1303 fn test_parse_html_no_styles_returns_none() {
1304 let blocks = parse_html("<p>plain</p>");
1305 assert_eq!(blocks.len(), 1);
1306 assert_eq!(blocks[0].line_height, None);
1307 assert_eq!(blocks[0].direction, None);
1308 assert_eq!(blocks[0].background_color, None);
1309 assert_eq!(blocks[0].non_breakable_lines, None);
1310 }
1311
1312 #[test]
1313 fn test_parse_markdown_nested_list_indent() {
1314 let md = "- top\n - nested\n - deep";
1315 let blocks = parse_markdown_blocks(md);
1316 assert_eq!(blocks.len(), 3);
1317 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1318 assert_eq!(blocks[0].list_indent, 0);
1319 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1320 assert_eq!(blocks[1].list_indent, 1);
1321 assert_eq!(blocks[2].list_style, Some(ListStyle::Disc));
1322 assert_eq!(blocks[2].list_indent, 2);
1323 }
1324
1325 #[test]
1326 fn test_parse_markdown_nested_ordered_list_indent() {
1327 let md = "1. first\n 1. nested\n 2. nested2";
1328 let blocks = parse_markdown_blocks(md);
1329 assert_eq!(blocks.len(), 3);
1330 assert_eq!(blocks[0].list_indent, 0);
1331 assert_eq!(blocks[1].list_indent, 1);
1332 assert_eq!(blocks[2].list_indent, 1);
1333 }
1334
1335 #[test]
1336 fn test_parse_html_nested_list_indent() {
1337 let html = "<ul><li>top</li><ul><li>nested</li></ul></ul>";
1338 let blocks = parse_html(html);
1339 assert!(blocks.len() >= 2);
1340 assert_eq!(blocks[0].list_indent, 0);
1341 assert_eq!(blocks[1].list_indent, 1);
1342 }
1343
1344 #[test]
1345 fn test_parse_markdown_table() {
1346 let md = "| A | B |\n|---|---|\n| 1 | 2 |";
1347 let elements = parse_markdown(md);
1348 assert_eq!(elements.len(), 1);
1349 match &elements[0] {
1350 ParsedElement::Table(table) => {
1351 assert_eq!(table.header_rows, 1);
1352 assert_eq!(table.rows.len(), 2); assert_eq!(table.rows[0].len(), 2);
1355 assert_eq!(table.rows[0][0].spans[0].text, "A");
1356 assert_eq!(table.rows[0][1].spans[0].text, "B");
1357 assert_eq!(table.rows[1].len(), 2);
1359 assert_eq!(table.rows[1][0].spans[0].text, "1");
1360 assert_eq!(table.rows[1][1].spans[0].text, "2");
1361 }
1362 _ => panic!("Expected ParsedElement::Table"),
1363 }
1364 }
1365
1366 #[test]
1367 fn test_parse_markdown_table_with_formatting() {
1368 let md = "| **bold** | `code` | *italic* |\n|---|---|---|\n| ~~strike~~ | plain | [link](http://x.com) |";
1369 let elements = parse_markdown(md);
1370 assert_eq!(elements.len(), 1);
1371 match &elements[0] {
1372 ParsedElement::Table(table) => {
1373 assert_eq!(table.rows.len(), 2);
1374 assert!(table.rows[0][0].spans[0].bold);
1376 assert!(table.rows[0][1].spans[0].code);
1378 assert!(table.rows[0][2].spans[0].italic);
1380 assert!(table.rows[1][0].spans[0].strikeout);
1382 assert_eq!(
1384 table.rows[1][2].spans[0].link_href,
1385 Some("http://x.com".to_string())
1386 );
1387 }
1388 _ => panic!("Expected ParsedElement::Table"),
1389 }
1390 }
1391
1392 #[test]
1393 fn test_parse_markdown_mixed_content_with_table() {
1394 let md = "Before\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nAfter";
1395 let elements = parse_markdown(md);
1396 assert_eq!(elements.len(), 3);
1397 assert!(matches!(&elements[0], ParsedElement::Block(_)));
1398 assert!(matches!(&elements[1], ParsedElement::Table(_)));
1399 assert!(matches!(&elements[2], ParsedElement::Block(_)));
1400 }
1401}