1use crate::entities::{ListStyle, TextDirection};
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedTableCell {
18 pub spans: Vec<ParsedSpan>,
19}
20
21#[derive(Debug, Clone)]
23pub struct ParsedTable {
24 pub header_rows: usize,
26 pub rows: Vec<Vec<ParsedTableCell>>,
28}
29
30#[derive(Debug, Clone)]
32pub enum ParsedElement {
33 Block(ParsedBlock),
34 Table(ParsedTable),
35}
36
37impl ParsedElement {
38 pub fn flatten_to_blocks(elements: Vec<ParsedElement>) -> Vec<ParsedBlock> {
41 let mut blocks = Vec::new();
42 for elem in elements {
43 match elem {
44 ParsedElement::Block(b) => blocks.push(b),
45 ParsedElement::Table(t) => {
46 for row in t.rows {
47 for cell in row {
48 blocks.push(ParsedBlock {
49 spans: cell.spans,
50 heading_level: None,
51 list_style: None,
52 list_indent: 0,
53 is_code_block: false,
54 code_language: None,
55 blockquote_depth: 0,
56 line_height: None,
57 non_breakable_lines: None,
58 direction: None,
59 background_color: None,
60 });
61 }
62 }
63 }
64 }
65 }
66 if blocks.is_empty() {
67 blocks.push(ParsedBlock {
68 spans: vec![ParsedSpan {
69 text: String::new(),
70 ..Default::default()
71 }],
72 heading_level: None,
73 list_style: None,
74 list_indent: 0,
75 is_code_block: false,
76 code_language: None,
77 blockquote_depth: 0,
78 line_height: None,
79 non_breakable_lines: None,
80 direction: None,
81 background_color: None,
82 });
83 }
84 blocks
85 }
86}
87
88#[derive(Debug, Clone)]
90pub struct ParsedBlock {
91 pub spans: Vec<ParsedSpan>,
92 pub heading_level: Option<i64>,
93 pub list_style: Option<ListStyle>,
94 pub list_indent: u32,
95 pub is_code_block: bool,
96 pub code_language: Option<String>,
97 pub blockquote_depth: u32,
98 pub line_height: Option<i64>,
99 pub non_breakable_lines: Option<bool>,
100 pub direction: Option<TextDirection>,
101 pub background_color: Option<String>,
102}
103
104impl ParsedBlock {
105 pub fn is_inline_only(&self) -> bool {
108 self.heading_level.is_none()
109 && self.list_style.is_none()
110 && !self.is_code_block
111 && self.blockquote_depth == 0
112 && self.line_height.is_none()
113 && self.non_breakable_lines.is_none()
114 && self.direction.is_none()
115 && self.background_color.is_none()
116 }
117}
118
119pub fn parse_markdown(markdown: &str) -> Vec<ParsedElement> {
122 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
123
124 let options =
125 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
126 let parser = Parser::new_ext(markdown, options);
127
128 let mut elements: Vec<ParsedElement> = Vec::new();
129 let mut current_spans: Vec<ParsedSpan> = Vec::new();
130 let mut current_heading: Option<i64> = None;
131 let mut current_list_style: Option<ListStyle> = None;
132 let mut is_code_block = false;
133 let mut code_language: Option<String> = None;
134 let mut blockquote_depth: u32 = 0;
135 let mut in_block = false;
136
137 let mut bold = false;
139 let mut italic = false;
140 let mut strikeout = false;
141 let mut link_href: Option<String> = None;
142
143 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
145 let mut current_list_indent: u32 = 0;
146
147 let mut in_table = false;
149 let mut in_table_head = false;
150 let mut table_rows: Vec<Vec<ParsedTableCell>> = Vec::new();
151 let mut current_row_cells: Vec<ParsedTableCell> = Vec::new();
152 let mut current_cell_spans: Vec<ParsedSpan> = Vec::new();
153 let mut table_header_rows: usize = 0;
154
155 for event in parser {
156 match event {
157 Event::Start(Tag::Paragraph) => {
158 in_block = true;
159 current_heading = None;
160 is_code_block = false;
161 }
162 Event::End(TagEnd::Paragraph) => {
163 if !current_spans.is_empty() || in_block {
164 elements.push(ParsedElement::Block(ParsedBlock {
165 spans: std::mem::take(&mut current_spans),
166 heading_level: current_heading.take(),
167 list_style: current_list_style.clone(),
168 list_indent: current_list_indent,
169 is_code_block: false,
170 code_language: None,
171 blockquote_depth,
172 line_height: None,
173 non_breakable_lines: None,
174 direction: None,
175 background_color: None,
176 }));
177 }
178 in_block = false;
179 current_list_style = None;
180 }
181 Event::Start(Tag::Heading { level, .. }) => {
182 in_block = true;
183 current_heading = Some(heading_level_to_i64(level));
184 is_code_block = false;
185 }
186 Event::End(TagEnd::Heading(_)) => {
187 elements.push(ParsedElement::Block(ParsedBlock {
188 spans: std::mem::take(&mut current_spans),
189 heading_level: current_heading.take(),
190 list_style: None,
191 list_indent: 0,
192 is_code_block: false,
193 code_language: None,
194 blockquote_depth,
195 line_height: None,
196 non_breakable_lines: None,
197 direction: None,
198 background_color: None,
199 }));
200 in_block = false;
201 }
202 Event::Start(Tag::List(ordered)) => {
203 let style = if ordered.is_some() {
204 Some(ListStyle::Decimal)
205 } else {
206 Some(ListStyle::Disc)
207 };
208 list_stack.push(style);
209 }
210 Event::End(TagEnd::List(_)) => {
211 list_stack.pop();
212 }
213 Event::Start(Tag::Item) => {
214 if !current_spans.is_empty() {
217 elements.push(ParsedElement::Block(ParsedBlock {
218 spans: std::mem::take(&mut current_spans),
219 heading_level: None,
220 list_style: current_list_style.clone(),
221 list_indent: current_list_indent,
222 is_code_block: false,
223 code_language: None,
224 blockquote_depth,
225 line_height: None,
226 non_breakable_lines: None,
227 direction: None,
228 background_color: None,
229 }));
230 }
231 in_block = true;
232 current_list_style = list_stack.last().cloned().flatten();
233 current_list_indent = if list_stack.is_empty() {
234 0
235 } else {
236 (list_stack.len() - 1) as u32
237 };
238 }
239 Event::End(TagEnd::Item) => {
240 if !current_spans.is_empty() {
243 elements.push(ParsedElement::Block(ParsedBlock {
244 spans: std::mem::take(&mut current_spans),
245 heading_level: None,
246 list_style: current_list_style.clone(),
247 list_indent: current_list_indent,
248 is_code_block: false,
249 code_language: None,
250 blockquote_depth,
251 line_height: None,
252 non_breakable_lines: None,
253 direction: None,
254 background_color: None,
255 }));
256 }
257 in_block = false;
258 current_list_style = None;
259 }
260 Event::Start(Tag::CodeBlock(kind)) => {
261 in_block = true;
262 is_code_block = true;
263 code_language = match &kind {
264 pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
265 Some(lang.to_string())
266 }
267 _ => None,
268 };
269 }
270 Event::End(TagEnd::CodeBlock) => {
271 if let Some(last) = current_spans.last_mut()
273 && last.text.ends_with('\n')
274 {
275 last.text.truncate(last.text.len() - 1);
276 }
277 elements.push(ParsedElement::Block(ParsedBlock {
278 spans: std::mem::take(&mut current_spans),
279 heading_level: None,
280 list_style: None,
281 list_indent: 0,
282 is_code_block: true,
283 code_language: code_language.take(),
284 blockquote_depth,
285 line_height: None,
286 non_breakable_lines: None,
287 direction: None,
288 background_color: None,
289 }));
290 in_block = false;
291 is_code_block = false;
292 }
293 Event::Start(Tag::Table(_)) => {
295 in_table = true;
296 in_table_head = false;
297 table_rows.clear();
298 current_row_cells.clear();
299 current_cell_spans.clear();
300 table_header_rows = 0;
301 }
302 Event::End(TagEnd::Table) => {
303 elements.push(ParsedElement::Table(ParsedTable {
304 header_rows: table_header_rows,
305 rows: std::mem::take(&mut table_rows),
306 }));
307 in_table = false;
308 }
309 Event::Start(Tag::TableHead) => {
310 in_table_head = true;
311 current_row_cells.clear();
312 }
313 Event::End(TagEnd::TableHead) => {
314 table_rows.push(std::mem::take(&mut current_row_cells));
316 table_header_rows += 1;
317 in_table_head = false;
318 }
319 Event::Start(Tag::TableRow) => {
320 current_row_cells.clear();
321 }
322 Event::End(TagEnd::TableRow) if !in_table_head => {
323 table_rows.push(std::mem::take(&mut current_row_cells));
325 }
326 Event::Start(Tag::TableCell) => {
327 current_cell_spans.clear();
328 }
329 Event::End(TagEnd::TableCell) => {
330 current_row_cells.push(ParsedTableCell {
331 spans: std::mem::take(&mut current_cell_spans),
332 });
333 }
334 Event::Start(Tag::Emphasis) => {
336 italic = true;
337 }
338 Event::End(TagEnd::Emphasis) => {
339 italic = false;
340 }
341 Event::Start(Tag::Strong) => {
342 bold = true;
343 }
344 Event::End(TagEnd::Strong) => {
345 bold = false;
346 }
347 Event::Start(Tag::Strikethrough) => {
348 strikeout = true;
349 }
350 Event::End(TagEnd::Strikethrough) => {
351 strikeout = false;
352 }
353 Event::Start(Tag::Link { dest_url, .. }) => {
354 link_href = Some(dest_url.to_string());
355 }
356 Event::End(TagEnd::Link) => {
357 link_href = None;
358 }
359 Event::Text(text) => {
360 let span = ParsedSpan {
361 text: text.to_string(),
362 bold,
363 italic,
364 underline: false,
365 strikeout,
366 code: is_code_block,
367 link_href: link_href.clone(),
368 };
369 if in_table {
370 current_cell_spans.push(span);
371 } else {
372 if !in_block {
373 in_block = true;
374 }
375 current_spans.push(span);
376 }
377 }
378 Event::Code(text) => {
379 let span = ParsedSpan {
380 text: text.to_string(),
381 bold,
382 italic,
383 underline: false,
384 strikeout,
385 code: true,
386 link_href: link_href.clone(),
387 };
388 if in_table {
389 current_cell_spans.push(span);
390 } else {
391 if !in_block {
392 in_block = true;
393 }
394 current_spans.push(span);
395 }
396 }
397 Event::SoftBreak => {
398 let span = ParsedSpan {
399 text: " ".to_string(),
400 bold,
401 italic,
402 underline: false,
403 strikeout,
404 code: false,
405 link_href: link_href.clone(),
406 };
407 if in_table {
408 current_cell_spans.push(span);
409 } else {
410 current_spans.push(span);
411 }
412 }
413 Event::HardBreak if !current_spans.is_empty() || in_block => {
414 elements.push(ParsedElement::Block(ParsedBlock {
416 spans: std::mem::take(&mut current_spans),
417 heading_level: current_heading.take(),
418 list_style: current_list_style.clone(),
419 list_indent: current_list_indent,
420 is_code_block,
421 code_language: code_language.clone(),
422 blockquote_depth,
423 line_height: None,
424 non_breakable_lines: None,
425 direction: None,
426 background_color: None,
427 }));
428 }
429 Event::Start(Tag::BlockQuote(_)) => {
430 blockquote_depth += 1;
431 }
432 Event::End(TagEnd::BlockQuote(_)) => {
433 blockquote_depth = blockquote_depth.saturating_sub(1);
434 }
435 _ => {}
436 }
437 }
438
439 if !current_spans.is_empty() {
441 elements.push(ParsedElement::Block(ParsedBlock {
442 spans: std::mem::take(&mut current_spans),
443 heading_level: current_heading,
444 list_style: current_list_style,
445 list_indent: current_list_indent,
446 is_code_block,
447 code_language: code_language.take(),
448 blockquote_depth,
449 line_height: None,
450 non_breakable_lines: None,
451 direction: None,
452 background_color: None,
453 }));
454 }
455
456 if elements.is_empty() {
458 elements.push(ParsedElement::Block(ParsedBlock {
459 spans: vec![ParsedSpan {
460 text: String::new(),
461 ..Default::default()
462 }],
463 heading_level: None,
464 list_style: None,
465 list_indent: 0,
466 is_code_block: false,
467 code_language: None,
468 blockquote_depth: 0,
469 line_height: None,
470 non_breakable_lines: None,
471 direction: None,
472 background_color: None,
473 }));
474 }
475
476 elements
477}
478
479fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
480 use pulldown_cmark::HeadingLevel;
481 match level {
482 HeadingLevel::H1 => 1,
483 HeadingLevel::H2 => 2,
484 HeadingLevel::H3 => 3,
485 HeadingLevel::H4 => 4,
486 HeadingLevel::H5 => 5,
487 HeadingLevel::H6 => 6,
488 }
489}
490
491use scraper::Node;
494
495#[derive(Debug, Clone, Default)]
497struct BlockStyles {
498 line_height: Option<i64>,
499 non_breakable_lines: Option<bool>,
500 direction: Option<TextDirection>,
501 background_color: Option<String>,
502}
503
504fn parse_block_styles(style: &str) -> BlockStyles {
507 let mut result = BlockStyles::default();
508 for part in style.split(';') {
509 let part = part.trim();
510 if let Some((prop, val)) = part.split_once(':') {
511 let prop = prop.trim().to_ascii_lowercase();
512 let val = val.trim();
513 match prop.as_str() {
514 "line-height" => {
515 if let Ok(v) = val.parse::<f64>() {
517 result.line_height = Some((v * 1000.0) as i64);
518 }
519 }
520 "white-space" if val == "pre" || val == "nowrap" || val == "pre-wrap" => {
521 result.non_breakable_lines = Some(true);
522 }
523 "direction" => {
524 if val.eq_ignore_ascii_case("rtl") {
525 result.direction = Some(TextDirection::RightToLeft);
526 } else if val.eq_ignore_ascii_case("ltr") {
527 result.direction = Some(TextDirection::LeftToRight);
528 }
529 }
530 "background-color" | "background" => {
531 result.background_color = Some(val.to_string());
532 }
533 _ => {}
534 }
535 }
536 }
537 result
538}
539
540pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
541 ParsedElement::flatten_to_blocks(parse_html_elements(html))
542}
543
544pub fn parse_html_elements(html: &str) -> Vec<ParsedElement> {
545 use scraper::Html;
546
547 let fragment = Html::parse_fragment(html);
548 let mut elements: Vec<ParsedElement> = Vec::new();
549
550 let root = fragment.root_element();
552
553 #[derive(Clone, Default)]
554 struct FmtState {
555 bold: bool,
556 italic: bool,
557 underline: bool,
558 strikeout: bool,
559 code: bool,
560 link_href: Option<String>,
561 }
562
563 const MAX_RECURSION_DEPTH: usize = 256;
564
565 fn collect_cell_spans(
567 node: ego_tree::NodeRef<Node>,
568 state: &FmtState,
569 spans: &mut Vec<ParsedSpan>,
570 depth: usize,
571 ) {
572 if depth > MAX_RECURSION_DEPTH {
573 return;
574 }
575 for child in node.children() {
576 match child.value() {
577 Node::Text(text) => {
578 let t = text.text.to_string();
579 if !t.is_empty() {
580 spans.push(ParsedSpan {
581 text: t,
582 bold: state.bold,
583 italic: state.italic,
584 underline: state.underline,
585 strikeout: state.strikeout,
586 code: state.code,
587 link_href: state.link_href.clone(),
588 });
589 }
590 }
591 Node::Element(el) => {
592 let tag = el.name();
593 let mut new_state = state.clone();
594 match tag {
595 "b" | "strong" => new_state.bold = true,
596 "i" | "em" => new_state.italic = true,
597 "u" | "ins" => new_state.underline = true,
598 "s" | "del" | "strike" => new_state.strikeout = true,
599 "code" => new_state.code = true,
600 "a" => {
601 if let Some(href) = el.attr("href") {
602 new_state.link_href = Some(href.to_string());
603 }
604 }
605 _ => {}
606 }
607 collect_cell_spans(child, &new_state, spans, depth + 1);
608 }
609 _ => {}
610 }
611 }
612 }
613
614 fn parse_table_element(table_node: ego_tree::NodeRef<Node>) -> ParsedTable {
616 let mut rows: Vec<Vec<ParsedTableCell>> = Vec::new();
617 let mut header_rows: usize = 0;
618
619 fn collect_rows(
620 node: ego_tree::NodeRef<Node>,
621 rows: &mut Vec<Vec<ParsedTableCell>>,
622 header_rows: &mut usize,
623 in_thead: bool,
624 ) {
625 for child in node.children() {
626 if let Node::Element(el) = child.value() {
627 match el.name() {
628 "thead" => collect_rows(child, rows, header_rows, true),
629 "tbody" | "tfoot" => collect_rows(child, rows, header_rows, false),
630 "tr" => {
631 let mut cells: Vec<ParsedTableCell> = Vec::new();
632 for td in child.children() {
633 if let Node::Element(td_el) = td.value()
634 && matches!(td_el.name(), "td" | "th")
635 {
636 let mut spans = Vec::new();
637 let state = FmtState::default();
638 collect_cell_spans(td, &state, &mut spans, 0);
639 if spans.is_empty() {
640 spans.push(ParsedSpan::default());
641 }
642 cells.push(ParsedTableCell { spans });
643 }
644 }
645 if !cells.is_empty() {
646 rows.push(cells);
647 if in_thead {
648 *header_rows += 1;
649 }
650 }
651 }
652 _ => {}
653 }
654 }
655 }
656 }
657
658 collect_rows(table_node, &mut rows, &mut header_rows, false);
659
660 if header_rows == 0 && !rows.is_empty() {
662 header_rows = 1;
663 }
664
665 ParsedTable { header_rows, rows }
666 }
667
668 fn walk_node(
669 node: ego_tree::NodeRef<Node>,
670 state: &FmtState,
671 elements: &mut Vec<ParsedElement>,
672 current_list_style: &Option<ListStyle>,
673 blockquote_depth: u32,
674 list_depth: u32,
675 depth: usize,
676 ) {
677 if depth > MAX_RECURSION_DEPTH {
678 return;
679 }
680 match node.value() {
681 Node::Element(el) => {
682 let tag = el.name();
683 let mut new_state = state.clone();
684 let mut new_list_style = current_list_style.clone();
685 let mut bq_depth = blockquote_depth;
686 let mut new_list_depth = list_depth;
687
688 let is_block_tag = matches!(
690 tag,
691 "p" | "div"
692 | "h1"
693 | "h2"
694 | "h3"
695 | "h4"
696 | "h5"
697 | "h6"
698 | "li"
699 | "pre"
700 | "br"
701 | "blockquote"
702 | "body"
703 | "html"
704 );
705
706 match tag {
708 "b" | "strong" => new_state.bold = true,
709 "i" | "em" => new_state.italic = true,
710 "u" | "ins" => new_state.underline = true,
711 "s" | "del" | "strike" => new_state.strikeout = true,
712 "code" => new_state.code = true,
713 "a" => {
714 if let Some(href) = el.attr("href") {
715 new_state.link_href = Some(href.to_string());
716 }
717 }
718 "ul" => {
719 new_list_style = Some(ListStyle::Disc);
720 new_list_depth = list_depth + 1;
721 }
722 "ol" => {
723 new_list_style = Some(ListStyle::Decimal);
724 new_list_depth = list_depth + 1;
725 }
726 "blockquote" => {
727 bq_depth += 1;
728 }
729 _ => {}
730 }
731
732 let heading_level = match tag {
734 "h1" => Some(1),
735 "h2" => Some(2),
736 "h3" => Some(3),
737 "h4" => Some(4),
738 "h5" => Some(5),
739 "h6" => Some(6),
740 _ => None,
741 };
742
743 let is_code_block = tag == "pre";
744
745 let code_language = if is_code_block {
747 node.children().find_map(|child| {
748 if let Node::Element(cel) = child.value()
749 && cel.name() == "code"
750 && let Some(cls) = cel.attr("class")
751 {
752 return cls
753 .split_whitespace()
754 .find_map(|c| c.strip_prefix("language-"))
755 .map(|l| l.to_string());
756 }
757 None
758 })
759 } else {
760 None
761 };
762
763 let css = if is_block_tag {
765 el.attr("style").map(parse_block_styles).unwrap_or_default()
766 } else {
767 BlockStyles::default()
768 };
769
770 if tag == "table" {
771 let parsed_table = parse_table_element(node);
773 if !parsed_table.rows.is_empty() {
774 elements.push(ParsedElement::Table(parsed_table));
775 }
776 return;
777 }
778
779 if tag == "br" {
780 elements.push(ParsedElement::Block(ParsedBlock {
782 spans: vec![ParsedSpan {
783 text: String::new(),
784 ..Default::default()
785 }],
786 heading_level: None,
787 list_style: None,
788 list_indent: 0,
789 is_code_block: false,
790 code_language: None,
791 blockquote_depth: bq_depth,
792 line_height: None,
793 non_breakable_lines: None,
794 direction: None,
795 background_color: None,
796 }));
797 return;
798 }
799
800 if tag == "blockquote" {
801 for child in node.children() {
803 walk_node(
804 child,
805 &new_state,
806 elements,
807 &new_list_style,
808 bq_depth,
809 new_list_depth,
810 depth + 1,
811 );
812 }
813 } else if is_block_tag && tag != "br" {
814 let mut spans: Vec<ParsedSpan> = Vec::new();
819 let mut nested_elements: Vec<ParsedElement> = Vec::new();
820 collect_inline_spans(
821 node,
822 &new_state,
823 &mut spans,
824 &new_list_style,
825 &mut nested_elements,
826 bq_depth,
827 new_list_depth,
828 depth + 1,
829 );
830
831 let list_style_for_block = if tag == "li" {
832 new_list_style.clone()
833 } else {
834 None
835 };
836
837 let list_indent_for_block = if tag == "li" {
838 new_list_depth.saturating_sub(1)
839 } else {
840 0
841 };
842
843 if !spans.is_empty() || heading_level.is_some() {
844 elements.push(ParsedElement::Block(ParsedBlock {
845 spans,
846 heading_level,
847 list_style: list_style_for_block,
848 list_indent: list_indent_for_block,
849 is_code_block,
850 code_language,
851 blockquote_depth: bq_depth,
852 line_height: css.line_height,
853 non_breakable_lines: css.non_breakable_lines,
854 direction: css.direction,
855 background_color: css.background_color,
856 }));
857 }
858 elements.append(&mut nested_elements);
860 } else if matches!(tag, "ul" | "ol" | "thead" | "tbody" | "tr") {
861 for child in node.children() {
863 walk_node(
864 child,
865 &new_state,
866 elements,
867 &new_list_style,
868 bq_depth,
869 new_list_depth,
870 depth + 1,
871 );
872 }
873 } else {
874 for child in node.children() {
876 walk_node(
877 child,
878 &new_state,
879 elements,
880 current_list_style,
881 bq_depth,
882 list_depth,
883 depth + 1,
884 );
885 }
886 }
887 }
888 Node::Text(text) => {
889 let t = text.text.to_string();
890 let trimmed = t.trim();
891 if !trimmed.is_empty() {
892 elements.push(ParsedElement::Block(ParsedBlock {
894 spans: vec![ParsedSpan {
895 text: trimmed.to_string(),
896 bold: state.bold,
897 italic: state.italic,
898 underline: state.underline,
899 strikeout: state.strikeout,
900 code: state.code,
901 link_href: state.link_href.clone(),
902 }],
903 heading_level: None,
904 list_style: None,
905 list_indent: 0,
906 is_code_block: false,
907 code_language: None,
908 blockquote_depth,
909 line_height: None,
910 non_breakable_lines: None,
911 direction: None,
912 background_color: None,
913 }));
914 }
915 }
916 _ => {
917 for child in node.children() {
919 walk_node(
920 child,
921 state,
922 elements,
923 current_list_style,
924 blockquote_depth,
925 list_depth,
926 depth + 1,
927 );
928 }
929 }
930 }
931 }
932
933 #[allow(clippy::too_many_arguments)]
937 fn collect_inline_spans(
938 node: ego_tree::NodeRef<Node>,
939 state: &FmtState,
940 spans: &mut Vec<ParsedSpan>,
941 current_list_style: &Option<ListStyle>,
942 elements: &mut Vec<ParsedElement>,
943 blockquote_depth: u32,
944 list_depth: u32,
945 depth: usize,
946 ) {
947 if depth > MAX_RECURSION_DEPTH {
948 return;
949 }
950 for child in node.children() {
951 match child.value() {
952 Node::Text(text) => {
953 let t = text.text.to_string();
954 if !t.is_empty() {
955 spans.push(ParsedSpan {
956 text: t,
957 bold: state.bold,
958 italic: state.italic,
959 underline: state.underline,
960 strikeout: state.strikeout,
961 code: state.code,
962 link_href: state.link_href.clone(),
963 });
964 }
965 }
966 Node::Element(el) => {
967 let tag = el.name();
968 let mut new_state = state.clone();
969
970 match tag {
971 "b" | "strong" => new_state.bold = true,
972 "i" | "em" => new_state.italic = true,
973 "u" | "ins" => new_state.underline = true,
974 "s" | "del" | "strike" => new_state.strikeout = true,
975 "code" => new_state.code = true,
976 "a" => {
977 if let Some(href) = el.attr("href") {
978 new_state.link_href = Some(href.to_string());
979 }
980 }
981 _ => {}
982 }
983
984 let nested_block = matches!(
986 tag,
987 "p" | "div"
988 | "h1"
989 | "h2"
990 | "h3"
991 | "h4"
992 | "h5"
993 | "h6"
994 | "li"
995 | "pre"
996 | "blockquote"
997 | "ul"
998 | "ol"
999 );
1000
1001 if tag == "br" {
1002 spans.push(ParsedSpan {
1005 text: String::new(),
1006 ..Default::default()
1007 });
1008 } else if nested_block || tag == "table" {
1009 walk_node(
1011 child,
1012 &new_state,
1013 elements,
1014 current_list_style,
1015 blockquote_depth,
1016 list_depth,
1017 depth + 1,
1018 );
1019 } else {
1020 collect_inline_spans(
1022 child,
1023 &new_state,
1024 spans,
1025 current_list_style,
1026 elements,
1027 blockquote_depth,
1028 list_depth,
1029 depth + 1,
1030 );
1031 }
1032 }
1033 _ => {}
1034 }
1035 }
1036 }
1037
1038 let initial_state = FmtState::default();
1039 let mut root_spans: Vec<ParsedSpan> = Vec::new();
1043 collect_inline_spans(
1044 *root,
1045 &initial_state,
1046 &mut root_spans,
1047 &None,
1048 &mut elements,
1049 0,
1050 0,
1051 0,
1052 );
1053 if !root_spans.is_empty() {
1054 elements.push(ParsedElement::Block(ParsedBlock {
1055 spans: root_spans,
1056 heading_level: None,
1057 list_style: None,
1058 list_indent: 0,
1059 is_code_block: false,
1060 code_language: None,
1061 blockquote_depth: 0,
1062 line_height: None,
1063 non_breakable_lines: None,
1064 direction: None,
1065 background_color: None,
1066 }));
1067 }
1068
1069 if elements.is_empty() {
1071 elements.push(ParsedElement::Block(ParsedBlock {
1072 spans: vec![ParsedSpan {
1073 text: String::new(),
1074 ..Default::default()
1075 }],
1076 heading_level: None,
1077 list_style: None,
1078 list_indent: 0,
1079 is_code_block: false,
1080 code_language: None,
1081 blockquote_depth: 0,
1082 line_height: None,
1083 non_breakable_lines: None,
1084 direction: None,
1085 background_color: None,
1086 }));
1087 }
1088
1089 elements
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094 use super::*;
1095
1096 fn parse_markdown_blocks(md: &str) -> Vec<ParsedBlock> {
1098 ParsedElement::flatten_to_blocks(parse_markdown(md))
1099 }
1100
1101 #[test]
1102 fn test_parse_markdown_simple_paragraph() {
1103 let blocks = parse_markdown_blocks("Hello **world**");
1104 assert_eq!(blocks.len(), 1);
1105 assert!(blocks[0].spans.len() >= 2);
1106 let plain_span = blocks[0]
1108 .spans
1109 .iter()
1110 .find(|s| s.text.contains("Hello"))
1111 .unwrap();
1112 assert!(!plain_span.bold);
1113 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1114 assert!(bold_span.bold);
1115 }
1116
1117 #[test]
1118 fn test_parse_markdown_heading() {
1119 let blocks = parse_markdown_blocks("# Title");
1120 assert_eq!(blocks.len(), 1);
1121 assert_eq!(blocks[0].heading_level, Some(1));
1122 assert_eq!(blocks[0].spans[0].text, "Title");
1123 }
1124
1125 #[test]
1126 fn test_parse_markdown_list() {
1127 let blocks = parse_markdown_blocks("- item1\n- item2");
1128 assert!(blocks.len() >= 2);
1129 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1130 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1131 }
1132
1133 #[test]
1134 fn test_parse_html_simple() {
1135 let blocks = parse_html("<p>Hello <b>world</b></p>");
1136 assert_eq!(blocks.len(), 1);
1137 assert!(blocks[0].spans.len() >= 2);
1138 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1139 assert!(bold_span.bold);
1140 }
1141
1142 #[test]
1143 fn test_parse_html_multiple_paragraphs() {
1144 let blocks = parse_html("<p>A</p><p>B</p>");
1145 assert_eq!(blocks.len(), 2);
1146 }
1147
1148 #[test]
1149 fn test_parse_html_heading() {
1150 let blocks = parse_html("<h2>Subtitle</h2>");
1151 assert_eq!(blocks.len(), 1);
1152 assert_eq!(blocks[0].heading_level, Some(2));
1153 }
1154
1155 #[test]
1156 fn test_parse_html_list() {
1157 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
1158 assert!(blocks.len() >= 2);
1159 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1160 }
1161
1162 #[test]
1163 fn test_parse_markdown_code_block() {
1164 let blocks = parse_markdown_blocks("```\nfn main() {}\n```");
1165 assert_eq!(blocks.len(), 1);
1166 assert!(blocks[0].is_code_block);
1167 assert!(blocks[0].spans[0].code);
1168 let text: String = blocks[0].spans.iter().map(|s| s.text.as_str()).collect();
1170 assert_eq!(
1171 text, "fn main() {}",
1172 "code block text should not have trailing newline"
1173 );
1174 }
1175
1176 #[test]
1177 fn test_parse_markdown_nested_formatting() {
1178 let blocks = parse_markdown_blocks("***bold italic***");
1179 assert_eq!(blocks.len(), 1);
1180 let span = &blocks[0].spans[0];
1181 assert!(span.bold);
1182 assert!(span.italic);
1183 }
1184
1185 #[test]
1186 fn test_parse_markdown_link() {
1187 let blocks = parse_markdown_blocks("[click](http://example.com)");
1188 assert_eq!(blocks.len(), 1);
1189 let span = &blocks[0].spans[0];
1190 assert_eq!(span.text, "click");
1191 assert_eq!(span.link_href, Some("http://example.com".to_string()));
1192 }
1193
1194 #[test]
1195 fn test_parse_markdown_empty() {
1196 let blocks = parse_markdown_blocks("");
1197 assert_eq!(blocks.len(), 1);
1198 assert!(blocks[0].spans[0].text.is_empty());
1199 }
1200
1201 #[test]
1202 fn test_parse_html_empty() {
1203 let blocks = parse_html("");
1204 assert_eq!(blocks.len(), 1);
1205 assert!(blocks[0].spans[0].text.is_empty());
1206 }
1207
1208 #[test]
1209 fn test_parse_html_nested_formatting() {
1210 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
1211 assert_eq!(blocks.len(), 1);
1212 let span = &blocks[0].spans[0];
1213 assert!(span.bold);
1214 assert!(span.italic);
1215 }
1216
1217 #[test]
1218 fn test_parse_html_link() {
1219 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
1220 assert_eq!(blocks.len(), 1);
1221 let span = &blocks[0].spans[0];
1222 assert_eq!(span.text, "click");
1223 assert_eq!(span.link_href, Some("http://example.com".to_string()));
1224 }
1225
1226 #[test]
1227 fn test_parse_html_ordered_list() {
1228 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
1229 assert!(blocks.len() >= 2);
1230 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1231 }
1232
1233 #[test]
1234 fn test_parse_markdown_ordered_list() {
1235 let blocks = parse_markdown_blocks("1. first\n2. second");
1236 assert!(blocks.len() >= 2);
1237 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1238 }
1239
1240 #[test]
1241 fn test_parse_html_blockquote_nested() {
1242 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
1243 assert!(blocks.len() >= 3);
1244 }
1245
1246 #[test]
1247 fn test_parse_block_styles_line_height() {
1248 let styles = parse_block_styles("line-height: 1.5");
1249 assert_eq!(styles.line_height, Some(1500));
1250 }
1251
1252 #[test]
1253 fn test_parse_block_styles_direction_rtl() {
1254 let styles = parse_block_styles("direction: rtl");
1255 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1256 }
1257
1258 #[test]
1259 fn test_parse_block_styles_background_color() {
1260 let styles = parse_block_styles("background-color: #ff0000");
1261 assert_eq!(styles.background_color, Some("#ff0000".to_string()));
1262 }
1263
1264 #[test]
1265 fn test_parse_block_styles_white_space_pre() {
1266 let styles = parse_block_styles("white-space: pre");
1267 assert_eq!(styles.non_breakable_lines, Some(true));
1268 }
1269
1270 #[test]
1271 fn test_parse_block_styles_multiple() {
1272 let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
1273 assert_eq!(styles.line_height, Some(2000));
1274 assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1275 assert_eq!(styles.background_color, Some("blue".to_string()));
1276 }
1277
1278 #[test]
1279 fn test_parse_html_block_styles_extracted() {
1280 let blocks = parse_html(
1281 r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
1282 );
1283 assert_eq!(blocks.len(), 1);
1284 assert_eq!(blocks[0].line_height, Some(1500));
1285 assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
1286 assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
1287 }
1288
1289 #[test]
1290 fn test_parse_html_white_space_pre() {
1291 let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
1292 assert_eq!(blocks.len(), 1);
1293 assert_eq!(blocks[0].non_breakable_lines, Some(true));
1294 }
1295
1296 #[test]
1297 fn test_parse_html_no_styles_returns_none() {
1298 let blocks = parse_html("<p>plain</p>");
1299 assert_eq!(blocks.len(), 1);
1300 assert_eq!(blocks[0].line_height, None);
1301 assert_eq!(blocks[0].direction, None);
1302 assert_eq!(blocks[0].background_color, None);
1303 assert_eq!(blocks[0].non_breakable_lines, None);
1304 }
1305
1306 #[test]
1307 fn test_parse_markdown_nested_list_indent() {
1308 let md = "- top\n - nested\n - deep";
1309 let blocks = parse_markdown_blocks(md);
1310 assert_eq!(blocks.len(), 3);
1311 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1312 assert_eq!(blocks[0].list_indent, 0);
1313 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1314 assert_eq!(blocks[1].list_indent, 1);
1315 assert_eq!(blocks[2].list_style, Some(ListStyle::Disc));
1316 assert_eq!(blocks[2].list_indent, 2);
1317 }
1318
1319 #[test]
1320 fn test_parse_markdown_nested_ordered_list_indent() {
1321 let md = "1. first\n 1. nested\n 2. nested2";
1322 let blocks = parse_markdown_blocks(md);
1323 assert_eq!(blocks.len(), 3);
1324 assert_eq!(blocks[0].list_indent, 0);
1325 assert_eq!(blocks[1].list_indent, 1);
1326 assert_eq!(blocks[2].list_indent, 1);
1327 }
1328
1329 #[test]
1330 fn test_parse_html_nested_list_indent() {
1331 let html = "<ul><li>top</li><ul><li>nested</li></ul></ul>";
1332 let blocks = parse_html(html);
1333 assert!(blocks.len() >= 2);
1334 assert_eq!(blocks[0].list_indent, 0);
1335 assert_eq!(blocks[1].list_indent, 1);
1336 }
1337
1338 #[test]
1339 fn test_parse_markdown_table() {
1340 let md = "| A | B |\n|---|---|\n| 1 | 2 |";
1341 let elements = parse_markdown(md);
1342 assert_eq!(elements.len(), 1);
1343 match &elements[0] {
1344 ParsedElement::Table(table) => {
1345 assert_eq!(table.header_rows, 1);
1346 assert_eq!(table.rows.len(), 2); assert_eq!(table.rows[0].len(), 2);
1349 assert_eq!(table.rows[0][0].spans[0].text, "A");
1350 assert_eq!(table.rows[0][1].spans[0].text, "B");
1351 assert_eq!(table.rows[1].len(), 2);
1353 assert_eq!(table.rows[1][0].spans[0].text, "1");
1354 assert_eq!(table.rows[1][1].spans[0].text, "2");
1355 }
1356 _ => panic!("Expected ParsedElement::Table"),
1357 }
1358 }
1359
1360 #[test]
1361 fn test_parse_markdown_table_with_formatting() {
1362 let md = "| **bold** | `code` | *italic* |\n|---|---|---|\n| ~~strike~~ | plain | [link](http://x.com) |";
1363 let elements = parse_markdown(md);
1364 assert_eq!(elements.len(), 1);
1365 match &elements[0] {
1366 ParsedElement::Table(table) => {
1367 assert_eq!(table.rows.len(), 2);
1368 assert!(table.rows[0][0].spans[0].bold);
1370 assert!(table.rows[0][1].spans[0].code);
1372 assert!(table.rows[0][2].spans[0].italic);
1374 assert!(table.rows[1][0].spans[0].strikeout);
1376 assert_eq!(
1378 table.rows[1][2].spans[0].link_href,
1379 Some("http://x.com".to_string())
1380 );
1381 }
1382 _ => panic!("Expected ParsedElement::Table"),
1383 }
1384 }
1385
1386 #[test]
1387 fn test_parse_markdown_mixed_content_with_table() {
1388 let md = "Before\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nAfter";
1389 let elements = parse_markdown(md);
1390 assert_eq!(elements.len(), 3);
1391 assert!(matches!(&elements[0], ParsedElement::Block(_)));
1392 assert!(matches!(&elements[1], ParsedElement::Table(_)));
1393 assert!(matches!(&elements[2], ParsedElement::Block(_)));
1394 }
1395}