Skip to main content

text_document_common/parser_tools/
content_parser.rs

1use crate::entities::{ListStyle, TextDirection};
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed table cell containing inline spans.
16#[derive(Debug, Clone)]
17pub struct ParsedTableCell {
18    pub spans: Vec<ParsedSpan>,
19}
20
21/// A parsed table extracted from markdown or HTML.
22#[derive(Debug, Clone)]
23pub struct ParsedTable {
24    /// Number of header rows (typically 1 for markdown tables).
25    pub header_rows: usize,
26    /// All rows (header + body), each containing cells with their inline spans.
27    pub rows: Vec<Vec<ParsedTableCell>>,
28}
29
30/// A parsed element: either a block or a table.
31#[derive(Debug, Clone)]
32pub enum ParsedElement {
33    Block(ParsedBlock),
34    Table(ParsedTable),
35}
36
37impl ParsedElement {
38    /// Extract blocks, flattening tables into one block per cell.
39    /// Use when table structure is not needed.
40    pub fn flatten_to_blocks(elements: Vec<ParsedElement>) -> Vec<ParsedBlock> {
41        let mut blocks = Vec::new();
42        for elem in elements {
43            match elem {
44                ParsedElement::Block(b) => blocks.push(b),
45                ParsedElement::Table(t) => {
46                    for row in t.rows {
47                        for cell in row {
48                            blocks.push(ParsedBlock {
49                                spans: cell.spans,
50                                heading_level: None,
51                                list_style: None,
52                                list_indent: 0,
53                                is_code_block: false,
54                                code_language: None,
55                                blockquote_depth: 0,
56                                line_height: None,
57                                non_breakable_lines: None,
58                                direction: None,
59                                background_color: None,
60                            });
61                        }
62                    }
63                }
64            }
65        }
66        if blocks.is_empty() {
67            blocks.push(ParsedBlock {
68                spans: vec![ParsedSpan {
69                    text: String::new(),
70                    ..Default::default()
71                }],
72                heading_level: None,
73                list_style: None,
74                list_indent: 0,
75                is_code_block: false,
76                code_language: None,
77                blockquote_depth: 0,
78                line_height: None,
79                non_breakable_lines: None,
80                direction: None,
81                background_color: None,
82            });
83        }
84        blocks
85    }
86}
87
88/// A parsed block (paragraph, heading, list item, code block)
89#[derive(Debug, Clone)]
90pub struct ParsedBlock {
91    pub spans: Vec<ParsedSpan>,
92    pub heading_level: Option<i64>,
93    pub list_style: Option<ListStyle>,
94    pub list_indent: u32,
95    pub is_code_block: bool,
96    pub code_language: Option<String>,
97    pub blockquote_depth: u32,
98    pub line_height: Option<i64>,
99    pub non_breakable_lines: Option<bool>,
100    pub direction: Option<TextDirection>,
101    pub background_color: Option<String>,
102}
103
104impl ParsedBlock {
105    /// Returns `true` when this block carries no block-level formatting,
106    /// meaning its content is purely inline.
107    pub fn is_inline_only(&self) -> bool {
108        self.heading_level.is_none()
109            && self.list_style.is_none()
110            && !self.is_code_block
111            && self.blockquote_depth == 0
112            && self.line_height.is_none()
113            && self.non_breakable_lines.is_none()
114            && self.direction.is_none()
115            && self.background_color.is_none()
116    }
117}
118
119// ─── Markdown parsing ────────────────────────────────────────────────
120
121pub fn parse_markdown(markdown: &str) -> Vec<ParsedElement> {
122    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
123
124    let options =
125        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
126    let parser = Parser::new_ext(markdown, options);
127
128    let mut elements: Vec<ParsedElement> = Vec::new();
129    let mut current_spans: Vec<ParsedSpan> = Vec::new();
130    let mut current_heading: Option<i64> = None;
131    let mut current_list_style: Option<ListStyle> = None;
132    let mut is_code_block = false;
133    let mut code_language: Option<String> = None;
134    let mut blockquote_depth: u32 = 0;
135    let mut in_block = false;
136
137    // Formatting state stack
138    let mut bold = false;
139    let mut italic = false;
140    let mut strikeout = false;
141    let mut link_href: Option<String> = None;
142
143    // List style stack for nested lists (also tracks nesting depth)
144    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
145    let mut current_list_indent: u32 = 0;
146
147    // Table tracking state
148    let mut in_table = false;
149    let mut in_table_head = false;
150    let mut table_rows: Vec<Vec<ParsedTableCell>> = Vec::new();
151    let mut current_row_cells: Vec<ParsedTableCell> = Vec::new();
152    let mut current_cell_spans: Vec<ParsedSpan> = Vec::new();
153    let mut table_header_rows: usize = 0;
154
155    for event in parser {
156        match event {
157            Event::Start(Tag::Paragraph) => {
158                in_block = true;
159                current_heading = None;
160                is_code_block = false;
161            }
162            Event::End(TagEnd::Paragraph) => {
163                if !current_spans.is_empty() || in_block {
164                    elements.push(ParsedElement::Block(ParsedBlock {
165                        spans: std::mem::take(&mut current_spans),
166                        heading_level: current_heading.take(),
167                        list_style: current_list_style.clone(),
168                        list_indent: current_list_indent,
169                        is_code_block: false,
170                        code_language: None,
171                        blockquote_depth,
172                        line_height: None,
173                        non_breakable_lines: None,
174                        direction: None,
175                        background_color: None,
176                    }));
177                }
178                in_block = false;
179                current_list_style = None;
180            }
181            Event::Start(Tag::Heading { level, .. }) => {
182                in_block = true;
183                current_heading = Some(heading_level_to_i64(level));
184                is_code_block = false;
185            }
186            Event::End(TagEnd::Heading(_)) => {
187                elements.push(ParsedElement::Block(ParsedBlock {
188                    spans: std::mem::take(&mut current_spans),
189                    heading_level: current_heading.take(),
190                    list_style: None,
191                    list_indent: 0,
192                    is_code_block: false,
193                    code_language: None,
194                    blockquote_depth,
195                    line_height: None,
196                    non_breakable_lines: None,
197                    direction: None,
198                    background_color: None,
199                }));
200                in_block = false;
201            }
202            Event::Start(Tag::List(ordered)) => {
203                let style = if ordered.is_some() {
204                    Some(ListStyle::Decimal)
205                } else {
206                    Some(ListStyle::Disc)
207                };
208                list_stack.push(style);
209            }
210            Event::End(TagEnd::List(_)) => {
211                list_stack.pop();
212            }
213            Event::Start(Tag::Item) => {
214                // Flush any accumulated spans from the parent item before
215                // starting a child item in a tight list
216                if !current_spans.is_empty() {
217                    elements.push(ParsedElement::Block(ParsedBlock {
218                        spans: std::mem::take(&mut current_spans),
219                        heading_level: None,
220                        list_style: current_list_style.clone(),
221                        list_indent: current_list_indent,
222                        is_code_block: false,
223                        code_language: None,
224                        blockquote_depth,
225                        line_height: None,
226                        non_breakable_lines: None,
227                        direction: None,
228                        background_color: None,
229                    }));
230                }
231                in_block = true;
232                current_list_style = list_stack.last().cloned().flatten();
233                current_list_indent = if list_stack.is_empty() {
234                    0
235                } else {
236                    (list_stack.len() - 1) as u32
237                };
238            }
239            Event::End(TagEnd::Item) => {
240                // The paragraph inside the item will have already been flushed,
241                // but if there was no inner paragraph (tight list), flush now.
242                if !current_spans.is_empty() {
243                    elements.push(ParsedElement::Block(ParsedBlock {
244                        spans: std::mem::take(&mut current_spans),
245                        heading_level: None,
246                        list_style: current_list_style.clone(),
247                        list_indent: current_list_indent,
248                        is_code_block: false,
249                        code_language: None,
250                        blockquote_depth,
251                        line_height: None,
252                        non_breakable_lines: None,
253                        direction: None,
254                        background_color: None,
255                    }));
256                }
257                in_block = false;
258                current_list_style = None;
259            }
260            Event::Start(Tag::CodeBlock(kind)) => {
261                in_block = true;
262                is_code_block = true;
263                code_language = match &kind {
264                    pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
265                        Some(lang.to_string())
266                    }
267                    _ => None,
268                };
269            }
270            Event::End(TagEnd::CodeBlock) => {
271                // pulldown-cmark appends a trailing '\n' to code block text — strip it
272                if let Some(last) = current_spans.last_mut()
273                    && last.text.ends_with('\n')
274                {
275                    last.text.truncate(last.text.len() - 1);
276                }
277                elements.push(ParsedElement::Block(ParsedBlock {
278                    spans: std::mem::take(&mut current_spans),
279                    heading_level: None,
280                    list_style: None,
281                    list_indent: 0,
282                    is_code_block: true,
283                    code_language: code_language.take(),
284                    blockquote_depth,
285                    line_height: None,
286                    non_breakable_lines: None,
287                    direction: None,
288                    background_color: None,
289                }));
290                in_block = false;
291                is_code_block = false;
292            }
293            // ─── Table events ───────────────────────────────────────
294            Event::Start(Tag::Table(_)) => {
295                in_table = true;
296                in_table_head = false;
297                table_rows.clear();
298                current_row_cells.clear();
299                current_cell_spans.clear();
300                table_header_rows = 0;
301            }
302            Event::End(TagEnd::Table) => {
303                elements.push(ParsedElement::Table(ParsedTable {
304                    header_rows: table_header_rows,
305                    rows: std::mem::take(&mut table_rows),
306                }));
307                in_table = false;
308            }
309            Event::Start(Tag::TableHead) => {
310                in_table_head = true;
311                current_row_cells.clear();
312            }
313            Event::End(TagEnd::TableHead) => {
314                // Flush the header row
315                table_rows.push(std::mem::take(&mut current_row_cells));
316                table_header_rows += 1;
317                in_table_head = false;
318            }
319            Event::Start(Tag::TableRow) => {
320                current_row_cells.clear();
321            }
322            Event::End(TagEnd::TableRow) => {
323                // Body rows only — header row is flushed in End(TableHead)
324                if !in_table_head {
325                    table_rows.push(std::mem::take(&mut current_row_cells));
326                }
327            }
328            Event::Start(Tag::TableCell) => {
329                current_cell_spans.clear();
330            }
331            Event::End(TagEnd::TableCell) => {
332                current_row_cells.push(ParsedTableCell {
333                    spans: std::mem::take(&mut current_cell_spans),
334                });
335            }
336            // ─── Inline formatting ──────────────────────────────────
337            Event::Start(Tag::Emphasis) => {
338                italic = true;
339            }
340            Event::End(TagEnd::Emphasis) => {
341                italic = false;
342            }
343            Event::Start(Tag::Strong) => {
344                bold = true;
345            }
346            Event::End(TagEnd::Strong) => {
347                bold = false;
348            }
349            Event::Start(Tag::Strikethrough) => {
350                strikeout = true;
351            }
352            Event::End(TagEnd::Strikethrough) => {
353                strikeout = false;
354            }
355            Event::Start(Tag::Link { dest_url, .. }) => {
356                link_href = Some(dest_url.to_string());
357            }
358            Event::End(TagEnd::Link) => {
359                link_href = None;
360            }
361            Event::Text(text) => {
362                let span = ParsedSpan {
363                    text: text.to_string(),
364                    bold,
365                    italic,
366                    underline: false,
367                    strikeout,
368                    code: is_code_block,
369                    link_href: link_href.clone(),
370                };
371                if in_table {
372                    current_cell_spans.push(span);
373                } else {
374                    if !in_block {
375                        in_block = true;
376                    }
377                    current_spans.push(span);
378                }
379            }
380            Event::Code(text) => {
381                let span = ParsedSpan {
382                    text: text.to_string(),
383                    bold,
384                    italic,
385                    underline: false,
386                    strikeout,
387                    code: true,
388                    link_href: link_href.clone(),
389                };
390                if in_table {
391                    current_cell_spans.push(span);
392                } else {
393                    if !in_block {
394                        in_block = true;
395                    }
396                    current_spans.push(span);
397                }
398            }
399            Event::SoftBreak => {
400                let span = ParsedSpan {
401                    text: " ".to_string(),
402                    bold,
403                    italic,
404                    underline: false,
405                    strikeout,
406                    code: false,
407                    link_href: link_href.clone(),
408                };
409                if in_table {
410                    current_cell_spans.push(span);
411                } else {
412                    current_spans.push(span);
413                }
414            }
415            Event::HardBreak => {
416                // Finalize current block
417                if !current_spans.is_empty() || in_block {
418                    elements.push(ParsedElement::Block(ParsedBlock {
419                        spans: std::mem::take(&mut current_spans),
420                        heading_level: current_heading.take(),
421                        list_style: current_list_style.clone(),
422                        list_indent: current_list_indent,
423                        is_code_block,
424                        code_language: code_language.clone(),
425                        blockquote_depth,
426                        line_height: None,
427                        non_breakable_lines: None,
428                        direction: None,
429                        background_color: None,
430                    }));
431                }
432            }
433            Event::Start(Tag::BlockQuote(_)) => {
434                blockquote_depth += 1;
435            }
436            Event::End(TagEnd::BlockQuote(_)) => {
437                blockquote_depth = blockquote_depth.saturating_sub(1);
438            }
439            _ => {}
440        }
441    }
442
443    // Flush any remaining content
444    if !current_spans.is_empty() {
445        elements.push(ParsedElement::Block(ParsedBlock {
446            spans: std::mem::take(&mut current_spans),
447            heading_level: current_heading,
448            list_style: current_list_style,
449            list_indent: current_list_indent,
450            is_code_block,
451            code_language: code_language.take(),
452            blockquote_depth,
453            line_height: None,
454            non_breakable_lines: None,
455            direction: None,
456            background_color: None,
457        }));
458    }
459
460    // If no elements were parsed, create a single empty paragraph
461    if elements.is_empty() {
462        elements.push(ParsedElement::Block(ParsedBlock {
463            spans: vec![ParsedSpan {
464                text: String::new(),
465                ..Default::default()
466            }],
467            heading_level: None,
468            list_style: None,
469            list_indent: 0,
470            is_code_block: false,
471            code_language: None,
472            blockquote_depth: 0,
473            line_height: None,
474            non_breakable_lines: None,
475            direction: None,
476            background_color: None,
477        }));
478    }
479
480    elements
481}
482
483fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
484    use pulldown_cmark::HeadingLevel;
485    match level {
486        HeadingLevel::H1 => 1,
487        HeadingLevel::H2 => 2,
488        HeadingLevel::H3 => 3,
489        HeadingLevel::H4 => 4,
490        HeadingLevel::H5 => 5,
491        HeadingLevel::H6 => 6,
492    }
493}
494
495// ─── HTML parsing ────────────────────────────────────────────────────
496
497use scraper::Node;
498
499/// Parsed CSS block-level styles from an inline `style` attribute.
500#[derive(Debug, Clone, Default)]
501struct BlockStyles {
502    line_height: Option<i64>,
503    non_breakable_lines: Option<bool>,
504    direction: Option<TextDirection>,
505    background_color: Option<String>,
506}
507
508/// Parse relevant CSS properties from an inline style string.
509/// Handles: line-height, white-space, direction, background-color.
510fn parse_block_styles(style: &str) -> BlockStyles {
511    let mut result = BlockStyles::default();
512    for part in style.split(';') {
513        let part = part.trim();
514        if let Some((prop, val)) = part.split_once(':') {
515            let prop = prop.trim().to_ascii_lowercase();
516            let val = val.trim();
517            match prop.as_str() {
518                "line-height" => {
519                    // Try parsing as a plain number (multiplier)
520                    if let Ok(v) = val.parse::<f64>() {
521                        result.line_height = Some((v * 1000.0) as i64);
522                    }
523                }
524                "white-space" => {
525                    if val == "pre" || val == "nowrap" || val == "pre-wrap" {
526                        result.non_breakable_lines = Some(true);
527                    }
528                }
529                "direction" => {
530                    if val.eq_ignore_ascii_case("rtl") {
531                        result.direction = Some(TextDirection::RightToLeft);
532                    } else if val.eq_ignore_ascii_case("ltr") {
533                        result.direction = Some(TextDirection::LeftToRight);
534                    }
535                }
536                "background-color" | "background" => {
537                    result.background_color = Some(val.to_string());
538                }
539                _ => {}
540            }
541        }
542    }
543    result
544}
545
546pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
547    ParsedElement::flatten_to_blocks(parse_html_elements(html))
548}
549
550pub fn parse_html_elements(html: &str) -> Vec<ParsedElement> {
551    use scraper::Html;
552
553    let fragment = Html::parse_fragment(html);
554    let mut elements: Vec<ParsedElement> = Vec::new();
555
556    // Walk the DOM tree starting from the root
557    let root = fragment.root_element();
558
559    #[derive(Clone, Default)]
560    struct FmtState {
561        bold: bool,
562        italic: bool,
563        underline: bool,
564        strikeout: bool,
565        code: bool,
566        link_href: Option<String>,
567    }
568
569    const MAX_RECURSION_DEPTH: usize = 256;
570
571    /// Collect inline spans from a `<td>` or `<th>` cell element.
572    fn collect_cell_spans(
573        node: ego_tree::NodeRef<Node>,
574        state: &FmtState,
575        spans: &mut Vec<ParsedSpan>,
576        depth: usize,
577    ) {
578        if depth > MAX_RECURSION_DEPTH {
579            return;
580        }
581        for child in node.children() {
582            match child.value() {
583                Node::Text(text) => {
584                    let t = text.text.to_string();
585                    if !t.is_empty() {
586                        spans.push(ParsedSpan {
587                            text: t,
588                            bold: state.bold,
589                            italic: state.italic,
590                            underline: state.underline,
591                            strikeout: state.strikeout,
592                            code: state.code,
593                            link_href: state.link_href.clone(),
594                        });
595                    }
596                }
597                Node::Element(el) => {
598                    let tag = el.name();
599                    let mut new_state = state.clone();
600                    match tag {
601                        "b" | "strong" => new_state.bold = true,
602                        "i" | "em" => new_state.italic = true,
603                        "u" | "ins" => new_state.underline = true,
604                        "s" | "del" | "strike" => new_state.strikeout = true,
605                        "code" => new_state.code = true,
606                        "a" => {
607                            if let Some(href) = el.attr("href") {
608                                new_state.link_href = Some(href.to_string());
609                            }
610                        }
611                        _ => {}
612                    }
613                    collect_cell_spans(child, &new_state, spans, depth + 1);
614                }
615                _ => {}
616            }
617        }
618    }
619
620    /// Parse a `<table>` element into a ParsedTable.
621    fn parse_table_element(table_node: ego_tree::NodeRef<Node>) -> ParsedTable {
622        let mut rows: Vec<Vec<ParsedTableCell>> = Vec::new();
623        let mut header_rows: usize = 0;
624
625        fn collect_rows(
626            node: ego_tree::NodeRef<Node>,
627            rows: &mut Vec<Vec<ParsedTableCell>>,
628            header_rows: &mut usize,
629            in_thead: bool,
630        ) {
631            for child in node.children() {
632                if let Node::Element(el) = child.value() {
633                    match el.name() {
634                        "thead" => collect_rows(child, rows, header_rows, true),
635                        "tbody" | "tfoot" => collect_rows(child, rows, header_rows, false),
636                        "tr" => {
637                            let mut cells: Vec<ParsedTableCell> = Vec::new();
638                            for td in child.children() {
639                                if let Node::Element(td_el) = td.value()
640                                    && matches!(td_el.name(), "td" | "th")
641                                {
642                                    let mut spans = Vec::new();
643                                    let state = FmtState::default();
644                                    collect_cell_spans(td, &state, &mut spans, 0);
645                                    if spans.is_empty() {
646                                        spans.push(ParsedSpan::default());
647                                    }
648                                    cells.push(ParsedTableCell { spans });
649                                }
650                            }
651                            if !cells.is_empty() {
652                                rows.push(cells);
653                                if in_thead {
654                                    *header_rows += 1;
655                                }
656                            }
657                        }
658                        _ => {}
659                    }
660                }
661            }
662        }
663
664        collect_rows(table_node, &mut rows, &mut header_rows, false);
665
666        // Tables without explicit <thead> but with <th> cells: treat first row as header
667        if header_rows == 0 && !rows.is_empty() {
668            header_rows = 1;
669        }
670
671        ParsedTable { header_rows, rows }
672    }
673
674    fn walk_node(
675        node: ego_tree::NodeRef<Node>,
676        state: &FmtState,
677        elements: &mut Vec<ParsedElement>,
678        current_list_style: &Option<ListStyle>,
679        blockquote_depth: u32,
680        list_depth: u32,
681        depth: usize,
682    ) {
683        if depth > MAX_RECURSION_DEPTH {
684            return;
685        }
686        match node.value() {
687            Node::Element(el) => {
688                let tag = el.name();
689                let mut new_state = state.clone();
690                let mut new_list_style = current_list_style.clone();
691                let mut bq_depth = blockquote_depth;
692                let mut new_list_depth = list_depth;
693
694                // Determine if this is a block-level element
695                let is_block_tag = matches!(
696                    tag,
697                    "p" | "div"
698                        | "h1"
699                        | "h2"
700                        | "h3"
701                        | "h4"
702                        | "h5"
703                        | "h6"
704                        | "li"
705                        | "pre"
706                        | "br"
707                        | "blockquote"
708                        | "body"
709                        | "html"
710                );
711
712                // Update formatting state
713                match tag {
714                    "b" | "strong" => new_state.bold = true,
715                    "i" | "em" => new_state.italic = true,
716                    "u" | "ins" => new_state.underline = true,
717                    "s" | "del" | "strike" => new_state.strikeout = true,
718                    "code" => new_state.code = true,
719                    "a" => {
720                        if let Some(href) = el.attr("href") {
721                            new_state.link_href = Some(href.to_string());
722                        }
723                    }
724                    "ul" => {
725                        new_list_style = Some(ListStyle::Disc);
726                        new_list_depth = list_depth + 1;
727                    }
728                    "ol" => {
729                        new_list_style = Some(ListStyle::Decimal);
730                        new_list_depth = list_depth + 1;
731                    }
732                    "blockquote" => {
733                        bq_depth += 1;
734                    }
735                    _ => {}
736                }
737
738                // Determine heading level
739                let heading_level = match tag {
740                    "h1" => Some(1),
741                    "h2" => Some(2),
742                    "h3" => Some(3),
743                    "h4" => Some(4),
744                    "h5" => Some(5),
745                    "h6" => Some(6),
746                    _ => None,
747                };
748
749                let is_code_block = tag == "pre";
750
751                // Extract code language from <pre><code class="language-xxx">
752                let code_language = if is_code_block {
753                    node.children().find_map(|child| {
754                        if let Node::Element(cel) = child.value()
755                            && cel.name() == "code"
756                            && let Some(cls) = cel.attr("class")
757                        {
758                            return cls
759                                .split_whitespace()
760                                .find_map(|c| c.strip_prefix("language-"))
761                                .map(|l| l.to_string());
762                        }
763                        None
764                    })
765                } else {
766                    None
767                };
768
769                // Extract CSS styles from block-level elements
770                let css = if is_block_tag {
771                    el.attr("style").map(parse_block_styles).unwrap_or_default()
772                } else {
773                    BlockStyles::default()
774                };
775
776                if tag == "table" {
777                    // Parse table structure into a ParsedTable
778                    let parsed_table = parse_table_element(node);
779                    if !parsed_table.rows.is_empty() {
780                        elements.push(ParsedElement::Table(parsed_table));
781                    }
782                    return;
783                }
784
785                if tag == "br" {
786                    // <br> creates a new block
787                    elements.push(ParsedElement::Block(ParsedBlock {
788                        spans: vec![ParsedSpan {
789                            text: String::new(),
790                            ..Default::default()
791                        }],
792                        heading_level: None,
793                        list_style: None,
794                        list_indent: 0,
795                        is_code_block: false,
796                        code_language: None,
797                        blockquote_depth: bq_depth,
798                        line_height: None,
799                        non_breakable_lines: None,
800                        direction: None,
801                        background_color: None,
802                    }));
803                    return;
804                }
805
806                if tag == "blockquote" {
807                    // Blockquote is a container — recurse into children with increased depth
808                    for child in node.children() {
809                        walk_node(
810                            child,
811                            &new_state,
812                            elements,
813                            &new_list_style,
814                            bq_depth,
815                            new_list_depth,
816                            depth + 1,
817                        );
818                    }
819                } else if is_block_tag && tag != "br" {
820                    // Start collecting spans for a new block.
821                    // Use a temporary buffer so that nested block-level
822                    // elements (e.g. sub-lists inside <li>) are collected
823                    // separately and appended *after* the parent block.
824                    let mut spans: Vec<ParsedSpan> = Vec::new();
825                    let mut nested_elements: Vec<ParsedElement> = Vec::new();
826                    collect_inline_spans(
827                        node,
828                        &new_state,
829                        &mut spans,
830                        &new_list_style,
831                        &mut nested_elements,
832                        bq_depth,
833                        new_list_depth,
834                        depth + 1,
835                    );
836
837                    let list_style_for_block = if tag == "li" {
838                        new_list_style.clone()
839                    } else {
840                        None
841                    };
842
843                    let list_indent_for_block = if tag == "li" {
844                        new_list_depth.saturating_sub(1)
845                    } else {
846                        0
847                    };
848
849                    if !spans.is_empty() || heading_level.is_some() {
850                        elements.push(ParsedElement::Block(ParsedBlock {
851                            spans,
852                            heading_level,
853                            list_style: list_style_for_block,
854                            list_indent: list_indent_for_block,
855                            is_code_block,
856                            code_language,
857                            blockquote_depth: bq_depth,
858                            line_height: css.line_height,
859                            non_breakable_lines: css.non_breakable_lines,
860                            direction: css.direction,
861                            background_color: css.background_color,
862                        }));
863                    }
864                    // Append nested block elements after the parent block
865                    elements.append(&mut nested_elements);
866                } else if matches!(tag, "ul" | "ol" | "thead" | "tbody" | "tr") {
867                    // Container elements: recurse into children
868                    for child in node.children() {
869                        walk_node(
870                            child,
871                            &new_state,
872                            elements,
873                            &new_list_style,
874                            bq_depth,
875                            new_list_depth,
876                            depth + 1,
877                        );
878                    }
879                } else {
880                    // Inline element or unknown: recurse
881                    for child in node.children() {
882                        walk_node(
883                            child,
884                            &new_state,
885                            elements,
886                            current_list_style,
887                            bq_depth,
888                            list_depth,
889                            depth + 1,
890                        );
891                    }
892                }
893            }
894            Node::Text(text) => {
895                let t = text.text.to_string();
896                let trimmed = t.trim();
897                if !trimmed.is_empty() {
898                    // Bare text not in a block — create a paragraph
899                    elements.push(ParsedElement::Block(ParsedBlock {
900                        spans: vec![ParsedSpan {
901                            text: trimmed.to_string(),
902                            bold: state.bold,
903                            italic: state.italic,
904                            underline: state.underline,
905                            strikeout: state.strikeout,
906                            code: state.code,
907                            link_href: state.link_href.clone(),
908                        }],
909                        heading_level: None,
910                        list_style: None,
911                        list_indent: 0,
912                        is_code_block: false,
913                        code_language: None,
914                        blockquote_depth,
915                        line_height: None,
916                        non_breakable_lines: None,
917                        direction: None,
918                        background_color: None,
919                    }));
920                }
921            }
922            _ => {
923                // Document, Comment, etc. — recurse children
924                for child in node.children() {
925                    walk_node(
926                        child,
927                        state,
928                        elements,
929                        current_list_style,
930                        blockquote_depth,
931                        list_depth,
932                        depth + 1,
933                    );
934                }
935            }
936        }
937    }
938
939    /// Collect inline spans from a block-level element's children.
940    /// If a nested block-level element is encountered, it is flushed as a
941    /// separate block.
942    #[allow(clippy::too_many_arguments)]
943    fn collect_inline_spans(
944        node: ego_tree::NodeRef<Node>,
945        state: &FmtState,
946        spans: &mut Vec<ParsedSpan>,
947        current_list_style: &Option<ListStyle>,
948        elements: &mut Vec<ParsedElement>,
949        blockquote_depth: u32,
950        list_depth: u32,
951        depth: usize,
952    ) {
953        if depth > MAX_RECURSION_DEPTH {
954            return;
955        }
956        for child in node.children() {
957            match child.value() {
958                Node::Text(text) => {
959                    let t = text.text.to_string();
960                    if !t.is_empty() {
961                        spans.push(ParsedSpan {
962                            text: t,
963                            bold: state.bold,
964                            italic: state.italic,
965                            underline: state.underline,
966                            strikeout: state.strikeout,
967                            code: state.code,
968                            link_href: state.link_href.clone(),
969                        });
970                    }
971                }
972                Node::Element(el) => {
973                    let tag = el.name();
974                    let mut new_state = state.clone();
975
976                    match tag {
977                        "b" | "strong" => new_state.bold = true,
978                        "i" | "em" => new_state.italic = true,
979                        "u" | "ins" => new_state.underline = true,
980                        "s" | "del" | "strike" => new_state.strikeout = true,
981                        "code" => new_state.code = true,
982                        "a" => {
983                            if let Some(href) = el.attr("href") {
984                                new_state.link_href = Some(href.to_string());
985                            }
986                        }
987                        _ => {}
988                    }
989
990                    // Check for nested block elements
991                    let nested_block = matches!(
992                        tag,
993                        "p" | "div"
994                            | "h1"
995                            | "h2"
996                            | "h3"
997                            | "h4"
998                            | "h5"
999                            | "h6"
1000                            | "li"
1001                            | "pre"
1002                            | "blockquote"
1003                            | "ul"
1004                            | "ol"
1005                    );
1006
1007                    if tag == "br" {
1008                        // br within a block: treat as splitting into new block
1009                        // For simplicity, just add a newline to current span
1010                        spans.push(ParsedSpan {
1011                            text: String::new(),
1012                            ..Default::default()
1013                        });
1014                    } else if nested_block || tag == "table" {
1015                        // Flush as separate element
1016                        walk_node(
1017                            child,
1018                            &new_state,
1019                            elements,
1020                            current_list_style,
1021                            blockquote_depth,
1022                            list_depth,
1023                            depth + 1,
1024                        );
1025                    } else {
1026                        // Inline element: recurse
1027                        collect_inline_spans(
1028                            child,
1029                            &new_state,
1030                            spans,
1031                            current_list_style,
1032                            elements,
1033                            blockquote_depth,
1034                            list_depth,
1035                            depth + 1,
1036                        );
1037                    }
1038                }
1039                _ => {}
1040            }
1041        }
1042    }
1043
1044    let initial_state = FmtState::default();
1045    // Treat the root element as a block-level container so that
1046    // top-level inline elements (e.g. `<b>Bold</b> <em>Italic</em>`)
1047    // are grouped into a single block instead of becoming separate blocks.
1048    let mut root_spans: Vec<ParsedSpan> = Vec::new();
1049    collect_inline_spans(
1050        *root,
1051        &initial_state,
1052        &mut root_spans,
1053        &None,
1054        &mut elements,
1055        0,
1056        0,
1057        0,
1058    );
1059    if !root_spans.is_empty() {
1060        elements.push(ParsedElement::Block(ParsedBlock {
1061            spans: root_spans,
1062            heading_level: None,
1063            list_style: None,
1064            list_indent: 0,
1065            is_code_block: false,
1066            code_language: None,
1067            blockquote_depth: 0,
1068            line_height: None,
1069            non_breakable_lines: None,
1070            direction: None,
1071            background_color: None,
1072        }));
1073    }
1074
1075    // If no elements were parsed, create a single empty paragraph
1076    if elements.is_empty() {
1077        elements.push(ParsedElement::Block(ParsedBlock {
1078            spans: vec![ParsedSpan {
1079                text: String::new(),
1080                ..Default::default()
1081            }],
1082            heading_level: None,
1083            list_style: None,
1084            list_indent: 0,
1085            is_code_block: false,
1086            code_language: None,
1087            blockquote_depth: 0,
1088            line_height: None,
1089            non_breakable_lines: None,
1090            direction: None,
1091            background_color: None,
1092        }));
1093    }
1094
1095    elements
1096}
1097
1098#[cfg(test)]
1099mod tests {
1100    use super::*;
1101
1102    /// Helper: flatten parse_markdown output to blocks for tests that don't care about tables.
1103    fn parse_markdown_blocks(md: &str) -> Vec<ParsedBlock> {
1104        ParsedElement::flatten_to_blocks(parse_markdown(md))
1105    }
1106
1107    #[test]
1108    fn test_parse_markdown_simple_paragraph() {
1109        let blocks = parse_markdown_blocks("Hello **world**");
1110        assert_eq!(blocks.len(), 1);
1111        assert!(blocks[0].spans.len() >= 2);
1112        // "Hello " is plain, "world" is bold
1113        let plain_span = blocks[0]
1114            .spans
1115            .iter()
1116            .find(|s| s.text.contains("Hello"))
1117            .unwrap();
1118        assert!(!plain_span.bold);
1119        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1120        assert!(bold_span.bold);
1121    }
1122
1123    #[test]
1124    fn test_parse_markdown_heading() {
1125        let blocks = parse_markdown_blocks("# Title");
1126        assert_eq!(blocks.len(), 1);
1127        assert_eq!(blocks[0].heading_level, Some(1));
1128        assert_eq!(blocks[0].spans[0].text, "Title");
1129    }
1130
1131    #[test]
1132    fn test_parse_markdown_list() {
1133        let blocks = parse_markdown_blocks("- item1\n- item2");
1134        assert!(blocks.len() >= 2);
1135        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1136        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1137    }
1138
1139    #[test]
1140    fn test_parse_html_simple() {
1141        let blocks = parse_html("<p>Hello <b>world</b></p>");
1142        assert_eq!(blocks.len(), 1);
1143        assert!(blocks[0].spans.len() >= 2);
1144        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1145        assert!(bold_span.bold);
1146    }
1147
1148    #[test]
1149    fn test_parse_html_multiple_paragraphs() {
1150        let blocks = parse_html("<p>A</p><p>B</p>");
1151        assert_eq!(blocks.len(), 2);
1152    }
1153
1154    #[test]
1155    fn test_parse_html_heading() {
1156        let blocks = parse_html("<h2>Subtitle</h2>");
1157        assert_eq!(blocks.len(), 1);
1158        assert_eq!(blocks[0].heading_level, Some(2));
1159    }
1160
1161    #[test]
1162    fn test_parse_html_list() {
1163        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
1164        assert!(blocks.len() >= 2);
1165        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1166    }
1167
1168    #[test]
1169    fn test_parse_markdown_code_block() {
1170        let blocks = parse_markdown_blocks("```\nfn main() {}\n```");
1171        assert_eq!(blocks.len(), 1);
1172        assert!(blocks[0].is_code_block);
1173        assert!(blocks[0].spans[0].code);
1174        // pulldown-cmark appends a trailing \n to code block text — verify it's stripped
1175        let text: String = blocks[0].spans.iter().map(|s| s.text.as_str()).collect();
1176        assert_eq!(
1177            text, "fn main() {}",
1178            "code block text should not have trailing newline"
1179        );
1180    }
1181
1182    #[test]
1183    fn test_parse_markdown_nested_formatting() {
1184        let blocks = parse_markdown_blocks("***bold italic***");
1185        assert_eq!(blocks.len(), 1);
1186        let span = &blocks[0].spans[0];
1187        assert!(span.bold);
1188        assert!(span.italic);
1189    }
1190
1191    #[test]
1192    fn test_parse_markdown_link() {
1193        let blocks = parse_markdown_blocks("[click](http://example.com)");
1194        assert_eq!(blocks.len(), 1);
1195        let span = &blocks[0].spans[0];
1196        assert_eq!(span.text, "click");
1197        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1198    }
1199
1200    #[test]
1201    fn test_parse_markdown_empty() {
1202        let blocks = parse_markdown_blocks("");
1203        assert_eq!(blocks.len(), 1);
1204        assert!(blocks[0].spans[0].text.is_empty());
1205    }
1206
1207    #[test]
1208    fn test_parse_html_empty() {
1209        let blocks = parse_html("");
1210        assert_eq!(blocks.len(), 1);
1211        assert!(blocks[0].spans[0].text.is_empty());
1212    }
1213
1214    #[test]
1215    fn test_parse_html_nested_formatting() {
1216        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
1217        assert_eq!(blocks.len(), 1);
1218        let span = &blocks[0].spans[0];
1219        assert!(span.bold);
1220        assert!(span.italic);
1221    }
1222
1223    #[test]
1224    fn test_parse_html_link() {
1225        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
1226        assert_eq!(blocks.len(), 1);
1227        let span = &blocks[0].spans[0];
1228        assert_eq!(span.text, "click");
1229        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1230    }
1231
1232    #[test]
1233    fn test_parse_html_ordered_list() {
1234        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
1235        assert!(blocks.len() >= 2);
1236        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1237    }
1238
1239    #[test]
1240    fn test_parse_markdown_ordered_list() {
1241        let blocks = parse_markdown_blocks("1. first\n2. second");
1242        assert!(blocks.len() >= 2);
1243        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1244    }
1245
1246    #[test]
1247    fn test_parse_html_blockquote_nested() {
1248        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
1249        assert!(blocks.len() >= 3);
1250    }
1251
1252    #[test]
1253    fn test_parse_block_styles_line_height() {
1254        let styles = parse_block_styles("line-height: 1.5");
1255        assert_eq!(styles.line_height, Some(1500));
1256    }
1257
1258    #[test]
1259    fn test_parse_block_styles_direction_rtl() {
1260        let styles = parse_block_styles("direction: rtl");
1261        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1262    }
1263
1264    #[test]
1265    fn test_parse_block_styles_background_color() {
1266        let styles = parse_block_styles("background-color: #ff0000");
1267        assert_eq!(styles.background_color, Some("#ff0000".to_string()));
1268    }
1269
1270    #[test]
1271    fn test_parse_block_styles_white_space_pre() {
1272        let styles = parse_block_styles("white-space: pre");
1273        assert_eq!(styles.non_breakable_lines, Some(true));
1274    }
1275
1276    #[test]
1277    fn test_parse_block_styles_multiple() {
1278        let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
1279        assert_eq!(styles.line_height, Some(2000));
1280        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1281        assert_eq!(styles.background_color, Some("blue".to_string()));
1282    }
1283
1284    #[test]
1285    fn test_parse_html_block_styles_extracted() {
1286        let blocks = parse_html(
1287            r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
1288        );
1289        assert_eq!(blocks.len(), 1);
1290        assert_eq!(blocks[0].line_height, Some(1500));
1291        assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
1292        assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
1293    }
1294
1295    #[test]
1296    fn test_parse_html_white_space_pre() {
1297        let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
1298        assert_eq!(blocks.len(), 1);
1299        assert_eq!(blocks[0].non_breakable_lines, Some(true));
1300    }
1301
1302    #[test]
1303    fn test_parse_html_no_styles_returns_none() {
1304        let blocks = parse_html("<p>plain</p>");
1305        assert_eq!(blocks.len(), 1);
1306        assert_eq!(blocks[0].line_height, None);
1307        assert_eq!(blocks[0].direction, None);
1308        assert_eq!(blocks[0].background_color, None);
1309        assert_eq!(blocks[0].non_breakable_lines, None);
1310    }
1311
1312    #[test]
1313    fn test_parse_markdown_nested_list_indent() {
1314        let md = "- top\n  - nested\n    - deep";
1315        let blocks = parse_markdown_blocks(md);
1316        assert_eq!(blocks.len(), 3);
1317        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1318        assert_eq!(blocks[0].list_indent, 0);
1319        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1320        assert_eq!(blocks[1].list_indent, 1);
1321        assert_eq!(blocks[2].list_style, Some(ListStyle::Disc));
1322        assert_eq!(blocks[2].list_indent, 2);
1323    }
1324
1325    #[test]
1326    fn test_parse_markdown_nested_ordered_list_indent() {
1327        let md = "1. first\n   1. nested\n   2. nested2";
1328        let blocks = parse_markdown_blocks(md);
1329        assert_eq!(blocks.len(), 3);
1330        assert_eq!(blocks[0].list_indent, 0);
1331        assert_eq!(blocks[1].list_indent, 1);
1332        assert_eq!(blocks[2].list_indent, 1);
1333    }
1334
1335    #[test]
1336    fn test_parse_html_nested_list_indent() {
1337        let html = "<ul><li>top</li><ul><li>nested</li></ul></ul>";
1338        let blocks = parse_html(html);
1339        assert!(blocks.len() >= 2);
1340        assert_eq!(blocks[0].list_indent, 0);
1341        assert_eq!(blocks[1].list_indent, 1);
1342    }
1343
1344    #[test]
1345    fn test_parse_markdown_table() {
1346        let md = "| A | B |\n|---|---|\n| 1 | 2 |";
1347        let elements = parse_markdown(md);
1348        assert_eq!(elements.len(), 1);
1349        match &elements[0] {
1350            ParsedElement::Table(table) => {
1351                assert_eq!(table.header_rows, 1);
1352                assert_eq!(table.rows.len(), 2); // 1 header + 1 body
1353                // Header row
1354                assert_eq!(table.rows[0].len(), 2);
1355                assert_eq!(table.rows[0][0].spans[0].text, "A");
1356                assert_eq!(table.rows[0][1].spans[0].text, "B");
1357                // Body row
1358                assert_eq!(table.rows[1].len(), 2);
1359                assert_eq!(table.rows[1][0].spans[0].text, "1");
1360                assert_eq!(table.rows[1][1].spans[0].text, "2");
1361            }
1362            _ => panic!("Expected ParsedElement::Table"),
1363        }
1364    }
1365
1366    #[test]
1367    fn test_parse_markdown_table_with_formatting() {
1368        let md = "| **bold** | `code` | *italic* |\n|---|---|---|\n| ~~strike~~ | plain | [link](http://x.com) |";
1369        let elements = parse_markdown(md);
1370        assert_eq!(elements.len(), 1);
1371        match &elements[0] {
1372            ParsedElement::Table(table) => {
1373                assert_eq!(table.rows.len(), 2);
1374                // Header: bold cell
1375                assert!(table.rows[0][0].spans[0].bold);
1376                // Header: code cell
1377                assert!(table.rows[0][1].spans[0].code);
1378                // Header: italic cell
1379                assert!(table.rows[0][2].spans[0].italic);
1380                // Body: strikeout cell
1381                assert!(table.rows[1][0].spans[0].strikeout);
1382                // Body: link cell
1383                assert_eq!(
1384                    table.rows[1][2].spans[0].link_href,
1385                    Some("http://x.com".to_string())
1386                );
1387            }
1388            _ => panic!("Expected ParsedElement::Table"),
1389        }
1390    }
1391
1392    #[test]
1393    fn test_parse_markdown_mixed_content_with_table() {
1394        let md = "Before\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nAfter";
1395        let elements = parse_markdown(md);
1396        assert_eq!(elements.len(), 3);
1397        assert!(matches!(&elements[0], ParsedElement::Block(_)));
1398        assert!(matches!(&elements[1], ParsedElement::Table(_)));
1399        assert!(matches!(&elements[2], ParsedElement::Block(_)));
1400    }
1401}