Skip to main content

text_document_common/parser_tools/
content_parser.rs

1use crate::entities::{ListStyle, TextDirection};
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed table cell containing inline spans.
16#[derive(Debug, Clone)]
17pub struct ParsedTableCell {
18    pub spans: Vec<ParsedSpan>,
19}
20
21/// A parsed table extracted from markdown or HTML.
22#[derive(Debug, Clone)]
23pub struct ParsedTable {
24    /// Number of header rows (typically 1 for markdown tables).
25    pub header_rows: usize,
26    /// All rows (header + body), each containing cells with their inline spans.
27    pub rows: Vec<Vec<ParsedTableCell>>,
28}
29
30/// A parsed element: either a block or a table.
31#[derive(Debug, Clone)]
32pub enum ParsedElement {
33    Block(ParsedBlock),
34    Table(ParsedTable),
35}
36
37impl ParsedElement {
38    /// Extract blocks, flattening tables into one block per cell.
39    /// Use when table structure is not needed.
40    pub fn flatten_to_blocks(elements: Vec<ParsedElement>) -> Vec<ParsedBlock> {
41        let mut blocks = Vec::new();
42        for elem in elements {
43            match elem {
44                ParsedElement::Block(b) => blocks.push(b),
45                ParsedElement::Table(t) => {
46                    for row in t.rows {
47                        for cell in row {
48                            blocks.push(ParsedBlock {
49                                spans: cell.spans,
50                                heading_level: None,
51                                list_style: None,
52                                list_indent: 0,
53                                is_code_block: false,
54                                code_language: None,
55                                blockquote_depth: 0,
56                                line_height: None,
57                                non_breakable_lines: None,
58                                direction: None,
59                                background_color: None,
60                            });
61                        }
62                    }
63                }
64            }
65        }
66        if blocks.is_empty() {
67            blocks.push(ParsedBlock {
68                spans: vec![ParsedSpan {
69                    text: String::new(),
70                    ..Default::default()
71                }],
72                heading_level: None,
73                list_style: None,
74                list_indent: 0,
75                is_code_block: false,
76                code_language: None,
77                blockquote_depth: 0,
78                line_height: None,
79                non_breakable_lines: None,
80                direction: None,
81                background_color: None,
82            });
83        }
84        blocks
85    }
86}
87
88/// A parsed block (paragraph, heading, list item, code block)
89#[derive(Debug, Clone)]
90pub struct ParsedBlock {
91    pub spans: Vec<ParsedSpan>,
92    pub heading_level: Option<i64>,
93    pub list_style: Option<ListStyle>,
94    pub list_indent: u32,
95    pub is_code_block: bool,
96    pub code_language: Option<String>,
97    pub blockquote_depth: u32,
98    pub line_height: Option<i64>,
99    pub non_breakable_lines: Option<bool>,
100    pub direction: Option<TextDirection>,
101    pub background_color: Option<String>,
102}
103
104impl ParsedBlock {
105    /// Returns `true` when this block carries no block-level formatting,
106    /// meaning its content is purely inline.
107    pub fn is_inline_only(&self) -> bool {
108        self.heading_level.is_none()
109            && self.list_style.is_none()
110            && !self.is_code_block
111            && self.blockquote_depth == 0
112            && self.line_height.is_none()
113            && self.non_breakable_lines.is_none()
114            && self.direction.is_none()
115            && self.background_color.is_none()
116    }
117}
118
119// ─── Markdown parsing ────────────────────────────────────────────────
120
121pub fn parse_markdown(markdown: &str) -> Vec<ParsedElement> {
122    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
123
124    let options =
125        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
126    let parser = Parser::new_ext(markdown, options);
127
128    let mut elements: Vec<ParsedElement> = Vec::new();
129    let mut current_spans: Vec<ParsedSpan> = Vec::new();
130    let mut current_heading: Option<i64> = None;
131    let mut current_list_style: Option<ListStyle> = None;
132    let mut is_code_block = false;
133    let mut code_language: Option<String> = None;
134    let mut blockquote_depth: u32 = 0;
135    let mut in_block = false;
136
137    // Formatting state stack
138    let mut bold = false;
139    let mut italic = false;
140    let mut strikeout = false;
141    let mut link_href: Option<String> = None;
142
143    // List style stack for nested lists (also tracks nesting depth)
144    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
145    let mut current_list_indent: u32 = 0;
146
147    // Table tracking state
148    let mut in_table = false;
149    let mut in_table_head = false;
150    let mut table_rows: Vec<Vec<ParsedTableCell>> = Vec::new();
151    let mut current_row_cells: Vec<ParsedTableCell> = Vec::new();
152    let mut current_cell_spans: Vec<ParsedSpan> = Vec::new();
153    let mut table_header_rows: usize = 0;
154
155    for event in parser {
156        match event {
157            Event::Start(Tag::Paragraph) => {
158                in_block = true;
159                current_heading = None;
160                is_code_block = false;
161            }
162            Event::End(TagEnd::Paragraph) => {
163                if !current_spans.is_empty() || in_block {
164                    elements.push(ParsedElement::Block(ParsedBlock {
165                        spans: std::mem::take(&mut current_spans),
166                        heading_level: current_heading.take(),
167                        list_style: current_list_style.clone(),
168                        list_indent: current_list_indent,
169                        is_code_block: false,
170                        code_language: None,
171                        blockquote_depth,
172                        line_height: None,
173                        non_breakable_lines: None,
174                        direction: None,
175                        background_color: None,
176                    }));
177                }
178                in_block = false;
179                current_list_style = None;
180            }
181            Event::Start(Tag::Heading { level, .. }) => {
182                in_block = true;
183                current_heading = Some(heading_level_to_i64(level));
184                is_code_block = false;
185            }
186            Event::End(TagEnd::Heading(_)) => {
187                elements.push(ParsedElement::Block(ParsedBlock {
188                    spans: std::mem::take(&mut current_spans),
189                    heading_level: current_heading.take(),
190                    list_style: None,
191                    list_indent: 0,
192                    is_code_block: false,
193                    code_language: None,
194                    blockquote_depth,
195                    line_height: None,
196                    non_breakable_lines: None,
197                    direction: None,
198                    background_color: None,
199                }));
200                in_block = false;
201            }
202            Event::Start(Tag::List(ordered)) => {
203                let style = if ordered.is_some() {
204                    Some(ListStyle::Decimal)
205                } else {
206                    Some(ListStyle::Disc)
207                };
208                list_stack.push(style);
209            }
210            Event::End(TagEnd::List(_)) => {
211                list_stack.pop();
212            }
213            Event::Start(Tag::Item) => {
214                // Flush any accumulated spans from the parent item before
215                // starting a child item in a tight list
216                if !current_spans.is_empty() {
217                    elements.push(ParsedElement::Block(ParsedBlock {
218                        spans: std::mem::take(&mut current_spans),
219                        heading_level: None,
220                        list_style: current_list_style.clone(),
221                        list_indent: current_list_indent,
222                        is_code_block: false,
223                        code_language: None,
224                        blockquote_depth,
225                        line_height: None,
226                        non_breakable_lines: None,
227                        direction: None,
228                        background_color: None,
229                    }));
230                }
231                in_block = true;
232                current_list_style = list_stack.last().cloned().flatten();
233                current_list_indent = if list_stack.is_empty() {
234                    0
235                } else {
236                    (list_stack.len() - 1) as u32
237                };
238            }
239            Event::End(TagEnd::Item) => {
240                // The paragraph inside the item will have already been flushed,
241                // but if there was no inner paragraph (tight list), flush now.
242                if !current_spans.is_empty() {
243                    elements.push(ParsedElement::Block(ParsedBlock {
244                        spans: std::mem::take(&mut current_spans),
245                        heading_level: None,
246                        list_style: current_list_style.clone(),
247                        list_indent: current_list_indent,
248                        is_code_block: false,
249                        code_language: None,
250                        blockquote_depth,
251                        line_height: None,
252                        non_breakable_lines: None,
253                        direction: None,
254                        background_color: None,
255                    }));
256                }
257                in_block = false;
258                current_list_style = None;
259            }
260            Event::Start(Tag::CodeBlock(kind)) => {
261                in_block = true;
262                is_code_block = true;
263                code_language = match &kind {
264                    pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
265                        Some(lang.to_string())
266                    }
267                    _ => None,
268                };
269            }
270            Event::End(TagEnd::CodeBlock) => {
271                // pulldown-cmark appends a trailing '\n' to code block text — strip it
272                if let Some(last) = current_spans.last_mut()
273                    && last.text.ends_with('\n')
274                {
275                    last.text.truncate(last.text.len() - 1);
276                }
277                elements.push(ParsedElement::Block(ParsedBlock {
278                    spans: std::mem::take(&mut current_spans),
279                    heading_level: None,
280                    list_style: None,
281                    list_indent: 0,
282                    is_code_block: true,
283                    code_language: code_language.take(),
284                    blockquote_depth,
285                    line_height: None,
286                    non_breakable_lines: None,
287                    direction: None,
288                    background_color: None,
289                }));
290                in_block = false;
291                is_code_block = false;
292            }
293            // ─── Table events ───────────────────────────────────────
294            Event::Start(Tag::Table(_)) => {
295                in_table = true;
296                in_table_head = false;
297                table_rows.clear();
298                current_row_cells.clear();
299                current_cell_spans.clear();
300                table_header_rows = 0;
301            }
302            Event::End(TagEnd::Table) => {
303                elements.push(ParsedElement::Table(ParsedTable {
304                    header_rows: table_header_rows,
305                    rows: std::mem::take(&mut table_rows),
306                }));
307                in_table = false;
308            }
309            Event::Start(Tag::TableHead) => {
310                in_table_head = true;
311                current_row_cells.clear();
312            }
313            Event::End(TagEnd::TableHead) => {
314                // Flush the header row
315                table_rows.push(std::mem::take(&mut current_row_cells));
316                table_header_rows += 1;
317                in_table_head = false;
318            }
319            Event::Start(Tag::TableRow) => {
320                current_row_cells.clear();
321            }
322            Event::End(TagEnd::TableRow) => {
323                // Body rows only — header row is flushed in End(TableHead)
324                if !in_table_head {
325                    table_rows.push(std::mem::take(&mut current_row_cells));
326                }
327            }
328            Event::Start(Tag::TableCell) => {
329                current_cell_spans.clear();
330            }
331            Event::End(TagEnd::TableCell) => {
332                current_row_cells.push(ParsedTableCell {
333                    spans: std::mem::take(&mut current_cell_spans),
334                });
335            }
336            // ─── Inline formatting ──────────────────────────────────
337            Event::Start(Tag::Emphasis) => {
338                italic = true;
339            }
340            Event::End(TagEnd::Emphasis) => {
341                italic = false;
342            }
343            Event::Start(Tag::Strong) => {
344                bold = true;
345            }
346            Event::End(TagEnd::Strong) => {
347                bold = false;
348            }
349            Event::Start(Tag::Strikethrough) => {
350                strikeout = true;
351            }
352            Event::End(TagEnd::Strikethrough) => {
353                strikeout = false;
354            }
355            Event::Start(Tag::Link { dest_url, .. }) => {
356                link_href = Some(dest_url.to_string());
357            }
358            Event::End(TagEnd::Link) => {
359                link_href = None;
360            }
361            Event::Text(text) => {
362                let span = ParsedSpan {
363                    text: text.to_string(),
364                    bold,
365                    italic,
366                    underline: false,
367                    strikeout,
368                    code: is_code_block,
369                    link_href: link_href.clone(),
370                };
371                if in_table {
372                    current_cell_spans.push(span);
373                } else {
374                    if !in_block {
375                        in_block = true;
376                    }
377                    current_spans.push(span);
378                }
379            }
380            Event::Code(text) => {
381                let span = ParsedSpan {
382                    text: text.to_string(),
383                    bold,
384                    italic,
385                    underline: false,
386                    strikeout,
387                    code: true,
388                    link_href: link_href.clone(),
389                };
390                if in_table {
391                    current_cell_spans.push(span);
392                } else {
393                    if !in_block {
394                        in_block = true;
395                    }
396                    current_spans.push(span);
397                }
398            }
399            Event::SoftBreak => {
400                let span = ParsedSpan {
401                    text: " ".to_string(),
402                    bold,
403                    italic,
404                    underline: false,
405                    strikeout,
406                    code: false,
407                    link_href: link_href.clone(),
408                };
409                if in_table {
410                    current_cell_spans.push(span);
411                } else {
412                    current_spans.push(span);
413                }
414            }
415            Event::HardBreak => {
416                // Finalize current block
417                if !current_spans.is_empty() || in_block {
418                    elements.push(ParsedElement::Block(ParsedBlock {
419                        spans: std::mem::take(&mut current_spans),
420                        heading_level: current_heading.take(),
421                        list_style: current_list_style.clone(),
422                        list_indent: current_list_indent,
423                        is_code_block,
424                        code_language: code_language.clone(),
425                        blockquote_depth,
426                        line_height: None,
427                        non_breakable_lines: None,
428                        direction: None,
429                        background_color: None,
430                    }));
431                }
432            }
433            Event::Start(Tag::BlockQuote(_)) => {
434                blockquote_depth += 1;
435            }
436            Event::End(TagEnd::BlockQuote(_)) => {
437                blockquote_depth = blockquote_depth.saturating_sub(1);
438            }
439            _ => {}
440        }
441    }
442
443    // Flush any remaining content
444    if !current_spans.is_empty() {
445        elements.push(ParsedElement::Block(ParsedBlock {
446            spans: std::mem::take(&mut current_spans),
447            heading_level: current_heading,
448            list_style: current_list_style,
449            list_indent: current_list_indent,
450            is_code_block,
451            code_language: code_language.take(),
452            blockquote_depth,
453            line_height: None,
454            non_breakable_lines: None,
455            direction: None,
456            background_color: None,
457        }));
458    }
459
460    // If no elements were parsed, create a single empty paragraph
461    if elements.is_empty() {
462        elements.push(ParsedElement::Block(ParsedBlock {
463            spans: vec![ParsedSpan {
464                text: String::new(),
465                ..Default::default()
466            }],
467            heading_level: None,
468            list_style: None,
469            list_indent: 0,
470            is_code_block: false,
471            code_language: None,
472            blockquote_depth: 0,
473            line_height: None,
474            non_breakable_lines: None,
475            direction: None,
476            background_color: None,
477        }));
478    }
479
480    elements
481}
482
483fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
484    use pulldown_cmark::HeadingLevel;
485    match level {
486        HeadingLevel::H1 => 1,
487        HeadingLevel::H2 => 2,
488        HeadingLevel::H3 => 3,
489        HeadingLevel::H4 => 4,
490        HeadingLevel::H5 => 5,
491        HeadingLevel::H6 => 6,
492    }
493}
494
495// ─── HTML parsing ────────────────────────────────────────────────────
496
497use scraper::Node;
498
499/// Parsed CSS block-level styles from an inline `style` attribute.
500#[derive(Debug, Clone, Default)]
501struct BlockStyles {
502    line_height: Option<i64>,
503    non_breakable_lines: Option<bool>,
504    direction: Option<TextDirection>,
505    background_color: Option<String>,
506}
507
508/// Parse relevant CSS properties from an inline style string.
509/// Handles: line-height, white-space, direction, background-color.
510fn parse_block_styles(style: &str) -> BlockStyles {
511    let mut result = BlockStyles::default();
512    for part in style.split(';') {
513        let part = part.trim();
514        if let Some((prop, val)) = part.split_once(':') {
515            let prop = prop.trim().to_ascii_lowercase();
516            let val = val.trim();
517            match prop.as_str() {
518                "line-height" => {
519                    // Try parsing as a plain number (multiplier)
520                    if let Ok(v) = val.parse::<f64>() {
521                        result.line_height = Some((v * 1000.0) as i64);
522                    }
523                }
524                "white-space" => {
525                    if val == "pre" || val == "nowrap" || val == "pre-wrap" {
526                        result.non_breakable_lines = Some(true);
527                    }
528                }
529                "direction" => {
530                    if val.eq_ignore_ascii_case("rtl") {
531                        result.direction = Some(TextDirection::RightToLeft);
532                    } else if val.eq_ignore_ascii_case("ltr") {
533                        result.direction = Some(TextDirection::LeftToRight);
534                    }
535                }
536                "background-color" | "background" => {
537                    result.background_color = Some(val.to_string());
538                }
539                _ => {}
540            }
541        }
542    }
543    result
544}
545
546pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
547    ParsedElement::flatten_to_blocks(parse_html_elements(html))
548}
549
550pub fn parse_html_elements(html: &str) -> Vec<ParsedElement> {
551    use scraper::Html;
552
553    let fragment = Html::parse_fragment(html);
554    let mut elements: Vec<ParsedElement> = Vec::new();
555
556    // Walk the DOM tree starting from the root
557    let root = fragment.root_element();
558
559    #[derive(Clone, Default)]
560    struct FmtState {
561        bold: bool,
562        italic: bool,
563        underline: bool,
564        strikeout: bool,
565        code: bool,
566        link_href: Option<String>,
567    }
568
569    const MAX_RECURSION_DEPTH: usize = 256;
570
571    /// Collect inline spans from a `<td>` or `<th>` cell element.
572    fn collect_cell_spans(
573        node: ego_tree::NodeRef<Node>,
574        state: &FmtState,
575        spans: &mut Vec<ParsedSpan>,
576        depth: usize,
577    ) {
578        if depth > MAX_RECURSION_DEPTH {
579            return;
580        }
581        for child in node.children() {
582            match child.value() {
583                Node::Text(text) => {
584                    let t = text.text.to_string();
585                    if !t.is_empty() {
586                        spans.push(ParsedSpan {
587                            text: t,
588                            bold: state.bold,
589                            italic: state.italic,
590                            underline: state.underline,
591                            strikeout: state.strikeout,
592                            code: state.code,
593                            link_href: state.link_href.clone(),
594                        });
595                    }
596                }
597                Node::Element(el) => {
598                    let tag = el.name();
599                    let mut new_state = state.clone();
600                    match tag {
601                        "b" | "strong" => new_state.bold = true,
602                        "i" | "em" => new_state.italic = true,
603                        "u" | "ins" => new_state.underline = true,
604                        "s" | "del" | "strike" => new_state.strikeout = true,
605                        "code" => new_state.code = true,
606                        "a" => {
607                            if let Some(href) = el.attr("href") {
608                                new_state.link_href = Some(href.to_string());
609                            }
610                        }
611                        _ => {}
612                    }
613                    collect_cell_spans(child, &new_state, spans, depth + 1);
614                }
615                _ => {}
616            }
617        }
618    }
619
620    /// Parse a `<table>` element into a ParsedTable.
621    fn parse_table_element(table_node: ego_tree::NodeRef<Node>) -> ParsedTable {
622        let mut rows: Vec<Vec<ParsedTableCell>> = Vec::new();
623        let mut header_rows: usize = 0;
624
625        fn collect_rows(
626            node: ego_tree::NodeRef<Node>,
627            rows: &mut Vec<Vec<ParsedTableCell>>,
628            header_rows: &mut usize,
629            in_thead: bool,
630        ) {
631            for child in node.children() {
632                if let Node::Element(el) = child.value() {
633                    match el.name() {
634                        "thead" => collect_rows(child, rows, header_rows, true),
635                        "tbody" | "tfoot" => collect_rows(child, rows, header_rows, false),
636                        "tr" => {
637                            let mut cells: Vec<ParsedTableCell> = Vec::new();
638                            for td in child.children() {
639                                if let Node::Element(td_el) = td.value()
640                                    && matches!(td_el.name(), "td" | "th")
641                                {
642                                    let mut spans = Vec::new();
643                                    let state = FmtState::default();
644                                    collect_cell_spans(td, &state, &mut spans, 0);
645                                    if spans.is_empty() {
646                                        spans.push(ParsedSpan::default());
647                                    }
648                                    cells.push(ParsedTableCell { spans });
649                                }
650                            }
651                            if !cells.is_empty() {
652                                rows.push(cells);
653                                if in_thead {
654                                    *header_rows += 1;
655                                }
656                            }
657                        }
658                        _ => {}
659                    }
660                }
661            }
662        }
663
664        collect_rows(table_node, &mut rows, &mut header_rows, false);
665
666        // Tables without explicit <thead> but with <th> cells: treat first row as header
667        if header_rows == 0 && !rows.is_empty() {
668            header_rows = 1;
669        }
670
671        ParsedTable { header_rows, rows }
672    }
673
674    fn walk_node(
675        node: ego_tree::NodeRef<Node>,
676        state: &FmtState,
677        elements: &mut Vec<ParsedElement>,
678        current_list_style: &Option<ListStyle>,
679        blockquote_depth: u32,
680        list_depth: u32,
681        depth: usize,
682    ) {
683        if depth > MAX_RECURSION_DEPTH {
684            return;
685        }
686        match node.value() {
687            Node::Element(el) => {
688                let tag = el.name();
689                let mut new_state = state.clone();
690                let mut new_list_style = current_list_style.clone();
691                let mut bq_depth = blockquote_depth;
692                let mut new_list_depth = list_depth;
693
694                // Determine if this is a block-level element
695                let is_block_tag = matches!(
696                    tag,
697                    "p" | "div"
698                        | "h1"
699                        | "h2"
700                        | "h3"
701                        | "h4"
702                        | "h5"
703                        | "h6"
704                        | "li"
705                        | "pre"
706                        | "br"
707                        | "blockquote"
708                );
709
710                // Update formatting state
711                match tag {
712                    "b" | "strong" => new_state.bold = true,
713                    "i" | "em" => new_state.italic = true,
714                    "u" | "ins" => new_state.underline = true,
715                    "s" | "del" | "strike" => new_state.strikeout = true,
716                    "code" => new_state.code = true,
717                    "a" => {
718                        if let Some(href) = el.attr("href") {
719                            new_state.link_href = Some(href.to_string());
720                        }
721                    }
722                    "ul" => {
723                        new_list_style = Some(ListStyle::Disc);
724                        new_list_depth = list_depth + 1;
725                    }
726                    "ol" => {
727                        new_list_style = Some(ListStyle::Decimal);
728                        new_list_depth = list_depth + 1;
729                    }
730                    "blockquote" => {
731                        bq_depth += 1;
732                    }
733                    _ => {}
734                }
735
736                // Determine heading level
737                let heading_level = match tag {
738                    "h1" => Some(1),
739                    "h2" => Some(2),
740                    "h3" => Some(3),
741                    "h4" => Some(4),
742                    "h5" => Some(5),
743                    "h6" => Some(6),
744                    _ => None,
745                };
746
747                let is_code_block = tag == "pre";
748
749                // Extract code language from <pre><code class="language-xxx">
750                let code_language = if is_code_block {
751                    node.children().find_map(|child| {
752                        if let Node::Element(cel) = child.value()
753                            && cel.name() == "code"
754                            && let Some(cls) = cel.attr("class")
755                        {
756                            return cls
757                                .split_whitespace()
758                                .find_map(|c| c.strip_prefix("language-"))
759                                .map(|l| l.to_string());
760                        }
761                        None
762                    })
763                } else {
764                    None
765                };
766
767                // Extract CSS styles from block-level elements
768                let css = if is_block_tag {
769                    el.attr("style").map(parse_block_styles).unwrap_or_default()
770                } else {
771                    BlockStyles::default()
772                };
773
774                if tag == "table" {
775                    // Parse table structure into a ParsedTable
776                    let parsed_table = parse_table_element(node);
777                    if !parsed_table.rows.is_empty() {
778                        elements.push(ParsedElement::Table(parsed_table));
779                    }
780                    return;
781                }
782
783                if tag == "br" {
784                    // <br> creates a new block
785                    elements.push(ParsedElement::Block(ParsedBlock {
786                        spans: vec![ParsedSpan {
787                            text: String::new(),
788                            ..Default::default()
789                        }],
790                        heading_level: None,
791                        list_style: None,
792                        list_indent: 0,
793                        is_code_block: false,
794                        code_language: None,
795                        blockquote_depth: bq_depth,
796                        line_height: None,
797                        non_breakable_lines: None,
798                        direction: None,
799                        background_color: None,
800                    }));
801                    return;
802                }
803
804                if tag == "blockquote" {
805                    // Blockquote is a container — recurse into children with increased depth
806                    for child in node.children() {
807                        walk_node(
808                            child,
809                            &new_state,
810                            elements,
811                            &new_list_style,
812                            bq_depth,
813                            new_list_depth,
814                            depth + 1,
815                        );
816                    }
817                } else if is_block_tag && tag != "br" {
818                    // Start collecting spans for a new block
819                    let mut spans: Vec<ParsedSpan> = Vec::new();
820                    collect_inline_spans(
821                        node,
822                        &new_state,
823                        &mut spans,
824                        &new_list_style,
825                        elements,
826                        bq_depth,
827                        new_list_depth,
828                        depth + 1,
829                    );
830
831                    let list_style_for_block = if tag == "li" {
832                        new_list_style.clone()
833                    } else {
834                        None
835                    };
836
837                    let list_indent_for_block = if tag == "li" {
838                        new_list_depth.saturating_sub(1)
839                    } else {
840                        0
841                    };
842
843                    if !spans.is_empty() || heading_level.is_some() {
844                        elements.push(ParsedElement::Block(ParsedBlock {
845                            spans,
846                            heading_level,
847                            list_style: list_style_for_block,
848                            list_indent: list_indent_for_block,
849                            is_code_block,
850                            code_language,
851                            blockquote_depth: bq_depth,
852                            line_height: css.line_height,
853                            non_breakable_lines: css.non_breakable_lines,
854                            direction: css.direction,
855                            background_color: css.background_color,
856                        }));
857                    }
858                } else if matches!(tag, "ul" | "ol" | "thead" | "tbody" | "tr") {
859                    // Container elements: recurse into children
860                    for child in node.children() {
861                        walk_node(
862                            child,
863                            &new_state,
864                            elements,
865                            &new_list_style,
866                            bq_depth,
867                            new_list_depth,
868                            depth + 1,
869                        );
870                    }
871                } else {
872                    // Inline element or unknown: recurse
873                    for child in node.children() {
874                        walk_node(
875                            child,
876                            &new_state,
877                            elements,
878                            current_list_style,
879                            bq_depth,
880                            list_depth,
881                            depth + 1,
882                        );
883                    }
884                }
885            }
886            Node::Text(text) => {
887                let t = text.text.to_string();
888                let trimmed = t.trim();
889                if !trimmed.is_empty() {
890                    // Bare text not in a block — create a paragraph
891                    elements.push(ParsedElement::Block(ParsedBlock {
892                        spans: vec![ParsedSpan {
893                            text: trimmed.to_string(),
894                            bold: state.bold,
895                            italic: state.italic,
896                            underline: state.underline,
897                            strikeout: state.strikeout,
898                            code: state.code,
899                            link_href: state.link_href.clone(),
900                        }],
901                        heading_level: None,
902                        list_style: None,
903                        list_indent: 0,
904                        is_code_block: false,
905                        code_language: None,
906                        blockquote_depth,
907                        line_height: None,
908                        non_breakable_lines: None,
909                        direction: None,
910                        background_color: None,
911                    }));
912                }
913            }
914            _ => {
915                // Document, Comment, etc. — recurse children
916                for child in node.children() {
917                    walk_node(
918                        child,
919                        state,
920                        elements,
921                        current_list_style,
922                        blockquote_depth,
923                        list_depth,
924                        depth + 1,
925                    );
926                }
927            }
928        }
929    }
930
931    /// Collect inline spans from a block-level element's children.
932    /// If a nested block-level element is encountered, it is flushed as a
933    /// separate block.
934    #[allow(clippy::too_many_arguments)]
935    fn collect_inline_spans(
936        node: ego_tree::NodeRef<Node>,
937        state: &FmtState,
938        spans: &mut Vec<ParsedSpan>,
939        current_list_style: &Option<ListStyle>,
940        elements: &mut Vec<ParsedElement>,
941        blockquote_depth: u32,
942        list_depth: u32,
943        depth: usize,
944    ) {
945        if depth > MAX_RECURSION_DEPTH {
946            return;
947        }
948        for child in node.children() {
949            match child.value() {
950                Node::Text(text) => {
951                    let t = text.text.to_string();
952                    if !t.is_empty() {
953                        spans.push(ParsedSpan {
954                            text: t,
955                            bold: state.bold,
956                            italic: state.italic,
957                            underline: state.underline,
958                            strikeout: state.strikeout,
959                            code: state.code,
960                            link_href: state.link_href.clone(),
961                        });
962                    }
963                }
964                Node::Element(el) => {
965                    let tag = el.name();
966                    let mut new_state = state.clone();
967
968                    match tag {
969                        "b" | "strong" => new_state.bold = true,
970                        "i" | "em" => new_state.italic = true,
971                        "u" | "ins" => new_state.underline = true,
972                        "s" | "del" | "strike" => new_state.strikeout = true,
973                        "code" => new_state.code = true,
974                        "a" => {
975                            if let Some(href) = el.attr("href") {
976                                new_state.link_href = Some(href.to_string());
977                            }
978                        }
979                        _ => {}
980                    }
981
982                    // Check for nested block elements
983                    let nested_block = matches!(
984                        tag,
985                        "p" | "div"
986                            | "h1"
987                            | "h2"
988                            | "h3"
989                            | "h4"
990                            | "h5"
991                            | "h6"
992                            | "li"
993                            | "pre"
994                            | "blockquote"
995                            | "ul"
996                            | "ol"
997                    );
998
999                    if tag == "br" {
1000                        // br within a block: treat as splitting into new block
1001                        // For simplicity, just add a newline to current span
1002                        spans.push(ParsedSpan {
1003                            text: String::new(),
1004                            ..Default::default()
1005                        });
1006                    } else if nested_block || tag == "table" {
1007                        // Flush as separate element
1008                        walk_node(
1009                            child,
1010                            &new_state,
1011                            elements,
1012                            current_list_style,
1013                            blockquote_depth,
1014                            list_depth,
1015                            depth + 1,
1016                        );
1017                    } else {
1018                        // Inline element: recurse
1019                        collect_inline_spans(
1020                            child,
1021                            &new_state,
1022                            spans,
1023                            current_list_style,
1024                            elements,
1025                            blockquote_depth,
1026                            list_depth,
1027                            depth + 1,
1028                        );
1029                    }
1030                }
1031                _ => {}
1032            }
1033        }
1034    }
1035
1036    let initial_state = FmtState::default();
1037    for child in root.children() {
1038        walk_node(child, &initial_state, &mut elements, &None, 0, 0, 0);
1039    }
1040
1041    // If no elements were parsed, create a single empty paragraph
1042    if elements.is_empty() {
1043        elements.push(ParsedElement::Block(ParsedBlock {
1044            spans: vec![ParsedSpan {
1045                text: String::new(),
1046                ..Default::default()
1047            }],
1048            heading_level: None,
1049            list_style: None,
1050            list_indent: 0,
1051            is_code_block: false,
1052            code_language: None,
1053            blockquote_depth: 0,
1054            line_height: None,
1055            non_breakable_lines: None,
1056            direction: None,
1057            background_color: None,
1058        }));
1059    }
1060
1061    elements
1062}
1063
1064#[cfg(test)]
1065mod tests {
1066    use super::*;
1067
1068    /// Helper: flatten parse_markdown output to blocks for tests that don't care about tables.
1069    fn parse_markdown_blocks(md: &str) -> Vec<ParsedBlock> {
1070        ParsedElement::flatten_to_blocks(parse_markdown(md))
1071    }
1072
1073    #[test]
1074    fn test_parse_markdown_simple_paragraph() {
1075        let blocks = parse_markdown_blocks("Hello **world**");
1076        assert_eq!(blocks.len(), 1);
1077        assert!(blocks[0].spans.len() >= 2);
1078        // "Hello " is plain, "world" is bold
1079        let plain_span = blocks[0]
1080            .spans
1081            .iter()
1082            .find(|s| s.text.contains("Hello"))
1083            .unwrap();
1084        assert!(!plain_span.bold);
1085        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1086        assert!(bold_span.bold);
1087    }
1088
1089    #[test]
1090    fn test_parse_markdown_heading() {
1091        let blocks = parse_markdown_blocks("# Title");
1092        assert_eq!(blocks.len(), 1);
1093        assert_eq!(blocks[0].heading_level, Some(1));
1094        assert_eq!(blocks[0].spans[0].text, "Title");
1095    }
1096
1097    #[test]
1098    fn test_parse_markdown_list() {
1099        let blocks = parse_markdown_blocks("- item1\n- item2");
1100        assert!(blocks.len() >= 2);
1101        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1102        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1103    }
1104
1105    #[test]
1106    fn test_parse_html_simple() {
1107        let blocks = parse_html("<p>Hello <b>world</b></p>");
1108        assert_eq!(blocks.len(), 1);
1109        assert!(blocks[0].spans.len() >= 2);
1110        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1111        assert!(bold_span.bold);
1112    }
1113
1114    #[test]
1115    fn test_parse_html_multiple_paragraphs() {
1116        let blocks = parse_html("<p>A</p><p>B</p>");
1117        assert_eq!(blocks.len(), 2);
1118    }
1119
1120    #[test]
1121    fn test_parse_html_heading() {
1122        let blocks = parse_html("<h2>Subtitle</h2>");
1123        assert_eq!(blocks.len(), 1);
1124        assert_eq!(blocks[0].heading_level, Some(2));
1125    }
1126
1127    #[test]
1128    fn test_parse_html_list() {
1129        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
1130        assert!(blocks.len() >= 2);
1131        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1132    }
1133
1134    #[test]
1135    fn test_parse_markdown_code_block() {
1136        let blocks = parse_markdown_blocks("```\nfn main() {}\n```");
1137        assert_eq!(blocks.len(), 1);
1138        assert!(blocks[0].is_code_block);
1139        assert!(blocks[0].spans[0].code);
1140        // pulldown-cmark appends a trailing \n to code block text — verify it's stripped
1141        let text: String = blocks[0].spans.iter().map(|s| s.text.as_str()).collect();
1142        assert_eq!(
1143            text, "fn main() {}",
1144            "code block text should not have trailing newline"
1145        );
1146    }
1147
1148    #[test]
1149    fn test_parse_markdown_nested_formatting() {
1150        let blocks = parse_markdown_blocks("***bold italic***");
1151        assert_eq!(blocks.len(), 1);
1152        let span = &blocks[0].spans[0];
1153        assert!(span.bold);
1154        assert!(span.italic);
1155    }
1156
1157    #[test]
1158    fn test_parse_markdown_link() {
1159        let blocks = parse_markdown_blocks("[click](http://example.com)");
1160        assert_eq!(blocks.len(), 1);
1161        let span = &blocks[0].spans[0];
1162        assert_eq!(span.text, "click");
1163        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1164    }
1165
1166    #[test]
1167    fn test_parse_markdown_empty() {
1168        let blocks = parse_markdown_blocks("");
1169        assert_eq!(blocks.len(), 1);
1170        assert!(blocks[0].spans[0].text.is_empty());
1171    }
1172
1173    #[test]
1174    fn test_parse_html_empty() {
1175        let blocks = parse_html("");
1176        assert_eq!(blocks.len(), 1);
1177        assert!(blocks[0].spans[0].text.is_empty());
1178    }
1179
1180    #[test]
1181    fn test_parse_html_nested_formatting() {
1182        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
1183        assert_eq!(blocks.len(), 1);
1184        let span = &blocks[0].spans[0];
1185        assert!(span.bold);
1186        assert!(span.italic);
1187    }
1188
1189    #[test]
1190    fn test_parse_html_link() {
1191        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
1192        assert_eq!(blocks.len(), 1);
1193        let span = &blocks[0].spans[0];
1194        assert_eq!(span.text, "click");
1195        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1196    }
1197
1198    #[test]
1199    fn test_parse_html_ordered_list() {
1200        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
1201        assert!(blocks.len() >= 2);
1202        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1203    }
1204
1205    #[test]
1206    fn test_parse_markdown_ordered_list() {
1207        let blocks = parse_markdown_blocks("1. first\n2. second");
1208        assert!(blocks.len() >= 2);
1209        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1210    }
1211
1212    #[test]
1213    fn test_parse_html_blockquote_nested() {
1214        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
1215        assert!(blocks.len() >= 3);
1216    }
1217
1218    #[test]
1219    fn test_parse_block_styles_line_height() {
1220        let styles = parse_block_styles("line-height: 1.5");
1221        assert_eq!(styles.line_height, Some(1500));
1222    }
1223
1224    #[test]
1225    fn test_parse_block_styles_direction_rtl() {
1226        let styles = parse_block_styles("direction: rtl");
1227        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1228    }
1229
1230    #[test]
1231    fn test_parse_block_styles_background_color() {
1232        let styles = parse_block_styles("background-color: #ff0000");
1233        assert_eq!(styles.background_color, Some("#ff0000".to_string()));
1234    }
1235
1236    #[test]
1237    fn test_parse_block_styles_white_space_pre() {
1238        let styles = parse_block_styles("white-space: pre");
1239        assert_eq!(styles.non_breakable_lines, Some(true));
1240    }
1241
1242    #[test]
1243    fn test_parse_block_styles_multiple() {
1244        let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
1245        assert_eq!(styles.line_height, Some(2000));
1246        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1247        assert_eq!(styles.background_color, Some("blue".to_string()));
1248    }
1249
1250    #[test]
1251    fn test_parse_html_block_styles_extracted() {
1252        let blocks = parse_html(
1253            r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
1254        );
1255        assert_eq!(blocks.len(), 1);
1256        assert_eq!(blocks[0].line_height, Some(1500));
1257        assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
1258        assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
1259    }
1260
1261    #[test]
1262    fn test_parse_html_white_space_pre() {
1263        let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
1264        assert_eq!(blocks.len(), 1);
1265        assert_eq!(blocks[0].non_breakable_lines, Some(true));
1266    }
1267
1268    #[test]
1269    fn test_parse_html_no_styles_returns_none() {
1270        let blocks = parse_html("<p>plain</p>");
1271        assert_eq!(blocks.len(), 1);
1272        assert_eq!(blocks[0].line_height, None);
1273        assert_eq!(blocks[0].direction, None);
1274        assert_eq!(blocks[0].background_color, None);
1275        assert_eq!(blocks[0].non_breakable_lines, None);
1276    }
1277
1278    #[test]
1279    fn test_parse_markdown_nested_list_indent() {
1280        let md = "- top\n  - nested\n    - deep";
1281        let blocks = parse_markdown_blocks(md);
1282        assert_eq!(blocks.len(), 3);
1283        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1284        assert_eq!(blocks[0].list_indent, 0);
1285        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1286        assert_eq!(blocks[1].list_indent, 1);
1287        assert_eq!(blocks[2].list_style, Some(ListStyle::Disc));
1288        assert_eq!(blocks[2].list_indent, 2);
1289    }
1290
1291    #[test]
1292    fn test_parse_markdown_nested_ordered_list_indent() {
1293        let md = "1. first\n   1. nested\n   2. nested2";
1294        let blocks = parse_markdown_blocks(md);
1295        assert_eq!(blocks.len(), 3);
1296        assert_eq!(blocks[0].list_indent, 0);
1297        assert_eq!(blocks[1].list_indent, 1);
1298        assert_eq!(blocks[2].list_indent, 1);
1299    }
1300
1301    #[test]
1302    fn test_parse_html_nested_list_indent() {
1303        let html = "<ul><li>top</li><ul><li>nested</li></ul></ul>";
1304        let blocks = parse_html(html);
1305        assert!(blocks.len() >= 2);
1306        assert_eq!(blocks[0].list_indent, 0);
1307        assert_eq!(blocks[1].list_indent, 1);
1308    }
1309
1310    #[test]
1311    fn test_parse_markdown_table() {
1312        let md = "| A | B |\n|---|---|\n| 1 | 2 |";
1313        let elements = parse_markdown(md);
1314        assert_eq!(elements.len(), 1);
1315        match &elements[0] {
1316            ParsedElement::Table(table) => {
1317                assert_eq!(table.header_rows, 1);
1318                assert_eq!(table.rows.len(), 2); // 1 header + 1 body
1319                // Header row
1320                assert_eq!(table.rows[0].len(), 2);
1321                assert_eq!(table.rows[0][0].spans[0].text, "A");
1322                assert_eq!(table.rows[0][1].spans[0].text, "B");
1323                // Body row
1324                assert_eq!(table.rows[1].len(), 2);
1325                assert_eq!(table.rows[1][0].spans[0].text, "1");
1326                assert_eq!(table.rows[1][1].spans[0].text, "2");
1327            }
1328            _ => panic!("Expected ParsedElement::Table"),
1329        }
1330    }
1331
1332    #[test]
1333    fn test_parse_markdown_table_with_formatting() {
1334        let md = "| **bold** | `code` | *italic* |\n|---|---|---|\n| ~~strike~~ | plain | [link](http://x.com) |";
1335        let elements = parse_markdown(md);
1336        assert_eq!(elements.len(), 1);
1337        match &elements[0] {
1338            ParsedElement::Table(table) => {
1339                assert_eq!(table.rows.len(), 2);
1340                // Header: bold cell
1341                assert!(table.rows[0][0].spans[0].bold);
1342                // Header: code cell
1343                assert!(table.rows[0][1].spans[0].code);
1344                // Header: italic cell
1345                assert!(table.rows[0][2].spans[0].italic);
1346                // Body: strikeout cell
1347                assert!(table.rows[1][0].spans[0].strikeout);
1348                // Body: link cell
1349                assert_eq!(
1350                    table.rows[1][2].spans[0].link_href,
1351                    Some("http://x.com".to_string())
1352                );
1353            }
1354            _ => panic!("Expected ParsedElement::Table"),
1355        }
1356    }
1357
1358    #[test]
1359    fn test_parse_markdown_mixed_content_with_table() {
1360        let md = "Before\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nAfter";
1361        let elements = parse_markdown(md);
1362        assert_eq!(elements.len(), 3);
1363        assert!(matches!(&elements[0], ParsedElement::Block(_)));
1364        assert!(matches!(&elements[1], ParsedElement::Table(_)));
1365        assert!(matches!(&elements[2], ParsedElement::Block(_)));
1366    }
1367}