Skip to main content

common/parser_tools/
content_parser.rs

1use crate::entities::{ListStyle, TextDirection};
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed table cell containing inline spans.
16#[derive(Debug, Clone)]
17pub struct ParsedTableCell {
18    pub spans: Vec<ParsedSpan>,
19}
20
21/// A parsed table extracted from markdown or HTML.
22#[derive(Debug, Clone)]
23pub struct ParsedTable {
24    /// Number of header rows (typically 1 for markdown tables).
25    pub header_rows: usize,
26    /// All rows (header + body), each containing cells with their inline spans.
27    pub rows: Vec<Vec<ParsedTableCell>>,
28}
29
30/// A parsed element: either a block or a table.
31#[derive(Debug, Clone)]
32pub enum ParsedElement {
33    Block(ParsedBlock),
34    Table(ParsedTable),
35}
36
37impl ParsedElement {
38    /// Extract blocks, flattening tables into one block per cell.
39    /// Use when table structure is not needed.
40    pub fn flatten_to_blocks(elements: Vec<ParsedElement>) -> Vec<ParsedBlock> {
41        let mut blocks = Vec::new();
42        for elem in elements {
43            match elem {
44                ParsedElement::Block(b) => blocks.push(b),
45                ParsedElement::Table(t) => {
46                    for row in t.rows {
47                        for cell in row {
48                            blocks.push(ParsedBlock {
49                                spans: cell.spans,
50                                heading_level: None,
51                                list_style: None,
52                                list_indent: 0,
53                                is_code_block: false,
54                                code_language: None,
55                                blockquote_depth: 0,
56                                line_height: None,
57                                non_breakable_lines: None,
58                                direction: None,
59                                background_color: None,
60                            });
61                        }
62                    }
63                }
64            }
65        }
66        if blocks.is_empty() {
67            blocks.push(ParsedBlock {
68                spans: vec![ParsedSpan {
69                    text: String::new(),
70                    ..Default::default()
71                }],
72                heading_level: None,
73                list_style: None,
74                list_indent: 0,
75                is_code_block: false,
76                code_language: None,
77                blockquote_depth: 0,
78                line_height: None,
79                non_breakable_lines: None,
80                direction: None,
81                background_color: None,
82            });
83        }
84        blocks
85    }
86}
87
88/// A parsed block (paragraph, heading, list item, code block)
89#[derive(Debug, Clone)]
90pub struct ParsedBlock {
91    pub spans: Vec<ParsedSpan>,
92    pub heading_level: Option<i64>,
93    pub list_style: Option<ListStyle>,
94    pub list_indent: u32,
95    pub is_code_block: bool,
96    pub code_language: Option<String>,
97    pub blockquote_depth: u32,
98    pub line_height: Option<i64>,
99    pub non_breakable_lines: Option<bool>,
100    pub direction: Option<TextDirection>,
101    pub background_color: Option<String>,
102}
103
104impl ParsedBlock {
105    /// Returns `true` when this block carries no block-level formatting,
106    /// meaning its content is purely inline.
107    pub fn is_inline_only(&self) -> bool {
108        self.heading_level.is_none()
109            && self.list_style.is_none()
110            && !self.is_code_block
111            && self.blockquote_depth == 0
112            && self.line_height.is_none()
113            && self.non_breakable_lines.is_none()
114            && self.direction.is_none()
115            && self.background_color.is_none()
116    }
117}
118
119// ─── Markdown parsing ────────────────────────────────────────────────
120
121pub fn parse_markdown(markdown: &str) -> Vec<ParsedElement> {
122    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
123
124    let options =
125        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
126    let parser = Parser::new_ext(markdown, options);
127
128    let mut elements: Vec<ParsedElement> = Vec::new();
129    let mut current_spans: Vec<ParsedSpan> = Vec::new();
130    let mut current_heading: Option<i64> = None;
131    let mut current_list_style: Option<ListStyle> = None;
132    let mut is_code_block = false;
133    let mut code_language: Option<String> = None;
134    let mut blockquote_depth: u32 = 0;
135    let mut in_block = false;
136
137    // Formatting state stack
138    let mut bold = false;
139    let mut italic = false;
140    let mut strikeout = false;
141    let mut link_href: Option<String> = None;
142
143    // List style stack for nested lists (also tracks nesting depth)
144    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
145    let mut current_list_indent: u32 = 0;
146
147    // Table tracking state
148    let mut in_table = false;
149    let mut in_table_head = false;
150    let mut table_rows: Vec<Vec<ParsedTableCell>> = Vec::new();
151    let mut current_row_cells: Vec<ParsedTableCell> = Vec::new();
152    let mut current_cell_spans: Vec<ParsedSpan> = Vec::new();
153    let mut table_header_rows: usize = 0;
154
155    for event in parser {
156        match event {
157            Event::Start(Tag::Paragraph) => {
158                in_block = true;
159                current_heading = None;
160                is_code_block = false;
161            }
162            Event::End(TagEnd::Paragraph) => {
163                if !current_spans.is_empty() || in_block {
164                    elements.push(ParsedElement::Block(ParsedBlock {
165                        spans: std::mem::take(&mut current_spans),
166                        heading_level: current_heading.take(),
167                        list_style: current_list_style.clone(),
168                        list_indent: current_list_indent,
169                        is_code_block: false,
170                        code_language: None,
171                        blockquote_depth,
172                        line_height: None,
173                        non_breakable_lines: None,
174                        direction: None,
175                        background_color: None,
176                    }));
177                }
178                in_block = false;
179                current_list_style = None;
180            }
181            Event::Start(Tag::Heading { level, .. }) => {
182                in_block = true;
183                current_heading = Some(heading_level_to_i64(level));
184                is_code_block = false;
185            }
186            Event::End(TagEnd::Heading(_)) => {
187                elements.push(ParsedElement::Block(ParsedBlock {
188                    spans: std::mem::take(&mut current_spans),
189                    heading_level: current_heading.take(),
190                    list_style: None,
191                    list_indent: 0,
192                    is_code_block: false,
193                    code_language: None,
194                    blockquote_depth,
195                    line_height: None,
196                    non_breakable_lines: None,
197                    direction: None,
198                    background_color: None,
199                }));
200                in_block = false;
201            }
202            Event::Start(Tag::List(ordered)) => {
203                let style = if ordered.is_some() {
204                    Some(ListStyle::Decimal)
205                } else {
206                    Some(ListStyle::Disc)
207                };
208                list_stack.push(style);
209            }
210            Event::End(TagEnd::List(_)) => {
211                list_stack.pop();
212            }
213            Event::Start(Tag::Item) => {
214                // Flush any accumulated spans from the parent item before
215                // starting a child item in a tight list
216                if !current_spans.is_empty() {
217                    elements.push(ParsedElement::Block(ParsedBlock {
218                        spans: std::mem::take(&mut current_spans),
219                        heading_level: None,
220                        list_style: current_list_style.clone(),
221                        list_indent: current_list_indent,
222                        is_code_block: false,
223                        code_language: None,
224                        blockquote_depth,
225                        line_height: None,
226                        non_breakable_lines: None,
227                        direction: None,
228                        background_color: None,
229                    }));
230                }
231                in_block = true;
232                current_list_style = list_stack.last().cloned().flatten();
233                current_list_indent = if list_stack.is_empty() {
234                    0
235                } else {
236                    (list_stack.len() - 1) as u32
237                };
238            }
239            Event::End(TagEnd::Item) => {
240                // The paragraph inside the item will have already been flushed,
241                // but if there was no inner paragraph (tight list), flush now.
242                if !current_spans.is_empty() {
243                    elements.push(ParsedElement::Block(ParsedBlock {
244                        spans: std::mem::take(&mut current_spans),
245                        heading_level: None,
246                        list_style: current_list_style.clone(),
247                        list_indent: current_list_indent,
248                        is_code_block: false,
249                        code_language: None,
250                        blockquote_depth,
251                        line_height: None,
252                        non_breakable_lines: None,
253                        direction: None,
254                        background_color: None,
255                    }));
256                }
257                in_block = false;
258                current_list_style = None;
259            }
260            Event::Start(Tag::CodeBlock(kind)) => {
261                in_block = true;
262                is_code_block = true;
263                code_language = match &kind {
264                    pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
265                        Some(lang.to_string())
266                    }
267                    _ => None,
268                };
269            }
270            Event::End(TagEnd::CodeBlock) => {
271                // pulldown-cmark appends a trailing '\n' to code block text — strip it
272                if let Some(last) = current_spans.last_mut()
273                    && last.text.ends_with('\n')
274                {
275                    last.text.truncate(last.text.len() - 1);
276                }
277                elements.push(ParsedElement::Block(ParsedBlock {
278                    spans: std::mem::take(&mut current_spans),
279                    heading_level: None,
280                    list_style: None,
281                    list_indent: 0,
282                    is_code_block: true,
283                    code_language: code_language.take(),
284                    blockquote_depth,
285                    line_height: None,
286                    non_breakable_lines: None,
287                    direction: None,
288                    background_color: None,
289                }));
290                in_block = false;
291                is_code_block = false;
292            }
293            // ─── Table events ───────────────────────────────────────
294            Event::Start(Tag::Table(_)) => {
295                in_table = true;
296                in_table_head = false;
297                table_rows.clear();
298                current_row_cells.clear();
299                current_cell_spans.clear();
300                table_header_rows = 0;
301            }
302            Event::End(TagEnd::Table) => {
303                elements.push(ParsedElement::Table(ParsedTable {
304                    header_rows: table_header_rows,
305                    rows: std::mem::take(&mut table_rows),
306                }));
307                in_table = false;
308            }
309            Event::Start(Tag::TableHead) => {
310                in_table_head = true;
311                current_row_cells.clear();
312            }
313            Event::End(TagEnd::TableHead) => {
314                // Flush the header row
315                table_rows.push(std::mem::take(&mut current_row_cells));
316                table_header_rows += 1;
317                in_table_head = false;
318            }
319            Event::Start(Tag::TableRow) => {
320                current_row_cells.clear();
321            }
322            Event::End(TagEnd::TableRow) if !in_table_head => {
323                // Body rows only — header row is flushed in End(TableHead)
324                table_rows.push(std::mem::take(&mut current_row_cells));
325            }
326            Event::Start(Tag::TableCell) => {
327                current_cell_spans.clear();
328            }
329            Event::End(TagEnd::TableCell) => {
330                current_row_cells.push(ParsedTableCell {
331                    spans: std::mem::take(&mut current_cell_spans),
332                });
333            }
334            // ─── Inline formatting ──────────────────────────────────
335            Event::Start(Tag::Emphasis) => {
336                italic = true;
337            }
338            Event::End(TagEnd::Emphasis) => {
339                italic = false;
340            }
341            Event::Start(Tag::Strong) => {
342                bold = true;
343            }
344            Event::End(TagEnd::Strong) => {
345                bold = false;
346            }
347            Event::Start(Tag::Strikethrough) => {
348                strikeout = true;
349            }
350            Event::End(TagEnd::Strikethrough) => {
351                strikeout = false;
352            }
353            Event::Start(Tag::Link { dest_url, .. }) => {
354                link_href = Some(dest_url.to_string());
355            }
356            Event::End(TagEnd::Link) => {
357                link_href = None;
358            }
359            Event::Text(text) => {
360                let span = ParsedSpan {
361                    text: text.to_string(),
362                    bold,
363                    italic,
364                    underline: false,
365                    strikeout,
366                    code: is_code_block,
367                    link_href: link_href.clone(),
368                };
369                if in_table {
370                    current_cell_spans.push(span);
371                } else {
372                    if !in_block {
373                        in_block = true;
374                    }
375                    current_spans.push(span);
376                }
377            }
378            Event::Code(text) => {
379                let span = ParsedSpan {
380                    text: text.to_string(),
381                    bold,
382                    italic,
383                    underline: false,
384                    strikeout,
385                    code: true,
386                    link_href: link_href.clone(),
387                };
388                if in_table {
389                    current_cell_spans.push(span);
390                } else {
391                    if !in_block {
392                        in_block = true;
393                    }
394                    current_spans.push(span);
395                }
396            }
397            Event::SoftBreak => {
398                let span = ParsedSpan {
399                    text: " ".to_string(),
400                    bold,
401                    italic,
402                    underline: false,
403                    strikeout,
404                    code: false,
405                    link_href: link_href.clone(),
406                };
407                if in_table {
408                    current_cell_spans.push(span);
409                } else {
410                    current_spans.push(span);
411                }
412            }
413            Event::HardBreak if !current_spans.is_empty() || in_block => {
414                // Finalize current block
415                elements.push(ParsedElement::Block(ParsedBlock {
416                    spans: std::mem::take(&mut current_spans),
417                    heading_level: current_heading.take(),
418                    list_style: current_list_style.clone(),
419                    list_indent: current_list_indent,
420                    is_code_block,
421                    code_language: code_language.clone(),
422                    blockquote_depth,
423                    line_height: None,
424                    non_breakable_lines: None,
425                    direction: None,
426                    background_color: None,
427                }));
428            }
429            Event::Start(Tag::BlockQuote(_)) => {
430                blockquote_depth += 1;
431            }
432            Event::End(TagEnd::BlockQuote(_)) => {
433                blockquote_depth = blockquote_depth.saturating_sub(1);
434            }
435            _ => {}
436        }
437    }
438
439    // Flush any remaining content
440    if !current_spans.is_empty() {
441        elements.push(ParsedElement::Block(ParsedBlock {
442            spans: std::mem::take(&mut current_spans),
443            heading_level: current_heading,
444            list_style: current_list_style,
445            list_indent: current_list_indent,
446            is_code_block,
447            code_language: code_language.take(),
448            blockquote_depth,
449            line_height: None,
450            non_breakable_lines: None,
451            direction: None,
452            background_color: None,
453        }));
454    }
455
456    // If no elements were parsed, create a single empty paragraph
457    if elements.is_empty() {
458        elements.push(ParsedElement::Block(ParsedBlock {
459            spans: vec![ParsedSpan {
460                text: String::new(),
461                ..Default::default()
462            }],
463            heading_level: None,
464            list_style: None,
465            list_indent: 0,
466            is_code_block: false,
467            code_language: None,
468            blockquote_depth: 0,
469            line_height: None,
470            non_breakable_lines: None,
471            direction: None,
472            background_color: None,
473        }));
474    }
475
476    elements
477}
478
479fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
480    use pulldown_cmark::HeadingLevel;
481    match level {
482        HeadingLevel::H1 => 1,
483        HeadingLevel::H2 => 2,
484        HeadingLevel::H3 => 3,
485        HeadingLevel::H4 => 4,
486        HeadingLevel::H5 => 5,
487        HeadingLevel::H6 => 6,
488    }
489}
490
491// ─── HTML parsing ────────────────────────────────────────────────────
492
493use scraper::Node;
494
495/// Parsed CSS block-level styles from an inline `style` attribute.
496#[derive(Debug, Clone, Default)]
497struct BlockStyles {
498    line_height: Option<i64>,
499    non_breakable_lines: Option<bool>,
500    direction: Option<TextDirection>,
501    background_color: Option<String>,
502}
503
504/// Parse relevant CSS properties from an inline style string.
505/// Handles: line-height, white-space, direction, background-color.
506fn parse_block_styles(style: &str) -> BlockStyles {
507    let mut result = BlockStyles::default();
508    for part in style.split(';') {
509        let part = part.trim();
510        if let Some((prop, val)) = part.split_once(':') {
511            let prop = prop.trim().to_ascii_lowercase();
512            let val = val.trim();
513            match prop.as_str() {
514                "line-height" => {
515                    // Try parsing as a plain number (multiplier)
516                    if let Ok(v) = val.parse::<f64>() {
517                        result.line_height = Some((v * 1000.0) as i64);
518                    }
519                }
520                "white-space" if val == "pre" || val == "nowrap" || val == "pre-wrap" => {
521                    result.non_breakable_lines = Some(true);
522                }
523                "direction" => {
524                    if val.eq_ignore_ascii_case("rtl") {
525                        result.direction = Some(TextDirection::RightToLeft);
526                    } else if val.eq_ignore_ascii_case("ltr") {
527                        result.direction = Some(TextDirection::LeftToRight);
528                    }
529                }
530                "background-color" | "background" => {
531                    result.background_color = Some(val.to_string());
532                }
533                _ => {}
534            }
535        }
536    }
537    result
538}
539
540pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
541    ParsedElement::flatten_to_blocks(parse_html_elements(html))
542}
543
544pub fn parse_html_elements(html: &str) -> Vec<ParsedElement> {
545    use scraper::Html;
546
547    let fragment = Html::parse_fragment(html);
548    let mut elements: Vec<ParsedElement> = Vec::new();
549
550    // Walk the DOM tree starting from the root
551    let root = fragment.root_element();
552
553    #[derive(Clone, Default)]
554    struct FmtState {
555        bold: bool,
556        italic: bool,
557        underline: bool,
558        strikeout: bool,
559        code: bool,
560        link_href: Option<String>,
561    }
562
563    const MAX_RECURSION_DEPTH: usize = 256;
564
565    /// Collect inline spans from a `<td>` or `<th>` cell element.
566    fn collect_cell_spans(
567        node: ego_tree::NodeRef<Node>,
568        state: &FmtState,
569        spans: &mut Vec<ParsedSpan>,
570        depth: usize,
571    ) {
572        if depth > MAX_RECURSION_DEPTH {
573            return;
574        }
575        for child in node.children() {
576            match child.value() {
577                Node::Text(text) => {
578                    let t = text.text.to_string();
579                    if !t.is_empty() {
580                        spans.push(ParsedSpan {
581                            text: t,
582                            bold: state.bold,
583                            italic: state.italic,
584                            underline: state.underline,
585                            strikeout: state.strikeout,
586                            code: state.code,
587                            link_href: state.link_href.clone(),
588                        });
589                    }
590                }
591                Node::Element(el) => {
592                    let tag = el.name();
593                    let mut new_state = state.clone();
594                    match tag {
595                        "b" | "strong" => new_state.bold = true,
596                        "i" | "em" => new_state.italic = true,
597                        "u" | "ins" => new_state.underline = true,
598                        "s" | "del" | "strike" => new_state.strikeout = true,
599                        "code" => new_state.code = true,
600                        "a" => {
601                            if let Some(href) = el.attr("href") {
602                                new_state.link_href = Some(href.to_string());
603                            }
604                        }
605                        _ => {}
606                    }
607                    collect_cell_spans(child, &new_state, spans, depth + 1);
608                }
609                _ => {}
610            }
611        }
612    }
613
614    /// Parse a `<table>` element into a ParsedTable.
615    fn parse_table_element(table_node: ego_tree::NodeRef<Node>) -> ParsedTable {
616        let mut rows: Vec<Vec<ParsedTableCell>> = Vec::new();
617        let mut header_rows: usize = 0;
618
619        fn collect_rows(
620            node: ego_tree::NodeRef<Node>,
621            rows: &mut Vec<Vec<ParsedTableCell>>,
622            header_rows: &mut usize,
623            in_thead: bool,
624        ) {
625            for child in node.children() {
626                if let Node::Element(el) = child.value() {
627                    match el.name() {
628                        "thead" => collect_rows(child, rows, header_rows, true),
629                        "tbody" | "tfoot" => collect_rows(child, rows, header_rows, false),
630                        "tr" => {
631                            let mut cells: Vec<ParsedTableCell> = Vec::new();
632                            for td in child.children() {
633                                if let Node::Element(td_el) = td.value()
634                                    && matches!(td_el.name(), "td" | "th")
635                                {
636                                    let mut spans = Vec::new();
637                                    let state = FmtState::default();
638                                    collect_cell_spans(td, &state, &mut spans, 0);
639                                    if spans.is_empty() {
640                                        spans.push(ParsedSpan::default());
641                                    }
642                                    cells.push(ParsedTableCell { spans });
643                                }
644                            }
645                            if !cells.is_empty() {
646                                rows.push(cells);
647                                if in_thead {
648                                    *header_rows += 1;
649                                }
650                            }
651                        }
652                        _ => {}
653                    }
654                }
655            }
656        }
657
658        collect_rows(table_node, &mut rows, &mut header_rows, false);
659
660        // Tables without explicit <thead> but with <th> cells: treat first row as header
661        if header_rows == 0 && !rows.is_empty() {
662            header_rows = 1;
663        }
664
665        ParsedTable { header_rows, rows }
666    }
667
668    fn walk_node(
669        node: ego_tree::NodeRef<Node>,
670        state: &FmtState,
671        elements: &mut Vec<ParsedElement>,
672        current_list_style: &Option<ListStyle>,
673        blockquote_depth: u32,
674        list_depth: u32,
675        depth: usize,
676    ) {
677        if depth > MAX_RECURSION_DEPTH {
678            return;
679        }
680        match node.value() {
681            Node::Element(el) => {
682                let tag = el.name();
683                let mut new_state = state.clone();
684                let mut new_list_style = current_list_style.clone();
685                let mut bq_depth = blockquote_depth;
686                let mut new_list_depth = list_depth;
687
688                // Determine if this is a block-level element
689                let is_block_tag = matches!(
690                    tag,
691                    "p" | "div"
692                        | "h1"
693                        | "h2"
694                        | "h3"
695                        | "h4"
696                        | "h5"
697                        | "h6"
698                        | "li"
699                        | "pre"
700                        | "br"
701                        | "blockquote"
702                        | "body"
703                        | "html"
704                );
705
706                // Update formatting state
707                match tag {
708                    "b" | "strong" => new_state.bold = true,
709                    "i" | "em" => new_state.italic = true,
710                    "u" | "ins" => new_state.underline = true,
711                    "s" | "del" | "strike" => new_state.strikeout = true,
712                    "code" => new_state.code = true,
713                    "a" => {
714                        if let Some(href) = el.attr("href") {
715                            new_state.link_href = Some(href.to_string());
716                        }
717                    }
718                    "ul" => {
719                        new_list_style = Some(ListStyle::Disc);
720                        new_list_depth = list_depth + 1;
721                    }
722                    "ol" => {
723                        new_list_style = Some(ListStyle::Decimal);
724                        new_list_depth = list_depth + 1;
725                    }
726                    "blockquote" => {
727                        bq_depth += 1;
728                    }
729                    _ => {}
730                }
731
732                // Determine heading level
733                let heading_level = match tag {
734                    "h1" => Some(1),
735                    "h2" => Some(2),
736                    "h3" => Some(3),
737                    "h4" => Some(4),
738                    "h5" => Some(5),
739                    "h6" => Some(6),
740                    _ => None,
741                };
742
743                let is_code_block = tag == "pre";
744
745                // Extract code language from <pre><code class="language-xxx">
746                let code_language = if is_code_block {
747                    node.children().find_map(|child| {
748                        if let Node::Element(cel) = child.value()
749                            && cel.name() == "code"
750                            && let Some(cls) = cel.attr("class")
751                        {
752                            return cls
753                                .split_whitespace()
754                                .find_map(|c| c.strip_prefix("language-"))
755                                .map(|l| l.to_string());
756                        }
757                        None
758                    })
759                } else {
760                    None
761                };
762
763                // Extract CSS styles from block-level elements
764                let css = if is_block_tag {
765                    el.attr("style").map(parse_block_styles).unwrap_or_default()
766                } else {
767                    BlockStyles::default()
768                };
769
770                if tag == "table" {
771                    // Parse table structure into a ParsedTable
772                    let parsed_table = parse_table_element(node);
773                    if !parsed_table.rows.is_empty() {
774                        elements.push(ParsedElement::Table(parsed_table));
775                    }
776                    return;
777                }
778
779                if tag == "br" {
780                    // <br> creates a new block
781                    elements.push(ParsedElement::Block(ParsedBlock {
782                        spans: vec![ParsedSpan {
783                            text: String::new(),
784                            ..Default::default()
785                        }],
786                        heading_level: None,
787                        list_style: None,
788                        list_indent: 0,
789                        is_code_block: false,
790                        code_language: None,
791                        blockquote_depth: bq_depth,
792                        line_height: None,
793                        non_breakable_lines: None,
794                        direction: None,
795                        background_color: None,
796                    }));
797                    return;
798                }
799
800                if tag == "blockquote" {
801                    // Blockquote is a container — recurse into children with increased depth
802                    for child in node.children() {
803                        walk_node(
804                            child,
805                            &new_state,
806                            elements,
807                            &new_list_style,
808                            bq_depth,
809                            new_list_depth,
810                            depth + 1,
811                        );
812                    }
813                } else if is_block_tag && tag != "br" {
814                    // Start collecting spans for a new block.
815                    // Use a temporary buffer so that nested block-level
816                    // elements (e.g. sub-lists inside <li>) are collected
817                    // separately and appended *after* the parent block.
818                    let mut spans: Vec<ParsedSpan> = Vec::new();
819                    let mut nested_elements: Vec<ParsedElement> = Vec::new();
820                    collect_inline_spans(
821                        node,
822                        &new_state,
823                        &mut spans,
824                        &new_list_style,
825                        &mut nested_elements,
826                        bq_depth,
827                        new_list_depth,
828                        depth + 1,
829                    );
830
831                    let list_style_for_block = if tag == "li" {
832                        new_list_style.clone()
833                    } else {
834                        None
835                    };
836
837                    let list_indent_for_block = if tag == "li" {
838                        new_list_depth.saturating_sub(1)
839                    } else {
840                        0
841                    };
842
843                    if !spans.is_empty() || heading_level.is_some() {
844                        elements.push(ParsedElement::Block(ParsedBlock {
845                            spans,
846                            heading_level,
847                            list_style: list_style_for_block,
848                            list_indent: list_indent_for_block,
849                            is_code_block,
850                            code_language,
851                            blockquote_depth: bq_depth,
852                            line_height: css.line_height,
853                            non_breakable_lines: css.non_breakable_lines,
854                            direction: css.direction,
855                            background_color: css.background_color,
856                        }));
857                    }
858                    // Append nested block elements after the parent block
859                    elements.append(&mut nested_elements);
860                } else if matches!(tag, "ul" | "ol" | "thead" | "tbody" | "tr") {
861                    // Container elements: recurse into children
862                    for child in node.children() {
863                        walk_node(
864                            child,
865                            &new_state,
866                            elements,
867                            &new_list_style,
868                            bq_depth,
869                            new_list_depth,
870                            depth + 1,
871                        );
872                    }
873                } else {
874                    // Inline element or unknown: recurse
875                    for child in node.children() {
876                        walk_node(
877                            child,
878                            &new_state,
879                            elements,
880                            current_list_style,
881                            bq_depth,
882                            list_depth,
883                            depth + 1,
884                        );
885                    }
886                }
887            }
888            Node::Text(text) => {
889                let t = text.text.to_string();
890                let trimmed = t.trim();
891                if !trimmed.is_empty() {
892                    // Bare text not in a block — create a paragraph
893                    elements.push(ParsedElement::Block(ParsedBlock {
894                        spans: vec![ParsedSpan {
895                            text: trimmed.to_string(),
896                            bold: state.bold,
897                            italic: state.italic,
898                            underline: state.underline,
899                            strikeout: state.strikeout,
900                            code: state.code,
901                            link_href: state.link_href.clone(),
902                        }],
903                        heading_level: None,
904                        list_style: None,
905                        list_indent: 0,
906                        is_code_block: false,
907                        code_language: None,
908                        blockquote_depth,
909                        line_height: None,
910                        non_breakable_lines: None,
911                        direction: None,
912                        background_color: None,
913                    }));
914                }
915            }
916            _ => {
917                // Document, Comment, etc. — recurse children
918                for child in node.children() {
919                    walk_node(
920                        child,
921                        state,
922                        elements,
923                        current_list_style,
924                        blockquote_depth,
925                        list_depth,
926                        depth + 1,
927                    );
928                }
929            }
930        }
931    }
932
933    /// Collect inline spans from a block-level element's children.
934    /// If a nested block-level element is encountered, it is flushed as a
935    /// separate block.
936    #[allow(clippy::too_many_arguments)]
937    fn collect_inline_spans(
938        node: ego_tree::NodeRef<Node>,
939        state: &FmtState,
940        spans: &mut Vec<ParsedSpan>,
941        current_list_style: &Option<ListStyle>,
942        elements: &mut Vec<ParsedElement>,
943        blockquote_depth: u32,
944        list_depth: u32,
945        depth: usize,
946    ) {
947        if depth > MAX_RECURSION_DEPTH {
948            return;
949        }
950        for child in node.children() {
951            match child.value() {
952                Node::Text(text) => {
953                    let t = text.text.to_string();
954                    if !t.is_empty() {
955                        spans.push(ParsedSpan {
956                            text: t,
957                            bold: state.bold,
958                            italic: state.italic,
959                            underline: state.underline,
960                            strikeout: state.strikeout,
961                            code: state.code,
962                            link_href: state.link_href.clone(),
963                        });
964                    }
965                }
966                Node::Element(el) => {
967                    let tag = el.name();
968                    let mut new_state = state.clone();
969
970                    match tag {
971                        "b" | "strong" => new_state.bold = true,
972                        "i" | "em" => new_state.italic = true,
973                        "u" | "ins" => new_state.underline = true,
974                        "s" | "del" | "strike" => new_state.strikeout = true,
975                        "code" => new_state.code = true,
976                        "a" => {
977                            if let Some(href) = el.attr("href") {
978                                new_state.link_href = Some(href.to_string());
979                            }
980                        }
981                        _ => {}
982                    }
983
984                    // Check for nested block elements
985                    let nested_block = matches!(
986                        tag,
987                        "p" | "div"
988                            | "h1"
989                            | "h2"
990                            | "h3"
991                            | "h4"
992                            | "h5"
993                            | "h6"
994                            | "li"
995                            | "pre"
996                            | "blockquote"
997                            | "ul"
998                            | "ol"
999                    );
1000
1001                    if tag == "br" {
1002                        // br within a block: treat as splitting into new block
1003                        // For simplicity, just add a newline to current span
1004                        spans.push(ParsedSpan {
1005                            text: String::new(),
1006                            ..Default::default()
1007                        });
1008                    } else if nested_block || tag == "table" {
1009                        // Flush as separate element
1010                        walk_node(
1011                            child,
1012                            &new_state,
1013                            elements,
1014                            current_list_style,
1015                            blockquote_depth,
1016                            list_depth,
1017                            depth + 1,
1018                        );
1019                    } else {
1020                        // Inline element: recurse
1021                        collect_inline_spans(
1022                            child,
1023                            &new_state,
1024                            spans,
1025                            current_list_style,
1026                            elements,
1027                            blockquote_depth,
1028                            list_depth,
1029                            depth + 1,
1030                        );
1031                    }
1032                }
1033                _ => {}
1034            }
1035        }
1036    }
1037
1038    let initial_state = FmtState::default();
1039    // Treat the root element as a block-level container so that
1040    // top-level inline elements (e.g. `<b>Bold</b> <em>Italic</em>`)
1041    // are grouped into a single block instead of becoming separate blocks.
1042    let mut root_spans: Vec<ParsedSpan> = Vec::new();
1043    collect_inline_spans(
1044        *root,
1045        &initial_state,
1046        &mut root_spans,
1047        &None,
1048        &mut elements,
1049        0,
1050        0,
1051        0,
1052    );
1053    if !root_spans.is_empty() {
1054        elements.push(ParsedElement::Block(ParsedBlock {
1055            spans: root_spans,
1056            heading_level: None,
1057            list_style: None,
1058            list_indent: 0,
1059            is_code_block: false,
1060            code_language: None,
1061            blockquote_depth: 0,
1062            line_height: None,
1063            non_breakable_lines: None,
1064            direction: None,
1065            background_color: None,
1066        }));
1067    }
1068
1069    // If no elements were parsed, create a single empty paragraph
1070    if elements.is_empty() {
1071        elements.push(ParsedElement::Block(ParsedBlock {
1072            spans: vec![ParsedSpan {
1073                text: String::new(),
1074                ..Default::default()
1075            }],
1076            heading_level: None,
1077            list_style: None,
1078            list_indent: 0,
1079            is_code_block: false,
1080            code_language: None,
1081            blockquote_depth: 0,
1082            line_height: None,
1083            non_breakable_lines: None,
1084            direction: None,
1085            background_color: None,
1086        }));
1087    }
1088
1089    elements
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094    use super::*;
1095
1096    /// Helper: flatten parse_markdown output to blocks for tests that don't care about tables.
1097    fn parse_markdown_blocks(md: &str) -> Vec<ParsedBlock> {
1098        ParsedElement::flatten_to_blocks(parse_markdown(md))
1099    }
1100
1101    #[test]
1102    fn test_parse_markdown_simple_paragraph() {
1103        let blocks = parse_markdown_blocks("Hello **world**");
1104        assert_eq!(blocks.len(), 1);
1105        assert!(blocks[0].spans.len() >= 2);
1106        // "Hello " is plain, "world" is bold
1107        let plain_span = blocks[0]
1108            .spans
1109            .iter()
1110            .find(|s| s.text.contains("Hello"))
1111            .unwrap();
1112        assert!(!plain_span.bold);
1113        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1114        assert!(bold_span.bold);
1115    }
1116
1117    #[test]
1118    fn test_parse_markdown_heading() {
1119        let blocks = parse_markdown_blocks("# Title");
1120        assert_eq!(blocks.len(), 1);
1121        assert_eq!(blocks[0].heading_level, Some(1));
1122        assert_eq!(blocks[0].spans[0].text, "Title");
1123    }
1124
1125    #[test]
1126    fn test_parse_markdown_list() {
1127        let blocks = parse_markdown_blocks("- item1\n- item2");
1128        assert!(blocks.len() >= 2);
1129        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1130        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1131    }
1132
1133    #[test]
1134    fn test_parse_html_simple() {
1135        let blocks = parse_html("<p>Hello <b>world</b></p>");
1136        assert_eq!(blocks.len(), 1);
1137        assert!(blocks[0].spans.len() >= 2);
1138        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
1139        assert!(bold_span.bold);
1140    }
1141
1142    #[test]
1143    fn test_parse_html_multiple_paragraphs() {
1144        let blocks = parse_html("<p>A</p><p>B</p>");
1145        assert_eq!(blocks.len(), 2);
1146    }
1147
1148    #[test]
1149    fn test_parse_html_heading() {
1150        let blocks = parse_html("<h2>Subtitle</h2>");
1151        assert_eq!(blocks.len(), 1);
1152        assert_eq!(blocks[0].heading_level, Some(2));
1153    }
1154
1155    #[test]
1156    fn test_parse_html_list() {
1157        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
1158        assert!(blocks.len() >= 2);
1159        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1160    }
1161
1162    #[test]
1163    fn test_parse_markdown_code_block() {
1164        let blocks = parse_markdown_blocks("```\nfn main() {}\n```");
1165        assert_eq!(blocks.len(), 1);
1166        assert!(blocks[0].is_code_block);
1167        assert!(blocks[0].spans[0].code);
1168        // pulldown-cmark appends a trailing \n to code block text — verify it's stripped
1169        let text: String = blocks[0].spans.iter().map(|s| s.text.as_str()).collect();
1170        assert_eq!(
1171            text, "fn main() {}",
1172            "code block text should not have trailing newline"
1173        );
1174    }
1175
1176    #[test]
1177    fn test_parse_markdown_nested_formatting() {
1178        let blocks = parse_markdown_blocks("***bold italic***");
1179        assert_eq!(blocks.len(), 1);
1180        let span = &blocks[0].spans[0];
1181        assert!(span.bold);
1182        assert!(span.italic);
1183    }
1184
1185    #[test]
1186    fn test_parse_markdown_link() {
1187        let blocks = parse_markdown_blocks("[click](http://example.com)");
1188        assert_eq!(blocks.len(), 1);
1189        let span = &blocks[0].spans[0];
1190        assert_eq!(span.text, "click");
1191        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1192    }
1193
1194    #[test]
1195    fn test_parse_markdown_empty() {
1196        let blocks = parse_markdown_blocks("");
1197        assert_eq!(blocks.len(), 1);
1198        assert!(blocks[0].spans[0].text.is_empty());
1199    }
1200
1201    #[test]
1202    fn test_parse_html_empty() {
1203        let blocks = parse_html("");
1204        assert_eq!(blocks.len(), 1);
1205        assert!(blocks[0].spans[0].text.is_empty());
1206    }
1207
1208    #[test]
1209    fn test_parse_html_nested_formatting() {
1210        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
1211        assert_eq!(blocks.len(), 1);
1212        let span = &blocks[0].spans[0];
1213        assert!(span.bold);
1214        assert!(span.italic);
1215    }
1216
1217    #[test]
1218    fn test_parse_html_link() {
1219        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
1220        assert_eq!(blocks.len(), 1);
1221        let span = &blocks[0].spans[0];
1222        assert_eq!(span.text, "click");
1223        assert_eq!(span.link_href, Some("http://example.com".to_string()));
1224    }
1225
1226    #[test]
1227    fn test_parse_html_ordered_list() {
1228        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
1229        assert!(blocks.len() >= 2);
1230        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1231    }
1232
1233    #[test]
1234    fn test_parse_markdown_ordered_list() {
1235        let blocks = parse_markdown_blocks("1. first\n2. second");
1236        assert!(blocks.len() >= 2);
1237        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
1238    }
1239
1240    #[test]
1241    fn test_parse_html_blockquote_nested() {
1242        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
1243        assert!(blocks.len() >= 3);
1244    }
1245
1246    #[test]
1247    fn test_parse_block_styles_line_height() {
1248        let styles = parse_block_styles("line-height: 1.5");
1249        assert_eq!(styles.line_height, Some(1500));
1250    }
1251
1252    #[test]
1253    fn test_parse_block_styles_direction_rtl() {
1254        let styles = parse_block_styles("direction: rtl");
1255        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1256    }
1257
1258    #[test]
1259    fn test_parse_block_styles_background_color() {
1260        let styles = parse_block_styles("background-color: #ff0000");
1261        assert_eq!(styles.background_color, Some("#ff0000".to_string()));
1262    }
1263
1264    #[test]
1265    fn test_parse_block_styles_white_space_pre() {
1266        let styles = parse_block_styles("white-space: pre");
1267        assert_eq!(styles.non_breakable_lines, Some(true));
1268    }
1269
1270    #[test]
1271    fn test_parse_block_styles_multiple() {
1272        let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
1273        assert_eq!(styles.line_height, Some(2000));
1274        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
1275        assert_eq!(styles.background_color, Some("blue".to_string()));
1276    }
1277
1278    #[test]
1279    fn test_parse_html_block_styles_extracted() {
1280        let blocks = parse_html(
1281            r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
1282        );
1283        assert_eq!(blocks.len(), 1);
1284        assert_eq!(blocks[0].line_height, Some(1500));
1285        assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
1286        assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
1287    }
1288
1289    #[test]
1290    fn test_parse_html_white_space_pre() {
1291        let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
1292        assert_eq!(blocks.len(), 1);
1293        assert_eq!(blocks[0].non_breakable_lines, Some(true));
1294    }
1295
1296    #[test]
1297    fn test_parse_html_no_styles_returns_none() {
1298        let blocks = parse_html("<p>plain</p>");
1299        assert_eq!(blocks.len(), 1);
1300        assert_eq!(blocks[0].line_height, None);
1301        assert_eq!(blocks[0].direction, None);
1302        assert_eq!(blocks[0].background_color, None);
1303        assert_eq!(blocks[0].non_breakable_lines, None);
1304    }
1305
1306    #[test]
1307    fn test_parse_markdown_nested_list_indent() {
1308        let md = "- top\n  - nested\n    - deep";
1309        let blocks = parse_markdown_blocks(md);
1310        assert_eq!(blocks.len(), 3);
1311        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
1312        assert_eq!(blocks[0].list_indent, 0);
1313        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
1314        assert_eq!(blocks[1].list_indent, 1);
1315        assert_eq!(blocks[2].list_style, Some(ListStyle::Disc));
1316        assert_eq!(blocks[2].list_indent, 2);
1317    }
1318
1319    #[test]
1320    fn test_parse_markdown_nested_ordered_list_indent() {
1321        let md = "1. first\n   1. nested\n   2. nested2";
1322        let blocks = parse_markdown_blocks(md);
1323        assert_eq!(blocks.len(), 3);
1324        assert_eq!(blocks[0].list_indent, 0);
1325        assert_eq!(blocks[1].list_indent, 1);
1326        assert_eq!(blocks[2].list_indent, 1);
1327    }
1328
1329    #[test]
1330    fn test_parse_html_nested_list_indent() {
1331        let html = "<ul><li>top</li><ul><li>nested</li></ul></ul>";
1332        let blocks = parse_html(html);
1333        assert!(blocks.len() >= 2);
1334        assert_eq!(blocks[0].list_indent, 0);
1335        assert_eq!(blocks[1].list_indent, 1);
1336    }
1337
1338    #[test]
1339    fn test_parse_markdown_table() {
1340        let md = "| A | B |\n|---|---|\n| 1 | 2 |";
1341        let elements = parse_markdown(md);
1342        assert_eq!(elements.len(), 1);
1343        match &elements[0] {
1344            ParsedElement::Table(table) => {
1345                assert_eq!(table.header_rows, 1);
1346                assert_eq!(table.rows.len(), 2); // 1 header + 1 body
1347                // Header row
1348                assert_eq!(table.rows[0].len(), 2);
1349                assert_eq!(table.rows[0][0].spans[0].text, "A");
1350                assert_eq!(table.rows[0][1].spans[0].text, "B");
1351                // Body row
1352                assert_eq!(table.rows[1].len(), 2);
1353                assert_eq!(table.rows[1][0].spans[0].text, "1");
1354                assert_eq!(table.rows[1][1].spans[0].text, "2");
1355            }
1356            _ => panic!("Expected ParsedElement::Table"),
1357        }
1358    }
1359
1360    #[test]
1361    fn test_parse_markdown_table_with_formatting() {
1362        let md = "| **bold** | `code` | *italic* |\n|---|---|---|\n| ~~strike~~ | plain | [link](http://x.com) |";
1363        let elements = parse_markdown(md);
1364        assert_eq!(elements.len(), 1);
1365        match &elements[0] {
1366            ParsedElement::Table(table) => {
1367                assert_eq!(table.rows.len(), 2);
1368                // Header: bold cell
1369                assert!(table.rows[0][0].spans[0].bold);
1370                // Header: code cell
1371                assert!(table.rows[0][1].spans[0].code);
1372                // Header: italic cell
1373                assert!(table.rows[0][2].spans[0].italic);
1374                // Body: strikeout cell
1375                assert!(table.rows[1][0].spans[0].strikeout);
1376                // Body: link cell
1377                assert_eq!(
1378                    table.rows[1][2].spans[0].link_href,
1379                    Some("http://x.com".to_string())
1380                );
1381            }
1382            _ => panic!("Expected ParsedElement::Table"),
1383        }
1384    }
1385
1386    #[test]
1387    fn test_parse_markdown_mixed_content_with_table() {
1388        let md = "Before\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nAfter";
1389        let elements = parse_markdown(md);
1390        assert_eq!(elements.len(), 3);
1391        assert!(matches!(&elements[0], ParsedElement::Block(_)));
1392        assert!(matches!(&elements[1], ParsedElement::Table(_)));
1393        assert!(matches!(&elements[2], ParsedElement::Block(_)));
1394    }
1395}