Skip to main content

text_document_common/parser_tools/
content_parser.rs

1use crate::entities::ListStyle;
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed block (paragraph, heading, list item, code block)
16#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18    pub spans: Vec<ParsedSpan>,
19    pub heading_level: Option<i64>,
20    pub list_style: Option<ListStyle>,
21    pub is_code_block: bool,
22}
23
24// ─── Markdown parsing ────────────────────────────────────────────────
25
26pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
27    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
28
29    let options =
30        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
31    let parser = Parser::new_ext(markdown, options);
32
33    let mut blocks: Vec<ParsedBlock> = Vec::new();
34    let mut current_spans: Vec<ParsedSpan> = Vec::new();
35    let mut current_heading: Option<i64> = None;
36    let mut current_list_style: Option<ListStyle> = None;
37    let mut is_code_block = false;
38    let mut in_block = false;
39
40    // Formatting state stack
41    let mut bold = false;
42    let mut italic = false;
43    let mut strikeout = false;
44    let mut link_href: Option<String> = None;
45
46    // List style stack for nested lists
47    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
48
49    for event in parser {
50        match event {
51            Event::Start(Tag::Paragraph) => {
52                in_block = true;
53                current_heading = None;
54                is_code_block = false;
55            }
56            Event::End(TagEnd::Paragraph) => {
57                if !current_spans.is_empty() || in_block {
58                    blocks.push(ParsedBlock {
59                        spans: std::mem::take(&mut current_spans),
60                        heading_level: current_heading.take(),
61                        list_style: current_list_style.clone(),
62                        is_code_block: false,
63                    });
64                }
65                in_block = false;
66                current_list_style = None;
67            }
68            Event::Start(Tag::Heading { level, .. }) => {
69                in_block = true;
70                current_heading = Some(heading_level_to_i64(level));
71                is_code_block = false;
72            }
73            Event::End(TagEnd::Heading(_)) => {
74                blocks.push(ParsedBlock {
75                    spans: std::mem::take(&mut current_spans),
76                    heading_level: current_heading.take(),
77                    list_style: None,
78                    is_code_block: false,
79                });
80                in_block = false;
81            }
82            Event::Start(Tag::List(ordered)) => {
83                let style = if ordered.is_some() {
84                    Some(ListStyle::Decimal)
85                } else {
86                    Some(ListStyle::Disc)
87                };
88                list_stack.push(style);
89            }
90            Event::End(TagEnd::List(_)) => {
91                list_stack.pop();
92            }
93            Event::Start(Tag::Item) => {
94                in_block = true;
95                current_list_style = list_stack.last().cloned().flatten();
96            }
97            Event::End(TagEnd::Item) => {
98                // The paragraph inside the item will have already been flushed,
99                // but if there was no inner paragraph (tight list), flush now.
100                if !current_spans.is_empty() {
101                    blocks.push(ParsedBlock {
102                        spans: std::mem::take(&mut current_spans),
103                        heading_level: None,
104                        list_style: current_list_style.clone(),
105                        is_code_block: false,
106                    });
107                }
108                in_block = false;
109                current_list_style = None;
110            }
111            Event::Start(Tag::CodeBlock(_)) => {
112                in_block = true;
113                is_code_block = true;
114            }
115            Event::End(TagEnd::CodeBlock) => {
116                blocks.push(ParsedBlock {
117                    spans: std::mem::take(&mut current_spans),
118                    heading_level: None,
119                    list_style: None,
120                    is_code_block: true,
121                });
122                in_block = false;
123                is_code_block = false;
124            }
125            Event::Start(Tag::Emphasis) => {
126                italic = true;
127            }
128            Event::End(TagEnd::Emphasis) => {
129                italic = false;
130            }
131            Event::Start(Tag::Strong) => {
132                bold = true;
133            }
134            Event::End(TagEnd::Strong) => {
135                bold = false;
136            }
137            Event::Start(Tag::Strikethrough) => {
138                strikeout = true;
139            }
140            Event::End(TagEnd::Strikethrough) => {
141                strikeout = false;
142            }
143            Event::Start(Tag::Link { dest_url, .. }) => {
144                link_href = Some(dest_url.to_string());
145            }
146            Event::End(TagEnd::Link) => {
147                link_href = None;
148            }
149            Event::Text(text) => {
150                if !in_block {
151                    // Bare text outside any block — create an implicit paragraph
152                    in_block = true;
153                }
154                current_spans.push(ParsedSpan {
155                    text: text.to_string(),
156                    bold,
157                    italic,
158                    underline: false,
159                    strikeout,
160                    code: is_code_block,
161                    link_href: link_href.clone(),
162                });
163            }
164            Event::Code(text) => {
165                if !in_block {
166                    in_block = true;
167                }
168                current_spans.push(ParsedSpan {
169                    text: text.to_string(),
170                    bold,
171                    italic,
172                    underline: false,
173                    strikeout,
174                    code: true,
175                    link_href: link_href.clone(),
176                });
177            }
178            Event::SoftBreak => {
179                // Add a space
180                current_spans.push(ParsedSpan {
181                    text: " ".to_string(),
182                    bold,
183                    italic,
184                    underline: false,
185                    strikeout,
186                    code: false,
187                    link_href: link_href.clone(),
188                });
189            }
190            Event::HardBreak => {
191                // Finalize current block
192                if !current_spans.is_empty() || in_block {
193                    blocks.push(ParsedBlock {
194                        spans: std::mem::take(&mut current_spans),
195                        heading_level: current_heading.take(),
196                        list_style: current_list_style.clone(),
197                        is_code_block,
198                    });
199                }
200            }
201            _ => {}
202        }
203    }
204
205    // Flush any remaining content
206    if !current_spans.is_empty() {
207        blocks.push(ParsedBlock {
208            spans: std::mem::take(&mut current_spans),
209            heading_level: current_heading,
210            list_style: current_list_style,
211            is_code_block,
212        });
213    }
214
215    // If no blocks were parsed, create a single empty paragraph
216    if blocks.is_empty() {
217        blocks.push(ParsedBlock {
218            spans: vec![ParsedSpan {
219                text: String::new(),
220                ..Default::default()
221            }],
222            heading_level: None,
223            list_style: None,
224            is_code_block: false,
225        });
226    }
227
228    blocks
229}
230
231fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
232    use pulldown_cmark::HeadingLevel;
233    match level {
234        HeadingLevel::H1 => 1,
235        HeadingLevel::H2 => 2,
236        HeadingLevel::H3 => 3,
237        HeadingLevel::H4 => 4,
238        HeadingLevel::H5 => 5,
239        HeadingLevel::H6 => 6,
240    }
241}
242
243// ─── HTML parsing ────────────────────────────────────────────────────
244
245use scraper::Node;
246
247pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
248    use scraper::Html;
249
250    let fragment = Html::parse_fragment(html);
251    let mut blocks: Vec<ParsedBlock> = Vec::new();
252
253    // Walk the DOM tree starting from the root
254    let root = fragment.root_element();
255
256    #[derive(Clone, Default)]
257    struct FmtState {
258        bold: bool,
259        italic: bool,
260        underline: bool,
261        strikeout: bool,
262        code: bool,
263        link_href: Option<String>,
264    }
265
266    const MAX_RECURSION_DEPTH: usize = 256;
267
268    fn walk_node(
269        node: ego_tree::NodeRef<Node>,
270        state: &FmtState,
271        blocks: &mut Vec<ParsedBlock>,
272        current_list_style: &Option<ListStyle>,
273        depth: usize,
274    ) {
275        if depth > MAX_RECURSION_DEPTH {
276            return;
277        }
278        match node.value() {
279            Node::Element(el) => {
280                let tag = el.name();
281                let mut new_state = state.clone();
282                let mut new_list_style = current_list_style.clone();
283
284                // Determine if this is a block-level element
285                let is_block_tag = matches!(
286                    tag,
287                    "p" | "div"
288                        | "h1"
289                        | "h2"
290                        | "h3"
291                        | "h4"
292                        | "h5"
293                        | "h6"
294                        | "li"
295                        | "pre"
296                        | "br"
297                        | "blockquote"
298                );
299
300                // Update formatting state
301                match tag {
302                    "b" | "strong" => new_state.bold = true,
303                    "i" | "em" => new_state.italic = true,
304                    "u" | "ins" => new_state.underline = true,
305                    "s" | "del" | "strike" => new_state.strikeout = true,
306                    "code" => new_state.code = true,
307                    "a" => {
308                        if let Some(href) = el.attr("href") {
309                            new_state.link_href = Some(href.to_string());
310                        }
311                    }
312                    "ul" => {
313                        new_list_style = Some(ListStyle::Disc);
314                    }
315                    "ol" => {
316                        new_list_style = Some(ListStyle::Decimal);
317                    }
318                    _ => {}
319                }
320
321                // Determine heading level
322                let heading_level = match tag {
323                    "h1" => Some(1),
324                    "h2" => Some(2),
325                    "h3" => Some(3),
326                    "h4" => Some(4),
327                    "h5" => Some(5),
328                    "h6" => Some(6),
329                    _ => None,
330                };
331
332                let is_code_block = tag == "pre";
333
334                if tag == "br" {
335                    // <br> creates a new block
336                    blocks.push(ParsedBlock {
337                        spans: vec![ParsedSpan {
338                            text: String::new(),
339                            ..Default::default()
340                        }],
341                        heading_level: None,
342                        list_style: None,
343                        is_code_block: false,
344                    });
345                    return;
346                }
347
348                if is_block_tag && tag != "br" {
349                    // Start collecting spans for a new block
350                    let mut spans: Vec<ParsedSpan> = Vec::new();
351                    collect_inline_spans(node, &new_state, &mut spans, &new_list_style, blocks, depth + 1);
352
353                    let list_style_for_block = if tag == "li" {
354                        new_list_style.clone()
355                    } else {
356                        None
357                    };
358
359                    if !spans.is_empty() || heading_level.is_some() {
360                        blocks.push(ParsedBlock {
361                            spans,
362                            heading_level,
363                            list_style: list_style_for_block,
364                            is_code_block,
365                        });
366                    }
367                } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
368                    // Container elements: recurse into children
369                    for child in node.children() {
370                        walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
371                    }
372                } else {
373                    // Inline element or unknown: recurse
374                    for child in node.children() {
375                        walk_node(child, &new_state, blocks, current_list_style, depth + 1);
376                    }
377                }
378            }
379            Node::Text(text) => {
380                let t = text.text.to_string();
381                let trimmed = t.trim();
382                if !trimmed.is_empty() {
383                    // Bare text not in a block — create a paragraph
384                    blocks.push(ParsedBlock {
385                        spans: vec![ParsedSpan {
386                            text: trimmed.to_string(),
387                            bold: state.bold,
388                            italic: state.italic,
389                            underline: state.underline,
390                            strikeout: state.strikeout,
391                            code: state.code,
392                            link_href: state.link_href.clone(),
393                        }],
394                        heading_level: None,
395                        list_style: None,
396                        is_code_block: false,
397                    });
398                }
399            }
400            _ => {
401                // Document, Comment, etc. — recurse children
402                for child in node.children() {
403                    walk_node(child, state, blocks, current_list_style, depth + 1);
404                }
405            }
406        }
407    }
408
409    /// Collect inline spans from a block-level element's children.
410    /// If a nested block-level element is encountered, it is flushed as a
411    /// separate block.
412    fn collect_inline_spans(
413        node: ego_tree::NodeRef<Node>,
414        state: &FmtState,
415        spans: &mut Vec<ParsedSpan>,
416        current_list_style: &Option<ListStyle>,
417        blocks: &mut Vec<ParsedBlock>,
418        depth: usize,
419    ) {
420        if depth > MAX_RECURSION_DEPTH {
421            return;
422        }
423        for child in node.children() {
424            match child.value() {
425                Node::Text(text) => {
426                    let t = text.text.to_string();
427                    if !t.is_empty() {
428                        spans.push(ParsedSpan {
429                            text: t,
430                            bold: state.bold,
431                            italic: state.italic,
432                            underline: state.underline,
433                            strikeout: state.strikeout,
434                            code: state.code,
435                            link_href: state.link_href.clone(),
436                        });
437                    }
438                }
439                Node::Element(el) => {
440                    let tag = el.name();
441                    let mut new_state = state.clone();
442
443                    match tag {
444                        "b" | "strong" => new_state.bold = true,
445                        "i" | "em" => new_state.italic = true,
446                        "u" | "ins" => new_state.underline = true,
447                        "s" | "del" | "strike" => new_state.strikeout = true,
448                        "code" => new_state.code = true,
449                        "a" => {
450                            if let Some(href) = el.attr("href") {
451                                new_state.link_href = Some(href.to_string());
452                            }
453                        }
454                        _ => {}
455                    }
456
457                    // Check for nested block elements
458                    let nested_block = matches!(
459                        tag,
460                        "p" | "div"
461                            | "h1"
462                            | "h2"
463                            | "h3"
464                            | "h4"
465                            | "h5"
466                            | "h6"
467                            | "li"
468                            | "pre"
469                            | "blockquote"
470                            | "ul"
471                            | "ol"
472                    );
473
474                    if tag == "br" {
475                        // br within a block: treat as splitting into new block
476                        // For simplicity, just add a newline to current span
477                        spans.push(ParsedSpan {
478                            text: String::new(),
479                            ..Default::default()
480                        });
481                    } else if nested_block {
482                        // Flush as separate block
483                        walk_node(child, &new_state, blocks, current_list_style, depth + 1);
484                    } else {
485                        // Inline element: recurse
486                        collect_inline_spans(child, &new_state, spans, current_list_style, blocks, depth + 1);
487                    }
488                }
489                _ => {}
490            }
491        }
492    }
493
494    let initial_state = FmtState::default();
495    for child in root.children() {
496        walk_node(child, &initial_state, &mut blocks, &None, 0);
497    }
498
499    // If no blocks were parsed, create a single empty paragraph
500    if blocks.is_empty() {
501        blocks.push(ParsedBlock {
502            spans: vec![ParsedSpan {
503                text: String::new(),
504                ..Default::default()
505            }],
506            heading_level: None,
507            list_style: None,
508            is_code_block: false,
509        });
510    }
511
512    blocks
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_parse_markdown_simple_paragraph() {
521        let blocks = parse_markdown("Hello **world**");
522        assert_eq!(blocks.len(), 1);
523        assert!(blocks[0].spans.len() >= 2);
524        // "Hello " is plain, "world" is bold
525        let plain_span = blocks[0]
526            .spans
527            .iter()
528            .find(|s| s.text.contains("Hello"))
529            .unwrap();
530        assert!(!plain_span.bold);
531        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
532        assert!(bold_span.bold);
533    }
534
535    #[test]
536    fn test_parse_markdown_heading() {
537        let blocks = parse_markdown("# Title");
538        assert_eq!(blocks.len(), 1);
539        assert_eq!(blocks[0].heading_level, Some(1));
540        assert_eq!(blocks[0].spans[0].text, "Title");
541    }
542
543    #[test]
544    fn test_parse_markdown_list() {
545        let blocks = parse_markdown("- item1\n- item2");
546        assert!(blocks.len() >= 2);
547        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
548        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
549    }
550
551    #[test]
552    fn test_parse_html_simple() {
553        let blocks = parse_html("<p>Hello <b>world</b></p>");
554        assert_eq!(blocks.len(), 1);
555        assert!(blocks[0].spans.len() >= 2);
556        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
557        assert!(bold_span.bold);
558    }
559
560    #[test]
561    fn test_parse_html_multiple_paragraphs() {
562        let blocks = parse_html("<p>A</p><p>B</p>");
563        assert_eq!(blocks.len(), 2);
564    }
565
566    #[test]
567    fn test_parse_html_heading() {
568        let blocks = parse_html("<h2>Subtitle</h2>");
569        assert_eq!(blocks.len(), 1);
570        assert_eq!(blocks[0].heading_level, Some(2));
571    }
572
573    #[test]
574    fn test_parse_html_list() {
575        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
576        assert!(blocks.len() >= 2);
577        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
578    }
579
580    #[test]
581    fn test_parse_markdown_code_block() {
582        let blocks = parse_markdown("```\nfn main() {}\n```");
583        assert_eq!(blocks.len(), 1);
584        assert!(blocks[0].is_code_block);
585        assert!(blocks[0].spans[0].code);
586    }
587
588    #[test]
589    fn test_parse_markdown_nested_formatting() {
590        let blocks = parse_markdown("***bold italic***");
591        assert_eq!(blocks.len(), 1);
592        let span = &blocks[0].spans[0];
593        assert!(span.bold);
594        assert!(span.italic);
595    }
596
597    #[test]
598    fn test_parse_markdown_link() {
599        let blocks = parse_markdown("[click](http://example.com)");
600        assert_eq!(blocks.len(), 1);
601        let span = &blocks[0].spans[0];
602        assert_eq!(span.text, "click");
603        assert_eq!(span.link_href, Some("http://example.com".to_string()));
604    }
605
606    #[test]
607    fn test_parse_markdown_empty() {
608        let blocks = parse_markdown("");
609        assert_eq!(blocks.len(), 1);
610        assert!(blocks[0].spans[0].text.is_empty());
611    }
612
613    #[test]
614    fn test_parse_html_empty() {
615        let blocks = parse_html("");
616        assert_eq!(blocks.len(), 1);
617        assert!(blocks[0].spans[0].text.is_empty());
618    }
619
620    #[test]
621    fn test_parse_html_nested_formatting() {
622        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
623        assert_eq!(blocks.len(), 1);
624        let span = &blocks[0].spans[0];
625        assert!(span.bold);
626        assert!(span.italic);
627    }
628
629    #[test]
630    fn test_parse_html_link() {
631        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
632        assert_eq!(blocks.len(), 1);
633        let span = &blocks[0].spans[0];
634        assert_eq!(span.text, "click");
635        assert_eq!(span.link_href, Some("http://example.com".to_string()));
636    }
637
638    #[test]
639    fn test_parse_html_ordered_list() {
640        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
641        assert!(blocks.len() >= 2);
642        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
643    }
644
645    #[test]
646    fn test_parse_markdown_ordered_list() {
647        let blocks = parse_markdown("1. first\n2. second");
648        assert!(blocks.len() >= 2);
649        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
650    }
651
652    #[test]
653    fn test_parse_html_blockquote_nested() {
654        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
655        assert!(blocks.len() >= 3);
656    }
657}