Skip to main content

text_document_common/parser_tools/
content_parser.rs

1use crate::entities::{ListStyle, TextDirection};
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed block (paragraph, heading, list item, code block)
16#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18    pub spans: Vec<ParsedSpan>,
19    pub heading_level: Option<i64>,
20    pub list_style: Option<ListStyle>,
21    pub is_code_block: bool,
22    pub code_language: Option<String>,
23    pub blockquote_depth: u32,
24    pub line_height: Option<i64>,
25    pub non_breakable_lines: Option<bool>,
26    pub direction: Option<TextDirection>,
27    pub background_color: Option<String>,
28}
29
30impl ParsedBlock {
31    /// Returns `true` when this block carries no block-level formatting,
32    /// meaning its content is purely inline.
33    pub fn is_inline_only(&self) -> bool {
34        self.heading_level.is_none()
35            && self.list_style.is_none()
36            && !self.is_code_block
37            && self.blockquote_depth == 0
38            && self.line_height.is_none()
39            && self.non_breakable_lines.is_none()
40            && self.direction.is_none()
41            && self.background_color.is_none()
42    }
43}
44
45// ─── Markdown parsing ────────────────────────────────────────────────
46
47pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
48    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
49
50    let options =
51        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
52    let parser = Parser::new_ext(markdown, options);
53
54    let mut blocks: Vec<ParsedBlock> = Vec::new();
55    let mut current_spans: Vec<ParsedSpan> = Vec::new();
56    let mut current_heading: Option<i64> = None;
57    let mut current_list_style: Option<ListStyle> = None;
58    let mut is_code_block = false;
59    let mut code_language: Option<String> = None;
60    let mut blockquote_depth: u32 = 0;
61    let mut in_block = false;
62
63    // Formatting state stack
64    let mut bold = false;
65    let mut italic = false;
66    let mut strikeout = false;
67    let mut link_href: Option<String> = None;
68
69    // List style stack for nested lists
70    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
71
72    for event in parser {
73        match event {
74            Event::Start(Tag::Paragraph) => {
75                in_block = true;
76                current_heading = None;
77                is_code_block = false;
78            }
79            Event::End(TagEnd::Paragraph) => {
80                if !current_spans.is_empty() || in_block {
81                    blocks.push(ParsedBlock {
82                        spans: std::mem::take(&mut current_spans),
83                        heading_level: current_heading.take(),
84                        list_style: current_list_style.clone(),
85                        is_code_block: false,
86                        code_language: None,
87                        blockquote_depth,
88                        line_height: None,
89                        non_breakable_lines: None,
90                        direction: None,
91                        background_color: None,
92                    });
93                }
94                in_block = false;
95                current_list_style = None;
96            }
97            Event::Start(Tag::Heading { level, .. }) => {
98                in_block = true;
99                current_heading = Some(heading_level_to_i64(level));
100                is_code_block = false;
101            }
102            Event::End(TagEnd::Heading(_)) => {
103                blocks.push(ParsedBlock {
104                    spans: std::mem::take(&mut current_spans),
105                    heading_level: current_heading.take(),
106                    list_style: None,
107                    is_code_block: false,
108                    code_language: None,
109                    blockquote_depth,
110                    line_height: None,
111                    non_breakable_lines: None,
112                    direction: None,
113                    background_color: None,
114                });
115                in_block = false;
116            }
117            Event::Start(Tag::List(ordered)) => {
118                let style = if ordered.is_some() {
119                    Some(ListStyle::Decimal)
120                } else {
121                    Some(ListStyle::Disc)
122                };
123                list_stack.push(style);
124            }
125            Event::End(TagEnd::List(_)) => {
126                list_stack.pop();
127            }
128            Event::Start(Tag::Item) => {
129                in_block = true;
130                current_list_style = list_stack.last().cloned().flatten();
131            }
132            Event::End(TagEnd::Item) => {
133                // The paragraph inside the item will have already been flushed,
134                // but if there was no inner paragraph (tight list), flush now.
135                if !current_spans.is_empty() {
136                    blocks.push(ParsedBlock {
137                        spans: std::mem::take(&mut current_spans),
138                        heading_level: None,
139                        list_style: current_list_style.clone(),
140                        is_code_block: false,
141                        code_language: None,
142                        blockquote_depth,
143                        line_height: None,
144                        non_breakable_lines: None,
145                        direction: None,
146                        background_color: None,
147                    });
148                }
149                in_block = false;
150                current_list_style = None;
151            }
152            Event::Start(Tag::CodeBlock(kind)) => {
153                in_block = true;
154                is_code_block = true;
155                code_language = match &kind {
156                    pulldown_cmark::CodeBlockKind::Fenced(lang) if !lang.is_empty() => {
157                        Some(lang.to_string())
158                    }
159                    _ => None,
160                };
161            }
162            Event::End(TagEnd::CodeBlock) => {
163                blocks.push(ParsedBlock {
164                    spans: std::mem::take(&mut current_spans),
165                    heading_level: None,
166                    list_style: None,
167                    is_code_block: true,
168                    code_language: code_language.take(),
169                    blockquote_depth,
170                    line_height: None,
171                    non_breakable_lines: None,
172                    direction: None,
173                    background_color: None,
174                });
175                in_block = false;
176                is_code_block = false;
177            }
178            Event::Start(Tag::Emphasis) => {
179                italic = true;
180            }
181            Event::End(TagEnd::Emphasis) => {
182                italic = false;
183            }
184            Event::Start(Tag::Strong) => {
185                bold = true;
186            }
187            Event::End(TagEnd::Strong) => {
188                bold = false;
189            }
190            Event::Start(Tag::Strikethrough) => {
191                strikeout = true;
192            }
193            Event::End(TagEnd::Strikethrough) => {
194                strikeout = false;
195            }
196            Event::Start(Tag::Link { dest_url, .. }) => {
197                link_href = Some(dest_url.to_string());
198            }
199            Event::End(TagEnd::Link) => {
200                link_href = None;
201            }
202            Event::Text(text) => {
203                if !in_block {
204                    // Bare text outside any block — create an implicit paragraph
205                    in_block = true;
206                }
207                current_spans.push(ParsedSpan {
208                    text: text.to_string(),
209                    bold,
210                    italic,
211                    underline: false,
212                    strikeout,
213                    code: is_code_block,
214                    link_href: link_href.clone(),
215                });
216            }
217            Event::Code(text) => {
218                if !in_block {
219                    in_block = true;
220                }
221                current_spans.push(ParsedSpan {
222                    text: text.to_string(),
223                    bold,
224                    italic,
225                    underline: false,
226                    strikeout,
227                    code: true,
228                    link_href: link_href.clone(),
229                });
230            }
231            Event::SoftBreak => {
232                // Add a space
233                current_spans.push(ParsedSpan {
234                    text: " ".to_string(),
235                    bold,
236                    italic,
237                    underline: false,
238                    strikeout,
239                    code: false,
240                    link_href: link_href.clone(),
241                });
242            }
243            Event::HardBreak => {
244                // Finalize current block
245                if !current_spans.is_empty() || in_block {
246                    blocks.push(ParsedBlock {
247                        spans: std::mem::take(&mut current_spans),
248                        heading_level: current_heading.take(),
249                        list_style: current_list_style.clone(),
250                        is_code_block,
251                        code_language: code_language.clone(),
252                        blockquote_depth,
253                        line_height: None,
254                        non_breakable_lines: None,
255                        direction: None,
256                        background_color: None,
257                    });
258                }
259            }
260            Event::Start(Tag::BlockQuote(_)) => {
261                blockquote_depth += 1;
262            }
263            Event::End(TagEnd::BlockQuote(_)) => {
264                blockquote_depth = blockquote_depth.saturating_sub(1);
265            }
266            _ => {}
267        }
268    }
269
270    // Flush any remaining content
271    if !current_spans.is_empty() {
272        blocks.push(ParsedBlock {
273            spans: std::mem::take(&mut current_spans),
274            heading_level: current_heading,
275            list_style: current_list_style,
276            is_code_block,
277            code_language: code_language.take(),
278            blockquote_depth,
279            line_height: None,
280            non_breakable_lines: None,
281            direction: None,
282            background_color: None,
283        });
284    }
285
286    // If no blocks were parsed, create a single empty paragraph
287    if blocks.is_empty() {
288        blocks.push(ParsedBlock {
289            spans: vec![ParsedSpan {
290                text: String::new(),
291                ..Default::default()
292            }],
293            heading_level: None,
294            list_style: None,
295            is_code_block: false,
296            code_language: None,
297            blockquote_depth: 0,
298            line_height: None,
299            non_breakable_lines: None,
300            direction: None,
301            background_color: None,
302        });
303    }
304
305    blocks
306}
307
308fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
309    use pulldown_cmark::HeadingLevel;
310    match level {
311        HeadingLevel::H1 => 1,
312        HeadingLevel::H2 => 2,
313        HeadingLevel::H3 => 3,
314        HeadingLevel::H4 => 4,
315        HeadingLevel::H5 => 5,
316        HeadingLevel::H6 => 6,
317    }
318}
319
320// ─── HTML parsing ────────────────────────────────────────────────────
321
322use scraper::Node;
323
324/// Parsed CSS block-level styles from an inline `style` attribute.
325#[derive(Debug, Clone, Default)]
326struct BlockStyles {
327    line_height: Option<i64>,
328    non_breakable_lines: Option<bool>,
329    direction: Option<TextDirection>,
330    background_color: Option<String>,
331}
332
333/// Parse relevant CSS properties from an inline style string.
334/// Handles: line-height, white-space, direction, background-color.
335fn parse_block_styles(style: &str) -> BlockStyles {
336    let mut result = BlockStyles::default();
337    for part in style.split(';') {
338        let part = part.trim();
339        if let Some((prop, val)) = part.split_once(':') {
340            let prop = prop.trim().to_ascii_lowercase();
341            let val = val.trim();
342            match prop.as_str() {
343                "line-height" => {
344                    // Try parsing as a plain number (multiplier)
345                    if let Ok(v) = val.parse::<f64>() {
346                        result.line_height = Some((v * 1000.0) as i64);
347                    }
348                }
349                "white-space" => {
350                    if val == "pre" || val == "nowrap" || val == "pre-wrap" {
351                        result.non_breakable_lines = Some(true);
352                    }
353                }
354                "direction" => {
355                    if val.eq_ignore_ascii_case("rtl") {
356                        result.direction = Some(TextDirection::RightToLeft);
357                    } else if val.eq_ignore_ascii_case("ltr") {
358                        result.direction = Some(TextDirection::LeftToRight);
359                    }
360                }
361                "background-color" | "background" => {
362                    result.background_color = Some(val.to_string());
363                }
364                _ => {}
365            }
366        }
367    }
368    result
369}
370
371pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
372    use scraper::Html;
373
374    let fragment = Html::parse_fragment(html);
375    let mut blocks: Vec<ParsedBlock> = Vec::new();
376
377    // Walk the DOM tree starting from the root
378    let root = fragment.root_element();
379
380    #[derive(Clone, Default)]
381    struct FmtState {
382        bold: bool,
383        italic: bool,
384        underline: bool,
385        strikeout: bool,
386        code: bool,
387        link_href: Option<String>,
388    }
389
390    const MAX_RECURSION_DEPTH: usize = 256;
391
392    fn walk_node(
393        node: ego_tree::NodeRef<Node>,
394        state: &FmtState,
395        blocks: &mut Vec<ParsedBlock>,
396        current_list_style: &Option<ListStyle>,
397        blockquote_depth: u32,
398        depth: usize,
399    ) {
400        if depth > MAX_RECURSION_DEPTH {
401            return;
402        }
403        match node.value() {
404            Node::Element(el) => {
405                let tag = el.name();
406                let mut new_state = state.clone();
407                let mut new_list_style = current_list_style.clone();
408                let mut bq_depth = blockquote_depth;
409
410                // Determine if this is a block-level element
411                let is_block_tag = matches!(
412                    tag,
413                    "p" | "div"
414                        | "h1"
415                        | "h2"
416                        | "h3"
417                        | "h4"
418                        | "h5"
419                        | "h6"
420                        | "li"
421                        | "pre"
422                        | "br"
423                        | "blockquote"
424                );
425
426                // Update formatting state
427                match tag {
428                    "b" | "strong" => new_state.bold = true,
429                    "i" | "em" => new_state.italic = true,
430                    "u" | "ins" => new_state.underline = true,
431                    "s" | "del" | "strike" => new_state.strikeout = true,
432                    "code" => new_state.code = true,
433                    "a" => {
434                        if let Some(href) = el.attr("href") {
435                            new_state.link_href = Some(href.to_string());
436                        }
437                    }
438                    "ul" => {
439                        new_list_style = Some(ListStyle::Disc);
440                    }
441                    "ol" => {
442                        new_list_style = Some(ListStyle::Decimal);
443                    }
444                    "blockquote" => {
445                        bq_depth += 1;
446                    }
447                    _ => {}
448                }
449
450                // Determine heading level
451                let heading_level = match tag {
452                    "h1" => Some(1),
453                    "h2" => Some(2),
454                    "h3" => Some(3),
455                    "h4" => Some(4),
456                    "h5" => Some(5),
457                    "h6" => Some(6),
458                    _ => None,
459                };
460
461                let is_code_block = tag == "pre";
462
463                // Extract code language from <pre><code class="language-xxx">
464                let code_language = if is_code_block {
465                    node.children().find_map(|child| {
466                        if let Node::Element(cel) = child.value()
467                            && cel.name() == "code"
468                            && let Some(cls) = cel.attr("class")
469                        {
470                            return cls
471                                .split_whitespace()
472                                .find_map(|c| c.strip_prefix("language-"))
473                                .map(|l| l.to_string());
474                        }
475                        None
476                    })
477                } else {
478                    None
479                };
480
481                // Extract CSS styles from block-level elements
482                let css = if is_block_tag {
483                    el.attr("style").map(parse_block_styles).unwrap_or_default()
484                } else {
485                    BlockStyles::default()
486                };
487
488                if tag == "br" {
489                    // <br> creates a new block
490                    blocks.push(ParsedBlock {
491                        spans: vec![ParsedSpan {
492                            text: String::new(),
493                            ..Default::default()
494                        }],
495                        heading_level: None,
496                        list_style: None,
497                        is_code_block: false,
498                        code_language: None,
499                        blockquote_depth: bq_depth,
500                        line_height: None,
501                        non_breakable_lines: None,
502                        direction: None,
503                        background_color: None,
504                    });
505                    return;
506                }
507
508                if tag == "blockquote" {
509                    // Blockquote is a container — recurse into children with increased depth
510                    for child in node.children() {
511                        walk_node(
512                            child,
513                            &new_state,
514                            blocks,
515                            &new_list_style,
516                            bq_depth,
517                            depth + 1,
518                        );
519                    }
520                } else if is_block_tag && tag != "br" {
521                    // Start collecting spans for a new block
522                    let mut spans: Vec<ParsedSpan> = Vec::new();
523                    collect_inline_spans(
524                        node,
525                        &new_state,
526                        &mut spans,
527                        &new_list_style,
528                        blocks,
529                        bq_depth,
530                        depth + 1,
531                    );
532
533                    let list_style_for_block = if tag == "li" {
534                        new_list_style.clone()
535                    } else {
536                        None
537                    };
538
539                    if !spans.is_empty() || heading_level.is_some() {
540                        blocks.push(ParsedBlock {
541                            spans,
542                            heading_level,
543                            list_style: list_style_for_block,
544                            is_code_block,
545                            code_language,
546                            blockquote_depth: bq_depth,
547                            line_height: css.line_height,
548                            non_breakable_lines: css.non_breakable_lines,
549                            direction: css.direction,
550                            background_color: css.background_color,
551                        });
552                    }
553                } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
554                    // Container elements: recurse into children
555                    for child in node.children() {
556                        walk_node(
557                            child,
558                            &new_state,
559                            blocks,
560                            &new_list_style,
561                            bq_depth,
562                            depth + 1,
563                        );
564                    }
565                } else {
566                    // Inline element or unknown: recurse
567                    for child in node.children() {
568                        walk_node(
569                            child,
570                            &new_state,
571                            blocks,
572                            current_list_style,
573                            bq_depth,
574                            depth + 1,
575                        );
576                    }
577                }
578            }
579            Node::Text(text) => {
580                let t = text.text.to_string();
581                let trimmed = t.trim();
582                if !trimmed.is_empty() {
583                    // Bare text not in a block — create a paragraph
584                    blocks.push(ParsedBlock {
585                        spans: vec![ParsedSpan {
586                            text: trimmed.to_string(),
587                            bold: state.bold,
588                            italic: state.italic,
589                            underline: state.underline,
590                            strikeout: state.strikeout,
591                            code: state.code,
592                            link_href: state.link_href.clone(),
593                        }],
594                        heading_level: None,
595                        list_style: None,
596                        is_code_block: false,
597                        code_language: None,
598                        blockquote_depth,
599                        line_height: None,
600                        non_breakable_lines: None,
601                        direction: None,
602                        background_color: None,
603                    });
604                }
605            }
606            _ => {
607                // Document, Comment, etc. — recurse children
608                for child in node.children() {
609                    walk_node(
610                        child,
611                        state,
612                        blocks,
613                        current_list_style,
614                        blockquote_depth,
615                        depth + 1,
616                    );
617                }
618            }
619        }
620    }
621
622    /// Collect inline spans from a block-level element's children.
623    /// If a nested block-level element is encountered, it is flushed as a
624    /// separate block.
625    fn collect_inline_spans(
626        node: ego_tree::NodeRef<Node>,
627        state: &FmtState,
628        spans: &mut Vec<ParsedSpan>,
629        current_list_style: &Option<ListStyle>,
630        blocks: &mut Vec<ParsedBlock>,
631        blockquote_depth: u32,
632        depth: usize,
633    ) {
634        if depth > MAX_RECURSION_DEPTH {
635            return;
636        }
637        for child in node.children() {
638            match child.value() {
639                Node::Text(text) => {
640                    let t = text.text.to_string();
641                    if !t.is_empty() {
642                        spans.push(ParsedSpan {
643                            text: t,
644                            bold: state.bold,
645                            italic: state.italic,
646                            underline: state.underline,
647                            strikeout: state.strikeout,
648                            code: state.code,
649                            link_href: state.link_href.clone(),
650                        });
651                    }
652                }
653                Node::Element(el) => {
654                    let tag = el.name();
655                    let mut new_state = state.clone();
656
657                    match tag {
658                        "b" | "strong" => new_state.bold = true,
659                        "i" | "em" => new_state.italic = true,
660                        "u" | "ins" => new_state.underline = true,
661                        "s" | "del" | "strike" => new_state.strikeout = true,
662                        "code" => new_state.code = true,
663                        "a" => {
664                            if let Some(href) = el.attr("href") {
665                                new_state.link_href = Some(href.to_string());
666                            }
667                        }
668                        _ => {}
669                    }
670
671                    // Check for nested block elements
672                    let nested_block = matches!(
673                        tag,
674                        "p" | "div"
675                            | "h1"
676                            | "h2"
677                            | "h3"
678                            | "h4"
679                            | "h5"
680                            | "h6"
681                            | "li"
682                            | "pre"
683                            | "blockquote"
684                            | "ul"
685                            | "ol"
686                    );
687
688                    if tag == "br" {
689                        // br within a block: treat as splitting into new block
690                        // For simplicity, just add a newline to current span
691                        spans.push(ParsedSpan {
692                            text: String::new(),
693                            ..Default::default()
694                        });
695                    } else if nested_block {
696                        // Flush as separate block
697                        walk_node(
698                            child,
699                            &new_state,
700                            blocks,
701                            current_list_style,
702                            blockquote_depth,
703                            depth + 1,
704                        );
705                    } else {
706                        // Inline element: recurse
707                        collect_inline_spans(
708                            child,
709                            &new_state,
710                            spans,
711                            current_list_style,
712                            blocks,
713                            blockquote_depth,
714                            depth + 1,
715                        );
716                    }
717                }
718                _ => {}
719            }
720        }
721    }
722
723    let initial_state = FmtState::default();
724    for child in root.children() {
725        walk_node(child, &initial_state, &mut blocks, &None, 0, 0);
726    }
727
728    // If no blocks were parsed, create a single empty paragraph
729    if blocks.is_empty() {
730        blocks.push(ParsedBlock {
731            spans: vec![ParsedSpan {
732                text: String::new(),
733                ..Default::default()
734            }],
735            heading_level: None,
736            list_style: None,
737            is_code_block: false,
738            code_language: None,
739            blockquote_depth: 0,
740            line_height: None,
741            non_breakable_lines: None,
742            direction: None,
743            background_color: None,
744        });
745    }
746
747    blocks
748}
749
750#[cfg(test)]
751mod tests {
752    use super::*;
753
754    #[test]
755    fn test_parse_markdown_simple_paragraph() {
756        let blocks = parse_markdown("Hello **world**");
757        assert_eq!(blocks.len(), 1);
758        assert!(blocks[0].spans.len() >= 2);
759        // "Hello " is plain, "world" is bold
760        let plain_span = blocks[0]
761            .spans
762            .iter()
763            .find(|s| s.text.contains("Hello"))
764            .unwrap();
765        assert!(!plain_span.bold);
766        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
767        assert!(bold_span.bold);
768    }
769
770    #[test]
771    fn test_parse_markdown_heading() {
772        let blocks = parse_markdown("# Title");
773        assert_eq!(blocks.len(), 1);
774        assert_eq!(blocks[0].heading_level, Some(1));
775        assert_eq!(blocks[0].spans[0].text, "Title");
776    }
777
778    #[test]
779    fn test_parse_markdown_list() {
780        let blocks = parse_markdown("- item1\n- item2");
781        assert!(blocks.len() >= 2);
782        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
783        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
784    }
785
786    #[test]
787    fn test_parse_html_simple() {
788        let blocks = parse_html("<p>Hello <b>world</b></p>");
789        assert_eq!(blocks.len(), 1);
790        assert!(blocks[0].spans.len() >= 2);
791        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
792        assert!(bold_span.bold);
793    }
794
795    #[test]
796    fn test_parse_html_multiple_paragraphs() {
797        let blocks = parse_html("<p>A</p><p>B</p>");
798        assert_eq!(blocks.len(), 2);
799    }
800
801    #[test]
802    fn test_parse_html_heading() {
803        let blocks = parse_html("<h2>Subtitle</h2>");
804        assert_eq!(blocks.len(), 1);
805        assert_eq!(blocks[0].heading_level, Some(2));
806    }
807
808    #[test]
809    fn test_parse_html_list() {
810        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
811        assert!(blocks.len() >= 2);
812        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
813    }
814
815    #[test]
816    fn test_parse_markdown_code_block() {
817        let blocks = parse_markdown("```\nfn main() {}\n```");
818        assert_eq!(blocks.len(), 1);
819        assert!(blocks[0].is_code_block);
820        assert!(blocks[0].spans[0].code);
821    }
822
823    #[test]
824    fn test_parse_markdown_nested_formatting() {
825        let blocks = parse_markdown("***bold italic***");
826        assert_eq!(blocks.len(), 1);
827        let span = &blocks[0].spans[0];
828        assert!(span.bold);
829        assert!(span.italic);
830    }
831
832    #[test]
833    fn test_parse_markdown_link() {
834        let blocks = parse_markdown("[click](http://example.com)");
835        assert_eq!(blocks.len(), 1);
836        let span = &blocks[0].spans[0];
837        assert_eq!(span.text, "click");
838        assert_eq!(span.link_href, Some("http://example.com".to_string()));
839    }
840
841    #[test]
842    fn test_parse_markdown_empty() {
843        let blocks = parse_markdown("");
844        assert_eq!(blocks.len(), 1);
845        assert!(blocks[0].spans[0].text.is_empty());
846    }
847
848    #[test]
849    fn test_parse_html_empty() {
850        let blocks = parse_html("");
851        assert_eq!(blocks.len(), 1);
852        assert!(blocks[0].spans[0].text.is_empty());
853    }
854
855    #[test]
856    fn test_parse_html_nested_formatting() {
857        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
858        assert_eq!(blocks.len(), 1);
859        let span = &blocks[0].spans[0];
860        assert!(span.bold);
861        assert!(span.italic);
862    }
863
864    #[test]
865    fn test_parse_html_link() {
866        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
867        assert_eq!(blocks.len(), 1);
868        let span = &blocks[0].spans[0];
869        assert_eq!(span.text, "click");
870        assert_eq!(span.link_href, Some("http://example.com".to_string()));
871    }
872
873    #[test]
874    fn test_parse_html_ordered_list() {
875        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
876        assert!(blocks.len() >= 2);
877        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
878    }
879
880    #[test]
881    fn test_parse_markdown_ordered_list() {
882        let blocks = parse_markdown("1. first\n2. second");
883        assert!(blocks.len() >= 2);
884        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
885    }
886
887    #[test]
888    fn test_parse_html_blockquote_nested() {
889        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
890        assert!(blocks.len() >= 3);
891    }
892
893    #[test]
894    fn test_parse_block_styles_line_height() {
895        let styles = parse_block_styles("line-height: 1.5");
896        assert_eq!(styles.line_height, Some(1500));
897    }
898
899    #[test]
900    fn test_parse_block_styles_direction_rtl() {
901        let styles = parse_block_styles("direction: rtl");
902        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
903    }
904
905    #[test]
906    fn test_parse_block_styles_background_color() {
907        let styles = parse_block_styles("background-color: #ff0000");
908        assert_eq!(styles.background_color, Some("#ff0000".to_string()));
909    }
910
911    #[test]
912    fn test_parse_block_styles_white_space_pre() {
913        let styles = parse_block_styles("white-space: pre");
914        assert_eq!(styles.non_breakable_lines, Some(true));
915    }
916
917    #[test]
918    fn test_parse_block_styles_multiple() {
919        let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
920        assert_eq!(styles.line_height, Some(2000));
921        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
922        assert_eq!(styles.background_color, Some("blue".to_string()));
923    }
924
925    #[test]
926    fn test_parse_html_block_styles_extracted() {
927        let blocks = parse_html(
928            r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
929        );
930        assert_eq!(blocks.len(), 1);
931        assert_eq!(blocks[0].line_height, Some(1500));
932        assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
933        assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
934    }
935
936    #[test]
937    fn test_parse_html_white_space_pre() {
938        let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
939        assert_eq!(blocks.len(), 1);
940        assert_eq!(blocks[0].non_breakable_lines, Some(true));
941    }
942
943    #[test]
944    fn test_parse_html_no_styles_returns_none() {
945        let blocks = parse_html("<p>plain</p>");
946        assert_eq!(blocks.len(), 1);
947        assert_eq!(blocks[0].line_height, None);
948        assert_eq!(blocks[0].direction, None);
949        assert_eq!(blocks[0].background_color, None);
950        assert_eq!(blocks[0].non_breakable_lines, None);
951    }
952}