Skip to main content

text_document_common/parser_tools/
content_parser.rs

1use crate::entities::{ListStyle, TextDirection};
2
3/// A parsed inline span with formatting info
4#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6    pub text: String,
7    pub bold: bool,
8    pub italic: bool,
9    pub underline: bool,
10    pub strikeout: bool,
11    pub code: bool,
12    pub link_href: Option<String>,
13}
14
15/// A parsed block (paragraph, heading, list item, code block)
16#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18    pub spans: Vec<ParsedSpan>,
19    pub heading_level: Option<i64>,
20    pub list_style: Option<ListStyle>,
21    pub is_code_block: bool,
22    pub line_height: Option<i64>,
23    pub non_breakable_lines: Option<bool>,
24    pub direction: Option<TextDirection>,
25    pub background_color: Option<String>,
26}
27
28// ─── Markdown parsing ────────────────────────────────────────────────
29
30pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
31    use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
32
33    let options =
34        Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
35    let parser = Parser::new_ext(markdown, options);
36
37    let mut blocks: Vec<ParsedBlock> = Vec::new();
38    let mut current_spans: Vec<ParsedSpan> = Vec::new();
39    let mut current_heading: Option<i64> = None;
40    let mut current_list_style: Option<ListStyle> = None;
41    let mut is_code_block = false;
42    let mut in_block = false;
43
44    // Formatting state stack
45    let mut bold = false;
46    let mut italic = false;
47    let mut strikeout = false;
48    let mut link_href: Option<String> = None;
49
50    // List style stack for nested lists
51    let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
52
53    for event in parser {
54        match event {
55            Event::Start(Tag::Paragraph) => {
56                in_block = true;
57                current_heading = None;
58                is_code_block = false;
59            }
60            Event::End(TagEnd::Paragraph) => {
61                if !current_spans.is_empty() || in_block {
62                    blocks.push(ParsedBlock {
63                        spans: std::mem::take(&mut current_spans),
64                        heading_level: current_heading.take(),
65                        list_style: current_list_style.clone(),
66                        is_code_block: false,
67                        line_height: None,
68                        non_breakable_lines: None,
69                        direction: None,
70                        background_color: None,
71                    });
72                }
73                in_block = false;
74                current_list_style = None;
75            }
76            Event::Start(Tag::Heading { level, .. }) => {
77                in_block = true;
78                current_heading = Some(heading_level_to_i64(level));
79                is_code_block = false;
80            }
81            Event::End(TagEnd::Heading(_)) => {
82                blocks.push(ParsedBlock {
83                    spans: std::mem::take(&mut current_spans),
84                    heading_level: current_heading.take(),
85                    list_style: None,
86                    is_code_block: false,
87                    line_height: None,
88                    non_breakable_lines: None,
89                    direction: None,
90                    background_color: None,
91                });
92                in_block = false;
93            }
94            Event::Start(Tag::List(ordered)) => {
95                let style = if ordered.is_some() {
96                    Some(ListStyle::Decimal)
97                } else {
98                    Some(ListStyle::Disc)
99                };
100                list_stack.push(style);
101            }
102            Event::End(TagEnd::List(_)) => {
103                list_stack.pop();
104            }
105            Event::Start(Tag::Item) => {
106                in_block = true;
107                current_list_style = list_stack.last().cloned().flatten();
108            }
109            Event::End(TagEnd::Item) => {
110                // The paragraph inside the item will have already been flushed,
111                // but if there was no inner paragraph (tight list), flush now.
112                if !current_spans.is_empty() {
113                    blocks.push(ParsedBlock {
114                        spans: std::mem::take(&mut current_spans),
115                        heading_level: None,
116                        list_style: current_list_style.clone(),
117                        is_code_block: false,
118                        line_height: None,
119                        non_breakable_lines: None,
120                        direction: None,
121                        background_color: None,
122                    });
123                }
124                in_block = false;
125                current_list_style = None;
126            }
127            Event::Start(Tag::CodeBlock(_)) => {
128                in_block = true;
129                is_code_block = true;
130            }
131            Event::End(TagEnd::CodeBlock) => {
132                blocks.push(ParsedBlock {
133                    spans: std::mem::take(&mut current_spans),
134                    heading_level: None,
135                    list_style: None,
136                    is_code_block: true,
137                    line_height: None,
138                    non_breakable_lines: None,
139                    direction: None,
140                    background_color: None,
141                });
142                in_block = false;
143                is_code_block = false;
144            }
145            Event::Start(Tag::Emphasis) => {
146                italic = true;
147            }
148            Event::End(TagEnd::Emphasis) => {
149                italic = false;
150            }
151            Event::Start(Tag::Strong) => {
152                bold = true;
153            }
154            Event::End(TagEnd::Strong) => {
155                bold = false;
156            }
157            Event::Start(Tag::Strikethrough) => {
158                strikeout = true;
159            }
160            Event::End(TagEnd::Strikethrough) => {
161                strikeout = false;
162            }
163            Event::Start(Tag::Link { dest_url, .. }) => {
164                link_href = Some(dest_url.to_string());
165            }
166            Event::End(TagEnd::Link) => {
167                link_href = None;
168            }
169            Event::Text(text) => {
170                if !in_block {
171                    // Bare text outside any block — create an implicit paragraph
172                    in_block = true;
173                }
174                current_spans.push(ParsedSpan {
175                    text: text.to_string(),
176                    bold,
177                    italic,
178                    underline: false,
179                    strikeout,
180                    code: is_code_block,
181                    link_href: link_href.clone(),
182                });
183            }
184            Event::Code(text) => {
185                if !in_block {
186                    in_block = true;
187                }
188                current_spans.push(ParsedSpan {
189                    text: text.to_string(),
190                    bold,
191                    italic,
192                    underline: false,
193                    strikeout,
194                    code: true,
195                    link_href: link_href.clone(),
196                });
197            }
198            Event::SoftBreak => {
199                // Add a space
200                current_spans.push(ParsedSpan {
201                    text: " ".to_string(),
202                    bold,
203                    italic,
204                    underline: false,
205                    strikeout,
206                    code: false,
207                    link_href: link_href.clone(),
208                });
209            }
210            Event::HardBreak => {
211                // Finalize current block
212                if !current_spans.is_empty() || in_block {
213                    blocks.push(ParsedBlock {
214                        spans: std::mem::take(&mut current_spans),
215                        heading_level: current_heading.take(),
216                        list_style: current_list_style.clone(),
217                        is_code_block,
218                        line_height: None,
219                        non_breakable_lines: None,
220                        direction: None,
221                        background_color: None,
222                    });
223                }
224            }
225            _ => {}
226        }
227    }
228
229    // Flush any remaining content
230    if !current_spans.is_empty() {
231        blocks.push(ParsedBlock {
232            spans: std::mem::take(&mut current_spans),
233            heading_level: current_heading,
234            list_style: current_list_style,
235            is_code_block,
236            line_height: None,
237            non_breakable_lines: None,
238            direction: None,
239            background_color: None,
240        });
241    }
242
243    // If no blocks were parsed, create a single empty paragraph
244    if blocks.is_empty() {
245        blocks.push(ParsedBlock {
246            spans: vec![ParsedSpan {
247                text: String::new(),
248                ..Default::default()
249            }],
250            heading_level: None,
251            list_style: None,
252            is_code_block: false,
253            line_height: None,
254            non_breakable_lines: None,
255            direction: None,
256            background_color: None,
257        });
258    }
259
260    blocks
261}
262
263fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
264    use pulldown_cmark::HeadingLevel;
265    match level {
266        HeadingLevel::H1 => 1,
267        HeadingLevel::H2 => 2,
268        HeadingLevel::H3 => 3,
269        HeadingLevel::H4 => 4,
270        HeadingLevel::H5 => 5,
271        HeadingLevel::H6 => 6,
272    }
273}
274
275// ─── HTML parsing ────────────────────────────────────────────────────
276
277use scraper::Node;
278
279/// Parsed CSS block-level styles from an inline `style` attribute.
280#[derive(Debug, Clone, Default)]
281struct BlockStyles {
282    line_height: Option<i64>,
283    non_breakable_lines: Option<bool>,
284    direction: Option<TextDirection>,
285    background_color: Option<String>,
286}
287
288/// Parse relevant CSS properties from an inline style string.
289/// Handles: line-height, white-space, direction, background-color.
290fn parse_block_styles(style: &str) -> BlockStyles {
291    let mut result = BlockStyles::default();
292    for part in style.split(';') {
293        let part = part.trim();
294        if let Some((prop, val)) = part.split_once(':') {
295            let prop = prop.trim().to_ascii_lowercase();
296            let val = val.trim();
297            match prop.as_str() {
298                "line-height" => {
299                    // Try parsing as a plain number (multiplier)
300                    if let Ok(v) = val.parse::<f64>() {
301                        result.line_height = Some((v * 1000.0) as i64);
302                    }
303                }
304                "white-space" => {
305                    if val == "pre" || val == "nowrap" || val == "pre-wrap" {
306                        result.non_breakable_lines = Some(true);
307                    }
308                }
309                "direction" => {
310                    if val.eq_ignore_ascii_case("rtl") {
311                        result.direction = Some(TextDirection::RightToLeft);
312                    } else if val.eq_ignore_ascii_case("ltr") {
313                        result.direction = Some(TextDirection::LeftToRight);
314                    }
315                }
316                "background-color" | "background" => {
317                    result.background_color = Some(val.to_string());
318                }
319                _ => {}
320            }
321        }
322    }
323    result
324}
325
326pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
327    use scraper::Html;
328
329    let fragment = Html::parse_fragment(html);
330    let mut blocks: Vec<ParsedBlock> = Vec::new();
331
332    // Walk the DOM tree starting from the root
333    let root = fragment.root_element();
334
335    #[derive(Clone, Default)]
336    struct FmtState {
337        bold: bool,
338        italic: bool,
339        underline: bool,
340        strikeout: bool,
341        code: bool,
342        link_href: Option<String>,
343    }
344
345    const MAX_RECURSION_DEPTH: usize = 256;
346
347    fn walk_node(
348        node: ego_tree::NodeRef<Node>,
349        state: &FmtState,
350        blocks: &mut Vec<ParsedBlock>,
351        current_list_style: &Option<ListStyle>,
352        depth: usize,
353    ) {
354        if depth > MAX_RECURSION_DEPTH {
355            return;
356        }
357        match node.value() {
358            Node::Element(el) => {
359                let tag = el.name();
360                let mut new_state = state.clone();
361                let mut new_list_style = current_list_style.clone();
362
363                // Determine if this is a block-level element
364                let is_block_tag = matches!(
365                    tag,
366                    "p" | "div"
367                        | "h1"
368                        | "h2"
369                        | "h3"
370                        | "h4"
371                        | "h5"
372                        | "h6"
373                        | "li"
374                        | "pre"
375                        | "br"
376                        | "blockquote"
377                );
378
379                // Update formatting state
380                match tag {
381                    "b" | "strong" => new_state.bold = true,
382                    "i" | "em" => new_state.italic = true,
383                    "u" | "ins" => new_state.underline = true,
384                    "s" | "del" | "strike" => new_state.strikeout = true,
385                    "code" => new_state.code = true,
386                    "a" => {
387                        if let Some(href) = el.attr("href") {
388                            new_state.link_href = Some(href.to_string());
389                        }
390                    }
391                    "ul" => {
392                        new_list_style = Some(ListStyle::Disc);
393                    }
394                    "ol" => {
395                        new_list_style = Some(ListStyle::Decimal);
396                    }
397                    _ => {}
398                }
399
400                // Determine heading level
401                let heading_level = match tag {
402                    "h1" => Some(1),
403                    "h2" => Some(2),
404                    "h3" => Some(3),
405                    "h4" => Some(4),
406                    "h5" => Some(5),
407                    "h6" => Some(6),
408                    _ => None,
409                };
410
411                let is_code_block = tag == "pre";
412
413                // Extract CSS styles from block-level elements
414                let css = if is_block_tag {
415                    el.attr("style").map(parse_block_styles).unwrap_or_default()
416                } else {
417                    BlockStyles::default()
418                };
419
420                if tag == "br" {
421                    // <br> creates a new block
422                    blocks.push(ParsedBlock {
423                        spans: vec![ParsedSpan {
424                            text: String::new(),
425                            ..Default::default()
426                        }],
427                        heading_level: None,
428                        list_style: None,
429                        is_code_block: false,
430                        line_height: None,
431                        non_breakable_lines: None,
432                        direction: None,
433                        background_color: None,
434                    });
435                    return;
436                }
437
438                if is_block_tag && tag != "br" {
439                    // Start collecting spans for a new block
440                    let mut spans: Vec<ParsedSpan> = Vec::new();
441                    collect_inline_spans(
442                        node,
443                        &new_state,
444                        &mut spans,
445                        &new_list_style,
446                        blocks,
447                        depth + 1,
448                    );
449
450                    let list_style_for_block = if tag == "li" {
451                        new_list_style.clone()
452                    } else {
453                        None
454                    };
455
456                    if !spans.is_empty() || heading_level.is_some() {
457                        blocks.push(ParsedBlock {
458                            spans,
459                            heading_level,
460                            list_style: list_style_for_block,
461                            is_code_block,
462                            line_height: css.line_height,
463                            non_breakable_lines: css.non_breakable_lines,
464                            direction: css.direction,
465                            background_color: css.background_color,
466                        });
467                    }
468                } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
469                    // Container elements: recurse into children
470                    for child in node.children() {
471                        walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
472                    }
473                } else {
474                    // Inline element or unknown: recurse
475                    for child in node.children() {
476                        walk_node(child, &new_state, blocks, current_list_style, depth + 1);
477                    }
478                }
479            }
480            Node::Text(text) => {
481                let t = text.text.to_string();
482                let trimmed = t.trim();
483                if !trimmed.is_empty() {
484                    // Bare text not in a block — create a paragraph
485                    blocks.push(ParsedBlock {
486                        spans: vec![ParsedSpan {
487                            text: trimmed.to_string(),
488                            bold: state.bold,
489                            italic: state.italic,
490                            underline: state.underline,
491                            strikeout: state.strikeout,
492                            code: state.code,
493                            link_href: state.link_href.clone(),
494                        }],
495                        heading_level: None,
496                        list_style: None,
497                        is_code_block: false,
498                        line_height: None,
499                        non_breakable_lines: None,
500                        direction: None,
501                        background_color: None,
502                    });
503                }
504            }
505            _ => {
506                // Document, Comment, etc. — recurse children
507                for child in node.children() {
508                    walk_node(child, state, blocks, current_list_style, depth + 1);
509                }
510            }
511        }
512    }
513
514    /// Collect inline spans from a block-level element's children.
515    /// If a nested block-level element is encountered, it is flushed as a
516    /// separate block.
517    fn collect_inline_spans(
518        node: ego_tree::NodeRef<Node>,
519        state: &FmtState,
520        spans: &mut Vec<ParsedSpan>,
521        current_list_style: &Option<ListStyle>,
522        blocks: &mut Vec<ParsedBlock>,
523        depth: usize,
524    ) {
525        if depth > MAX_RECURSION_DEPTH {
526            return;
527        }
528        for child in node.children() {
529            match child.value() {
530                Node::Text(text) => {
531                    let t = text.text.to_string();
532                    if !t.is_empty() {
533                        spans.push(ParsedSpan {
534                            text: t,
535                            bold: state.bold,
536                            italic: state.italic,
537                            underline: state.underline,
538                            strikeout: state.strikeout,
539                            code: state.code,
540                            link_href: state.link_href.clone(),
541                        });
542                    }
543                }
544                Node::Element(el) => {
545                    let tag = el.name();
546                    let mut new_state = state.clone();
547
548                    match tag {
549                        "b" | "strong" => new_state.bold = true,
550                        "i" | "em" => new_state.italic = true,
551                        "u" | "ins" => new_state.underline = true,
552                        "s" | "del" | "strike" => new_state.strikeout = true,
553                        "code" => new_state.code = true,
554                        "a" => {
555                            if let Some(href) = el.attr("href") {
556                                new_state.link_href = Some(href.to_string());
557                            }
558                        }
559                        _ => {}
560                    }
561
562                    // Check for nested block elements
563                    let nested_block = matches!(
564                        tag,
565                        "p" | "div"
566                            | "h1"
567                            | "h2"
568                            | "h3"
569                            | "h4"
570                            | "h5"
571                            | "h6"
572                            | "li"
573                            | "pre"
574                            | "blockquote"
575                            | "ul"
576                            | "ol"
577                    );
578
579                    if tag == "br" {
580                        // br within a block: treat as splitting into new block
581                        // For simplicity, just add a newline to current span
582                        spans.push(ParsedSpan {
583                            text: String::new(),
584                            ..Default::default()
585                        });
586                    } else if nested_block {
587                        // Flush as separate block
588                        walk_node(child, &new_state, blocks, current_list_style, depth + 1);
589                    } else {
590                        // Inline element: recurse
591                        collect_inline_spans(
592                            child,
593                            &new_state,
594                            spans,
595                            current_list_style,
596                            blocks,
597                            depth + 1,
598                        );
599                    }
600                }
601                _ => {}
602            }
603        }
604    }
605
606    let initial_state = FmtState::default();
607    for child in root.children() {
608        walk_node(child, &initial_state, &mut blocks, &None, 0);
609    }
610
611    // If no blocks were parsed, create a single empty paragraph
612    if blocks.is_empty() {
613        blocks.push(ParsedBlock {
614            spans: vec![ParsedSpan {
615                text: String::new(),
616                ..Default::default()
617            }],
618            heading_level: None,
619            list_style: None,
620            is_code_block: false,
621            line_height: None,
622            non_breakable_lines: None,
623            direction: None,
624            background_color: None,
625        });
626    }
627
628    blocks
629}
630
631#[cfg(test)]
632mod tests {
633    use super::*;
634
635    #[test]
636    fn test_parse_markdown_simple_paragraph() {
637        let blocks = parse_markdown("Hello **world**");
638        assert_eq!(blocks.len(), 1);
639        assert!(blocks[0].spans.len() >= 2);
640        // "Hello " is plain, "world" is bold
641        let plain_span = blocks[0]
642            .spans
643            .iter()
644            .find(|s| s.text.contains("Hello"))
645            .unwrap();
646        assert!(!plain_span.bold);
647        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
648        assert!(bold_span.bold);
649    }
650
651    #[test]
652    fn test_parse_markdown_heading() {
653        let blocks = parse_markdown("# Title");
654        assert_eq!(blocks.len(), 1);
655        assert_eq!(blocks[0].heading_level, Some(1));
656        assert_eq!(blocks[0].spans[0].text, "Title");
657    }
658
659    #[test]
660    fn test_parse_markdown_list() {
661        let blocks = parse_markdown("- item1\n- item2");
662        assert!(blocks.len() >= 2);
663        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
664        assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
665    }
666
667    #[test]
668    fn test_parse_html_simple() {
669        let blocks = parse_html("<p>Hello <b>world</b></p>");
670        assert_eq!(blocks.len(), 1);
671        assert!(blocks[0].spans.len() >= 2);
672        let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
673        assert!(bold_span.bold);
674    }
675
676    #[test]
677    fn test_parse_html_multiple_paragraphs() {
678        let blocks = parse_html("<p>A</p><p>B</p>");
679        assert_eq!(blocks.len(), 2);
680    }
681
682    #[test]
683    fn test_parse_html_heading() {
684        let blocks = parse_html("<h2>Subtitle</h2>");
685        assert_eq!(blocks.len(), 1);
686        assert_eq!(blocks[0].heading_level, Some(2));
687    }
688
689    #[test]
690    fn test_parse_html_list() {
691        let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
692        assert!(blocks.len() >= 2);
693        assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
694    }
695
696    #[test]
697    fn test_parse_markdown_code_block() {
698        let blocks = parse_markdown("```\nfn main() {}\n```");
699        assert_eq!(blocks.len(), 1);
700        assert!(blocks[0].is_code_block);
701        assert!(blocks[0].spans[0].code);
702    }
703
704    #[test]
705    fn test_parse_markdown_nested_formatting() {
706        let blocks = parse_markdown("***bold italic***");
707        assert_eq!(blocks.len(), 1);
708        let span = &blocks[0].spans[0];
709        assert!(span.bold);
710        assert!(span.italic);
711    }
712
713    #[test]
714    fn test_parse_markdown_link() {
715        let blocks = parse_markdown("[click](http://example.com)");
716        assert_eq!(blocks.len(), 1);
717        let span = &blocks[0].spans[0];
718        assert_eq!(span.text, "click");
719        assert_eq!(span.link_href, Some("http://example.com".to_string()));
720    }
721
722    #[test]
723    fn test_parse_markdown_empty() {
724        let blocks = parse_markdown("");
725        assert_eq!(blocks.len(), 1);
726        assert!(blocks[0].spans[0].text.is_empty());
727    }
728
729    #[test]
730    fn test_parse_html_empty() {
731        let blocks = parse_html("");
732        assert_eq!(blocks.len(), 1);
733        assert!(blocks[0].spans[0].text.is_empty());
734    }
735
736    #[test]
737    fn test_parse_html_nested_formatting() {
738        let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
739        assert_eq!(blocks.len(), 1);
740        let span = &blocks[0].spans[0];
741        assert!(span.bold);
742        assert!(span.italic);
743    }
744
745    #[test]
746    fn test_parse_html_link() {
747        let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
748        assert_eq!(blocks.len(), 1);
749        let span = &blocks[0].spans[0];
750        assert_eq!(span.text, "click");
751        assert_eq!(span.link_href, Some("http://example.com".to_string()));
752    }
753
754    #[test]
755    fn test_parse_html_ordered_list() {
756        let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
757        assert!(blocks.len() >= 2);
758        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
759    }
760
761    #[test]
762    fn test_parse_markdown_ordered_list() {
763        let blocks = parse_markdown("1. first\n2. second");
764        assert!(blocks.len() >= 2);
765        assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
766    }
767
768    #[test]
769    fn test_parse_html_blockquote_nested() {
770        let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
771        assert!(blocks.len() >= 3);
772    }
773
774    #[test]
775    fn test_parse_block_styles_line_height() {
776        let styles = parse_block_styles("line-height: 1.5");
777        assert_eq!(styles.line_height, Some(1500));
778    }
779
780    #[test]
781    fn test_parse_block_styles_direction_rtl() {
782        let styles = parse_block_styles("direction: rtl");
783        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
784    }
785
786    #[test]
787    fn test_parse_block_styles_background_color() {
788        let styles = parse_block_styles("background-color: #ff0000");
789        assert_eq!(styles.background_color, Some("#ff0000".to_string()));
790    }
791
792    #[test]
793    fn test_parse_block_styles_white_space_pre() {
794        let styles = parse_block_styles("white-space: pre");
795        assert_eq!(styles.non_breakable_lines, Some(true));
796    }
797
798    #[test]
799    fn test_parse_block_styles_multiple() {
800        let styles = parse_block_styles("line-height: 2.0; direction: rtl; background-color: blue");
801        assert_eq!(styles.line_height, Some(2000));
802        assert_eq!(styles.direction, Some(TextDirection::RightToLeft));
803        assert_eq!(styles.background_color, Some("blue".to_string()));
804    }
805
806    #[test]
807    fn test_parse_html_block_styles_extracted() {
808        let blocks = parse_html(
809            r#"<p style="line-height: 1.5; direction: rtl; background-color: #ccc">text</p>"#,
810        );
811        assert_eq!(blocks.len(), 1);
812        assert_eq!(blocks[0].line_height, Some(1500));
813        assert_eq!(blocks[0].direction, Some(TextDirection::RightToLeft));
814        assert_eq!(blocks[0].background_color, Some("#ccc".to_string()));
815    }
816
817    #[test]
818    fn test_parse_html_white_space_pre() {
819        let blocks = parse_html(r#"<p style="white-space: pre">code</p>"#);
820        assert_eq!(blocks.len(), 1);
821        assert_eq!(blocks[0].non_breakable_lines, Some(true));
822    }
823
824    #[test]
825    fn test_parse_html_no_styles_returns_none() {
826        let blocks = parse_html("<p>plain</p>");
827        assert_eq!(blocks.len(), 1);
828        assert_eq!(blocks[0].line_height, None);
829        assert_eq!(blocks[0].direction, None);
830        assert_eq!(blocks[0].background_color, None);
831        assert_eq!(blocks[0].non_breakable_lines, None);
832    }
833}