Skip to main content

carta_readers/html/
mod.rs

1//! HTML reader.
2//!
3//! Parsing runs in three stages: a tokenizer (`tokenize`) turns the source into a flat stream of
4//! start tags, end tags and text; a tree builder (`tree::build_tree`) assembles that stream into a
5//! node tree, applying void-element and implied-end-tag rules; and a `convert::Converter` walks the
6//! tree into a [`Document`]. Document metadata is read from a `<head>` element when present.
7
8mod classify;
9mod convert;
10mod notes;
11mod table;
12mod tokenize;
13mod tree;
14
15use std::borrow::Cow;
16
17use carta_ast::Document;
18use carta_core::{Extensions, Reader, ReaderOptions, Result};
19
20#[cfg(feature = "opml")]
21use carta_ast::Inline;
22
23#[cfg(feature = "opml")]
24use convert::inlines_from_nodes;
25use convert::{Converter, extract_meta};
26use tokenize::tokenize;
27use tree::{build_tree, locate};
28
29/// Parses HTML text into the document model.
30#[derive(Debug, Default, Clone, Copy)]
31pub struct HtmlReader;
32
33impl Reader for HtmlReader {
34    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
35        Ok(parse(input, options.extensions))
36    }
37}
38
39fn parse(input: &str, ext: Extensions) -> Document {
40    let normalized = normalize(input);
41    let chars: Vec<char> = normalized.chars().collect();
42    let tokens = tokenize(&chars);
43    let roots = build_tree(tokens);
44    let (head, body) = locate(&roots);
45
46    let mut converter = Converter::new(ext);
47    converter.index_notes(notes::collect_note_defs(&body));
48    let meta = head.map(extract_meta).unwrap_or_default();
49    let blocks = converter.blocks(&body, false);
50    Document {
51        meta: meta.into_iter().map(|(k, v)| (k.into(), v)).collect(),
52        blocks,
53        ..Document::default()
54    }
55}
56
57/// Parse a string of HTML inline markup into inlines, with no surrounding block. Recognized inline
58/// tags (`<em>`, `<strong>`, `<code>`, `<a>`, …) become their corresponding constructs, character
59/// references are resolved, and leading and trailing whitespace is trimmed. Intended for callers
60/// that carry inline content in a single string, such as an outline heading.
61#[cfg(feature = "opml")]
62pub(crate) fn parse_inline_fragment(input: &str) -> Vec<Inline> {
63    let normalized = normalize(input);
64    let chars: Vec<char> = normalized.chars().collect();
65    let tokens = tokenize(&chars);
66    let roots = build_tree(tokens);
67    inlines_from_nodes(&roots)
68}
69
70/// Normalize line endings to `\n` and strip a leading byte-order mark.
71fn normalize(input: &str) -> Cow<'_, str> {
72    let without_bom = input.strip_prefix('\u{feff}').unwrap_or(input);
73    if !without_bom.contains('\r') {
74        return Cow::Borrowed(without_bom);
75    }
76    let mut out = String::with_capacity(without_bom.len());
77    let mut chars = without_bom.chars().peekable();
78    while let Some(ch) = chars.next() {
79        match ch {
80            '\r' => {
81                if chars.peek() == Some(&'\n') {
82                    chars.next();
83                }
84                out.push('\n');
85            }
86            other => out.push(other),
87        }
88    }
89    Cow::Owned(out)
90}
91
92#[cfg(test)]
93mod tests {
94    use super::HtmlReader;
95    use carta_ast::{Block, Inline, MathType};
96    use carta_core::{Extension, Extensions, Reader, ReaderOptions};
97
98    /// The structural extensions enabled by default for the `html` format. The unit tests exercise
99    /// this default dialect; `+`/`-` toggle behavior is covered by the golden corpus.
100    fn html_defaults() -> Extensions {
101        Extensions::from_list(&[
102            Extension::AutoIdentifiers,
103            Extension::LineBlocks,
104            Extension::NativeDivs,
105            Extension::NativeSpans,
106        ])
107    }
108
109    fn read_with(input: &str, extensions: Extensions) -> Vec<Block> {
110        let mut options = ReaderOptions::default();
111        options.extensions = extensions;
112        HtmlReader
113            .read(input, &options)
114            .expect("reader should not fail")
115            .blocks
116    }
117
118    fn blocks(input: &str) -> Vec<Block> {
119        read_with(input, html_defaults())
120    }
121
122    #[test]
123    fn paragraph_with_emphasis() {
124        let result = blocks("<p>a <em>b</em></p>");
125        assert!(matches!(result.as_slice(), [Block::Para(_)]));
126    }
127
128    #[test]
129    fn loose_text_is_plain() {
130        assert!(matches!(blocks("hello").as_slice(), [Block::Plain(_)]));
131    }
132
133    #[test]
134    fn paragraph_sibling_promotes_loose_text() {
135        let result = blocks("loose<p>para</p>");
136        assert!(matches!(
137            result.as_slice(),
138            [Block::Para(_), Block::Para(_)]
139        ));
140    }
141
142    #[test]
143    fn horizontal_rule_does_not_promote() {
144        let result = blocks("loose<hr>");
145        assert!(matches!(
146            result.as_slice(),
147            [Block::Plain(_), Block::HorizontalRule]
148        ));
149    }
150
151    #[test]
152    fn nested_list_inside_item_stays_tight() {
153        let result = blocks("<ul><li>a<ul><li>b</li></ul></li></ul>");
154        let Some(Block::BulletList(items)) = result.first() else {
155            panic!("expected bullet list");
156        };
157        let Some(item) = items.first() else {
158            panic!("expected one item");
159        };
160        assert!(matches!(item.first(), Some(Block::Plain(_))));
161    }
162
163    #[test]
164    fn heading_generates_identifier() {
165        let result = blocks("<h1>Hello World</h1>");
166        let Some(Block::Header(level, attr, _)) = result.first() else {
167            panic!("expected header");
168        };
169        assert_eq!(*level, 1);
170        assert_eq!(attr.id, "hello-world");
171    }
172
173    #[test]
174    fn duplicate_identifiers_are_disambiguated() {
175        let result = blocks("<h1>Sec</h1><h2>Sec</h2>");
176        let ids: Vec<&str> = result
177            .iter()
178            .filter_map(|block| match block {
179                Block::Header(_, attr, _) => Some(attr.id.as_str()),
180                _ => None,
181            })
182            .collect();
183        assert_eq!(ids, vec!["sec", "sec-1"]);
184    }
185
186    #[test]
187    fn entities_are_decoded() {
188        let result = blocks("<p>a &amp; b &copy; c</p>");
189        let Some(Block::Para(inlines)) = result.first() else {
190            panic!("expected paragraph");
191        };
192        assert!(inlines.contains(&Inline::Str("&".to_string().into())));
193        assert!(inlines.contains(&Inline::Str("\u{a9}".to_string().into())));
194    }
195
196    #[test]
197    fn comment_joins_surrounding_text() {
198        let result = blocks("<p>a<!-- c -->b</p>");
199        let Some(Block::Para(inlines)) = result.first() else {
200            panic!("expected paragraph");
201        };
202        assert_eq!(inlines.as_slice(), [Inline::Str("ab".to_string().into())]);
203    }
204
205    #[test]
206    fn script_content_is_dropped() {
207        assert!(blocks("<script>var x = 1;</script><p>p</p>").len() == 1);
208    }
209
210    #[test]
211    fn head_metadata_is_extracted() {
212        let document = HtmlReader
213            .read(
214                "<head><title>T</title><meta name=\"author\" content=\"A\"></head><body><p>b</p></body>",
215                &ReaderOptions::default(),
216            )
217            .expect("reader should not fail");
218        assert!(document.meta.contains_key("title"));
219        assert!(document.meta.contains_key("author"));
220    }
221
222    use carta_ast::{Alignment, ColWidth, ListNumberStyle, Target};
223
224    fn first_block(input: &str) -> Block {
225        blocks(input).into_iter().next().expect("a block")
226    }
227
228    fn para_inlines(input: &str) -> Vec<Inline> {
229        match first_block(input) {
230            Block::Para(inlines) | Block::Plain(inlines) => inlines,
231            other => panic!("expected a paragraph, got {other:?}"),
232        }
233    }
234
235    #[test]
236    fn normalizes_crlf_and_strips_bom() {
237        let inlines = para_inlines("\u{feff}<p>a\r\nb</p>");
238        assert_eq!(
239            inlines.as_slice(),
240            [
241                Inline::Str("a".to_string().into()),
242                Inline::SoftBreak,
243                Inline::Str("b".to_string().into())
244            ]
245        );
246    }
247
248    #[test]
249    fn ordered_list_reads_type_and_start() {
250        let Block::OrderedList(attrs, items) =
251            first_block(r#"<ol type="A" start="3"><li>x</li><li>y</li></ol>"#)
252        else {
253            panic!("expected ordered list");
254        };
255        assert_eq!(attrs.start, 3);
256        assert_eq!(attrs.style, ListNumberStyle::UpperAlpha);
257        assert_eq!(items.len(), 2);
258    }
259
260    #[test]
261    fn menu_is_a_bullet_list() {
262        assert!(matches!(
263            first_block("<menu><li>a</li></menu>"),
264            Block::BulletList(_)
265        ));
266    }
267
268    #[test]
269    fn implied_li_close_splits_items() {
270        let Block::BulletList(items) = first_block("<ul><li>a<li>b</ul>") else {
271            panic!("expected bullet list");
272        };
273        assert_eq!(items.len(), 2);
274    }
275
276    #[test]
277    fn pre_with_code_language_class_becomes_code_block() {
278        let Block::CodeBlock(attr, text) = first_block(
279            r#"<pre><code class="language-rust">let x = 1;
280</code></pre>"#,
281        ) else {
282            panic!("expected code block");
283        };
284        assert_eq!(attr.classes, vec!["rust".to_string()]);
285        assert_eq!(text, "let x = 1;");
286    }
287
288    #[test]
289    fn definition_list_pairs_terms_and_definitions() {
290        let Block::DefinitionList(items) =
291            first_block("<dl><dt>term</dt><dd>one</dd><dd>two</dd></dl>")
292        else {
293            panic!("expected definition list");
294        };
295        let (term, defs) = items.into_iter().next().expect("an item");
296        assert_eq!(term, vec![Inline::Str("term".to_string().into())]);
297        assert_eq!(defs.len(), 2);
298    }
299
300    #[test]
301    fn blockquote_wraps_child_blocks() {
302        assert!(matches!(
303            first_block("<blockquote><p>q</p></blockquote>"),
304            Block::BlockQuote(_)
305        ));
306    }
307
308    #[test]
309    fn sectioning_div_gets_a_class() {
310        let Block::Div(attr, _) = first_block("<section><p>x</p></section>") else {
311            panic!("expected div");
312        };
313        assert!(attr.classes.contains(&"section".into()));
314    }
315
316    #[test]
317    fn figure_separates_caption_from_content() {
318        let Block::Figure(_, caption, content) =
319            first_block("<figure><img src=\"a.png\"><figcaption>cap</figcaption></figure>")
320        else {
321            panic!("expected figure");
322        };
323        assert_eq!(caption.short, None);
324        assert!(!caption.long.is_empty());
325        assert!(!content.is_empty());
326    }
327
328    #[test]
329    fn table_reads_sections_alignment_and_spans() {
330        let input = r#"<table>
331            <caption>cap</caption>
332            <colgroup><col style="width: 25%"><col></colgroup>
333            <thead><tr><th align="right">H1</th><th>H2</th></tr></thead>
334            <tbody><tr><td colspan="2">wide</td></tr></tbody>
335            <tfoot><tr><td>f1</td><td>f2</td></tr></tfoot>
336        </table>"#;
337        let Block::Table(table) = first_block(input) else {
338            panic!("expected table");
339        };
340        assert_eq!(table.col_specs.len(), 2);
341        assert_eq!(
342            table.col_specs.first().map(|spec| spec.width.clone()),
343            Some(ColWidth::ColWidth(0.25))
344        );
345        assert_eq!(
346            table
347                .head
348                .rows
349                .first()
350                .and_then(|row| row.cells.first())
351                .map(|cell| cell.align.clone()),
352            Some(Alignment::AlignRight)
353        );
354        let body_cell_span = table
355            .bodies
356            .first()
357            .and_then(|body| body.body.first())
358            .and_then(|row| row.cells.first())
359            .map(|cell| cell.col_span);
360        assert_eq!(body_cell_span, Some(2));
361        assert_eq!(table.foot.rows.len(), 1);
362    }
363
364    #[test]
365    fn oversized_cell_spans_are_clamped() {
366        // A cell span materialises one grid slot per spanned column (and a carry per spanned row),
367        // so an unbounded `colspan="90000000"` once forced a multi-gigabyte allocation that a
368        // nightly fuzz run hit as an out-of-memory crash. Spans are now clamped to the HTML spec's
369        // limits, keeping the input parseable in bounded memory.
370        let input = r#"<table><tr><td colspan="90000000" rowspan="2">x</td></tr><tr><td>y</td></tr></table>"#;
371        let Block::Table(table) = first_block(input) else {
372            panic!("expected table");
373        };
374        let cell_span = table
375            .bodies
376            .first()
377            .and_then(|body| body.body.first())
378            .and_then(|row| row.cells.first())
379            .map(|cell| (cell.col_span, cell.row_span));
380        assert_eq!(cell_span, Some((1000, 2)));
381    }
382
383    #[test]
384    fn cell_alignment_reads_text_align_style() {
385        let Block::Table(table) =
386            first_block(r#"<table><tr><td style="text-align: center">c</td></tr></table>"#)
387        else {
388            panic!("expected table");
389        };
390        let align = table
391            .bodies
392            .first()
393            .and_then(|body| body.body.first())
394            .and_then(|row| row.cells.first())
395            .map(|cell| cell.align.clone());
396        assert_eq!(align, Some(Alignment::AlignCenter));
397    }
398
399    #[test]
400    fn every_inline_emphasis_kind_is_mapped() {
401        let inlines = para_inlines(
402            "<p><em>a</em><b>b</b><del>c</del><u>d</u><sup>e</sup><sub>f</sub><q>g</q></p>",
403        );
404        assert!(matches!(
405            inlines.as_slice(),
406            [
407                Inline::Emph(_),
408                Inline::Strong(_),
409                Inline::Strikeout(_),
410                Inline::Underline(_),
411                Inline::Superscript(_),
412                Inline::Subscript(_),
413                Inline::Quoted(_, _),
414            ]
415        ));
416    }
417
418    #[test]
419    fn class_carrying_inlines_become_spans() {
420        let inlines = para_inlines("<p><mark>m</mark><kbd>k</kbd></p>");
421        let classes: Vec<&str> = inlines
422            .iter()
423            .filter_map(|inline| match inline {
424                Inline::Span(attr, _) => attr.classes.first().map(carta_ast::Text::as_str),
425                _ => None,
426            })
427            .collect();
428        assert_eq!(classes, vec!["mark", "kbd"]);
429    }
430
431    #[test]
432    fn code_variants_force_classes() {
433        let inlines = para_inlines("<p><code>c</code><samp>s</samp><var>v</var></p>");
434        let classes: Vec<Vec<String>> = inlines
435            .iter()
436            .filter_map(|inline| match inline {
437                Inline::Code(attr, _) => {
438                    Some(attr.classes.iter().map(ToString::to_string).collect())
439                }
440                _ => None,
441            })
442            .collect();
443        assert_eq!(
444            classes,
445            vec![
446                Vec::<String>::new(),
447                vec!["sample".to_string()],
448                vec!["variable".to_string()],
449            ]
450        );
451    }
452
453    #[test]
454    fn line_break_element_becomes_line_break() {
455        let inlines = para_inlines("<p>a<br>b</p>");
456        assert!(inlines.contains(&Inline::LineBreak));
457    }
458
459    #[test]
460    fn anchor_with_href_is_a_link() {
461        let inlines = para_inlines(r#"<p><a href="/u" title="T" class="x">t</a></p>"#);
462        let Some(Inline::Link(attr, _, target)) = inlines.first() else {
463            panic!("expected link");
464        };
465        assert_eq!(
466            *target,
467            Box::new(Target {
468                url: "/u".to_string().into(),
469                title: "T".to_string().into()
470            })
471        );
472        assert!(attr.classes.contains(&"x".into()));
473    }
474
475    #[test]
476    fn anchor_with_name_is_a_span_with_id() {
477        let inlines = para_inlines(r#"<p><a name="anchor">t</a></p>"#);
478        let Some(Inline::Span(attr, _)) = inlines.first() else {
479            panic!("expected span");
480        };
481        assert_eq!(attr.id, "anchor");
482    }
483
484    #[test]
485    fn image_reads_src_title_and_alt() {
486        let inlines = para_inlines(r#"<p><img src="a.png" title="T" alt="alt text"></p>"#);
487        let Some(Inline::Image(_, alt, target)) = inlines.first() else {
488            panic!("expected image");
489        };
490        assert_eq!(target.url, "a.png");
491        assert_eq!(target.title, "T");
492        assert_eq!(
493            alt.as_slice(),
494            [
495                Inline::Str("alt".to_string().into()),
496                Inline::Space,
497                Inline::Str("text".to_string().into())
498            ]
499        );
500    }
501
502    #[test]
503    fn unknown_inline_element_is_transparent() {
504        let inlines = para_inlines("<p>a<bogus>b</bogus>c</p>");
505        assert_eq!(inlines.as_slice(), [Inline::Str("abc".to_string().into())]);
506    }
507
508    #[test]
509    fn data_attributes_drop_their_prefix() {
510        let Block::Div(attr, _) = first_block(r#"<div id="d" data-role="note">x</div>"#) else {
511            panic!("expected div");
512        };
513        assert_eq!(attr.id, "d");
514        assert!(
515            attr.attributes
516                .contains(&("role".to_string().into(), "note".to_string().into()))
517        );
518    }
519
520    #[test]
521    fn boolean_and_unquoted_attributes_parse() {
522        let Block::OrderedList(attrs, _) = first_block("<ol reversed start=5><li>a</li></ol>")
523        else {
524            panic!("expected ordered list");
525        };
526        assert_eq!(attrs.start, 5);
527    }
528
529    #[test]
530    fn numeric_and_named_references_decode() {
531        let inlines = para_inlines("<p>&#65;&#x42;&#X43;&copy</p>");
532        assert_eq!(
533            inlines.as_slice(),
534            [Inline::Str("ABC\u{a9}".to_string().into())]
535        );
536    }
537
538    #[test]
539    fn unknown_entity_is_left_verbatim() {
540        let inlines = para_inlines("<p>&notreal;</p>");
541        assert_eq!(
542            inlines.as_slice(),
543            [Inline::Str("&notreal;".to_string().into())]
544        );
545    }
546
547    #[test]
548    fn style_block_is_dropped() {
549        assert!(blocks("<style>p { color: red }</style><p>x</p>").len() == 1);
550    }
551
552    #[test]
553    fn textarea_content_is_read_as_text() {
554        let inlines = para_inlines("<p><textarea>typed &amp; ok</textarea></p>");
555        assert!(
556            inlines
557                .iter()
558                .any(|inline| matches!(inline, Inline::Str(s) if s.contains('&')))
559        );
560    }
561
562    #[test]
563    fn cdata_and_processing_instructions_are_skipped() {
564        let inlines = para_inlines("<p>a<![CDATA[ junk ]]><?pi here?>b</p>");
565        assert_eq!(inlines.as_slice(), [Inline::Str("ab".to_string().into())]);
566    }
567
568    #[test]
569    fn doctype_declaration_is_skipped() {
570        assert!(matches!(
571            first_block("<!DOCTYPE html><p>x</p>"),
572            Block::Para(_)
573        ));
574    }
575
576    #[test]
577    fn stray_less_than_is_literal_text() {
578        let inlines = para_inlines("<p>a < b</p>");
579        assert!(
580            inlines
581                .iter()
582                .any(|inline| matches!(inline, Inline::Str(s) if s.contains('<')))
583        );
584    }
585
586    #[test]
587    fn self_closing_span_has_no_children() {
588        let inlines = para_inlines("<p>a<span/>b</p>");
589        assert!(
590            inlines
591                .iter()
592                .any(|inline| matches!(inline, Inline::Span(_, children) if children.is_empty()))
593        );
594    }
595
596    #[test]
597    fn explicit_id_on_heading_is_preserved() {
598        let Block::Header(_, attr, _) = first_block(r#"<h2 id="custom">Title</h2>"#) else {
599            panic!("expected header");
600        };
601        assert_eq!(attr.id, "custom");
602    }
603
604    #[test]
605    fn line_block_div_becomes_line_block() {
606        let Block::LineBlock(lines) = first_block(r#"<div class="line-block">a<br>b</div>"#) else {
607            panic!("expected line block");
608        };
609        assert_eq!(lines.len(), 2);
610    }
611
612    #[test]
613    fn line_block_div_with_id_stays_div() {
614        assert!(matches!(
615            first_block(r#"<div class="line-block" id="x">a</div>"#),
616            Block::Div(..)
617        ));
618    }
619
620    #[test]
621    fn inline_style_becomes_raw_html() {
622        let inlines = para_inlines("<p>a<style>.x{}</style>b</p>");
623        assert!(inlines.iter().any(|inline| matches!(
624            inline,
625            Inline::RawInline(format, text)
626                if format.0 == "html" && text == "<style>.x{}</style>"
627        )));
628    }
629
630    #[test]
631    fn leading_style_block_is_dropped() {
632        assert!(matches!(
633            blocks("<style>.x{}</style><p>x</p>").as_slice(),
634            [Block::Para(_)]
635        ));
636    }
637
638    #[test]
639    fn style_after_a_block_is_kept_as_a_raw_paragraph() {
640        let result = blocks("<p>a</p>\n<style>.x{}</style>\n<p>b</p>");
641        let [Block::Para(_), Block::Para(mid), Block::Para(_)] = result.as_slice() else {
642            panic!("expected three paragraphs");
643        };
644        assert!(matches!(
645            mid.as_slice(),
646            [Inline::RawInline(format, text)]
647                if format.0 == "html" && text == "<style>.x{}</style>"
648        ));
649    }
650
651    #[test]
652    fn style_directly_adjacent_to_a_block_is_dropped() {
653        assert!(matches!(
654            blocks("<p>a</p><style>.x{}</style><p>b</p>").as_slice(),
655            [Block::Para(_), Block::Para(_)]
656        ));
657    }
658
659    #[test]
660    fn adjacent_styles_share_one_raw_paragraph() {
661        let result = blocks("<p>a</p>\n<style>s1{}</style>\n<style>s2{}</style>\n<p>b</p>");
662        let [_, Block::Para(mid), _] = result.as_slice() else {
663            panic!("expected three paragraphs");
664        };
665        assert!(matches!(
666            mid.as_slice(),
667            [
668                Inline::RawInline(f1, t1),
669                Inline::SoftBreak,
670                Inline::RawInline(f2, t2),
671            ] if f1.0 == "html" && t1 == "<style>s1{}</style>"
672                && f2.0 == "html" && t2 == "<style>s2{}</style>"
673        ));
674    }
675
676    #[test]
677    fn math_script_becomes_inline_math() {
678        let inlines = para_inlines(r#"<p><script type="math/tex">\D</script></p>"#);
679        assert!(matches!(
680            inlines.as_slice(),
681            [Inline::Math(MathType::InlineMath, text)] if text == "\\D"
682        ));
683    }
684
685    #[test]
686    fn display_math_script_becomes_display_math() {
687        let inlines = para_inlines(r#"<p><script type="math/tex; mode=display">\D</script></p>"#);
688        assert!(matches!(
689            inlines.as_slice(),
690            [Inline::Math(MathType::DisplayMath, _)]
691        ));
692    }
693
694    #[test]
695    fn non_math_script_is_dropped() {
696        assert!(blocks("<p><script>run()</script></p>").is_empty());
697    }
698
699    #[test]
700    fn checkbox_in_item_renders_ballot_box() {
701        let Block::BulletList(items) =
702            first_block(r#"<ul><li><input type="checkbox" checked/>do it</li></ul>"#)
703        else {
704            panic!("expected bullet list");
705        };
706        let Some([Block::Plain(inlines)]) = items.first().map(Vec::as_slice) else {
707            panic!("expected one plain block");
708        };
709        assert!(matches!(inlines.first(), Some(Inline::Str(s)) if s == "\u{2612}"));
710    }
711
712    #[test]
713    fn checkbox_outside_item_is_dropped() {
714        let inlines = para_inlines(r#"<p><input type="checkbox"/>text</p>"#);
715        assert_eq!(inlines.as_slice(), [Inline::Str("text".to_string().into())]);
716    }
717
718    #[test]
719    fn paragraph_with_checkbox_demotes_to_plain() {
720        assert!(matches!(
721            first_block(r#"<p><input type="checkbox"/>x</p>"#),
722            Block::Plain(_)
723        ));
724    }
725
726    #[test]
727    fn empty_paragraph_is_dropped() {
728        assert!(blocks("<p>hi</p><p></p><p>lo</p>").len() == 2);
729    }
730
731    #[test]
732    fn consecutive_terms_merge_with_line_break() {
733        let Block::DefinitionList(items) = first_block("<dl><dt>a</dt><dt>b</dt><dd>x</dd></dl>")
734        else {
735            panic!("expected definition list");
736        };
737        let Some((term, _)) = items.first() else {
738            panic!("expected one item");
739        };
740        assert!(term.contains(&Inline::LineBreak));
741    }
742
743    #[test]
744    fn stray_paragraph_in_list_attaches_to_item() {
745        let Block::BulletList(items) = first_block("<ul><li>a</li><p>b</p></ul>") else {
746            panic!("expected bullet list");
747        };
748        assert_eq!(items.len(), 1);
749        assert_eq!(items.first().map(Vec::len), Some(2));
750    }
751
752    #[test]
753    fn native_divs_off_splices_div_children() {
754        let result = read_with("<div class=\"c\"><p>x</p></div>", Extensions::empty());
755        assert!(matches!(result.as_slice(), [Block::Para(_)]));
756    }
757
758    #[test]
759    fn native_divs_off_drops_sectioning_wrapper() {
760        let result = read_with("<section><p>x</p></section>", Extensions::empty());
761        assert!(matches!(result.as_slice(), [Block::Para(_)]));
762    }
763
764    #[test]
765    fn native_spans_off_unwraps_span_and_small_caps() {
766        let plain = read_with("<p><span class=\"c\">x</span></p>", Extensions::empty());
767        let Some(Block::Para(inlines)) = plain.first() else {
768            panic!("expected paragraph");
769        };
770        assert_eq!(inlines.as_slice(), [Inline::Str("x".to_string().into())]);
771
772        let caps = read_with(
773            "<p><span style=\"font-variant: small-caps\">x</span></p>",
774            Extensions::empty(),
775        );
776        let Some(Block::Para(inlines)) = caps.first() else {
777            panic!("expected paragraph");
778        };
779        assert_eq!(inlines.as_slice(), [Inline::Str("x".to_string().into())]);
780    }
781
782    #[test]
783    fn native_spans_off_keeps_class_carrying_inlines() {
784        // `<mark>`/`<kbd>` and friends are their own constructs, not `<span>` elements, so the
785        // toggle leaves them as spans.
786        let result = read_with("<p><mark>m</mark></p>", Extensions::empty());
787        let Some(Block::Para(inlines)) = result.first() else {
788            panic!("expected paragraph");
789        };
790        assert!(matches!(inlines.first(), Some(Inline::Span(_, _))));
791    }
792
793    #[test]
794    fn auto_identifiers_off_leaves_id_empty_but_keeps_explicit() {
795        let generated = read_with("<h1>Hello World</h1>", Extensions::empty());
796        let Some(Block::Header(_, attr, _)) = generated.first() else {
797            panic!("expected header");
798        };
799        assert_eq!(attr.id, "");
800
801        let explicit = read_with("<h2 id=\"keep\">T</h2>", Extensions::empty());
802        let Some(Block::Header(_, attr, _)) = explicit.first() else {
803            panic!("expected header");
804        };
805        assert_eq!(attr.id, "keep");
806    }
807
808    #[test]
809    fn line_blocks_off_keeps_a_plain_div() {
810        let result = read_with(
811            "<div class=\"line-block\">a<br>b</div>",
812            Extensions::from_list(&[Extension::NativeDivs]),
813        );
814        let Some(Block::Div(attr, children)) = result.first() else {
815            panic!("expected div");
816        };
817        assert_eq!(attr.classes, vec!["line-block".to_string()]);
818        assert!(matches!(children.as_slice(), [Block::Plain(_)]));
819    }
820
821    /// Read with the `html` default set plus the given text extensions, which is what `html+smart`
822    /// and the `html+tex_math_*` corpus specs resolve to.
823    fn read_with_text_ext(input: &str, added: &[Extension]) -> Vec<Block> {
824        read_with(input, html_defaults().union(Extensions::from_list(added)))
825    }
826
827    fn para_inlines_ext(input: &str, added: &[Extension]) -> Vec<Inline> {
828        match read_with_text_ext(input, added).into_iter().next() {
829            Some(Block::Para(inlines) | Block::Plain(inlines)) => inlines,
830            other => panic!("expected a paragraph, got {other:?}"),
831        }
832    }
833
834    #[test]
835    fn smart_off_keeps_literal_punctuation() {
836        let inlines = para_inlines("<p>\"a\" -- ... ---</p>");
837        assert_eq!(
838            inlines.as_slice(),
839            [
840                Inline::Str("\"a\"".to_string().into()),
841                Inline::Space,
842                Inline::Str("--".to_string().into()),
843                Inline::Space,
844                Inline::Str("...".to_string().into()),
845                Inline::Space,
846                Inline::Str("---".to_string().into()),
847            ]
848        );
849    }
850
851    #[test]
852    fn smart_on_curls_quotes_and_folds_dashes() {
853        let inlines = para_inlines_ext("<p>\"a\" -- ... ---</p>", &[Extension::Smart]);
854        assert_eq!(
855            inlines.as_slice(),
856            [
857                Inline::Quoted(
858                    carta_ast::QuoteType::DoubleQuote,
859                    vec![Inline::Str("a".to_string().into())]
860                ),
861                Inline::Space,
862                Inline::Str("\u{2013}".to_string().into()),
863                Inline::Space,
864                Inline::Str("\u{2026}".to_string().into()),
865                Inline::Space,
866                Inline::Str("\u{2014}".to_string().into()),
867            ]
868        );
869    }
870
871    #[test]
872    fn tex_math_dollars_off_keeps_literal_text() {
873        let inlines = para_inlines("<p>$x^2$ and $$y$$</p>");
874        assert_eq!(
875            inlines.as_slice(),
876            [
877                Inline::Str("$x^2$".to_string().into()),
878                Inline::Space,
879                Inline::Str("and".to_string().into()),
880                Inline::Space,
881                Inline::Str("$$y$$".to_string().into()),
882            ]
883        );
884    }
885
886    #[test]
887    fn tex_math_dollars_on_splits_inline_and_display() {
888        let inlines = para_inlines_ext("<p>$x^2$ and $$y$$</p>", &[Extension::TexMathDollars]);
889        assert_eq!(
890            inlines.as_slice(),
891            [
892                Inline::Math(MathType::InlineMath, "x^2".to_string().into()),
893                Inline::Space,
894                Inline::Str("and".to_string().into()),
895                Inline::Space,
896                Inline::Math(MathType::DisplayMath, "y".to_string().into()),
897            ]
898        );
899    }
900
901    #[test]
902    fn tex_math_single_backslash_on_splits_inline_and_display() {
903        let inlines = para_inlines_ext(
904            "<p>\\(x\\) and \\[y\\]</p>",
905            &[Extension::TexMathSingleBackslash],
906        );
907        assert_eq!(
908            inlines.as_slice(),
909            [
910                Inline::Math(MathType::InlineMath, "x".to_string().into()),
911                Inline::Space,
912                Inline::Str("and".to_string().into()),
913                Inline::Space,
914                Inline::Math(MathType::DisplayMath, "y".to_string().into()),
915            ]
916        );
917    }
918
919    #[test]
920    fn tex_math_double_backslash_on_splits_inline_and_display() {
921        let inlines = para_inlines_ext(
922            "<p>\\\\(x\\\\) and \\\\[y\\\\]</p>",
923            &[Extension::TexMathDoubleBackslash],
924        );
925        assert_eq!(
926            inlines.as_slice(),
927            [
928                Inline::Math(MathType::InlineMath, "x".to_string().into()),
929                Inline::Space,
930                Inline::Str("and".to_string().into()),
931                Inline::Space,
932                Inline::Math(MathType::DisplayMath, "y".to_string().into()),
933            ]
934        );
935    }
936
937    #[test]
938    fn note_reference_reconstructs_body_and_drops_container() {
939        let result = blocks(concat!(
940            "text<a href=\"#fn1\" class=\"footnote-ref\" role=\"doc-noteref\"><sup>1</sup></a>\n",
941            "<section class=\"footnotes\" role=\"doc-endnotes\"><hr /><ol>",
942            "<li id=\"fn1\"><p>the note",
943            "<a href=\"#fnref1\" class=\"footnote-back\" role=\"doc-backlink\">\u{21a9}</a></p></li>",
944            "</ol></section>",
945        ));
946        assert_eq!(
947            result.as_slice(),
948            [Block::Plain(vec![
949                Inline::Str("text".to_string().into()),
950                Inline::Note(vec![Block::Para(vec![
951                    Inline::Str("the".to_string().into()),
952                    Inline::Space,
953                    Inline::Str("note".to_string().into()),
954                ])]),
955            ])]
956        );
957    }
958
959    #[test]
960    fn unmatched_note_reference_becomes_an_empty_note() {
961        let result = blocks("text<a href=\"#missing\" role=\"doc-noteref\"><sup>1</sup></a>");
962        assert_eq!(
963            result.as_slice(),
964            [Block::Plain(vec![
965                Inline::Str("text".to_string().into()),
966                Inline::Note(Vec::new()),
967            ])]
968        );
969    }
970
971    fn header_ids(input: &str, added: &[Extension]) -> Vec<String> {
972        read_with_text_ext(input, added)
973            .into_iter()
974            .filter_map(|block| match block {
975                Block::Header(_, attr, _) => Some(attr.id.to_string()),
976                _ => None,
977            })
978            .collect()
979    }
980
981    #[test]
982    fn gfm_auto_identifiers_drops_dots_keeps_digits_and_does_not_collapse() {
983        // The `gfm_auto_identifiers` slug differs from the default: dots are dropped, leading digits
984        // survive, and removed punctuation leaves its surrounding separators (no run collapsing).
985        let ids = header_ids(
986            "<h2>1.2 Section A.B</h2><h2>Tools &amp; Tips</h2>",
987            &[Extension::GfmAutoIdentifiers],
988        );
989        assert_eq!(ids, vec!["12-section-ab", "tools--tips"]);
990    }
991
992    #[test]
993    fn gfm_auto_identifiers_keep_the_section_fallback_and_increment_on_collision() {
994        let ids = header_ids(
995            "<h2>Repeat</h2><h2>Repeat</h2><h3>!!!</h3>",
996            &[Extension::GfmAutoIdentifiers],
997        );
998        assert_eq!(ids, vec!["repeat", "repeat-1", "section"]);
999    }
1000
1001    #[test]
1002    fn gfm_auto_identifiers_need_auto_identifiers_to_take_effect() {
1003        let ids = read_with(
1004            "<h2>1.2 Section A.B</h2>",
1005            Extensions::from_list(&[Extension::GfmAutoIdentifiers]),
1006        )
1007        .into_iter()
1008        .filter_map(|block| match block {
1009            Block::Header(_, attr, _) => Some(attr.id.to_string()),
1010            _ => None,
1011        })
1012        .collect::<Vec<_>>();
1013        assert_eq!(ids, vec![String::new()]);
1014    }
1015
1016    #[test]
1017    fn repeated_headings_resume_probing_from_the_last_issued_suffix() {
1018        let ids = header_ids("<h2>Same</h2><h2>Same</h2><h2>Same</h2><h2>Same</h2>", &[]);
1019        assert_eq!(ids, vec!["same", "same-1", "same-2", "same-3"]);
1020    }
1021
1022    #[test]
1023    fn repeated_headings_skip_an_id_reserved_by_an_explicit_heading() {
1024        let ids = header_ids(
1025            "<h2 id=\"same-2\">Explicit</h2><h2>Same</h2><h2>Same</h2><h2>Same</h2>",
1026            &[],
1027        );
1028        assert_eq!(ids, vec!["same-2", "same", "same-1", "same-3"]);
1029    }
1030
1031    #[cfg(feature = "opml")]
1032    #[test]
1033    fn inline_fragment_parses_markup_and_trims_edges() {
1034        let inlines = super::parse_inline_fragment("  <strong>a</strong> b <code>c</code>  ");
1035        assert_eq!(
1036            inlines,
1037            vec![
1038                Inline::Strong(vec![Inline::Str("a".to_string().into())]),
1039                Inline::Space,
1040                Inline::Str("b".to_string().into()),
1041                Inline::Space,
1042                Inline::Code(Box::default(), "c".to_string().into()),
1043            ]
1044        );
1045    }
1046
1047    #[cfg(feature = "opml")]
1048    #[test]
1049    fn inline_fragment_resolves_character_references() {
1050        let inlines = super::parse_inline_fragment("a &amp; b");
1051        assert_eq!(
1052            inlines,
1053            vec![
1054                Inline::Str("a".to_string().into()),
1055                Inline::Space,
1056                Inline::Str("&".to_string().into()),
1057                Inline::Space,
1058                Inline::Str("b".to_string().into()),
1059            ]
1060        );
1061    }
1062
1063    #[cfg(feature = "opml")]
1064    fn raw(tag: &str) -> Inline {
1065        Inline::RawInline(
1066            carta_ast::Format("html".to_string().into()),
1067            tag.to_string().into(),
1068        )
1069    }
1070
1071    #[cfg(feature = "opml")]
1072    #[test]
1073    fn inline_fragment_preserves_an_unrecognized_tag_verbatim() {
1074        let inlines = super::parse_inline_fragment("<cite>Book</cite>");
1075        assert_eq!(
1076            inlines,
1077            vec![
1078                raw("<cite>"),
1079                Inline::Str("Book".to_string().into()),
1080                raw("</cite>")
1081            ]
1082        );
1083    }
1084
1085    #[cfg(feature = "opml")]
1086    #[test]
1087    fn inline_fragment_keeps_unknown_tag_attributes() {
1088        let inlines = super::parse_inline_fragment("<time datetime=\"2020\">y</time>");
1089        assert_eq!(
1090            inlines,
1091            vec![
1092                raw("<time datetime=\"2020\">"),
1093                Inline::Str("y".to_string().into()),
1094                raw("</time>"),
1095            ]
1096        );
1097    }
1098
1099    #[cfg(feature = "opml")]
1100    #[test]
1101    fn inline_fragment_escapes_attribute_values_and_emits_bare_boolean() {
1102        let inlines = super::parse_inline_fragment("<x-foo a=\"1<2&3\" hidden>z</x-foo>");
1103        assert_eq!(
1104            inlines,
1105            vec![
1106                raw("<x-foo a=\"1&lt;2&amp;3\" hidden>"),
1107                Inline::Str("z".to_string().into()),
1108                raw("</x-foo>"),
1109            ]
1110        );
1111    }
1112
1113    #[cfg(feature = "opml")]
1114    #[test]
1115    fn inline_fragment_lowercases_an_unknown_tag_name() {
1116        let inlines = super::parse_inline_fragment("<CITE>b</CITE>");
1117        assert_eq!(
1118            inlines,
1119            vec![
1120                raw("<cite>"),
1121                Inline::Str("b".to_string().into()),
1122                raw("</cite>")
1123            ]
1124        );
1125    }
1126
1127    #[cfg(feature = "opml")]
1128    #[test]
1129    fn inline_fragment_void_unknown_tag_is_a_single_raw_inline() {
1130        let inlines = super::parse_inline_fragment("a <wbr> b");
1131        assert_eq!(
1132            inlines,
1133            vec![
1134                Inline::Str("a".to_string().into()),
1135                Inline::Space,
1136                raw("<wbr>"),
1137                Inline::Space,
1138                Inline::Str("b".to_string().into()),
1139            ]
1140        );
1141    }
1142
1143    #[cfg(feature = "opml")]
1144    #[test]
1145    fn inline_fragment_self_closing_unknown_tag_pairs_open_and_close() {
1146        let inlines = super::parse_inline_fragment("<custom-tag/>");
1147        assert_eq!(inlines, vec![raw("<custom-tag>"), raw("</custom-tag>")]);
1148    }
1149
1150    #[cfg(feature = "opml")]
1151    #[test]
1152    fn inline_fragment_unclosed_unknown_tag_omits_the_close() {
1153        let inlines = super::parse_inline_fragment("a <cite>open-only");
1154        assert_eq!(
1155            inlines,
1156            vec![
1157                Inline::Str("a".to_string().into()),
1158                Inline::Space,
1159                raw("<cite>"),
1160                Inline::Str("open-only".to_string().into()),
1161            ]
1162        );
1163    }
1164
1165    #[cfg(feature = "opml")]
1166    #[test]
1167    fn inline_fragment_stray_unknown_end_tag_is_preserved() {
1168        let inlines = super::parse_inline_fragment("</cite> tail");
1169        assert_eq!(
1170            inlines,
1171            vec![
1172                raw("</cite>"),
1173                Inline::Space,
1174                Inline::Str("tail".to_string().into()),
1175            ]
1176        );
1177    }
1178
1179    #[cfg(feature = "opml")]
1180    #[test]
1181    fn inline_fragment_unknown_tag_wraps_recognized_inner_markup() {
1182        let inlines = super::parse_inline_fragment("<cite><em>x</em></cite>");
1183        assert_eq!(
1184            inlines,
1185            vec![
1186                raw("<cite>"),
1187                Inline::Emph(vec![Inline::Str("x".to_string().into())]),
1188                raw("</cite>"),
1189            ]
1190        );
1191    }
1192
1193    #[cfg(feature = "opml")]
1194    #[test]
1195    fn inline_fragment_recognized_tags_keep_structural_mapping() {
1196        let inlines = super::parse_inline_fragment("<em>e</em> <strong>s</strong> <sup>2</sup>");
1197        assert_eq!(
1198            inlines,
1199            vec![
1200                Inline::Emph(vec![Inline::Str("e".to_string().into())]),
1201                Inline::Space,
1202                Inline::Strong(vec![Inline::Str("s".to_string().into())]),
1203                Inline::Space,
1204                Inline::Superscript(vec![Inline::Str("2".to_string().into())]),
1205            ]
1206        );
1207    }
1208}