1mod classify;
9mod convert;
10mod notes;
11mod table;
12mod tokenize;
13mod tree;
14
15use std::borrow::Cow;
16
17use carta_ast::Document;
18use carta_core::{Extensions, Reader, ReaderOptions, Result};
19
20#[cfg(feature = "opml")]
21use carta_ast::Inline;
22
23#[cfg(feature = "opml")]
24use convert::inlines_from_nodes;
25use convert::{Converter, extract_meta};
26use tokenize::tokenize;
27use tree::{build_tree, locate};
28
29#[derive(Debug, Default, Clone, Copy)]
31pub struct HtmlReader;
32
33impl Reader for HtmlReader {
34 fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
35 Ok(parse(input, options.extensions))
36 }
37}
38
39fn parse(input: &str, ext: Extensions) -> Document {
40 let normalized = normalize(input);
41 let chars: Vec<char> = normalized.chars().collect();
42 let tokens = tokenize(&chars);
43 let roots = build_tree(tokens);
44 let (head, body) = locate(&roots);
45
46 let mut converter = Converter::new(ext);
47 converter.index_notes(notes::collect_note_defs(&body));
48 let meta = head.map(extract_meta).unwrap_or_default();
49 let blocks = converter.blocks(&body, false);
50 Document {
51 meta: meta.into_iter().map(|(k, v)| (k.into(), v)).collect(),
52 blocks,
53 ..Document::default()
54 }
55}
56
57#[cfg(feature = "opml")]
62pub(crate) fn parse_inline_fragment(input: &str) -> Vec<Inline> {
63 let normalized = normalize(input);
64 let chars: Vec<char> = normalized.chars().collect();
65 let tokens = tokenize(&chars);
66 let roots = build_tree(tokens);
67 inlines_from_nodes(&roots)
68}
69
70fn normalize(input: &str) -> Cow<'_, str> {
72 let without_bom = input.strip_prefix('\u{feff}').unwrap_or(input);
73 if !without_bom.contains('\r') {
74 return Cow::Borrowed(without_bom);
75 }
76 let mut out = String::with_capacity(without_bom.len());
77 let mut chars = without_bom.chars().peekable();
78 while let Some(ch) = chars.next() {
79 match ch {
80 '\r' => {
81 if chars.peek() == Some(&'\n') {
82 chars.next();
83 }
84 out.push('\n');
85 }
86 other => out.push(other),
87 }
88 }
89 Cow::Owned(out)
90}
91
92#[cfg(test)]
93mod tests {
94 use super::HtmlReader;
95 use carta_ast::{Block, Inline, MathType};
96 use carta_core::{Extension, Extensions, Reader, ReaderOptions};
97
98 fn html_defaults() -> Extensions {
101 Extensions::from_list(&[
102 Extension::AutoIdentifiers,
103 Extension::LineBlocks,
104 Extension::NativeDivs,
105 Extension::NativeSpans,
106 ])
107 }
108
109 fn read_with(input: &str, extensions: Extensions) -> Vec<Block> {
110 let mut options = ReaderOptions::default();
111 options.extensions = extensions;
112 HtmlReader
113 .read(input, &options)
114 .expect("reader should not fail")
115 .blocks
116 }
117
118 fn blocks(input: &str) -> Vec<Block> {
119 read_with(input, html_defaults())
120 }
121
122 #[test]
123 fn paragraph_with_emphasis() {
124 let result = blocks("<p>a <em>b</em></p>");
125 assert!(matches!(result.as_slice(), [Block::Para(_)]));
126 }
127
128 #[test]
129 fn loose_text_is_plain() {
130 assert!(matches!(blocks("hello").as_slice(), [Block::Plain(_)]));
131 }
132
133 #[test]
134 fn paragraph_sibling_promotes_loose_text() {
135 let result = blocks("loose<p>para</p>");
136 assert!(matches!(
137 result.as_slice(),
138 [Block::Para(_), Block::Para(_)]
139 ));
140 }
141
142 #[test]
143 fn horizontal_rule_does_not_promote() {
144 let result = blocks("loose<hr>");
145 assert!(matches!(
146 result.as_slice(),
147 [Block::Plain(_), Block::HorizontalRule]
148 ));
149 }
150
151 #[test]
152 fn nested_list_inside_item_stays_tight() {
153 let result = blocks("<ul><li>a<ul><li>b</li></ul></li></ul>");
154 let Some(Block::BulletList(items)) = result.first() else {
155 panic!("expected bullet list");
156 };
157 let Some(item) = items.first() else {
158 panic!("expected one item");
159 };
160 assert!(matches!(item.first(), Some(Block::Plain(_))));
161 }
162
163 #[test]
164 fn heading_generates_identifier() {
165 let result = blocks("<h1>Hello World</h1>");
166 let Some(Block::Header(level, attr, _)) = result.first() else {
167 panic!("expected header");
168 };
169 assert_eq!(*level, 1);
170 assert_eq!(attr.id, "hello-world");
171 }
172
173 #[test]
174 fn duplicate_identifiers_are_disambiguated() {
175 let result = blocks("<h1>Sec</h1><h2>Sec</h2>");
176 let ids: Vec<&str> = result
177 .iter()
178 .filter_map(|block| match block {
179 Block::Header(_, attr, _) => Some(attr.id.as_str()),
180 _ => None,
181 })
182 .collect();
183 assert_eq!(ids, vec!["sec", "sec-1"]);
184 }
185
186 #[test]
187 fn entities_are_decoded() {
188 let result = blocks("<p>a & b © c</p>");
189 let Some(Block::Para(inlines)) = result.first() else {
190 panic!("expected paragraph");
191 };
192 assert!(inlines.contains(&Inline::Str("&".to_string().into())));
193 assert!(inlines.contains(&Inline::Str("\u{a9}".to_string().into())));
194 }
195
196 #[test]
197 fn comment_joins_surrounding_text() {
198 let result = blocks("<p>a<!-- c -->b</p>");
199 let Some(Block::Para(inlines)) = result.first() else {
200 panic!("expected paragraph");
201 };
202 assert_eq!(inlines.as_slice(), [Inline::Str("ab".to_string().into())]);
203 }
204
205 #[test]
206 fn script_content_is_dropped() {
207 assert!(blocks("<script>var x = 1;</script><p>p</p>").len() == 1);
208 }
209
210 #[test]
211 fn head_metadata_is_extracted() {
212 let document = HtmlReader
213 .read(
214 "<head><title>T</title><meta name=\"author\" content=\"A\"></head><body><p>b</p></body>",
215 &ReaderOptions::default(),
216 )
217 .expect("reader should not fail");
218 assert!(document.meta.contains_key("title"));
219 assert!(document.meta.contains_key("author"));
220 }
221
222 use carta_ast::{Alignment, ColWidth, ListNumberStyle, Target};
223
224 fn first_block(input: &str) -> Block {
225 blocks(input).into_iter().next().expect("a block")
226 }
227
228 fn para_inlines(input: &str) -> Vec<Inline> {
229 match first_block(input) {
230 Block::Para(inlines) | Block::Plain(inlines) => inlines,
231 other => panic!("expected a paragraph, got {other:?}"),
232 }
233 }
234
235 #[test]
236 fn normalizes_crlf_and_strips_bom() {
237 let inlines = para_inlines("\u{feff}<p>a\r\nb</p>");
238 assert_eq!(
239 inlines.as_slice(),
240 [
241 Inline::Str("a".to_string().into()),
242 Inline::SoftBreak,
243 Inline::Str("b".to_string().into())
244 ]
245 );
246 }
247
248 #[test]
249 fn ordered_list_reads_type_and_start() {
250 let Block::OrderedList(attrs, items) =
251 first_block(r#"<ol type="A" start="3"><li>x</li><li>y</li></ol>"#)
252 else {
253 panic!("expected ordered list");
254 };
255 assert_eq!(attrs.start, 3);
256 assert_eq!(attrs.style, ListNumberStyle::UpperAlpha);
257 assert_eq!(items.len(), 2);
258 }
259
260 #[test]
261 fn menu_is_a_bullet_list() {
262 assert!(matches!(
263 first_block("<menu><li>a</li></menu>"),
264 Block::BulletList(_)
265 ));
266 }
267
268 #[test]
269 fn implied_li_close_splits_items() {
270 let Block::BulletList(items) = first_block("<ul><li>a<li>b</ul>") else {
271 panic!("expected bullet list");
272 };
273 assert_eq!(items.len(), 2);
274 }
275
276 #[test]
277 fn pre_with_code_language_class_becomes_code_block() {
278 let Block::CodeBlock(attr, text) = first_block(
279 r#"<pre><code class="language-rust">let x = 1;
280</code></pre>"#,
281 ) else {
282 panic!("expected code block");
283 };
284 assert_eq!(attr.classes, vec!["rust".to_string()]);
285 assert_eq!(text, "let x = 1;");
286 }
287
288 #[test]
289 fn definition_list_pairs_terms_and_definitions() {
290 let Block::DefinitionList(items) =
291 first_block("<dl><dt>term</dt><dd>one</dd><dd>two</dd></dl>")
292 else {
293 panic!("expected definition list");
294 };
295 let (term, defs) = items.into_iter().next().expect("an item");
296 assert_eq!(term, vec![Inline::Str("term".to_string().into())]);
297 assert_eq!(defs.len(), 2);
298 }
299
300 #[test]
301 fn blockquote_wraps_child_blocks() {
302 assert!(matches!(
303 first_block("<blockquote><p>q</p></blockquote>"),
304 Block::BlockQuote(_)
305 ));
306 }
307
308 #[test]
309 fn sectioning_div_gets_a_class() {
310 let Block::Div(attr, _) = first_block("<section><p>x</p></section>") else {
311 panic!("expected div");
312 };
313 assert!(attr.classes.contains(&"section".into()));
314 }
315
316 #[test]
317 fn figure_separates_caption_from_content() {
318 let Block::Figure(_, caption, content) =
319 first_block("<figure><img src=\"a.png\"><figcaption>cap</figcaption></figure>")
320 else {
321 panic!("expected figure");
322 };
323 assert_eq!(caption.short, None);
324 assert!(!caption.long.is_empty());
325 assert!(!content.is_empty());
326 }
327
328 #[test]
329 fn table_reads_sections_alignment_and_spans() {
330 let input = r#"<table>
331 <caption>cap</caption>
332 <colgroup><col style="width: 25%"><col></colgroup>
333 <thead><tr><th align="right">H1</th><th>H2</th></tr></thead>
334 <tbody><tr><td colspan="2">wide</td></tr></tbody>
335 <tfoot><tr><td>f1</td><td>f2</td></tr></tfoot>
336 </table>"#;
337 let Block::Table(table) = first_block(input) else {
338 panic!("expected table");
339 };
340 assert_eq!(table.col_specs.len(), 2);
341 assert_eq!(
342 table.col_specs.first().map(|spec| spec.width.clone()),
343 Some(ColWidth::ColWidth(0.25))
344 );
345 assert_eq!(
346 table
347 .head
348 .rows
349 .first()
350 .and_then(|row| row.cells.first())
351 .map(|cell| cell.align.clone()),
352 Some(Alignment::AlignRight)
353 );
354 let body_cell_span = table
355 .bodies
356 .first()
357 .and_then(|body| body.body.first())
358 .and_then(|row| row.cells.first())
359 .map(|cell| cell.col_span);
360 assert_eq!(body_cell_span, Some(2));
361 assert_eq!(table.foot.rows.len(), 1);
362 }
363
364 #[test]
365 fn oversized_cell_spans_are_clamped() {
366 let input = r#"<table><tr><td colspan="90000000" rowspan="2">x</td></tr><tr><td>y</td></tr></table>"#;
371 let Block::Table(table) = first_block(input) else {
372 panic!("expected table");
373 };
374 let cell_span = table
375 .bodies
376 .first()
377 .and_then(|body| body.body.first())
378 .and_then(|row| row.cells.first())
379 .map(|cell| (cell.col_span, cell.row_span));
380 assert_eq!(cell_span, Some((1000, 2)));
381 }
382
383 #[test]
384 fn cell_alignment_reads_text_align_style() {
385 let Block::Table(table) =
386 first_block(r#"<table><tr><td style="text-align: center">c</td></tr></table>"#)
387 else {
388 panic!("expected table");
389 };
390 let align = table
391 .bodies
392 .first()
393 .and_then(|body| body.body.first())
394 .and_then(|row| row.cells.first())
395 .map(|cell| cell.align.clone());
396 assert_eq!(align, Some(Alignment::AlignCenter));
397 }
398
399 #[test]
400 fn every_inline_emphasis_kind_is_mapped() {
401 let inlines = para_inlines(
402 "<p><em>a</em><b>b</b><del>c</del><u>d</u><sup>e</sup><sub>f</sub><q>g</q></p>",
403 );
404 assert!(matches!(
405 inlines.as_slice(),
406 [
407 Inline::Emph(_),
408 Inline::Strong(_),
409 Inline::Strikeout(_),
410 Inline::Underline(_),
411 Inline::Superscript(_),
412 Inline::Subscript(_),
413 Inline::Quoted(_, _),
414 ]
415 ));
416 }
417
418 #[test]
419 fn class_carrying_inlines_become_spans() {
420 let inlines = para_inlines("<p><mark>m</mark><kbd>k</kbd></p>");
421 let classes: Vec<&str> = inlines
422 .iter()
423 .filter_map(|inline| match inline {
424 Inline::Span(attr, _) => attr.classes.first().map(carta_ast::Text::as_str),
425 _ => None,
426 })
427 .collect();
428 assert_eq!(classes, vec!["mark", "kbd"]);
429 }
430
431 #[test]
432 fn code_variants_force_classes() {
433 let inlines = para_inlines("<p><code>c</code><samp>s</samp><var>v</var></p>");
434 let classes: Vec<Vec<String>> = inlines
435 .iter()
436 .filter_map(|inline| match inline {
437 Inline::Code(attr, _) => {
438 Some(attr.classes.iter().map(ToString::to_string).collect())
439 }
440 _ => None,
441 })
442 .collect();
443 assert_eq!(
444 classes,
445 vec![
446 Vec::<String>::new(),
447 vec!["sample".to_string()],
448 vec!["variable".to_string()],
449 ]
450 );
451 }
452
453 #[test]
454 fn line_break_element_becomes_line_break() {
455 let inlines = para_inlines("<p>a<br>b</p>");
456 assert!(inlines.contains(&Inline::LineBreak));
457 }
458
459 #[test]
460 fn anchor_with_href_is_a_link() {
461 let inlines = para_inlines(r#"<p><a href="/u" title="T" class="x">t</a></p>"#);
462 let Some(Inline::Link(attr, _, target)) = inlines.first() else {
463 panic!("expected link");
464 };
465 assert_eq!(
466 *target,
467 Box::new(Target {
468 url: "/u".to_string().into(),
469 title: "T".to_string().into()
470 })
471 );
472 assert!(attr.classes.contains(&"x".into()));
473 }
474
475 #[test]
476 fn anchor_with_name_is_a_span_with_id() {
477 let inlines = para_inlines(r#"<p><a name="anchor">t</a></p>"#);
478 let Some(Inline::Span(attr, _)) = inlines.first() else {
479 panic!("expected span");
480 };
481 assert_eq!(attr.id, "anchor");
482 }
483
484 #[test]
485 fn image_reads_src_title_and_alt() {
486 let inlines = para_inlines(r#"<p><img src="a.png" title="T" alt="alt text"></p>"#);
487 let Some(Inline::Image(_, alt, target)) = inlines.first() else {
488 panic!("expected image");
489 };
490 assert_eq!(target.url, "a.png");
491 assert_eq!(target.title, "T");
492 assert_eq!(
493 alt.as_slice(),
494 [
495 Inline::Str("alt".to_string().into()),
496 Inline::Space,
497 Inline::Str("text".to_string().into())
498 ]
499 );
500 }
501
502 #[test]
503 fn unknown_inline_element_is_transparent() {
504 let inlines = para_inlines("<p>a<bogus>b</bogus>c</p>");
505 assert_eq!(inlines.as_slice(), [Inline::Str("abc".to_string().into())]);
506 }
507
508 #[test]
509 fn data_attributes_drop_their_prefix() {
510 let Block::Div(attr, _) = first_block(r#"<div id="d" data-role="note">x</div>"#) else {
511 panic!("expected div");
512 };
513 assert_eq!(attr.id, "d");
514 assert!(
515 attr.attributes
516 .contains(&("role".to_string().into(), "note".to_string().into()))
517 );
518 }
519
520 #[test]
521 fn boolean_and_unquoted_attributes_parse() {
522 let Block::OrderedList(attrs, _) = first_block("<ol reversed start=5><li>a</li></ol>")
523 else {
524 panic!("expected ordered list");
525 };
526 assert_eq!(attrs.start, 5);
527 }
528
529 #[test]
530 fn numeric_and_named_references_decode() {
531 let inlines = para_inlines("<p>ABC©</p>");
532 assert_eq!(
533 inlines.as_slice(),
534 [Inline::Str("ABC\u{a9}".to_string().into())]
535 );
536 }
537
538 #[test]
539 fn unknown_entity_is_left_verbatim() {
540 let inlines = para_inlines("<p>¬real;</p>");
541 assert_eq!(
542 inlines.as_slice(),
543 [Inline::Str("¬real;".to_string().into())]
544 );
545 }
546
547 #[test]
548 fn style_block_is_dropped() {
549 assert!(blocks("<style>p { color: red }</style><p>x</p>").len() == 1);
550 }
551
552 #[test]
553 fn textarea_content_is_read_as_text() {
554 let inlines = para_inlines("<p><textarea>typed & ok</textarea></p>");
555 assert!(
556 inlines
557 .iter()
558 .any(|inline| matches!(inline, Inline::Str(s) if s.contains('&')))
559 );
560 }
561
562 #[test]
563 fn cdata_and_processing_instructions_are_skipped() {
564 let inlines = para_inlines("<p>a<![CDATA[ junk ]]><?pi here?>b</p>");
565 assert_eq!(inlines.as_slice(), [Inline::Str("ab".to_string().into())]);
566 }
567
568 #[test]
569 fn doctype_declaration_is_skipped() {
570 assert!(matches!(
571 first_block("<!DOCTYPE html><p>x</p>"),
572 Block::Para(_)
573 ));
574 }
575
576 #[test]
577 fn stray_less_than_is_literal_text() {
578 let inlines = para_inlines("<p>a < b</p>");
579 assert!(
580 inlines
581 .iter()
582 .any(|inline| matches!(inline, Inline::Str(s) if s.contains('<')))
583 );
584 }
585
586 #[test]
587 fn self_closing_span_has_no_children() {
588 let inlines = para_inlines("<p>a<span/>b</p>");
589 assert!(
590 inlines
591 .iter()
592 .any(|inline| matches!(inline, Inline::Span(_, children) if children.is_empty()))
593 );
594 }
595
596 #[test]
597 fn explicit_id_on_heading_is_preserved() {
598 let Block::Header(_, attr, _) = first_block(r#"<h2 id="custom">Title</h2>"#) else {
599 panic!("expected header");
600 };
601 assert_eq!(attr.id, "custom");
602 }
603
604 #[test]
605 fn line_block_div_becomes_line_block() {
606 let Block::LineBlock(lines) = first_block(r#"<div class="line-block">a<br>b</div>"#) else {
607 panic!("expected line block");
608 };
609 assert_eq!(lines.len(), 2);
610 }
611
612 #[test]
613 fn line_block_div_with_id_stays_div() {
614 assert!(matches!(
615 first_block(r#"<div class="line-block" id="x">a</div>"#),
616 Block::Div(..)
617 ));
618 }
619
620 #[test]
621 fn inline_style_becomes_raw_html() {
622 let inlines = para_inlines("<p>a<style>.x{}</style>b</p>");
623 assert!(inlines.iter().any(|inline| matches!(
624 inline,
625 Inline::RawInline(format, text)
626 if format.0 == "html" && text == "<style>.x{}</style>"
627 )));
628 }
629
630 #[test]
631 fn leading_style_block_is_dropped() {
632 assert!(matches!(
633 blocks("<style>.x{}</style><p>x</p>").as_slice(),
634 [Block::Para(_)]
635 ));
636 }
637
638 #[test]
639 fn style_after_a_block_is_kept_as_a_raw_paragraph() {
640 let result = blocks("<p>a</p>\n<style>.x{}</style>\n<p>b</p>");
641 let [Block::Para(_), Block::Para(mid), Block::Para(_)] = result.as_slice() else {
642 panic!("expected three paragraphs");
643 };
644 assert!(matches!(
645 mid.as_slice(),
646 [Inline::RawInline(format, text)]
647 if format.0 == "html" && text == "<style>.x{}</style>"
648 ));
649 }
650
651 #[test]
652 fn style_directly_adjacent_to_a_block_is_dropped() {
653 assert!(matches!(
654 blocks("<p>a</p><style>.x{}</style><p>b</p>").as_slice(),
655 [Block::Para(_), Block::Para(_)]
656 ));
657 }
658
659 #[test]
660 fn adjacent_styles_share_one_raw_paragraph() {
661 let result = blocks("<p>a</p>\n<style>s1{}</style>\n<style>s2{}</style>\n<p>b</p>");
662 let [_, Block::Para(mid), _] = result.as_slice() else {
663 panic!("expected three paragraphs");
664 };
665 assert!(matches!(
666 mid.as_slice(),
667 [
668 Inline::RawInline(f1, t1),
669 Inline::SoftBreak,
670 Inline::RawInline(f2, t2),
671 ] if f1.0 == "html" && t1 == "<style>s1{}</style>"
672 && f2.0 == "html" && t2 == "<style>s2{}</style>"
673 ));
674 }
675
676 #[test]
677 fn math_script_becomes_inline_math() {
678 let inlines = para_inlines(r#"<p><script type="math/tex">\D</script></p>"#);
679 assert!(matches!(
680 inlines.as_slice(),
681 [Inline::Math(MathType::InlineMath, text)] if text == "\\D"
682 ));
683 }
684
685 #[test]
686 fn display_math_script_becomes_display_math() {
687 let inlines = para_inlines(r#"<p><script type="math/tex; mode=display">\D</script></p>"#);
688 assert!(matches!(
689 inlines.as_slice(),
690 [Inline::Math(MathType::DisplayMath, _)]
691 ));
692 }
693
694 #[test]
695 fn non_math_script_is_dropped() {
696 assert!(blocks("<p><script>run()</script></p>").is_empty());
697 }
698
699 #[test]
700 fn checkbox_in_item_renders_ballot_box() {
701 let Block::BulletList(items) =
702 first_block(r#"<ul><li><input type="checkbox" checked/>do it</li></ul>"#)
703 else {
704 panic!("expected bullet list");
705 };
706 let Some([Block::Plain(inlines)]) = items.first().map(Vec::as_slice) else {
707 panic!("expected one plain block");
708 };
709 assert!(matches!(inlines.first(), Some(Inline::Str(s)) if s == "\u{2612}"));
710 }
711
712 #[test]
713 fn checkbox_outside_item_is_dropped() {
714 let inlines = para_inlines(r#"<p><input type="checkbox"/>text</p>"#);
715 assert_eq!(inlines.as_slice(), [Inline::Str("text".to_string().into())]);
716 }
717
718 #[test]
719 fn paragraph_with_checkbox_demotes_to_plain() {
720 assert!(matches!(
721 first_block(r#"<p><input type="checkbox"/>x</p>"#),
722 Block::Plain(_)
723 ));
724 }
725
726 #[test]
727 fn empty_paragraph_is_dropped() {
728 assert!(blocks("<p>hi</p><p></p><p>lo</p>").len() == 2);
729 }
730
731 #[test]
732 fn consecutive_terms_merge_with_line_break() {
733 let Block::DefinitionList(items) = first_block("<dl><dt>a</dt><dt>b</dt><dd>x</dd></dl>")
734 else {
735 panic!("expected definition list");
736 };
737 let Some((term, _)) = items.first() else {
738 panic!("expected one item");
739 };
740 assert!(term.contains(&Inline::LineBreak));
741 }
742
743 #[test]
744 fn stray_paragraph_in_list_attaches_to_item() {
745 let Block::BulletList(items) = first_block("<ul><li>a</li><p>b</p></ul>") else {
746 panic!("expected bullet list");
747 };
748 assert_eq!(items.len(), 1);
749 assert_eq!(items.first().map(Vec::len), Some(2));
750 }
751
752 #[test]
753 fn native_divs_off_splices_div_children() {
754 let result = read_with("<div class=\"c\"><p>x</p></div>", Extensions::empty());
755 assert!(matches!(result.as_slice(), [Block::Para(_)]));
756 }
757
758 #[test]
759 fn native_divs_off_drops_sectioning_wrapper() {
760 let result = read_with("<section><p>x</p></section>", Extensions::empty());
761 assert!(matches!(result.as_slice(), [Block::Para(_)]));
762 }
763
764 #[test]
765 fn native_spans_off_unwraps_span_and_small_caps() {
766 let plain = read_with("<p><span class=\"c\">x</span></p>", Extensions::empty());
767 let Some(Block::Para(inlines)) = plain.first() else {
768 panic!("expected paragraph");
769 };
770 assert_eq!(inlines.as_slice(), [Inline::Str("x".to_string().into())]);
771
772 let caps = read_with(
773 "<p><span style=\"font-variant: small-caps\">x</span></p>",
774 Extensions::empty(),
775 );
776 let Some(Block::Para(inlines)) = caps.first() else {
777 panic!("expected paragraph");
778 };
779 assert_eq!(inlines.as_slice(), [Inline::Str("x".to_string().into())]);
780 }
781
782 #[test]
783 fn native_spans_off_keeps_class_carrying_inlines() {
784 let result = read_with("<p><mark>m</mark></p>", Extensions::empty());
787 let Some(Block::Para(inlines)) = result.first() else {
788 panic!("expected paragraph");
789 };
790 assert!(matches!(inlines.first(), Some(Inline::Span(_, _))));
791 }
792
793 #[test]
794 fn auto_identifiers_off_leaves_id_empty_but_keeps_explicit() {
795 let generated = read_with("<h1>Hello World</h1>", Extensions::empty());
796 let Some(Block::Header(_, attr, _)) = generated.first() else {
797 panic!("expected header");
798 };
799 assert_eq!(attr.id, "");
800
801 let explicit = read_with("<h2 id=\"keep\">T</h2>", Extensions::empty());
802 let Some(Block::Header(_, attr, _)) = explicit.first() else {
803 panic!("expected header");
804 };
805 assert_eq!(attr.id, "keep");
806 }
807
808 #[test]
809 fn line_blocks_off_keeps_a_plain_div() {
810 let result = read_with(
811 "<div class=\"line-block\">a<br>b</div>",
812 Extensions::from_list(&[Extension::NativeDivs]),
813 );
814 let Some(Block::Div(attr, children)) = result.first() else {
815 panic!("expected div");
816 };
817 assert_eq!(attr.classes, vec!["line-block".to_string()]);
818 assert!(matches!(children.as_slice(), [Block::Plain(_)]));
819 }
820
821 fn read_with_text_ext(input: &str, added: &[Extension]) -> Vec<Block> {
824 read_with(input, html_defaults().union(Extensions::from_list(added)))
825 }
826
827 fn para_inlines_ext(input: &str, added: &[Extension]) -> Vec<Inline> {
828 match read_with_text_ext(input, added).into_iter().next() {
829 Some(Block::Para(inlines) | Block::Plain(inlines)) => inlines,
830 other => panic!("expected a paragraph, got {other:?}"),
831 }
832 }
833
834 #[test]
835 fn smart_off_keeps_literal_punctuation() {
836 let inlines = para_inlines("<p>\"a\" -- ... ---</p>");
837 assert_eq!(
838 inlines.as_slice(),
839 [
840 Inline::Str("\"a\"".to_string().into()),
841 Inline::Space,
842 Inline::Str("--".to_string().into()),
843 Inline::Space,
844 Inline::Str("...".to_string().into()),
845 Inline::Space,
846 Inline::Str("---".to_string().into()),
847 ]
848 );
849 }
850
851 #[test]
852 fn smart_on_curls_quotes_and_folds_dashes() {
853 let inlines = para_inlines_ext("<p>\"a\" -- ... ---</p>", &[Extension::Smart]);
854 assert_eq!(
855 inlines.as_slice(),
856 [
857 Inline::Quoted(
858 carta_ast::QuoteType::DoubleQuote,
859 vec![Inline::Str("a".to_string().into())]
860 ),
861 Inline::Space,
862 Inline::Str("\u{2013}".to_string().into()),
863 Inline::Space,
864 Inline::Str("\u{2026}".to_string().into()),
865 Inline::Space,
866 Inline::Str("\u{2014}".to_string().into()),
867 ]
868 );
869 }
870
871 #[test]
872 fn tex_math_dollars_off_keeps_literal_text() {
873 let inlines = para_inlines("<p>$x^2$ and $$y$$</p>");
874 assert_eq!(
875 inlines.as_slice(),
876 [
877 Inline::Str("$x^2$".to_string().into()),
878 Inline::Space,
879 Inline::Str("and".to_string().into()),
880 Inline::Space,
881 Inline::Str("$$y$$".to_string().into()),
882 ]
883 );
884 }
885
886 #[test]
887 fn tex_math_dollars_on_splits_inline_and_display() {
888 let inlines = para_inlines_ext("<p>$x^2$ and $$y$$</p>", &[Extension::TexMathDollars]);
889 assert_eq!(
890 inlines.as_slice(),
891 [
892 Inline::Math(MathType::InlineMath, "x^2".to_string().into()),
893 Inline::Space,
894 Inline::Str("and".to_string().into()),
895 Inline::Space,
896 Inline::Math(MathType::DisplayMath, "y".to_string().into()),
897 ]
898 );
899 }
900
901 #[test]
902 fn tex_math_single_backslash_on_splits_inline_and_display() {
903 let inlines = para_inlines_ext(
904 "<p>\\(x\\) and \\[y\\]</p>",
905 &[Extension::TexMathSingleBackslash],
906 );
907 assert_eq!(
908 inlines.as_slice(),
909 [
910 Inline::Math(MathType::InlineMath, "x".to_string().into()),
911 Inline::Space,
912 Inline::Str("and".to_string().into()),
913 Inline::Space,
914 Inline::Math(MathType::DisplayMath, "y".to_string().into()),
915 ]
916 );
917 }
918
919 #[test]
920 fn tex_math_double_backslash_on_splits_inline_and_display() {
921 let inlines = para_inlines_ext(
922 "<p>\\\\(x\\\\) and \\\\[y\\\\]</p>",
923 &[Extension::TexMathDoubleBackslash],
924 );
925 assert_eq!(
926 inlines.as_slice(),
927 [
928 Inline::Math(MathType::InlineMath, "x".to_string().into()),
929 Inline::Space,
930 Inline::Str("and".to_string().into()),
931 Inline::Space,
932 Inline::Math(MathType::DisplayMath, "y".to_string().into()),
933 ]
934 );
935 }
936
937 #[test]
938 fn note_reference_reconstructs_body_and_drops_container() {
939 let result = blocks(concat!(
940 "text<a href=\"#fn1\" class=\"footnote-ref\" role=\"doc-noteref\"><sup>1</sup></a>\n",
941 "<section class=\"footnotes\" role=\"doc-endnotes\"><hr /><ol>",
942 "<li id=\"fn1\"><p>the note",
943 "<a href=\"#fnref1\" class=\"footnote-back\" role=\"doc-backlink\">\u{21a9}</a></p></li>",
944 "</ol></section>",
945 ));
946 assert_eq!(
947 result.as_slice(),
948 [Block::Plain(vec![
949 Inline::Str("text".to_string().into()),
950 Inline::Note(vec![Block::Para(vec![
951 Inline::Str("the".to_string().into()),
952 Inline::Space,
953 Inline::Str("note".to_string().into()),
954 ])]),
955 ])]
956 );
957 }
958
959 #[test]
960 fn unmatched_note_reference_becomes_an_empty_note() {
961 let result = blocks("text<a href=\"#missing\" role=\"doc-noteref\"><sup>1</sup></a>");
962 assert_eq!(
963 result.as_slice(),
964 [Block::Plain(vec![
965 Inline::Str("text".to_string().into()),
966 Inline::Note(Vec::new()),
967 ])]
968 );
969 }
970
971 fn header_ids(input: &str, added: &[Extension]) -> Vec<String> {
972 read_with_text_ext(input, added)
973 .into_iter()
974 .filter_map(|block| match block {
975 Block::Header(_, attr, _) => Some(attr.id.to_string()),
976 _ => None,
977 })
978 .collect()
979 }
980
981 #[test]
982 fn gfm_auto_identifiers_drops_dots_keeps_digits_and_does_not_collapse() {
983 let ids = header_ids(
986 "<h2>1.2 Section A.B</h2><h2>Tools & Tips</h2>",
987 &[Extension::GfmAutoIdentifiers],
988 );
989 assert_eq!(ids, vec!["12-section-ab", "tools--tips"]);
990 }
991
992 #[test]
993 fn gfm_auto_identifiers_keep_the_section_fallback_and_increment_on_collision() {
994 let ids = header_ids(
995 "<h2>Repeat</h2><h2>Repeat</h2><h3>!!!</h3>",
996 &[Extension::GfmAutoIdentifiers],
997 );
998 assert_eq!(ids, vec!["repeat", "repeat-1", "section"]);
999 }
1000
1001 #[test]
1002 fn gfm_auto_identifiers_need_auto_identifiers_to_take_effect() {
1003 let ids = read_with(
1004 "<h2>1.2 Section A.B</h2>",
1005 Extensions::from_list(&[Extension::GfmAutoIdentifiers]),
1006 )
1007 .into_iter()
1008 .filter_map(|block| match block {
1009 Block::Header(_, attr, _) => Some(attr.id.to_string()),
1010 _ => None,
1011 })
1012 .collect::<Vec<_>>();
1013 assert_eq!(ids, vec![String::new()]);
1014 }
1015
1016 #[test]
1017 fn repeated_headings_resume_probing_from_the_last_issued_suffix() {
1018 let ids = header_ids("<h2>Same</h2><h2>Same</h2><h2>Same</h2><h2>Same</h2>", &[]);
1019 assert_eq!(ids, vec!["same", "same-1", "same-2", "same-3"]);
1020 }
1021
1022 #[test]
1023 fn repeated_headings_skip_an_id_reserved_by_an_explicit_heading() {
1024 let ids = header_ids(
1025 "<h2 id=\"same-2\">Explicit</h2><h2>Same</h2><h2>Same</h2><h2>Same</h2>",
1026 &[],
1027 );
1028 assert_eq!(ids, vec!["same-2", "same", "same-1", "same-3"]);
1029 }
1030
1031 #[cfg(feature = "opml")]
1032 #[test]
1033 fn inline_fragment_parses_markup_and_trims_edges() {
1034 let inlines = super::parse_inline_fragment(" <strong>a</strong> b <code>c</code> ");
1035 assert_eq!(
1036 inlines,
1037 vec![
1038 Inline::Strong(vec![Inline::Str("a".to_string().into())]),
1039 Inline::Space,
1040 Inline::Str("b".to_string().into()),
1041 Inline::Space,
1042 Inline::Code(Box::default(), "c".to_string().into()),
1043 ]
1044 );
1045 }
1046
1047 #[cfg(feature = "opml")]
1048 #[test]
1049 fn inline_fragment_resolves_character_references() {
1050 let inlines = super::parse_inline_fragment("a & b");
1051 assert_eq!(
1052 inlines,
1053 vec![
1054 Inline::Str("a".to_string().into()),
1055 Inline::Space,
1056 Inline::Str("&".to_string().into()),
1057 Inline::Space,
1058 Inline::Str("b".to_string().into()),
1059 ]
1060 );
1061 }
1062
1063 #[cfg(feature = "opml")]
1064 fn raw(tag: &str) -> Inline {
1065 Inline::RawInline(
1066 carta_ast::Format("html".to_string().into()),
1067 tag.to_string().into(),
1068 )
1069 }
1070
1071 #[cfg(feature = "opml")]
1072 #[test]
1073 fn inline_fragment_preserves_an_unrecognized_tag_verbatim() {
1074 let inlines = super::parse_inline_fragment("<cite>Book</cite>");
1075 assert_eq!(
1076 inlines,
1077 vec![
1078 raw("<cite>"),
1079 Inline::Str("Book".to_string().into()),
1080 raw("</cite>")
1081 ]
1082 );
1083 }
1084
1085 #[cfg(feature = "opml")]
1086 #[test]
1087 fn inline_fragment_keeps_unknown_tag_attributes() {
1088 let inlines = super::parse_inline_fragment("<time datetime=\"2020\">y</time>");
1089 assert_eq!(
1090 inlines,
1091 vec![
1092 raw("<time datetime=\"2020\">"),
1093 Inline::Str("y".to_string().into()),
1094 raw("</time>"),
1095 ]
1096 );
1097 }
1098
1099 #[cfg(feature = "opml")]
1100 #[test]
1101 fn inline_fragment_escapes_attribute_values_and_emits_bare_boolean() {
1102 let inlines = super::parse_inline_fragment("<x-foo a=\"1<2&3\" hidden>z</x-foo>");
1103 assert_eq!(
1104 inlines,
1105 vec![
1106 raw("<x-foo a=\"1<2&3\" hidden>"),
1107 Inline::Str("z".to_string().into()),
1108 raw("</x-foo>"),
1109 ]
1110 );
1111 }
1112
1113 #[cfg(feature = "opml")]
1114 #[test]
1115 fn inline_fragment_lowercases_an_unknown_tag_name() {
1116 let inlines = super::parse_inline_fragment("<CITE>b</CITE>");
1117 assert_eq!(
1118 inlines,
1119 vec![
1120 raw("<cite>"),
1121 Inline::Str("b".to_string().into()),
1122 raw("</cite>")
1123 ]
1124 );
1125 }
1126
1127 #[cfg(feature = "opml")]
1128 #[test]
1129 fn inline_fragment_void_unknown_tag_is_a_single_raw_inline() {
1130 let inlines = super::parse_inline_fragment("a <wbr> b");
1131 assert_eq!(
1132 inlines,
1133 vec![
1134 Inline::Str("a".to_string().into()),
1135 Inline::Space,
1136 raw("<wbr>"),
1137 Inline::Space,
1138 Inline::Str("b".to_string().into()),
1139 ]
1140 );
1141 }
1142
1143 #[cfg(feature = "opml")]
1144 #[test]
1145 fn inline_fragment_self_closing_unknown_tag_pairs_open_and_close() {
1146 let inlines = super::parse_inline_fragment("<custom-tag/>");
1147 assert_eq!(inlines, vec![raw("<custom-tag>"), raw("</custom-tag>")]);
1148 }
1149
1150 #[cfg(feature = "opml")]
1151 #[test]
1152 fn inline_fragment_unclosed_unknown_tag_omits_the_close() {
1153 let inlines = super::parse_inline_fragment("a <cite>open-only");
1154 assert_eq!(
1155 inlines,
1156 vec![
1157 Inline::Str("a".to_string().into()),
1158 Inline::Space,
1159 raw("<cite>"),
1160 Inline::Str("open-only".to_string().into()),
1161 ]
1162 );
1163 }
1164
1165 #[cfg(feature = "opml")]
1166 #[test]
1167 fn inline_fragment_stray_unknown_end_tag_is_preserved() {
1168 let inlines = super::parse_inline_fragment("</cite> tail");
1169 assert_eq!(
1170 inlines,
1171 vec![
1172 raw("</cite>"),
1173 Inline::Space,
1174 Inline::Str("tail".to_string().into()),
1175 ]
1176 );
1177 }
1178
1179 #[cfg(feature = "opml")]
1180 #[test]
1181 fn inline_fragment_unknown_tag_wraps_recognized_inner_markup() {
1182 let inlines = super::parse_inline_fragment("<cite><em>x</em></cite>");
1183 assert_eq!(
1184 inlines,
1185 vec![
1186 raw("<cite>"),
1187 Inline::Emph(vec![Inline::Str("x".to_string().into())]),
1188 raw("</cite>"),
1189 ]
1190 );
1191 }
1192
1193 #[cfg(feature = "opml")]
1194 #[test]
1195 fn inline_fragment_recognized_tags_keep_structural_mapping() {
1196 let inlines = super::parse_inline_fragment("<em>e</em> <strong>s</strong> <sup>2</sup>");
1197 assert_eq!(
1198 inlines,
1199 vec![
1200 Inline::Emph(vec![Inline::Str("e".to_string().into())]),
1201 Inline::Space,
1202 Inline::Strong(vec![Inline::Str("s".to_string().into())]),
1203 Inline::Space,
1204 Inline::Superscript(vec![Inline::Str("2".to_string().into())]),
1205 ]
1206 );
1207 }
1208}