use super::{parse, ElementKind, NodeId, Tree};
#[derive(serde::Deserialize)]
struct SpecExample {
markdown: String,
html: String,
example: u32,
start_line: u32,
section: String,
}
#[derive(Debug, Clone, PartialEq)]
enum Shape {
Heading(u8),
ThematicBreak,
Paragraph,
CodeBlock,
HtmlBlock,
BlockQuote(Vec<Self>),
List {
ordered: bool,
start: u32,
tight: bool,
items: Vec<Vec<Self>>,
},
}
fn load_spec() -> Vec<SpecExample> {
let json = include_str!("../tests/fixtures/commonmark_spec.json");
serde_json::from_str(json).expect("spec.json should parse as valid JSON")
}
fn section_examples<'a>(all: &'a [SpecExample], section: &str) -> Vec<&'a SpecExample> {
all.iter().filter(|e| e.section == section).collect()
}
fn tree_shapes(tree: &Tree) -> Vec<Shape> {
children_shapes(tree, tree.root(), false)
}
fn children_shapes(tree: &Tree, parent: NodeId, tight: bool) -> Vec<Shape> {
tree.children(parent)
.iter()
.filter_map(|&id| node_shape(tree, id, tight))
.collect()
}
fn node_shape(tree: &Tree, id: NodeId, tight: bool) -> Option<Shape> {
let node = tree.node(id);
match &node.kind {
ElementKind::Heading { level } => Some(Shape::Heading(*level)),
ElementKind::Rules => Some(Shape::ThematicBreak),
ElementKind::Paragraph => {
if tight {
None
} else {
Some(Shape::Paragraph)
}
}
ElementKind::CodeBlock | ElementKind::Math => Some(Shape::CodeBlock),
ElementKind::QuoteBlock | ElementKind::Admonition { .. } => {
Some(Shape::BlockQuote(children_shapes(tree, id, false)))
}
ElementKind::List {
ordered,
start,
tight: t,
} => {
let items = tree
.children(id)
.iter()
.filter(|&&cid| matches!(tree.node(cid).kind, ElementKind::ListItem { .. }))
.map(|&cid| children_shapes(tree, cid, *t))
.collect();
Some(Shape::List {
ordered: *ordered,
start: *start,
tight: *t,
items,
})
}
ElementKind::HtmlBlock
| ElementKind::Container
| ElementKind::Details
| ElementKind::DetailsSummary
| ElementKind::FormControl
| ElementKind::DefinitionList
| ElementKind::DefinitionTerm
| ElementKind::DefinitionDesc
| ElementKind::Table { .. }
| ElementKind::TableRow { .. }
| ElementKind::TableCell => Some(Shape::HtmlBlock),
_ => None,
}
}
fn html_shapes(html: &str) -> Vec<Shape> {
parse_html_blocks(html)
}
fn parse_html_blocks(html: &str) -> Vec<Shape> {
let mut shapes = Vec::new();
let mut pos = 0;
while pos < html.len() {
pos = skip_ws(html, pos);
if pos >= html.len() {
break;
}
if html.as_bytes()[pos] != b'<' {
pos = next_lt(html, pos);
continue;
}
if html[pos..].starts_with("</") {
break;
}
if let Some((shape, end)) = try_hr(html, pos)
.or_else(|| try_heading(html, pos))
.or_else(|| try_pre_code(html, pos))
.or_else(|| try_paragraph(html, pos))
.or_else(|| try_blockquote(html, pos))
.or_else(|| try_list(html, pos))
{
shapes.push(shape);
pos = end;
} else if let Some(end) = skip_unknown_html(html, pos) {
shapes.push(Shape::HtmlBlock);
pos = end;
} else {
pos += 1;
}
}
shapes
}
fn skip_ws(s: &str, start: usize) -> usize {
s[start..]
.find(|c: char| !c.is_ascii_whitespace())
.map_or(s.len(), |i| start + i)
}
fn next_lt(s: &str, start: usize) -> usize {
s[start..].find('<').map_or(s.len(), |i| start + i)
}
fn try_hr(html: &str, pos: usize) -> Option<(Shape, usize)> {
for pat in &["<hr />", "<hr/>", "<hr>"] {
if html[pos..].starts_with(pat) {
return Some((Shape::ThematicBreak, pos + pat.len()));
}
}
None
}
fn try_heading(html: &str, pos: usize) -> Option<(Shape, usize)> {
let rest = &html[pos..];
if rest.len() < 5 || !rest.starts_with("<h") {
return None;
}
let lvl = rest.as_bytes()[2];
if !(b'1'..=b'6').contains(&lvl) || rest.as_bytes()[3] != b'>' {
return None;
}
let level = lvl - b'0';
let close = format!("</h{level}>");
let end = rest.find(&close)? + close.len();
Some((Shape::Heading(level), pos + end))
}
fn try_pre_code(html: &str, pos: usize) -> Option<(Shape, usize)> {
if !html[pos..].starts_with("<pre><code") {
return None;
}
let end = html[pos..].find("</pre>")? + "</pre>".len();
Some((Shape::CodeBlock, pos + end))
}
fn try_paragraph(html: &str, pos: usize) -> Option<(Shape, usize)> {
let rest = &html[pos..];
if !rest.starts_with("<p>") && !rest.starts_with("<p ") {
return None;
}
let end = rest.find("</p>")? + "</p>".len();
Some((Shape::Paragraph, pos + end))
}
fn try_blockquote(html: &str, pos: usize) -> Option<(Shape, usize)> {
if !tag_opens_at(html, pos, "blockquote") {
return None;
}
let open_end = pos + html[pos..].find('>')? + 1;
let close_pos = find_close(html, open_end, "blockquote")?;
let children = parse_html_blocks(&html[open_end..close_pos]);
Some((
Shape::BlockQuote(children),
close_pos + "</blockquote>".len(),
))
}
fn try_list(html: &str, pos: usize) -> Option<(Shape, usize)> {
let (ordered, tag) = if tag_opens_at(html, pos, "ul") {
(false, "ul")
} else if tag_opens_at(html, pos, "ol") {
(true, "ol")
} else {
return None;
};
let open_tag_end = html[pos..].find('>')?;
let start_num = if ordered {
parse_start_attr(&html[pos..pos + open_tag_end])
} else {
0
};
let open_end = pos + open_tag_end + 1;
let close_pos = find_close(html, open_end, tag)?;
let inner = &html[open_end..close_pos];
let li_contents = extract_li_contents(inner);
let item_shapes: Vec<Vec<Shape>> = li_contents
.iter()
.map(|c| parse_html_blocks(c))
.collect();
let tight = !item_shapes
.iter()
.any(|shapes| shapes.iter().any(|s| matches!(s, Shape::Paragraph)));
let close_tag_len = tag.len() + 3; Some((
Shape::List {
ordered,
start: start_num,
tight,
items: item_shapes,
},
close_pos + close_tag_len,
))
}
fn parse_start_attr(tag_text: &str) -> u32 {
tag_text
.find("start=\"")
.and_then(|i| {
let rest = &tag_text[i + 7..];
rest.find('"').and_then(|j| rest[..j].parse().ok())
})
.unwrap_or(1)
}
fn extract_li_contents(inner: &str) -> Vec<String> {
let mut items = Vec::new();
let mut pos = 0;
while pos < inner.len() {
pos = skip_ws(inner, pos);
if pos >= inner.len() {
break;
}
if !tag_opens_at(inner, pos, "li") {
pos += 1;
continue;
}
let Some(tag_end_rel) = inner[pos..].find('>') else {
break;
};
let tag_end = pos + tag_end_rel + 1;
let Some(close_pos) = find_close(inner, tag_end, "li") else {
break;
};
items.push(inner[tag_end..close_pos].to_string());
pos = close_pos + "</li>".len();
}
items
}
fn tag_opens_at(html: &str, pos: usize, tag: &str) -> bool {
let rest = &html[pos..];
let prefix_len = tag.len() + 1; if rest.len() < prefix_len {
return false;
}
if rest.as_bytes()[0] != b'<' || !rest[1..].starts_with(tag) {
return false;
}
if rest.len() == prefix_len {
return true;
}
matches!(
rest.as_bytes()[prefix_len],
b'>' | b' ' | b'\n' | b'\t' | b'/'
)
}
fn find_close(html: &str, start: usize, tag: &str) -> Option<usize> {
let close = format!("</{tag}>");
let mut depth = 1u32;
let mut pos = start;
while pos < html.len() {
if html[pos..].starts_with(&close) {
depth -= 1;
if depth == 0 {
return Some(pos);
}
pos += close.len();
} else if tag_opens_at(html, pos, tag) {
depth += 1;
pos = html[pos..].find('>').map_or(html.len(), |i| pos + i + 1);
} else {
pos += 1;
}
}
None
}
fn skip_unknown_html(html: &str, pos: usize) -> Option<usize> {
let rest = &html[pos..];
if rest.starts_with("<!--") {
return rest.find("-->").map(|i| pos + i + 3);
}
if rest.starts_with("<?") {
return rest.find("?>").map(|i| pos + i + 2);
}
if rest.starts_with("<![CDATA[") {
return rest.find("]]>").map(|i| pos + i + 3);
}
if rest.starts_with("<!") {
return rest.find('>').map(|i| pos + i + 1);
}
if rest.len() < 2 || !rest.as_bytes()[1].is_ascii_alphabetic() {
return Some(pos + 1);
}
let name_end = rest[1..]
.find(|c: char| !c.is_ascii_alphanumeric() && c != '-')
.map_or(rest.len(), |i| i + 1);
let tag = &rest[1..name_end];
if tag.is_empty() {
return Some(pos + 1);
}
let gt = rest.find('>')?;
let open_end = pos + gt + 1;
if rest[..=gt].ends_with("/>") {
return Some(open_end);
}
find_close(html, open_end, tag)
.map(|cp| cp + tag.len() + 3) .or(Some(open_end))
}
type Mask = u8;
const BOLD: Mask = 0b001;
const ITALIC: Mask = 0b010;
const STRIKE: Mask = 0b100;
type MaskedText = Vec<(char, Mask)>;
fn html_emphasis_mask(html: &str) -> MaskedText {
let bytes = html.as_bytes();
let mut out: MaskedText = Vec::new();
let mut bold = 0u32;
let mut italic = 0u32;
let mut strike = 0u32;
let mut in_para = false;
let mut i = 0;
while i < html.len() {
if bytes[i] == b'<' {
let rest = &html[i..];
if rest.starts_with("<p>") || rest.starts_with("<p ") {
in_para = true;
} else if rest.starts_with("</p>") {
in_para = false;
}
i = consume_emphasis_tag(html, i, &mut bold, &mut italic, &mut strike);
continue;
}
if !in_para {
i += 1;
continue;
}
if bytes[i] == b'&'
&& let Some((ch, end)) = decode_entity(html, i)
{
out.push((ch, current_mask(bold, italic, strike)));
i = end;
continue;
}
let ch = html[i..]
.chars()
.next()
.expect("byte index is on a char boundary");
out.push((ch, current_mask(bold, italic, strike)));
i += ch.len_utf8();
}
out
}
const fn current_mask(bold: u32, italic: u32, strike: u32) -> Mask {
let mut m = 0;
if bold > 0 {
m |= BOLD;
}
if italic > 0 {
m |= ITALIC;
}
if strike > 0 {
m |= STRIKE;
}
m
}
fn consume_emphasis_tag(
html: &str,
pos: usize,
bold: &mut u32,
italic: &mut u32,
strike: &mut u32,
) -> usize {
let rest = &html[pos..];
let Some(gt) = rest.find('>') else {
return pos + 1;
};
let inner = &rest[1..gt]; match inner {
"strong" => *bold += 1,
"/strong" => *bold = bold.saturating_sub(1),
"em" => *italic += 1,
"/em" => *italic = italic.saturating_sub(1),
"del" => *strike += 1,
"/del" => *strike = strike.saturating_sub(1),
_ => {}
}
pos + gt + 1
}
fn decode_entity(html: &str, pos: usize) -> Option<(char, usize)> {
for (name, ch) in &[
("&", '&'),
("<", '<'),
(">", '>'),
(""", '"'),
("'", '\''),
] {
if html[pos..].starts_with(name) {
return Some((*ch, pos + name.len()));
}
}
None
}
fn tree_emphasis_mask(tree: &Tree) -> MaskedText {
let source = tree.source();
let mut mask = vec![0u8; source.len()];
let mut is_markup = vec![false; source.len()];
for node in tree.nodes() {
let (start, end) = (node.span.start, node.span.end);
let (bit, open_len) = match node.kind {
ElementKind::Strong => (BOLD, 2),
ElementKind::Emphasis => (ITALIC, 1),
ElementKind::Strikethrough => (
STRIKE,
source[start..end].chars().take_while(|&c| c == '~').count(),
),
_ => continue,
};
for slot in &mut mask[start..end] {
*slot |= bit;
}
for off in 0..open_len {
is_markup[start + off] = true;
is_markup[end - 1 - off] = true;
}
}
let mut out: MaskedText = Vec::new();
for node in tree.nodes() {
if !matches!(
node.kind,
ElementKind::Paragraph | ElementKind::Heading { .. }
) {
continue;
}
let (start, end) = (node.span.start, node.span.end);
let mut host: MaskedText = Vec::new();
let mut i = start;
while i < end {
if is_markup[i] {
i += 1;
continue;
}
let ch = source[i..]
.chars()
.next()
.expect("byte index is on a char boundary");
host.push((ch, mask[i]));
i += ch.len_utf8();
}
out.extend(normalize_masked(&host));
}
out
}
fn normalize_masked(text: &MaskedText) -> MaskedText {
let is_ws = |&(c, _): &(char, Mask)| c.is_whitespace();
let start = text.iter().position(|e| !is_ws(e)).unwrap_or(text.len());
let end = text
.iter()
.rposition(|e| !is_ws(e))
.map_or(start, |p| p + 1);
text[start..end].to_vec()
}
const DEVIATIONS: &[(u32, &str)] = &[
(148, "HTML container modeling (<table>)"),
(149, "HTML container modeling (<div>)"),
(150, "HTML container modeling (<div>)"),
(151, "HTML container modeling (<div>)"),
(155, "HTML container modeling (<div>, blank line boundary)"),
(156, "HTML container modeling (<div>)"),
(157, "HTML container modeling (<div>)"),
(160, "HTML container modeling (<table>)"),
(161, "HTML container modeling (<div>)"),
(162, "HTML container modeling (<div>)"),
(165, "HTML container modeling (<div>, self-closing)"),
(167, "HTML container modeling (<div>)"),
(168, "HTML container modeling (<nav>)"),
(175, "HTML container modeling (<div> in list)"),
(184, "HTML container modeling (<div>, indented)"),
(186, "HTML container modeling (<div>, blockquote)"),
(190, "HTML container modeling (<table>, blank lines)"),
(191, "HTML container modeling (<div>, blank line split)"),
];
fn is_deviation(example: u32) -> bool {
DEVIATIONS.iter().any(|(n, _)| *n == example)
}
fn run_section(section: &str) {
let spec = load_spec();
let examples = section_examples(&spec, section);
assert!(
!examples.is_empty(),
"no spec examples found for section {section:?}"
);
let mut failures: Vec<String> = Vec::new();
let mut skipped = 0u32;
for ex in &examples {
if is_deviation(ex.example) {
skipped += 1;
continue;
}
let tree = parse(&ex.markdown);
let actual = tree_shapes(&tree);
let expected = html_shapes(&ex.html);
if actual != expected {
failures.push(format!(
"Example {} (spec line {}):\n\
\x20 expected: {expected:?}\n\
\x20 actual: {actual:?}\n\
\x20 markdown: {:?}\n\
\x20 html: {:?}",
ex.example, ex.start_line, ex.markdown, ex.html,
));
}
}
let total = examples.len();
let passed = total - failures.len() - skipped as usize;
assert!(
failures.is_empty(),
"{section}: {passed}/{total} passed, {skipped} skipped, {} failed:\n\n{}",
failures.len(),
failures.join("\n\n")
);
}
const EMPHASIS_DEVIATIONS: &[(u32, &str)] = &[
(404, "link inside emphasis (`[bar](/url)` -> `bar`)"),
(419, "link with nested emphasis inside emphasis"),
(422, "link inside strong"),
(433, "link with nested emphasis inside strong"),
(473, "emphasis delimiter inside a link label"),
(474, "emphasis delimiter inside a link label"),
(480, "autolink absorbs the trailing `**`"),
(481, "autolink absorbs the trailing `__`"),
(478, "code span inside emphasis (`` `*` `` -> `*`)"),
(479, "code span inside emphasis (`` `_` `` -> `_`)"),
(475, "raw inline HTML (`<img .. title=\"*\"/>`)"),
(476, "raw inline HTML (`<a href=\"**\">`)"),
(477, "raw inline HTML (`<a href=\"__\">`)"),
(437, "backslash-escaped delimiter (`\\*`)"),
(440, "backslash-escaped delimiter (`\\*`)"),
(449, "backslash-escaped delimiter (`\\_`)"),
(452, "backslash-escaped delimiter (`\\_`)"),
];
fn is_emphasis_deviation(example: u32) -> bool {
EMPHASIS_DEVIATIONS.iter().any(|(n, _)| *n == example)
}
fn run_emphasis_section() {
let spec = load_spec();
let examples = section_examples(&spec, "Emphasis and strong emphasis");
assert!(
!examples.is_empty(),
"no spec examples found for the emphasis section"
);
let mut failures: Vec<String> = Vec::new();
let mut skipped = 0u32;
for ex in &examples {
if is_emphasis_deviation(ex.example) {
skipped += 1;
continue;
}
let tree = parse(&ex.markdown);
let actual = normalize_masked(&tree_emphasis_mask(&tree));
let expected = normalize_masked(&html_emphasis_mask(&ex.html));
if actual != expected {
failures.push(format!(
"Example {} (spec line {}):\n\
\x20 expected: {expected:?}\n\
\x20 actual: {actual:?}\n\
\x20 markdown: {:?}\n\
\x20 html: {:?}",
ex.example, ex.start_line, ex.markdown, ex.html,
));
}
}
let total = examples.len();
let passed = total - failures.len() - skipped as usize;
assert!(
failures.is_empty(),
"Emphasis and strong emphasis: {passed}/{total} passed, {skipped} skipped, {} failed:\n\n{}",
failures.len(),
failures.join("\n\n")
);
}
#[test]
fn spec_tabs() {
run_section("Tabs");
}
#[test]
fn spec_precedence() {
run_section("Precedence");
}
#[test]
fn spec_thematic_breaks() {
run_section("Thematic breaks");
}
#[test]
fn spec_atx_headings() {
run_section("ATX headings");
}
#[test]
fn spec_setext_headings() {
run_section("Setext headings");
}
#[test]
fn spec_indented_code_blocks() {
run_section("Indented code blocks");
}
#[test]
fn spec_fenced_code_blocks() {
run_section("Fenced code blocks");
}
#[test]
fn spec_html_blocks() {
run_section("HTML blocks");
}
#[test]
fn spec_link_reference_definitions() {
run_section("Link reference definitions");
}
#[test]
fn spec_paragraphs() {
run_section("Paragraphs");
}
#[test]
fn spec_blank_lines() {
run_section("Blank lines");
}
#[test]
fn spec_block_quotes() {
run_section("Block quotes");
}
#[test]
fn spec_list_items() {
run_section("List items");
}
#[test]
fn spec_lists() {
run_section("Lists");
}
#[test]
fn spec_emphasis() {
run_emphasis_section();
}
#[test]
fn spec_gfm_strikethrough() {
const EXAMPLES: &[(&str, &str)] = &[
(
"~~Hi~~ Hello, ~there~ world!\n",
"<p><del>Hi</del> Hello, <del>there</del> world!</p>\n",
),
(
"This ~~has a\n\nnew paragraph~~.\n",
"<p>This ~~has a</p>\n<p>new paragraph~~.</p>\n",
),
(
"This will ~~~not~~~ strike.\n",
"<p>This will ~~~not~~~ strike.</p>\n",
),
];
for &(markdown, html) in EXAMPLES {
let tree = parse(markdown);
let actual = normalize_masked(&tree_emphasis_mask(&tree));
let expected = normalize_masked(&html_emphasis_mask(html));
assert_eq!(
actual, expected,
"GFM strikethrough mismatch for {markdown:?}: expected {expected:?}, got {actual:?}"
);
}
}
#[test]
fn all_sections_covered() {
let spec = load_spec();
let mut sections: Vec<&str> = spec.iter().map(|e| e.section.as_str()).collect();
sections.sort_unstable();
sections.dedup();
let tested: &[&str] = &[
"ATX headings",
"Blank lines",
"Block quotes",
"Emphasis and strong emphasis",
"Fenced code blocks",
"HTML blocks",
"Indented code blocks",
"Link reference definitions",
"List items",
"Lists",
"Paragraphs",
"Precedence",
"Setext headings",
"Tabs",
"Thematic breaks",
];
let inline: &[&str] = &[
"Autolinks",
"Backslash escapes",
"Code spans",
"Entity and numeric character references",
"Hard line breaks",
"Images",
"Inlines",
"Links",
"Raw HTML",
"Soft line breaks",
"Textual content",
];
for section in §ions {
assert!(
tested.contains(section) || inline.contains(section),
"unaccounted spec section: {section:?} — add it to tested or inline list"
);
}
}