use crate::model::{ParsedSection, SectionType};
use anyhow::Result;
use htmd::HtmlToMarkdown;
#[cfg(test)]
use scraper::{Html, Selector};
fn extract_heading_content(
heading: &scraper::ElementRef,
current_depth: u8,
converter: &HtmlToMarkdown,
) -> Option<String> {
use super::markdown;
let mut content_html = String::new();
let mut current = heading.next_sibling();
while let Some(node) = current {
if let Some(sibling_elem) = scraper::ElementRef::wrap(node) {
let tag_name = sibling_elem.value().name();
if let Some(sibling_depth) = heading_depth(tag_name) {
if sibling_depth <= current_depth {
break;
}
}
if tag_name == "dfn" && sibling_elem.value().attr("id").is_some() {
break;
}
content_html.push_str(&sibling_elem.html());
}
current = node.next_sibling();
}
if content_html.trim().is_empty() {
return None;
}
let markdown = markdown::element_to_markdown_from_html(&content_html, converter);
let trimmed = markdown.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
}
fn extract_heading_title(element: &scraper::ElementRef) -> Option<String> {
let mut text_parts = Vec::new();
for node in element.children() {
if let Some(elem) = scraper::ElementRef::wrap(node) {
let classes = elem.value().classes().collect::<Vec<_>>();
if classes.contains(&"secno")
|| classes.contains(&"secnum")
|| classes.contains(&"self-link")
{
continue;
}
text_parts.push(elem.text().collect::<String>());
} else if let Some(text) = node.value().as_text() {
text_parts.push(text.to_string());
}
}
let result = text_parts.join("").trim().to_string();
if result.is_empty() {
None
} else {
Some(result)
}
}
fn heading_depth(tag: &str) -> Option<u8> {
match tag {
"h2" => Some(2),
"h3" => Some(3),
"h4" => Some(4),
"h5" => Some(5),
"h6" => Some(6),
_ => None,
}
}
pub fn parse_heading_element(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Result<Option<ParsedSection>> {
let anchor = match element.value().attr("id") {
Some(id) => id.to_string(),
None => return Ok(None), };
let title = extract_heading_title(element);
let depth = heading_depth(element.value().name())
.ok_or_else(|| anyhow::anyhow!("Invalid heading tag: {}", element.value().name()))?;
let content_text = extract_heading_content(element, depth, converter);
Ok(Some(ParsedSection {
anchor,
title,
content_text,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(depth),
}))
}
pub fn parse_dfn_element(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Result<Option<ParsedSection>> {
let anchor = match element.value().attr("id") {
Some(id) => id.to_string(),
None => return Ok(None), };
if is_inside_algorithm_content(element) {
return Ok(None);
}
let has_dfn_for = element.value().attr("data-dfn-for").is_some();
let has_dfn_type = element.value().attr("data-dfn-type").is_some();
let has_direct_var_child = element
.children()
.filter_map(scraper::ElementRef::wrap)
.any(|c| c.value().name() == "var");
if (has_dfn_for && !has_dfn_type) || has_direct_var_child {
return Ok(None);
}
if element.value().attr("data-dfn-type") == Some("argument") {
return Ok(None);
}
let title = element.text().collect::<String>().trim().to_string();
let title = if title.is_empty() { None } else { Some(title) };
let section_type = if is_inside_algorithm_div(element) {
SectionType::Algorithm
} else if is_idl_type(element) {
SectionType::Idl
} else {
SectionType::Definition
};
let content_text = match section_type {
SectionType::Definition => extract_definition_content(element, converter),
SectionType::Algorithm => extract_algorithm_content(element, converter),
SectionType::Idl => extract_idl_content(element),
_ => None,
};
Ok(Some(ParsedSection {
anchor,
title,
content_text,
section_type,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: None,
}))
}
fn extract_definition_content(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Option<String> {
use super::markdown;
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
let tag_name = parent_elem.value().name();
if matches!(tag_name, "p" | "div" | "dd" | "dt" | "li" | "section") {
return Some(markdown::element_to_markdown(&parent_elem, converter));
}
}
current = node.parent();
}
Some(element.text().collect::<String>().trim().to_string())
}
fn extract_algorithm_content(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Option<String> {
use super::{algorithms, markdown};
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
if parent_elem.value().name() == "div" {
let classes: Vec<_> = parent_elem.value().classes().collect();
let is_algo_div = classes.contains(&"algorithm")
|| parent_elem.value().attr("data-algorithm").is_some();
if is_algo_div {
return extract_from_algorithm_div(&parent_elem, converter);
}
}
if matches!(parent_elem.value().name(), "p" | "dd" | "li") {
let intro = markdown::element_to_markdown(&parent_elem, converter);
let mut sibling = node.next_sibling();
while let Some(sib_node) = sibling {
if let Some(sib_elem) = scraper::ElementRef::wrap(sib_node) {
if sib_elem.value().name() == "ol" {
let steps = algorithms::render_algorithm_ol(&sib_elem, converter);
return Some(format!("{}\n\n{}", intro.trim(), steps));
}
if matches!(
sib_elem.value().name(),
"p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
) {
break;
}
}
sibling = sib_node.next_sibling();
}
}
}
current = node.parent();
}
None
}
fn extract_from_algorithm_div(
div: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Option<String> {
use super::algorithms;
let ol_selector = scraper::Selector::parse("ol").ok()?;
let ol_elem = div.select(&ol_selector).next()?;
let mut intro_html = String::new();
for child in div.children() {
if let Some(child_elem) = scraper::ElementRef::wrap(child) {
if child_elem.value().name() == "ol" {
break;
}
intro_html.push_str(&child_elem.html());
} else if let Some(text) = child.value().as_text() {
intro_html.push_str(text);
}
}
let intro = converter
.convert(&intro_html)
.unwrap_or_default()
.trim()
.to_string();
let steps = algorithms::render_algorithm_ol(&ol_elem, converter);
Some(format!("{}\n\n{}", intro, steps))
}
fn extract_idl_content(element: &scraper::ElementRef) -> Option<String> {
use super::idl;
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
if parent_elem.value().name() == "pre" {
let idl_text = idl::extract_idl_text(&parent_elem);
return Some(idl_text);
}
}
current = node.parent();
}
None
}
pub fn parse_emu_clause_element(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Result<Option<ParsedSection>> {
let anchor = match element.value().attr("id") {
Some(id) => id.to_string(),
None => return Ok(None),
};
let h1 = element
.children()
.filter_map(scraper::ElementRef::wrap)
.find(|c| c.value().name() == "h1");
let (title, depth) = match h1 {
Some(h1_elem) => {
let title = extract_heading_title(&h1_elem);
let depth = extract_secnum_depth(&h1_elem);
(title, depth)
}
None => (None, None),
};
let section_type = if element.value().attr("type").is_some() {
SectionType::Algorithm
} else {
SectionType::Heading
};
let content_text = extract_emu_clause_content(element, converter);
Ok(Some(ParsedSection {
anchor,
title,
content_text,
section_type,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth,
}))
}
fn extract_secnum_depth(heading: &scraper::ElementRef) -> Option<u8> {
for child in heading.children() {
if let Some(elem) = scraper::ElementRef::wrap(child) {
let classes: Vec<_> = elem.value().classes().collect();
if classes.contains(&"secnum") {
let text = elem.text().collect::<String>();
let text = text.trim();
if text.is_empty() {
return None;
}
let parts = text.split('.').count();
return Some((parts + 1).min(255) as u8);
}
}
}
None
}
fn extract_emu_clause_content(
element: &scraper::ElementRef,
converter: &HtmlToMarkdown,
) -> Option<String> {
use super::{algorithms, markdown};
let mut intro_html = String::new();
let mut algo_steps: Option<String> = None;
for child in element.children() {
if let Some(child_elem) = scraper::ElementRef::wrap(child) {
let tag = child_elem.value().name();
if tag == "h1" || tag == "emu-clause" || tag == "emu-annex" || tag == "emu-import" {
continue;
}
if tag == "emu-alg" {
if let Some(ol) = child_elem
.children()
.filter_map(scraper::ElementRef::wrap)
.find(|c| c.value().name() == "ol")
{
algo_steps = Some(algorithms::render_algorithm_ol(&ol, converter));
}
continue;
}
if tag == "span" && child_elem.value().attr("id").is_some() {
let text = child_elem.text().collect::<String>();
if text.trim().is_empty() {
continue;
}
}
intro_html.push_str(&child_elem.html());
}
}
let intro = markdown::element_to_markdown_from_html(&intro_html, converter);
let intro = intro.trim();
match (intro.is_empty(), algo_steps) {
(true, None) => None,
(true, Some(steps)) => Some(steps),
(false, None) => Some(intro.to_string()),
(false, Some(steps)) => Some(format!("{}\n\n{}", intro, steps)),
}
}
#[cfg(test)]
pub fn collect_headings(html: &str) -> Result<Vec<ParsedSection>> {
let document = Html::parse_document(html);
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let mut sections = Vec::new();
let selector = Selector::parse("h2[id], h3[id], h4[id], h5[id], h6[id]")
.map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
for element in document.select(&selector) {
if let Some(section) = parse_heading_element(&element, &converter)? {
sections.push(ParsedSection {
content_text: None,
..section
});
}
}
Ok(sections)
}
fn is_inside_algorithm_content(element: &scraper::ElementRef) -> bool {
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
if parent_elem.value().name() == "ol" {
let mut ol_ancestor = parent_elem.parent();
while let Some(anc_node) = ol_ancestor {
if let Some(anc_elem) = scraper::ElementRef::wrap(anc_node) {
if anc_elem.value().name() == "div" {
let classes: Vec<_> = anc_elem.value().classes().collect();
if classes.contains(&"algorithm")
|| anc_elem.value().attr("data-algorithm").is_some()
{
return true; }
}
}
ol_ancestor = anc_node.parent();
}
let mut prev_sibling = node.prev_sibling();
while let Some(prev_node) = prev_sibling {
if let Some(prev_elem) = scraper::ElementRef::wrap(prev_node) {
if matches!(prev_elem.value().name(), "p" | "dd" | "li") {
if let Ok(dfn_selector) = scraper::Selector::parse("dfn[id]") {
if prev_elem.select(&dfn_selector).next().is_some() {
return true; }
}
}
if matches!(
prev_elem.value().name(),
"p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
) {
break;
}
}
prev_sibling = prev_node.prev_sibling();
}
return false;
}
}
current = node.parent();
}
false
}
fn is_inside_algorithm_div(element: &scraper::ElementRef) -> bool {
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
if parent_elem.value().name() == "div" {
let classes: Vec<_> = parent_elem.value().classes().collect();
if classes.contains(&"algorithm") {
return true;
}
}
if matches!(parent_elem.value().name(), "p" | "div" | "dd" | "li") {
let mut sibling = node.next_sibling();
while let Some(sib_node) = sibling {
if let Some(sib_elem) = scraper::ElementRef::wrap(sib_node) {
if sib_elem.value().name() == "ol" {
return true;
}
if matches!(
sib_elem.value().name(),
"p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
) {
break;
}
}
sibling = sib_node.next_sibling();
}
}
}
current = node.parent();
}
false
}
fn is_idl_type(element: &scraper::ElementRef) -> bool {
if let Some(dfn_type) = element.value().attr("data-dfn-type") {
matches!(
dfn_type,
"interface" | "dictionary" | "enum" | "callback" | "callback interface" | "typedef"
)
} else {
false
}
}
#[cfg(test)]
pub fn collect_idl(html: &str) -> Result<Vec<ParsedSection>> {
let document = Html::parse_document(html);
let mut sections = Vec::new();
let selector = Selector::parse("dfn[id][data-dfn-type]")
.map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
for element in document.select(&selector) {
if !is_idl_type(&element) {
continue;
}
let anchor = element
.value()
.attr("id")
.ok_or_else(|| anyhow::anyhow!("IDL type missing id"))?
.to_string();
let title = element.text().collect::<String>().trim().to_string();
let title = if title.is_empty() { None } else { Some(title) };
sections.push(ParsedSection {
anchor,
title,
content_text: None, section_type: SectionType::Idl,
parent_anchor: None, prev_anchor: None, next_anchor: None, depth: None, });
}
Ok(sections)
}
#[cfg(test)]
pub fn collect_algorithms(html: &str) -> Result<Vec<ParsedSection>> {
let document = Html::parse_document(html);
let mut sections = Vec::new();
let selector = Selector::parse("div.algorithm dfn[id]")
.map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
for element in document.select(&selector) {
let anchor = element
.value()
.attr("id")
.ok_or_else(|| anyhow::anyhow!("Algorithm missing id"))?
.to_string();
let title = element.text().collect::<String>().trim().to_string();
let title = if title.is_empty() { None } else { Some(title) };
sections.push(ParsedSection {
anchor,
title,
content_text: None, section_type: SectionType::Algorithm,
parent_anchor: None, prev_anchor: None, next_anchor: None, depth: None, });
}
Ok(sections)
}
#[cfg(test)]
pub fn collect_definitions(html: &str) -> Result<Vec<ParsedSection>> {
let document = Html::parse_document(html);
let mut sections = Vec::new();
let selector =
Selector::parse("dfn[id]").map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
for element in document.select(&selector) {
if is_inside_algorithm_div(&element) {
continue;
}
if is_idl_type(&element) {
continue;
}
let anchor = element
.value()
.attr("id")
.ok_or_else(|| anyhow::anyhow!("Definition missing id"))?
.to_string();
let title = element.text().collect::<String>().trim().to_string();
let title = if title.is_empty() { None } else { Some(title) };
sections.push(ParsedSection {
anchor,
title,
content_text: None, section_type: SectionType::Definition,
parent_anchor: None, prev_anchor: None, next_anchor: None, depth: None, });
}
Ok(sections)
}
pub fn build_section_tree(mut sections: Vec<ParsedSection>) -> Vec<ParsedSection> {
for i in 0..sections.len() {
if let Some(current_depth) = sections[i].depth {
for j in (0..i).rev() {
if let Some(parent_depth) = sections[j].depth {
if parent_depth < current_depth {
sections[i].parent_anchor = Some(sections[j].anchor.clone());
break;
}
}
}
} else {
for j in (0..i).rev() {
if sections[j].depth.is_some() {
sections[i].parent_anchor = Some(sections[j].anchor.clone());
break;
}
}
}
}
for i in 0..sections.len() {
let current_depth = sections[i].depth;
let current_parent = sections[i].parent_anchor.clone();
for j in (0..i).rev() {
if sections[j].depth == current_depth && sections[j].parent_anchor == current_parent {
sections[i].prev_anchor = Some(sections[j].anchor.clone());
break;
}
}
for j in (i + 1)..sections.len() {
if sections[j].depth == current_depth && sections[j].parent_anchor == current_parent {
sections[i].next_anchor = Some(sections[j].anchor.clone());
break;
}
}
}
sections
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bikeshed_heading_parsing() {
let html = include_str!("../../tests/fixtures/headings/bikeshed_heading.html");
let sections = collect_headings(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "trees");
assert_eq!(section.title, Some("Trees".to_string()));
assert_eq!(section.section_type, SectionType::Heading);
assert_eq!(section.depth, Some(3));
}
#[test]
fn test_wattsi_heading_parsing() {
let html = include_str!("../../tests/fixtures/headings/wattsi_heading.html");
let sections = collect_headings(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "abstract");
assert_eq!(
section.title,
Some("Where does this specification fit?".to_string())
);
assert_eq!(section.section_type, SectionType::Heading);
assert_eq!(section.depth, Some(3));
}
#[test]
fn test_multiple_heading_levels() {
let html = r#"
<h2 id="section-1">Section 1</h2>
<h3 id="section-1-1">Section 1.1</h3>
<h4 id="section-1-1-1">Section 1.1.1</h4>
<h2 id="section-2">Section 2</h2>
"#;
let sections = collect_headings(html).unwrap();
assert_eq!(sections.len(), 4);
assert_eq!(sections[0].anchor, "section-1");
assert_eq!(sections[0].depth, Some(2));
assert_eq!(sections[1].anchor, "section-1-1");
assert_eq!(sections[1].depth, Some(3));
assert_eq!(sections[2].anchor, "section-1-1-1");
assert_eq!(sections[2].depth, Some(4));
assert_eq!(sections[3].anchor, "section-2");
assert_eq!(sections[3].depth, Some(2));
}
#[test]
fn test_heading_without_id_ignored() {
let html = r#"
<h2 id="has-id">With ID</h2>
<h2>Without ID</h2>
"#;
let sections = collect_headings(html).unwrap();
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].anchor, "has-id");
}
#[test]
fn test_build_section_tree_simple_nesting() {
let html = r#"
<h2 id="s1">Section 1</h2>
<h3 id="s1-1">Section 1.1</h3>
<h3 id="s1-2">Section 1.2</h3>
<h4 id="s1-2-1">Section 1.2.1</h4>
<h2 id="s2">Section 2</h2>
"#;
let sections = collect_headings(html).unwrap();
let tree = build_section_tree(sections);
assert_eq!(tree[0].parent_anchor, None);
assert_eq!(tree[0].prev_anchor, None);
assert_eq!(tree[0].next_anchor, Some("s2".to_string()));
assert_eq!(tree[1].parent_anchor, Some("s1".to_string()));
assert_eq!(tree[1].prev_anchor, None);
assert_eq!(tree[1].next_anchor, Some("s1-2".to_string()));
assert_eq!(tree[2].parent_anchor, Some("s1".to_string()));
assert_eq!(tree[2].prev_anchor, Some("s1-1".to_string()));
assert_eq!(tree[2].next_anchor, None);
assert_eq!(tree[3].parent_anchor, Some("s1-2".to_string()));
assert_eq!(tree[3].prev_anchor, None);
assert_eq!(tree[3].next_anchor, None);
assert_eq!(tree[4].parent_anchor, None);
assert_eq!(tree[4].prev_anchor, Some("s1".to_string()));
assert_eq!(tree[4].next_anchor, None);
}
#[test]
fn test_build_section_tree_flat_structure() {
let html = r#"
<h2 id="a">A</h2>
<h2 id="b">B</h2>
<h2 id="c">C</h2>
"#;
let sections = collect_headings(html).unwrap();
let tree = build_section_tree(sections);
assert_eq!(tree[0].parent_anchor, None);
assert_eq!(tree[0].prev_anchor, None);
assert_eq!(tree[0].next_anchor, Some("b".to_string()));
assert_eq!(tree[1].parent_anchor, None);
assert_eq!(tree[1].prev_anchor, Some("a".to_string()));
assert_eq!(tree[1].next_anchor, Some("c".to_string()));
assert_eq!(tree[2].parent_anchor, None);
assert_eq!(tree[2].prev_anchor, Some("b".to_string()));
assert_eq!(tree[2].next_anchor, None);
}
#[test]
fn test_build_section_tree_single_heading() {
let html = r#"<h2 id="only">Only Section</h2>"#;
let sections = collect_headings(html).unwrap();
let tree = build_section_tree(sections);
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].parent_anchor, None);
assert_eq!(tree[0].prev_anchor, None);
assert_eq!(tree[0].next_anchor, None);
}
#[test]
fn test_build_section_tree_skip_levels() {
let html = r#"
<h2 id="top">Top</h2>
<h4 id="nested">Nested (skipped h3)</h4>
<h2 id="next">Next Top</h2>
"#;
let sections = collect_headings(html).unwrap();
let tree = build_section_tree(sections);
assert_eq!(tree[1].parent_anchor, Some("top".to_string()));
assert_eq!(tree[1].prev_anchor, None); assert_eq!(tree[1].next_anchor, None);
}
#[test]
fn test_bikeshed_definition_parsing() {
let html = include_str!("../../tests/fixtures/definitions/bikeshed_definition.html");
let sections = collect_definitions(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "concept-tree");
assert_eq!(section.title, Some("tree".to_string()));
assert_eq!(section.section_type, SectionType::Definition);
assert_eq!(section.depth, None);
}
#[test]
fn test_wattsi_definition_parsing() {
let html = include_str!("../../tests/fixtures/definitions/wattsi_definition.html");
let sections = collect_definitions(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "in-parallel");
assert_eq!(section.title, Some("in parallel".to_string()));
assert_eq!(section.section_type, SectionType::Definition);
assert_eq!(section.depth, None);
}
#[test]
fn test_definition_with_code() {
let html = include_str!("../../tests/fixtures/definitions/definition_with_code.html");
let sections = collect_definitions(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "x-that");
assert_eq!(section.title, Some("createElement".to_string()));
assert_eq!(section.section_type, SectionType::Definition);
}
#[test]
fn test_definition_without_id_ignored() {
let html = r#"
<dfn id="has-id">With ID</dfn>
<dfn>Without ID</dfn>
"#;
let sections = collect_definitions(html).unwrap();
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].anchor, "has-id");
}
#[test]
fn test_multiple_definitions() {
let html = r#"
<p>A <dfn id="def-1">first term</dfn> and a <dfn id="def-2">second term</dfn>.</p>
<p>Also a <dfn id="def-3">third term</dfn>.</p>
"#;
let sections = collect_definitions(html).unwrap();
assert_eq!(sections.len(), 3);
assert_eq!(sections[0].anchor, "def-1");
assert_eq!(sections[1].anchor, "def-2");
assert_eq!(sections[2].anchor, "def-3");
}
#[test]
fn test_bikeshed_algorithm_parsing() {
let html = include_str!("../../tests/fixtures/algorithms/bikeshed_algorithm.html");
let sections = collect_algorithms(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "concept-ordered-set-parser");
assert_eq!(section.title, Some("ordered set parser".to_string()));
assert_eq!(section.section_type, SectionType::Algorithm);
assert_eq!(section.depth, None);
}
#[test]
fn test_algorithm_vs_definition_distinction() {
let html =
include_str!("../../tests/fixtures/algorithms/mixed_definitions_algorithms.html");
let algorithms = collect_algorithms(html).unwrap();
assert_eq!(algorithms.len(), 1);
assert_eq!(algorithms[0].anchor, "algorithm-def");
assert_eq!(algorithms[0].section_type, SectionType::Algorithm);
let definitions = collect_definitions(html).unwrap();
assert_eq!(definitions.len(), 2);
assert_eq!(definitions[0].anchor, "standalone-def");
assert_eq!(definitions[0].section_type, SectionType::Definition);
assert_eq!(definitions[1].anchor, "another-standalone");
assert_eq!(definitions[1].section_type, SectionType::Definition);
let def_anchors: Vec<_> = definitions.iter().map(|d| &d.anchor).collect();
assert!(!def_anchors.contains(&&"algorithm-def".to_string()));
}
#[test]
fn test_algorithm_without_dfn() {
let html = r#"
<div class="algorithm" data-algorithm="no dfn">
<p>This algorithm has no dfn element.</p>
<ol><li>Step 1</li></ol>
</div>
"#;
let sections = collect_algorithms(html).unwrap();
assert_eq!(sections.len(), 0); }
#[test]
fn test_idl_interface_parsing() {
let html = include_str!("../../tests/fixtures/idl/interface.html");
let sections = collect_idl(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "event");
assert_eq!(section.title, Some("Event".to_string()));
assert_eq!(section.section_type, SectionType::Idl);
assert_eq!(section.depth, None);
}
#[test]
fn test_idl_dictionary_parsing() {
let html = include_str!("../../tests/fixtures/idl/dictionary.html");
let sections = collect_idl(html).unwrap();
assert_eq!(sections.len(), 1);
let section = §ions[0];
assert_eq!(section.anchor, "eventinit");
assert_eq!(section.title, Some("EventInit".to_string()));
assert_eq!(section.section_type, SectionType::Idl);
assert_eq!(section.depth, None);
}
#[test]
fn test_idl_vs_definition_distinction() {
let html = include_str!("../../tests/fixtures/idl/mixed_idl_definitions.html");
let idl = collect_idl(html).unwrap();
assert_eq!(idl.len(), 2);
assert_eq!(idl[0].anchor, "myinterface");
assert_eq!(idl[0].section_type, SectionType::Idl);
assert_eq!(idl[1].anchor, "mydict");
assert_eq!(idl[1].section_type, SectionType::Idl);
let definitions = collect_definitions(html).unwrap();
assert_eq!(definitions.len(), 2);
assert_eq!(definitions[0].anchor, "regular-term");
assert_eq!(definitions[0].section_type, SectionType::Definition);
assert_eq!(definitions[1].anchor, "another-term");
assert_eq!(definitions[1].section_type, SectionType::Definition);
let def_anchors: Vec<_> = definitions.iter().map(|d| &d.anchor).collect();
assert!(!def_anchors.contains(&&"myinterface".to_string()));
assert!(!def_anchors.contains(&&"mydict".to_string()));
}
#[test]
fn test_idl_without_data_dfn_type_ignored() {
let html = r#"
<pre class="idl">
<dfn id="has-type" data-dfn-type="interface">WithType</dfn>
<dfn id="no-type">WithoutType</dfn>
</pre>
"#;
let sections = collect_idl(html).unwrap();
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].anchor, "has-type");
}
#[test]
fn test_wattsi_algorithm_pattern() {
let html = include_str!("../../tests/fixtures/algorithms/wattsi_navigate.html");
let converter = crate::parse::markdown::build_converter("https://html.spec.whatwg.org");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut algorithms = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
algorithms.push(section);
}
}
assert_eq!(algorithms.len(), 1, "Should detect one algorithm");
let algo = &algorithms[0];
assert_eq!(algo.anchor, "navigate");
assert_eq!(algo.title, Some("navigate".to_string()));
assert_eq!(
algo.section_type,
SectionType::Algorithm,
"Should be classified as Algorithm, not Definition"
);
let content = algo.content_text.as_ref().unwrap();
assert!(content.contains("navigate"), "Should include intro text");
assert!(content.contains("1. "), "Should include first step");
assert!(content.contains("2. "), "Should include second step");
assert!(
content.contains(" 1. "),
"Should include nested step with indentation"
);
}
#[test]
fn test_dfn_inside_algorithm_content_skipped() {
let html = r#"
<h2 id="algorithms">Algorithms</h2>
<p>To <dfn id="do-something">do something</dfn> with <var>input</var>:</p>
<ol>
<li><p>Let <var>result</var> be the result of calling <dfn id="helper">helper</dfn>.</p></li>
<li><p>Return <var>result</var>.</p></li>
</ol>
<p>The <dfn id="outside-def">outside definition</dfn> is separate.</p>
"#;
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
sections.push(section);
}
}
assert_eq!(
sections.len(),
2,
"Should collect 2 sections (algorithm + outside def), not the helper inside <ol>"
);
let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
assert!(
anchors.contains(&"do-something"),
"Should include the algorithm-defining dfn"
);
assert!(
anchors.contains(&"outside-def"),
"Should include the outside definition"
);
assert!(
!anchors.contains(&"helper"),
"Should NOT include dfn inside algorithm <ol>"
);
}
#[test]
fn test_dfn_inside_bikeshed_algorithm_content_skipped() {
let html = r#"
<h2 id="algorithms">Algorithms</h2>
<div class="algorithm">
<p>To <dfn id="process">process</dfn> the <var>data</var>:</p>
<ol>
<li><p>Let <var>x</var> be a new <dfn id="internal-thing">internal thing</dfn>.</p></li>
<li><p>Return <var>x</var>.</p></li>
</ol>
</div>
<p>A <dfn id="external-term">external term</dfn> here.</p>
"#;
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
sections.push(section);
}
}
assert_eq!(
sections.len(),
2,
"Should collect 2 sections, not the internal-thing inside <ol>"
);
let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
assert!(anchors.contains(&"process"));
assert!(anchors.contains(&"external-term"));
assert!(
!anchors.contains(&"internal-thing"),
"Should NOT include dfn inside algorithm <ol>"
);
}
#[test]
fn test_parameter_dfns_skipped() {
let html = r#"
<h2 id="algorithms">Algorithms</h2>
<p>To <dfn id="navigate">navigate</dfn> with <dfn data-dfn-for="navigate" id="param1"><var>url</var></dfn>
and <dfn id="param2"><var>options</var></dfn>:</p>
<ol>
<li><p>Do something.</p></li>
</ol>
<p>A standalone <dfn id="regular-def">definition</dfn>.</p>
"#;
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
sections.push(section);
}
}
assert_eq!(
sections.len(),
2,
"Should collect 2 sections (algorithm + regular def)"
);
let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
assert!(
anchors.contains(&"navigate"),
"Should include the algorithm"
);
assert!(
anchors.contains(&"regular-def"),
"Should include standalone definition"
);
assert!(
!anchors.contains(&"param1"),
"Should NOT include parameter dfn with data-dfn-for"
);
assert!(
!anchors.contains(&"param2"),
"Should NOT include parameter dfn containing <var>"
);
}
#[test]
fn test_property_dfns_with_dfn_for_and_dfn_type_kept() {
let html = r#"
<h2 id="trees">Trees</h2>
<p>An object that <dfn class="dfn-paneled" data-dfn-type="dfn" data-export id="concept-tree">participates</dfn>
in a tree has a <dfn class="dfn-paneled" data-dfn-for="tree" data-dfn-type="dfn" data-export id="concept-tree-parent">parent</dfn>,
which is either null or an object, and has
<dfn class="dfn-paneled" data-dfn-for="tree" data-dfn-type="dfn" data-export id="concept-tree-child">children</dfn>,
which is an ordered set of objects.</p>
"#;
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
sections.push(section);
}
}
let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
assert!(
anchors.contains(&"concept-tree"),
"Should include dfn without data-dfn-for"
);
assert!(
anchors.contains(&"concept-tree-parent"),
"Should include property dfn with data-dfn-for + data-dfn-type"
);
assert!(
anchors.contains(&"concept-tree-child"),
"Should include property dfn with data-dfn-for + data-dfn-type"
);
}
#[test]
fn test_argument_dfns_skipped() {
let html = r#"
<h2 id="api">API</h2>
<pre class="idl">
<dfn data-dfn-type="interface" id="audiodecoder"><code>AudioDecoder</code></dfn>
<dfn data-dfn-for="AudioDecoder" data-dfn-type="constructor" id="dom-audiodecoder-ctor"><code>AudioDecoder(init)</code></dfn>
<dfn data-dfn-for="AudioDecoder/AudioDecoder(init)" data-dfn-type="argument" id="dom-audiodecoder-ctor-init"><code>init</code></dfn>
<dfn data-dfn-for="AudioDecoder" data-dfn-type="method" id="dom-audiodecoder-configure"><code>configure(config)</code></dfn>
<dfn data-dfn-for="AudioDecoder/configure(config)" data-dfn-type="argument" id="dom-audiodecoder-configure-config"><code>config</code></dfn>
<dfn data-dfn-for="AudioDecoder" data-dfn-type="attribute" id="dom-audiodecoder-state"><code>state</code></dfn>
</pre>
"#;
let converter = crate::parse::markdown::build_converter("https://test.example.com");
let document = Html::parse_document(html);
let selector = Selector::parse("dfn[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
sections.push(section);
}
}
let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
assert!(
anchors.contains(&"audiodecoder"),
"Interface should be kept"
);
assert!(
anchors.contains(&"dom-audiodecoder-ctor"),
"Constructor should be kept"
);
assert!(
anchors.contains(&"dom-audiodecoder-configure"),
"Method should be kept"
);
assert!(
anchors.contains(&"dom-audiodecoder-state"),
"Attribute should be kept"
);
assert!(
!anchors.contains(&"dom-audiodecoder-ctor-init"),
"Argument should be skipped"
);
assert!(
!anchors.contains(&"dom-audiodecoder-configure-config"),
"Argument should be skipped"
);
}
#[test]
fn test_emu_clause_prose_section() {
let html = r#"
<emu-clause id="sec-overview">
<h1><span class="secnum">4</span> Overview</h1>
<p>This section contains a non-normative overview of the ECMAScript language.</p>
</emu-clause>
"#;
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
assert_eq!(section.anchor, "sec-overview");
assert_eq!(section.title, Some("Overview".to_string()));
assert_eq!(section.depth, Some(2)); assert_eq!(section.section_type, SectionType::Heading);
assert!(section.content_text.is_some());
assert!(section
.content_text
.as_ref()
.unwrap()
.contains("non-normative overview"));
}
#[test]
fn test_emu_clause_algorithm_section() {
let html = r#"
<emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
<h1><span class="secnum">7.1.17</span> ToString ( <var>argument</var> )</h1>
<p>The abstract operation ToString converts argument to a String.</p>
<emu-alg>
<ol>
<li>If <var>argument</var> is a String, return <var>argument</var>.</li>
<li>If <var>argument</var> is <emu-val>undefined</emu-val>, return "undefined".</li>
<li>If <var>argument</var> is <emu-val>null</emu-val>, return "null".</li>
</ol>
</emu-alg>
</emu-clause>
"#;
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
assert_eq!(section.anchor, "sec-tostring");
assert_eq!(section.title, Some("ToString ( argument )".to_string()));
assert_eq!(section.depth, Some(4)); assert_eq!(section.section_type, SectionType::Algorithm);
let content = section.content_text.unwrap();
assert!(
content.contains("converts argument"),
"Should have intro prose"
);
assert!(content.contains("1."), "Should have algorithm steps");
}
#[test]
fn test_emu_clause_nested_sections_excluded_from_content() {
let html = r#"
<emu-clause id="sec-parent">
<h1><span class="secnum">23</span> Parent Section</h1>
<p>Intro text for the parent.</p>
<emu-clause id="sec-child">
<h1><span class="secnum">23.1</span> Child Section</h1>
<p>This should NOT appear in parent content.</p>
</emu-clause>
</emu-clause>
"#;
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let mut sections = Vec::new();
for element in document.select(&selector) {
if let Some(section) = parse_emu_clause_element(&element, &converter).unwrap() {
sections.push(section);
}
}
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].anchor, "sec-parent");
assert_eq!(sections[1].anchor, "sec-child");
let parent_content = sections[0].content_text.as_ref().unwrap();
assert!(parent_content.contains("Intro text"));
assert!(!parent_content.contains("should NOT appear"));
}
#[test]
fn test_secnum_depth_derivation() {
fn depth_from_html(secnum: &str) -> Option<u8> {
let html = format!(r#"<h1><span class="secnum">{}</span> Title</h1>"#, secnum);
let document = Html::parse_document(&html);
let selector = Selector::parse("h1").unwrap();
let h1 = document.select(&selector).next().unwrap();
extract_secnum_depth(&h1)
}
assert_eq!(depth_from_html("4"), Some(2)); assert_eq!(depth_from_html("4.3"), Some(3)); assert_eq!(depth_from_html("7.1.17"), Some(4)); assert_eq!(depth_from_html("23.1.3.30"), Some(5)); assert_eq!(depth_from_html("A"), Some(2)); assert_eq!(depth_from_html("A.1"), Some(3)); assert_eq!(depth_from_html("A.1.2"), Some(4)); }
#[test]
fn test_emu_clause_secnum_stripped_from_title() {
let html = r#"
<emu-clause id="sec-test">
<h1><span class="secnum">7.1.17</span> ToString ( <var>argument</var> )</h1>
</emu-clause>
"#;
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
let title = section.title.unwrap();
assert!(
!title.contains("7.1.17"),
"secnum should be stripped: {}",
title
);
assert!(
title.contains("ToString"),
"Title should have function name: {}",
title
);
}
#[test]
fn test_emu_annex_parsed() {
let html = r#"
<emu-annex id="sec-additional-built-in-properties">
<h1><span class="secnum">B</span> Additional Built-in Properties</h1>
<p>Annex content here.</p>
</emu-annex>
"#;
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-annex[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
assert_eq!(section.anchor, "sec-additional-built-in-properties");
assert_eq!(
section.title,
Some("Additional Built-in Properties".to_string())
);
assert_eq!(section.depth, Some(2)); }
#[test]
fn test_ecmarkup_fixture_tostring_algorithm() {
let html = include_str!("../../tests/fixtures/ecmarkup/tostring.html");
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
assert_eq!(section.anchor, "sec-tostring");
assert_eq!(section.title, Some("ToString ( argument )".to_string()));
assert_eq!(section.depth, Some(4)); assert_eq!(section.section_type, SectionType::Algorithm);
let content = section.content_text.as_ref().unwrap();
assert!(
content.contains("The abstract operation ToString takes argument *argument*"),
"Intro should have italic var: {}",
&content[..200]
);
assert!(
content.contains("[ECMAScript language value](https://tc39.es/ecma262#sec-ecmascript-language-types)"),
"emu-xref links should be inline markdown links"
);
assert!(
content.contains("1. If *argument* [is a String]("),
"Step 1 should be on a single line with inline link"
);
assert!(
content.contains("2. If *argument* [is a Symbol]("),
"Step 2 should follow immediately"
);
assert!(
content.contains("3. If *argument* is undefined, return \"undefined\"."),
"Step 3: emu-val should render inline"
);
assert!(
content.contains("10. Let *primValue* be ?"),
"Step 10 should have var and link inline"
);
assert!(
content.contains("10. Let *primValue*") && content.contains("[ToPrimitive]("),
"Step 10 should have ToPrimitive link"
);
assert!(
content.contains("12. Return ?") && content.contains("[ToString]("),
"Step 12 should have recursive call"
);
for i in 1..=12 {
let prefix = format!("{}. ", i);
let matches: Vec<_> = content
.lines()
.filter(|l| {
let trimmed = l.trim_start();
trimmed.starts_with(&prefix)
|| (i >= 10 && trimmed.starts_with(&format!("{}.", i)))
})
.collect();
assert!(
!matches.is_empty(),
"Step {} should appear on its own line",
i
);
}
}
#[test]
fn test_ecmarkup_fixture_undefined_type_prose() {
let html = include_str!("../../tests/fixtures/ecmarkup/undefined_type.html");
let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
let document = Html::parse_document(html);
let selector = Selector::parse("emu-clause[id]").unwrap();
let element = document.select(&selector).next().unwrap();
let section = parse_emu_clause_element(&element, &converter)
.unwrap()
.unwrap();
assert_eq!(
section.anchor,
"sec-ecmascript-language-types-undefined-type"
);
assert_eq!(section.title, Some("The Undefined Type".to_string()));
assert_eq!(section.depth, Some(4)); assert_eq!(section.section_type, SectionType::Heading);
let content = section.content_text.as_ref().unwrap();
assert!(
content.contains("The Undefined type has exactly one value, called undefined."),
"emu-val should render inline as plain text: {}",
content
);
assert!(
content.contains("the value undefined."),
"Second emu-val should also be inline"
);
let line_count = content.lines().count();
assert!(
line_count <= 2,
"Simple prose should be 1-2 lines, got {}: {}",
line_count,
content
);
}
}