use crate::model::{ParsedReference, ParsedSection, SectionType};
use crate::spec_registry::SpecRegistry;
use scraper::Html;
pub fn extract_references(
html: &str,
spec_name: &str,
sections: &[ParsedSection],
registry: &SpecRegistry,
) -> Vec<ParsedReference> {
let document = Html::parse_document(html);
let scope_anchors: std::collections::HashSet<&str> = sections
.iter()
.filter(|s| {
matches!(
s.section_type,
SectionType::Heading | SectionType::Algorithm
)
})
.map(|s| s.anchor.as_str())
.collect();
let mut seen = std::collections::HashSet::new();
let mut references = Vec::new();
let mut current_section: Option<String> = None;
for node_ref in document.root_element().descendants() {
let Some(elem) = scraper::ElementRef::wrap(node_ref) else {
continue;
};
if let Some(id) = elem.value().attr("id") {
if scope_anchors.contains(id) {
current_section = Some(id.to_string());
}
}
if elem.value().name() == "a" {
if let Some(href) = elem.value().attr("href") {
if is_self_link(&elem) || is_biblio_ref(&elem) {
continue;
}
if let Some(ref section) = current_section {
if let Some(mut parsed_ref) = parse_href(href, section, registry) {
if parsed_ref.to_spec == "self" {
parsed_ref.to_spec = spec_name.to_string();
}
let key = (
parsed_ref.from_anchor.clone(),
parsed_ref.to_spec.clone(),
parsed_ref.to_anchor.clone(),
);
if seen.insert(key) {
references.push(parsed_ref);
}
}
}
}
}
}
references
}
fn is_self_link(link: &scraper::ElementRef) -> bool {
let classes: Vec<_> = link.value().classes().collect();
classes.contains(&"self-link")
}
fn is_biblio_ref(link: &scraper::ElementRef) -> bool {
if let Some(link_type) = link.value().attr("data-link-type") {
link_type == "biblio"
} else {
false
}
}
fn parse_href(href: &str, from_anchor: &str, registry: &SpecRegistry) -> Option<ParsedReference> {
if href.starts_with('#') {
let to_anchor = href.trim_start_matches('#').to_string();
return Some(ParsedReference {
from_anchor: from_anchor.to_string(),
to_spec: "self".to_string(),
to_anchor,
});
}
if href.starts_with("http://") || href.starts_with("https://") {
if let Some((spec_name, anchor)) = registry.resolve_url(href) {
return Some(ParsedReference {
from_anchor: from_anchor.to_string(),
to_spec: spec_name,
to_anchor: anchor,
});
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intra_spec_reference() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a href="#section2">Section 2</a> for details.</p>
<h2 id="section2">Section 2</h2>
<p>Content here.</p>
"##;
let sections = vec![
ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
},
ParsedSection {
anchor: "section2".to_string(),
title: Some("Section 2".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
},
];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 1);
assert_eq!(refs[0].from_anchor, "section1");
assert_eq!(refs[0].to_spec, "TEST");
assert_eq!(refs[0].to_anchor, "section2");
}
#[test]
fn test_skip_self_links() {
let html = r##"
<h2 id="section1">Section 1<a class="self-link" href="#section1"></a></h2>
<p>Content here.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 0);
}
#[test]
fn test_skip_biblio_refs() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a data-link-type="biblio" href="#biblio-infra">[INFRA]</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 0);
}
#[test]
fn test_cross_spec_reference() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a href="https://dom.spec.whatwg.org/#concept-tree">tree</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 1);
assert_eq!(refs[0].from_anchor, "section1");
assert_eq!(refs[0].to_spec, "DOM");
assert_eq!(refs[0].to_anchor, "concept-tree");
}
#[test]
fn test_unknown_url_skipped() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a href="https://example.com/foo">external link</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 0);
}
#[test]
fn test_nested_sections() {
let html = r##"
<h2 id="parent">Parent</h2>
<div>
<h3 id="child">Child</h3>
<p>See <a href="#parent">parent section</a>.</p>
</div>
"##;
let sections = vec![
ParsedSection {
anchor: "parent".to_string(),
title: Some("Parent".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
},
ParsedSection {
anchor: "child".to_string(),
title: Some("Child".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: Some("parent".to_string()),
prev_anchor: None,
next_anchor: None,
depth: Some(3),
},
];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 1);
assert_eq!(refs[0].from_anchor, "child");
assert_eq!(refs[0].to_anchor, "parent");
}
#[test]
fn test_wattsi_algorithm_references() {
let html = r##"
<p>To <dfn id="navigate">navigate</dfn> a <a href="#navigable">navigable</a> to a
<a href="https://url.spec.whatwg.org/#concept-url">URL</a>, with optional <a href="#post-resource">POST resource</a>:</p>
<ol>
<li><p>Let x be <a href="#snapshotting-params">snapshotting params</a>.</p></li>
<li><p><a href="https://infra.spec.whatwg.org/#assert">Assert</a>: foo.</p></li>
</ol>
"##;
let sections = vec![ParsedSection {
anchor: "navigate".to_string(),
title: Some("navigate".to_string()),
content_text: None,
section_type: SectionType::Algorithm,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: None,
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 5, "Expected 5 references, got {}", refs.len());
for ref_item in &refs {
assert_eq!(ref_item.from_anchor, "navigate");
}
let intra: Vec<_> = refs
.iter()
.filter(|r| r.to_spec == "TEST")
.map(|r| r.to_anchor.as_str())
.collect();
assert!(intra.contains(&"navigable"));
assert!(intra.contains(&"post-resource"));
assert!(intra.contains(&"snapshotting-params"));
assert!(refs
.iter()
.any(|r| r.to_spec == "URL" && r.to_anchor == "concept-url"));
assert!(refs
.iter()
.any(|r| r.to_spec == "INFRA" && r.to_anchor == "assert"));
}
#[test]
fn test_algorithm_with_parameter_dfns() {
let html = r##"
<div data-algorithm="">
<p>To <dfn id="navigate">navigate</dfn> a <a href="#navigable">navigable</a>
using <dfn id="navigation-resource">documentResource</dfn> and
<dfn id="navigation-response">response</dfn>:</p>
<ol>
<li><p><a href="#assert">Assert</a>: stuff.</p></li>
<li><p>Let x be <a href="#snapshot">snapshot</a>.</p></li>
</ol>
</div>
"##;
let sections = vec![
ParsedSection {
anchor: "navigate".to_string(),
title: Some("navigate".to_string()),
content_text: None,
section_type: SectionType::Algorithm,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: None,
},
ParsedSection {
anchor: "navigation-resource".to_string(),
title: Some("documentResource".to_string()),
content_text: None,
section_type: SectionType::Definition,
parent_anchor: Some("navigate".to_string()),
prev_anchor: None,
next_anchor: None,
depth: None,
},
ParsedSection {
anchor: "navigation-response".to_string(),
title: Some("response".to_string()),
content_text: None,
section_type: SectionType::Definition,
parent_anchor: Some("navigate".to_string()),
prev_anchor: None,
next_anchor: None,
depth: None,
},
];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 3, "Expected 3 references, got {}", refs.len());
for ref_item in &refs {
assert_eq!(
ref_item.from_anchor, "navigate",
"Link to {} should be from navigate, not {}",
ref_item.to_anchor, ref_item.from_anchor
);
}
}
#[test]
fn test_cross_spec_reference_to_w3c() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a href="https://drafts.csswg.org/selectors-4/#specificity">specificity</a>.</p>
<p>Also <a href="https://w3c.github.io/ServiceWorker/#service-worker-concept">SW</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 2);
assert!(refs
.iter()
.any(|r| r.to_spec == "CSS-SELECTORS" && r.to_anchor == "specificity"));
assert!(refs
.iter()
.any(|r| r.to_spec == "SERVICE-WORKERS" && r.to_anchor == "service-worker-concept"));
}
#[test]
fn test_cross_spec_reference_to_tc39() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>Call <a href="https://tc39.es/ecma262/#sec-tostring">ToString</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 1);
assert_eq!(refs[0].to_spec, "ECMA-262");
assert_eq!(refs[0].to_anchor, "sec-tostring");
}
#[test]
fn test_duplicate_refs_deduplicated() {
let html = r##"
<h2 id="section1">Section 1</h2>
<p>See <a href="#target">target</a> and also <a href="#target">target again</a>.</p>
"##;
let sections = vec![ParsedSection {
anchor: "section1".to_string(),
title: Some("Section 1".to_string()),
content_text: None,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(2),
}];
let registry = SpecRegistry::new();
let refs = extract_references(html, "TEST", §ions, ®istry);
assert_eq!(refs.len(), 1, "Duplicate ref should be deduplicated");
assert_eq!(refs[0].to_anchor, "target");
}
}