pub mod algorithms;
pub mod idl;
pub mod idl_defs;
pub mod markdown;
pub mod references;
pub mod sections;
use crate::model::{ParsedSection, ParsedSpec, SectionType};
use anyhow::Result;
use htmd::HtmlToMarkdown;
use scraper::{Html, Selector};
fn is_ietf_html(document: &Html) -> bool {
let Ok(sel) = Selector::parse("html.RFC") else {
return false;
};
document.select(&sel).next().is_some()
}
fn ietf_extract_title(heading: &scraper::ElementRef) -> Option<String> {
let Ok(sel) = Selector::parse("a.section-name") else {
return None;
};
if let Some(name_a) = heading.select(&sel).next() {
let text = name_a.text().collect::<String>().trim().to_string();
if !text.is_empty() {
return Some(text);
}
}
let text = heading.text().collect::<String>().trim().to_string();
if text.is_empty() {
None
} else {
Some(text)
}
}
fn extract_ietf_prose(section: &scraper::ElementRef, converter: &HtmlToMarkdown) -> Option<String> {
let mut content_html = String::new();
for node in section.children() {
if let Some(child) = scraper::ElementRef::wrap(node) {
let tag = child.value().name();
if tag == "section" || matches!(tag, "h2" | "h3" | "h4" | "h5" | "h6") {
continue;
}
content_html.push_str(&child.html());
}
}
if content_html.trim().is_empty() {
return None;
}
let md = markdown::element_to_markdown_from_html(&content_html, converter);
let trimmed = md.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
}
fn parse_ietf_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
let section_sel =
Selector::parse("section[id]").map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
let heading_sel = Selector::parse("h2, h3, h4, h5, h6")
.map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
let mut parsed = Vec::new();
for section_elem in document.select(§ion_sel) {
let section_id = match section_elem.value().attr("id") {
Some(id) => id,
None => continue,
};
if !section_id.starts_with("section-") && !section_id.starts_with("appendix-") {
continue;
}
if section_id.starts_with("section-boilerplate") || section_id.starts_with("section-toc") {
continue;
}
let heading = match section_elem.select(&heading_sel).next() {
Some(h) => h,
None => continue,
};
let depth = match heading.value().name() {
"h2" => 2u8,
"h3" => 3,
"h4" => 4,
"h5" => 5,
"h6" => 6,
_ => 2,
};
let title = ietf_extract_title(&heading);
let content_text = extract_ietf_prose(§ion_elem, converter);
parsed.push(ParsedSection {
anchor: section_id.to_string(),
title,
content_text,
section_type: SectionType::Heading,
parent_anchor: None,
prev_anchor: None,
next_anchor: None,
depth: Some(depth),
});
}
Ok(parsed)
}
fn parse_generic_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
let mut sections = Vec::new();
let selector = Selector::parse(
"h2[id], h3[id], h4[id], h5[id], h6[id], dfn[id], emu-clause[id], emu-annex[id], tr[id], dt[id], section[id], li[id]",
)
.map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
for element in document.select(&selector) {
let tag_name = element.value().name();
match tag_name {
"h2" | "h3" | "h4" | "h5" | "h6" => {
if let Some(section) = sections::parse_heading_element(&element, converter)? {
sections.push(section);
}
}
"dfn" => {
if is_inside_emu_clause(&element) {
continue;
}
if let Some(section) = sections::parse_dfn_element(&element, converter)? {
sections.push(section);
}
}
"emu-clause" | "emu-annex" => {
if let Some(section) = sections::parse_emu_clause_element(&element, converter)? {
sections.push(section);
}
}
"tr" | "dt" | "section" | "li" => {
if let Some(section) = sections::parse_anchor_element(&element, converter)? {
sections.push(section);
}
}
_ => {}
}
}
Ok(sections)
}
pub fn parse_spec(html: &str, spec_name: &str, base_url: &str) -> Result<ParsedSpec> {
let document = Html::parse_document(html);
let converter = markdown::build_converter(base_url);
let sections = if is_ietf_html(&document) {
parse_ietf_html(&document, &converter)?
} else {
parse_generic_html(&document, &converter)?
};
let sections = sections::build_section_tree(sections);
let registry = crate::spec_registry::SpecRegistry::new();
let references = references::extract_references(html, spec_name, §ions, ®istry);
let idl_definitions = idl_defs::extract_idl_definitions(html);
Ok(ParsedSpec {
sections,
references,
idl_definitions,
})
}
fn is_inside_emu_clause(element: &scraper::ElementRef) -> bool {
let mut current = element.parent();
while let Some(node) = current {
if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
let tag = parent_elem.value().name();
if tag == "emu-clause" || tag == "emu-annex" {
return true;
}
}
current = node.parent();
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::SectionType;
#[test]
fn test_parse_spec_full_pipeline() {
let html = r#"
<h2 id="intro">Introduction</h2>
<p>This spec defines <dfn id="concept-widget">widgets</dfn>.</p>
<h3 id="types">Widget Types</h3>
<pre class="idl">
<c- b>interface</c-> <dfn data-dfn-type="interface" id="widget"><code>Widget</code></dfn> {
<c- g>constructor</c->();
};
</pre>
<div class="algorithm" data-algorithm="create widget">
<p>To <dfn id="create-widget">create a widget</dfn>:</p>
<ol>
<li>Let w be a new Widget.</li>
<li>Return w.</li>
</ol>
</div>
<h3 id="examples">Examples</h3>
<p>See the <dfn id="widget-example">widget example</dfn>.</p>
"#;
let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
assert_eq!(parsed.sections.len(), 7);
assert!(!parsed.idl_definitions.is_empty());
assert_eq!(parsed.sections[0].anchor, "intro");
assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
assert_eq!(parsed.sections[1].anchor, "concept-widget");
assert_eq!(parsed.sections[1].section_type, SectionType::Definition);
assert_eq!(parsed.sections[2].anchor, "types");
assert_eq!(parsed.sections[2].section_type, SectionType::Heading);
assert_eq!(parsed.sections[3].anchor, "widget");
assert_eq!(parsed.sections[3].section_type, SectionType::Idl);
assert_eq!(parsed.sections[4].anchor, "create-widget");
assert_eq!(parsed.sections[4].section_type, SectionType::Algorithm);
assert_eq!(parsed.sections[5].anchor, "examples");
assert_eq!(parsed.sections[5].section_type, SectionType::Heading);
assert_eq!(parsed.sections[6].anchor, "widget-example");
assert_eq!(parsed.sections[6].section_type, SectionType::Definition);
assert_eq!(parsed.sections[0].parent_anchor, None);
assert_eq!(parsed.sections[1].parent_anchor, Some("intro".to_string()));
assert_eq!(parsed.sections[2].parent_anchor, Some("intro".to_string()));
assert_eq!(parsed.sections[3].parent_anchor, Some("types".to_string()));
assert_eq!(parsed.sections[4].parent_anchor, Some("types".to_string()));
assert_eq!(parsed.sections[5].parent_anchor, Some("intro".to_string()));
assert_eq!(parsed.sections[5].prev_anchor, Some("types".to_string()));
assert_eq!(
parsed.sections[6].parent_anchor,
Some("examples".to_string())
);
}
#[test]
fn test_parse_spec_empty() {
let html = "<html><body></body></html>";
let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
assert_eq!(parsed.sections.len(), 0);
assert_eq!(parsed.references.len(), 0);
assert_eq!(parsed.idl_definitions.len(), 0);
}
#[test]
fn test_parse_spec_ecmarkup_pipeline() {
let html = r#"
<emu-clause id="sec-types">
<h1><span class="secnum">6</span> ECMAScript Data Types</h1>
<p>An ECMAScript language type corresponds to values.</p>
<emu-clause id="sec-undefined-type">
<h1><span class="secnum">6.1</span> The Undefined Type</h1>
<p>The Undefined type has exactly one value, called <emu-val>undefined</emu-val>.</p>
</emu-clause>
<emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
<h1><span class="secnum">6.2</span> ToString ( <var>argument</var> )</h1>
<p>Converts argument to a String.</p>
<emu-alg>
<ol>
<li>If <var>argument</var> is a String, return <var>argument</var>.</li>
<li>Return "default".</li>
</ol>
</emu-alg>
</emu-clause>
</emu-clause>
"#;
let parsed = parse_spec(html, "ECMA-262", "https://tc39.es/ecma262").unwrap();
assert_eq!(parsed.sections.len(), 3);
assert_eq!(parsed.sections[0].anchor, "sec-types");
assert_eq!(
parsed.sections[0].title,
Some("ECMAScript Data Types".to_string())
);
assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
assert_eq!(parsed.sections[0].depth, Some(2));
assert_eq!(parsed.sections[0].parent_anchor, None);
assert_eq!(parsed.sections[1].anchor, "sec-undefined-type");
assert_eq!(parsed.sections[1].depth, Some(3));
assert_eq!(
parsed.sections[1].parent_anchor,
Some("sec-types".to_string())
);
assert_eq!(parsed.sections[2].anchor, "sec-tostring");
assert_eq!(parsed.sections[2].section_type, SectionType::Algorithm);
assert_eq!(parsed.sections[2].depth, Some(3));
assert_eq!(
parsed.sections[2].parent_anchor,
Some("sec-types".to_string())
);
assert_eq!(
parsed.sections[1].next_anchor,
Some("sec-tostring".to_string())
);
assert_eq!(
parsed.sections[2].prev_anchor,
Some("sec-undefined-type".to_string())
);
}
#[test]
fn test_parse_spec_ietf_xml2rfc() {
let html = r##"<!DOCTYPE html>
<html class="RFC">
<head><title>Test RFC</title></head>
<body>
<section id="section-1">
<h2 id="name-introduction">
<a class="section-number selfRef" href="#section-1">1. </a>
<a class="section-name selfRef" href="#name-introduction">Introduction</a>
</h2>
<p>This document defines something useful.</p>
<section id="section-1.1">
<h3 id="name-overview">
<a class="section-number selfRef" href="#section-1.1">1.1. </a>
<a class="section-name selfRef" href="#name-overview">Overview</a>
</h3>
<p>An overview of the protocol.</p>
</section>
</section>
<section id="section-2">
<h2 id="name-protocol">
<a class="section-number selfRef" href="#section-2">2. </a>
<a class="section-name selfRef" href="#name-protocol">Protocol</a>
</h2>
<p>The protocol works as follows.</p>
</section>
<section id="appendix-A">
<h2 id="name-appendix-a">
<a class="section-number selfRef" href="#appendix-A">A. </a>
<a class="section-name selfRef" href="#name-appendix-a">Appendix A</a>
</h2>
<p>Additional notes.</p>
</section>
<section id="section-boilerplate.1">
<h2 id="name-status">Status of This Memo</h2>
<p>This is an Internet Standards Track document.</p>
</section>
<section id="section-toc">
<h2 id="name-toc">Table of Contents</h2>
</section>
</body>
</html>"##;
let parsed = parse_spec(
html,
"RFC9999",
"https://www.rfc-editor.org/rfc/rfc9999.html",
)
.unwrap();
assert_eq!(parsed.sections.len(), 4);
assert_eq!(parsed.sections[0].anchor, "section-1");
assert_eq!(parsed.sections[1].anchor, "section-1.1");
assert_eq!(parsed.sections[2].anchor, "section-2");
assert_eq!(parsed.sections[3].anchor, "appendix-A");
assert_eq!(parsed.sections[0].title, Some("Introduction".to_string()));
assert_eq!(parsed.sections[1].title, Some("Overview".to_string()));
assert_eq!(parsed.sections[2].title, Some("Protocol".to_string()));
assert_eq!(parsed.sections[3].title, Some("Appendix A".to_string()));
assert_eq!(parsed.sections[0].depth, Some(2));
assert_eq!(parsed.sections[1].depth, Some(3));
assert_eq!(parsed.sections[2].depth, Some(2));
assert_eq!(parsed.sections[3].depth, Some(2));
assert_eq!(
parsed.sections[1].parent_anchor,
Some("section-1".to_string())
);
assert_eq!(parsed.sections[2].parent_anchor, None);
assert_eq!(
parsed.sections[2].prev_anchor,
Some("section-1".to_string())
);
}
}