1pub mod algorithms;
2pub mod idl;
3pub mod idl_defs;
4pub mod markdown;
5pub mod references;
6pub mod sections;
7
8use crate::model::{ParsedSection, ParsedSpec, SectionType};
9use anyhow::Result;
10use htmd::HtmlToMarkdown;
11use scraper::{Html, Selector};
12
13fn is_ietf_html(document: &Html) -> bool {
17 let Ok(sel) = Selector::parse("html.RFC") else {
18 return false;
19 };
20 document.select(&sel).next().is_some()
21}
22
23fn ietf_extract_title(heading: &scraper::ElementRef) -> Option<String> {
29 let Ok(sel) = Selector::parse("a.section-name") else {
30 return None;
31 };
32 if let Some(name_a) = heading.select(&sel).next() {
33 let text = name_a.text().collect::<String>().trim().to_string();
34 if !text.is_empty() {
35 return Some(text);
36 }
37 }
38 let text = heading.text().collect::<String>().trim().to_string();
40 if text.is_empty() {
41 None
42 } else {
43 Some(text)
44 }
45}
46
47fn extract_ietf_prose(section: &scraper::ElementRef, converter: &HtmlToMarkdown) -> Option<String> {
55 let mut content_html = String::new();
56 for node in section.children() {
57 if let Some(child) = scraper::ElementRef::wrap(node) {
58 let tag = child.value().name();
59 if tag == "section" || matches!(tag, "h2" | "h3" | "h4" | "h5" | "h6") {
60 continue;
61 }
62 content_html.push_str(&child.html());
63 }
64 }
65 if content_html.trim().is_empty() {
66 return None;
67 }
68 let md = markdown::element_to_markdown_from_html(&content_html, converter);
69 let trimmed = md.trim();
70 if trimmed.is_empty() {
71 None
72 } else {
73 Some(trimmed.to_string())
74 }
75}
76
77fn parse_ietf_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
88 let section_sel =
89 Selector::parse("section[id]").map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
90 let heading_sel = Selector::parse("h2, h3, h4, h5, h6")
91 .map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
92
93 let mut parsed = Vec::new();
94
95 for section_elem in document.select(§ion_sel) {
96 let section_id = match section_elem.value().attr("id") {
97 Some(id) => id,
98 None => continue,
99 };
100
101 if !section_id.starts_with("section-") && !section_id.starts_with("appendix-") {
103 continue;
104 }
105 if section_id.starts_with("section-boilerplate") || section_id.starts_with("section-toc") {
107 continue;
108 }
109
110 let heading = match section_elem.select(&heading_sel).next() {
112 Some(h) => h,
113 None => continue,
114 };
115
116 let depth = match heading.value().name() {
117 "h2" => 2u8,
118 "h3" => 3,
119 "h4" => 4,
120 "h5" => 5,
121 "h6" => 6,
122 _ => 2,
123 };
124
125 let title = ietf_extract_title(&heading);
126 let content_text = extract_ietf_prose(§ion_elem, converter);
127
128 parsed.push(ParsedSection {
129 anchor: section_id.to_string(),
130 title,
131 content_text,
132 section_type: SectionType::Heading,
133 parent_anchor: None,
134 prev_anchor: None,
135 next_anchor: None,
136 depth: Some(depth),
137 });
138 }
139
140 Ok(parsed)
141}
142
143fn parse_generic_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
150 let mut sections = Vec::new();
151
152 let selector = Selector::parse(
159 "h2[id], h3[id], h4[id], h5[id], h6[id], dfn[id], emu-clause[id], emu-annex[id], tr[id], dt[id], section[id], li[id]",
160 )
161 .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
162
163 for element in document.select(&selector) {
164 let tag_name = element.value().name();
165
166 match tag_name {
167 "h2" | "h3" | "h4" | "h5" | "h6" => {
168 if let Some(section) = sections::parse_heading_element(&element, converter)? {
169 sections.push(section);
170 }
171 }
172 "dfn" => {
173 if is_inside_emu_clause(&element) {
174 continue;
175 }
176 if let Some(section) = sections::parse_dfn_element(&element, converter)? {
177 sections.push(section);
178 }
179 }
180 "emu-clause" | "emu-annex" => {
181 if let Some(section) = sections::parse_emu_clause_element(&element, converter)? {
182 sections.push(section);
183 }
184 }
185 "tr" | "dt" | "section" | "li" => {
186 if let Some(section) = sections::parse_anchor_element(&element, converter)? {
187 sections.push(section);
188 }
189 }
190 _ => {}
191 }
192 }
193
194 Ok(sections)
195}
196
197pub fn parse_spec(html: &str, spec_name: &str, base_url: &str) -> Result<ParsedSpec> {
200 let document = Html::parse_document(html);
201 let converter = markdown::build_converter(base_url);
202
203 let sections = if is_ietf_html(&document) {
206 parse_ietf_html(&document, &converter)?
207 } else {
208 parse_generic_html(&document, &converter)?
209 };
210
211 let sections = sections::build_section_tree(sections);
213
214 let registry = crate::spec_registry::SpecRegistry::new();
218 let references = references::extract_references(html, spec_name, §ions, ®istry);
219 let idl_definitions = idl_defs::extract_idl_definitions(html);
220
221 Ok(ParsedSpec {
222 sections,
223 references,
224 idl_definitions,
225 })
226}
227
228fn is_inside_emu_clause(element: &scraper::ElementRef) -> bool {
232 let mut current = element.parent();
233 while let Some(node) = current {
234 if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
235 let tag = parent_elem.value().name();
236 if tag == "emu-clause" || tag == "emu-annex" {
237 return true;
238 }
239 }
240 current = node.parent();
241 }
242 false
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248 use crate::model::SectionType;
249
250 #[test]
251 fn test_parse_spec_full_pipeline() {
252 let html = r#"
253 <h2 id="intro">Introduction</h2>
254 <p>This spec defines <dfn id="concept-widget">widgets</dfn>.</p>
255
256 <h3 id="types">Widget Types</h3>
257 <pre class="idl">
258 <c- b>interface</c-> <dfn data-dfn-type="interface" id="widget"><code>Widget</code></dfn> {
259 <c- g>constructor</c->();
260 };
261 </pre>
262
263 <div class="algorithm" data-algorithm="create widget">
264 <p>To <dfn id="create-widget">create a widget</dfn>:</p>
265 <ol>
266 <li>Let w be a new Widget.</li>
267 <li>Return w.</li>
268 </ol>
269 </div>
270
271 <h3 id="examples">Examples</h3>
272 <p>See the <dfn id="widget-example">widget example</dfn>.</p>
273 "#;
274
275 let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
276
277 assert_eq!(parsed.sections.len(), 7);
279 assert!(!parsed.idl_definitions.is_empty());
280
281 assert_eq!(parsed.sections[0].anchor, "intro");
283 assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
284
285 assert_eq!(parsed.sections[1].anchor, "concept-widget");
286 assert_eq!(parsed.sections[1].section_type, SectionType::Definition);
287
288 assert_eq!(parsed.sections[2].anchor, "types");
289 assert_eq!(parsed.sections[2].section_type, SectionType::Heading);
290
291 assert_eq!(parsed.sections[3].anchor, "widget");
292 assert_eq!(parsed.sections[3].section_type, SectionType::Idl);
293
294 assert_eq!(parsed.sections[4].anchor, "create-widget");
295 assert_eq!(parsed.sections[4].section_type, SectionType::Algorithm);
296
297 assert_eq!(parsed.sections[5].anchor, "examples");
298 assert_eq!(parsed.sections[5].section_type, SectionType::Heading);
299
300 assert_eq!(parsed.sections[6].anchor, "widget-example");
301 assert_eq!(parsed.sections[6].section_type, SectionType::Definition);
302
303 assert_eq!(parsed.sections[0].parent_anchor, None);
306
307 assert_eq!(parsed.sections[1].parent_anchor, Some("intro".to_string()));
309
310 assert_eq!(parsed.sections[2].parent_anchor, Some("intro".to_string()));
312
313 assert_eq!(parsed.sections[3].parent_anchor, Some("types".to_string()));
315
316 assert_eq!(parsed.sections[4].parent_anchor, Some("types".to_string()));
318
319 assert_eq!(parsed.sections[5].parent_anchor, Some("intro".to_string()));
321 assert_eq!(parsed.sections[5].prev_anchor, Some("types".to_string()));
322
323 assert_eq!(
325 parsed.sections[6].parent_anchor,
326 Some("examples".to_string())
327 );
328 }
329
330 #[test]
331 fn test_parse_spec_empty() {
332 let html = "<html><body></body></html>";
333 let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
334 assert_eq!(parsed.sections.len(), 0);
335 assert_eq!(parsed.references.len(), 0);
336 assert_eq!(parsed.idl_definitions.len(), 0);
337 }
338
339 #[test]
340 fn test_parse_spec_ecmarkup_pipeline() {
341 let html = r#"
342 <emu-clause id="sec-types">
343 <h1><span class="secnum">6</span> ECMAScript Data Types</h1>
344 <p>An ECMAScript language type corresponds to values.</p>
345
346 <emu-clause id="sec-undefined-type">
347 <h1><span class="secnum">6.1</span> The Undefined Type</h1>
348 <p>The Undefined type has exactly one value, called <emu-val>undefined</emu-val>.</p>
349 </emu-clause>
350
351 <emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
352 <h1><span class="secnum">6.2</span> ToString ( <var>argument</var> )</h1>
353 <p>Converts argument to a String.</p>
354 <emu-alg>
355 <ol>
356 <li>If <var>argument</var> is a String, return <var>argument</var>.</li>
357 <li>Return "default".</li>
358 </ol>
359 </emu-alg>
360 </emu-clause>
361 </emu-clause>
362 "#;
363
364 let parsed = parse_spec(html, "ECMA-262", "https://tc39.es/ecma262").unwrap();
365
366 assert_eq!(parsed.sections.len(), 3);
368
369 assert_eq!(parsed.sections[0].anchor, "sec-types");
371 assert_eq!(
372 parsed.sections[0].title,
373 Some("ECMAScript Data Types".to_string())
374 );
375 assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
376 assert_eq!(parsed.sections[0].depth, Some(2));
377 assert_eq!(parsed.sections[0].parent_anchor, None);
378
379 assert_eq!(parsed.sections[1].anchor, "sec-undefined-type");
381 assert_eq!(parsed.sections[1].depth, Some(3));
382 assert_eq!(
383 parsed.sections[1].parent_anchor,
384 Some("sec-types".to_string())
385 );
386
387 assert_eq!(parsed.sections[2].anchor, "sec-tostring");
389 assert_eq!(parsed.sections[2].section_type, SectionType::Algorithm);
390 assert_eq!(parsed.sections[2].depth, Some(3));
391 assert_eq!(
392 parsed.sections[2].parent_anchor,
393 Some("sec-types".to_string())
394 );
395
396 assert_eq!(
398 parsed.sections[1].next_anchor,
399 Some("sec-tostring".to_string())
400 );
401 assert_eq!(
402 parsed.sections[2].prev_anchor,
403 Some("sec-undefined-type".to_string())
404 );
405 }
406
407 #[test]
408 fn test_parse_spec_ietf_xml2rfc() {
409 let html = r##"<!DOCTYPE html>
416<html class="RFC">
417<head><title>Test RFC</title></head>
418<body>
419<section id="section-1">
420 <h2 id="name-introduction">
421 <a class="section-number selfRef" href="#section-1">1. </a>
422 <a class="section-name selfRef" href="#name-introduction">Introduction</a>
423 </h2>
424 <p>This document defines something useful.</p>
425
426 <section id="section-1.1">
427 <h3 id="name-overview">
428 <a class="section-number selfRef" href="#section-1.1">1.1. </a>
429 <a class="section-name selfRef" href="#name-overview">Overview</a>
430 </h3>
431 <p>An overview of the protocol.</p>
432 </section>
433</section>
434
435<section id="section-2">
436 <h2 id="name-protocol">
437 <a class="section-number selfRef" href="#section-2">2. </a>
438 <a class="section-name selfRef" href="#name-protocol">Protocol</a>
439 </h2>
440 <p>The protocol works as follows.</p>
441</section>
442
443<section id="appendix-A">
444 <h2 id="name-appendix-a">
445 <a class="section-number selfRef" href="#appendix-A">A. </a>
446 <a class="section-name selfRef" href="#name-appendix-a">Appendix A</a>
447 </h2>
448 <p>Additional notes.</p>
449</section>
450
451<section id="section-boilerplate.1">
452 <h2 id="name-status">Status of This Memo</h2>
453 <p>This is an Internet Standards Track document.</p>
454</section>
455
456<section id="section-toc">
457 <h2 id="name-toc">Table of Contents</h2>
458</section>
459</body>
460</html>"##;
461
462 let parsed = parse_spec(
463 html,
464 "RFC9999",
465 "https://www.rfc-editor.org/rfc/rfc9999.html",
466 )
467 .unwrap();
468
469 assert_eq!(parsed.sections.len(), 4);
472
473 assert_eq!(parsed.sections[0].anchor, "section-1");
475 assert_eq!(parsed.sections[1].anchor, "section-1.1");
476 assert_eq!(parsed.sections[2].anchor, "section-2");
477 assert_eq!(parsed.sections[3].anchor, "appendix-A");
478
479 assert_eq!(parsed.sections[0].title, Some("Introduction".to_string()));
481 assert_eq!(parsed.sections[1].title, Some("Overview".to_string()));
482 assert_eq!(parsed.sections[2].title, Some("Protocol".to_string()));
483 assert_eq!(parsed.sections[3].title, Some("Appendix A".to_string()));
484
485 assert_eq!(parsed.sections[0].depth, Some(2));
487 assert_eq!(parsed.sections[1].depth, Some(3));
488 assert_eq!(parsed.sections[2].depth, Some(2));
489 assert_eq!(parsed.sections[3].depth, Some(2));
490
491 assert_eq!(
493 parsed.sections[1].parent_anchor,
494 Some("section-1".to_string())
495 );
496
497 assert_eq!(parsed.sections[2].parent_anchor, None);
499 assert_eq!(
500 parsed.sections[2].prev_anchor,
501 Some("section-1".to_string())
502 );
503 }
504}