webspec_index/parse/
mod.rs

1pub mod algorithms;
2pub mod idl;
3pub mod idl_defs;
4pub mod markdown;
5pub mod references;
6pub mod sections;
7
8use crate::model::{ParsedSection, ParsedSpec, SectionType};
9use anyhow::Result;
10use htmd::HtmlToMarkdown;
11use scraper::{Html, Selector};
12
13// ── IETF RFC HTML parsing (xml2rfc format) ───────────────────────────────────
14
15/// Detect IETF RFC HTML generated by xml2rfc (`<html class="RFC">`).
16fn is_ietf_html(document: &Html) -> bool {
17    let Ok(sel) = Selector::parse("html.RFC") else {
18        return false;
19    };
20    document.select(&sel).next().is_some()
21}
22
23/// Extract the human-readable title from an IETF heading element.
24///
25/// xml2rfc headings contain `<a class="section-number selfRef">N. </a>` (the
26/// numeric prefix) and `<a class="section-name selfRef">Title</a>` (the title).
27/// We extract only the `section-name` text to get a clean, prefix-free title.
28fn ietf_extract_title(heading: &scraper::ElementRef) -> Option<String> {
29    let Ok(sel) = Selector::parse("a.section-name") else {
30        return None;
31    };
32    if let Some(name_a) = heading.select(&sel).next() {
33        let text = name_a.text().collect::<String>().trim().to_string();
34        if !text.is_empty() {
35            return Some(text);
36        }
37    }
38    // Fallback: all text in the heading (older xml2rfc versions or edge cases)
39    let text = heading.text().collect::<String>().trim().to_string();
40    if text.is_empty() {
41        None
42    } else {
43        Some(text)
44    }
45}
46
47/// Extract prose content from an IETF `<section>` element.
48///
49/// Collects the direct children of the section, skipping:
50/// - The heading element itself (h2–h6)
51/// - Nested `<section>` sub-elements (each gets its own indexed entry)
52///
53/// This ensures each section's content is its own prose, not its children's.
54fn extract_ietf_prose(section: &scraper::ElementRef, converter: &HtmlToMarkdown) -> Option<String> {
55    let mut content_html = String::new();
56    for node in section.children() {
57        if let Some(child) = scraper::ElementRef::wrap(node) {
58            let tag = child.value().name();
59            if tag == "section" || matches!(tag, "h2" | "h3" | "h4" | "h5" | "h6") {
60                continue;
61            }
62            content_html.push_str(&child.html());
63        }
64    }
65    if content_html.trim().is_empty() {
66        return None;
67    }
68    let md = markdown::element_to_markdown_from_html(&content_html, converter);
69    let trimmed = md.trim();
70    if trimmed.is_empty() {
71        None
72    } else {
73        Some(trimmed.to_string())
74    }
75}
76
77/// Parse an IETF RFC HTML document (xml2rfc format) into structured sections.
78///
79/// IETF HTML uses `<section id="section-N">` / `<section id="appendix-A">` as
80/// the canonical anchor, with the heading inside the section carrying a
81/// `name-*` id that is NOT what users reference.  We index the `section-*` /
82/// `appendix-*` id as the anchor so that links like `#section-1` resolve.
83///
84/// Skipped section id prefixes:
85/// - `section-boilerplate` — Status of This Memo, Copyright Notice
86/// - `section-toc`         — Table of Contents
87fn parse_ietf_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
88    let section_sel =
89        Selector::parse("section[id]").map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
90    let heading_sel = Selector::parse("h2, h3, h4, h5, h6")
91        .map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
92
93    let mut parsed = Vec::new();
94
95    for section_elem in document.select(&section_sel) {
96        let section_id = match section_elem.value().attr("id") {
97            Some(id) => id,
98            None => continue,
99        };
100
101        // Only process section-* and appendix-* IDs
102        if !section_id.starts_with("section-") && !section_id.starts_with("appendix-") {
103            continue;
104        }
105        // Skip non-content sections
106        if section_id.starts_with("section-boilerplate") || section_id.starts_with("section-toc") {
107            continue;
108        }
109
110        // Find the first heading child — it is always the section title in xml2rfc
111        let heading = match section_elem.select(&heading_sel).next() {
112            Some(h) => h,
113            None => continue,
114        };
115
116        let depth = match heading.value().name() {
117            "h2" => 2u8,
118            "h3" => 3,
119            "h4" => 4,
120            "h5" => 5,
121            "h6" => 6,
122            _ => 2,
123        };
124
125        let title = ietf_extract_title(&heading);
126        let content_text = extract_ietf_prose(&section_elem, converter);
127
128        parsed.push(ParsedSection {
129            anchor: section_id.to_string(),
130            title,
131            content_text,
132            section_type: SectionType::Heading,
133            parent_anchor: None,
134            prev_anchor: None,
135            next_anchor: None,
136            depth: Some(depth),
137        });
138    }
139
140    Ok(parsed)
141}
142
143// ── Generic spec parsing ──────────────────────────────────────────────────────
144
145/// Parse generic (non-IETF) spec HTML into structured sections.
146///
147/// Selects headings (h2–h6 with id), definitions (dfn with id), and
148/// TC39/ecmarkup clause elements (emu-clause, emu-annex with id).
149fn parse_generic_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
150    let mut sections = Vec::new();
151
152    // Collect all potential section elements in a single pass to preserve document order.
153    // This includes:
154    // - headings (h2-h6 with id) — WHATWG/W3C specs
155    // - definitions (dfn with id) — all specs
156    // - emu-clause/emu-annex (with id) — TC39/ecmarkup specs
157    // - tr/dt/section/li with id — W3C specs use these as named anchor targets
158    let selector = Selector::parse(
159        "h2[id], h3[id], h4[id], h5[id], h6[id], dfn[id], emu-clause[id], emu-annex[id], tr[id], dt[id], section[id], li[id]",
160    )
161    .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
162
163    for element in document.select(&selector) {
164        let tag_name = element.value().name();
165
166        match tag_name {
167            "h2" | "h3" | "h4" | "h5" | "h6" => {
168                if let Some(section) = sections::parse_heading_element(&element, converter)? {
169                    sections.push(section);
170                }
171            }
172            "dfn" => {
173                if is_inside_emu_clause(&element) {
174                    continue;
175                }
176                if let Some(section) = sections::parse_dfn_element(&element, converter)? {
177                    sections.push(section);
178                }
179            }
180            "emu-clause" | "emu-annex" => {
181                if let Some(section) = sections::parse_emu_clause_element(&element, converter)? {
182                    sections.push(section);
183                }
184            }
185            "tr" | "dt" | "section" | "li" => {
186                if let Some(section) = sections::parse_anchor_element(&element, converter)? {
187                    sections.push(section);
188                }
189            }
190            _ => {}
191        }
192    }
193
194    Ok(sections)
195}
196
197/// Parse a complete spec HTML document into structured sections and references.
198/// `base_url` is used to absolutize relative links in content markdown.
199pub fn parse_spec(html: &str, spec_name: &str, base_url: &str) -> Result<ParsedSpec> {
200    let document = Html::parse_document(html);
201    let converter = markdown::build_converter(base_url);
202
203    // IETF RFC HTML (xml2rfc format) uses a fundamentally different structure:
204    // canonical anchors are on `<section id="section-N">` elements, not headings.
205    let sections = if is_ietf_html(&document) {
206        parse_ietf_html(&document, &converter)?
207    } else {
208        parse_generic_html(&document, &converter)?
209    };
210
211    // Build tree relationships (parent, prev, next)
212    let sections = sections::build_section_tree(sections);
213
214    // Extract references
215    // Note: We need a SpecRegistry to resolve cross-spec URLs
216    // For now, create an empty one (will be passed in later for full functionality)
217    let registry = crate::spec_registry::SpecRegistry::new();
218    let references = references::extract_references(html, spec_name, &sections, &registry);
219    let idl_definitions = idl_defs::extract_idl_definitions(html);
220
221    Ok(ParsedSpec {
222        sections,
223        references,
224        idl_definitions,
225    })
226}
227
228/// Check if a dfn element is inside an emu-clause (TC39/ecmarkup spec).
229/// In ecmarkup specs, dfns are inline term definitions inside emu-clause content.
230/// We skip them as standalone sections since the emu-clause itself is the section.
231fn is_inside_emu_clause(element: &scraper::ElementRef) -> bool {
232    let mut current = element.parent();
233    while let Some(node) = current {
234        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
235            let tag = parent_elem.value().name();
236            if tag == "emu-clause" || tag == "emu-annex" {
237                return true;
238            }
239        }
240        current = node.parent();
241    }
242    false
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248    use crate::model::SectionType;
249
250    #[test]
251    fn test_parse_spec_full_pipeline() {
252        let html = r#"
253            <h2 id="intro">Introduction</h2>
254            <p>This spec defines <dfn id="concept-widget">widgets</dfn>.</p>
255
256            <h3 id="types">Widget Types</h3>
257            <pre class="idl">
258                <c- b>interface</c-> <dfn data-dfn-type="interface" id="widget"><code>Widget</code></dfn> {
259                    <c- g>constructor</c->();
260                };
261            </pre>
262
263            <div class="algorithm" data-algorithm="create widget">
264                <p>To <dfn id="create-widget">create a widget</dfn>:</p>
265                <ol>
266                    <li>Let w be a new Widget.</li>
267                    <li>Return w.</li>
268                </ol>
269            </div>
270
271            <h3 id="examples">Examples</h3>
272            <p>See the <dfn id="widget-example">widget example</dfn>.</p>
273        "#;
274
275        let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
276
277        // Should have 7 sections total
278        assert_eq!(parsed.sections.len(), 7);
279        assert!(!parsed.idl_definitions.is_empty());
280
281        // Check section types and order
282        assert_eq!(parsed.sections[0].anchor, "intro");
283        assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
284
285        assert_eq!(parsed.sections[1].anchor, "concept-widget");
286        assert_eq!(parsed.sections[1].section_type, SectionType::Definition);
287
288        assert_eq!(parsed.sections[2].anchor, "types");
289        assert_eq!(parsed.sections[2].section_type, SectionType::Heading);
290
291        assert_eq!(parsed.sections[3].anchor, "widget");
292        assert_eq!(parsed.sections[3].section_type, SectionType::Idl);
293
294        assert_eq!(parsed.sections[4].anchor, "create-widget");
295        assert_eq!(parsed.sections[4].section_type, SectionType::Algorithm);
296
297        assert_eq!(parsed.sections[5].anchor, "examples");
298        assert_eq!(parsed.sections[5].section_type, SectionType::Heading);
299
300        assert_eq!(parsed.sections[6].anchor, "widget-example");
301        assert_eq!(parsed.sections[6].section_type, SectionType::Definition);
302
303        // Check tree relationships
304        // intro (h2) should have no parent
305        assert_eq!(parsed.sections[0].parent_anchor, None);
306
307        // concept-widget (dfn) should have intro as parent
308        assert_eq!(parsed.sections[1].parent_anchor, Some("intro".to_string()));
309
310        // types (h3) should have intro as parent
311        assert_eq!(parsed.sections[2].parent_anchor, Some("intro".to_string()));
312
313        // widget (idl) should have types as parent
314        assert_eq!(parsed.sections[3].parent_anchor, Some("types".to_string()));
315
316        // create-widget (algorithm) should have types as parent
317        assert_eq!(parsed.sections[4].parent_anchor, Some("types".to_string()));
318
319        // examples (h3) should have intro as parent and types as prev sibling
320        assert_eq!(parsed.sections[5].parent_anchor, Some("intro".to_string()));
321        assert_eq!(parsed.sections[5].prev_anchor, Some("types".to_string()));
322
323        // widget-example (dfn) should have examples as parent
324        assert_eq!(
325            parsed.sections[6].parent_anchor,
326            Some("examples".to_string())
327        );
328    }
329
330    #[test]
331    fn test_parse_spec_empty() {
332        let html = "<html><body></body></html>";
333        let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
334        assert_eq!(parsed.sections.len(), 0);
335        assert_eq!(parsed.references.len(), 0);
336        assert_eq!(parsed.idl_definitions.len(), 0);
337    }
338
339    #[test]
340    fn test_parse_spec_ecmarkup_pipeline() {
341        let html = r#"
342            <emu-clause id="sec-types">
343                <h1><span class="secnum">6</span> ECMAScript Data Types</h1>
344                <p>An ECMAScript language type corresponds to values.</p>
345
346                <emu-clause id="sec-undefined-type">
347                    <h1><span class="secnum">6.1</span> The Undefined Type</h1>
348                    <p>The Undefined type has exactly one value, called <emu-val>undefined</emu-val>.</p>
349                </emu-clause>
350
351                <emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
352                    <h1><span class="secnum">6.2</span> ToString ( <var>argument</var> )</h1>
353                    <p>Converts argument to a String.</p>
354                    <emu-alg>
355                        <ol>
356                            <li>If <var>argument</var> is a String, return <var>argument</var>.</li>
357                            <li>Return "default".</li>
358                        </ol>
359                    </emu-alg>
360                </emu-clause>
361            </emu-clause>
362        "#;
363
364        let parsed = parse_spec(html, "ECMA-262", "https://tc39.es/ecma262").unwrap();
365
366        // Should have 3 sections (all emu-clauses), no dfns (dfns inside emu-clause are skipped)
367        assert_eq!(parsed.sections.len(), 3);
368
369        // Parent section
370        assert_eq!(parsed.sections[0].anchor, "sec-types");
371        assert_eq!(
372            parsed.sections[0].title,
373            Some("ECMAScript Data Types".to_string())
374        );
375        assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
376        assert_eq!(parsed.sections[0].depth, Some(2));
377        assert_eq!(parsed.sections[0].parent_anchor, None);
378
379        // Child section
380        assert_eq!(parsed.sections[1].anchor, "sec-undefined-type");
381        assert_eq!(parsed.sections[1].depth, Some(3));
382        assert_eq!(
383            parsed.sections[1].parent_anchor,
384            Some("sec-types".to_string())
385        );
386
387        // Algorithm section
388        assert_eq!(parsed.sections[2].anchor, "sec-tostring");
389        assert_eq!(parsed.sections[2].section_type, SectionType::Algorithm);
390        assert_eq!(parsed.sections[2].depth, Some(3));
391        assert_eq!(
392            parsed.sections[2].parent_anchor,
393            Some("sec-types".to_string())
394        );
395
396        // Check tree: sec-undefined-type and sec-tostring are siblings
397        assert_eq!(
398            parsed.sections[1].next_anchor,
399            Some("sec-tostring".to_string())
400        );
401        assert_eq!(
402            parsed.sections[2].prev_anchor,
403            Some("sec-undefined-type".to_string())
404        );
405    }
406
407    #[test]
408    fn test_parse_spec_ietf_xml2rfc() {
409        // Minimal xml2rfc-generated HTML with the canonical IETF structure:
410        // - <html class="RFC"> triggers IETF path
411        // - <section id="section-N"> carries the user-referenced anchor
412        // - headings have id="name-..." (NOT what users reference)
413        // - section number in <a class="section-number selfRef">
414        // - section title in <a class="section-name selfRef">
415        let html = r##"<!DOCTYPE html>
416<html class="RFC">
417<head><title>Test RFC</title></head>
418<body>
419<section id="section-1">
420  <h2 id="name-introduction">
421    <a class="section-number selfRef" href="#section-1">1. </a>
422    <a class="section-name selfRef" href="#name-introduction">Introduction</a>
423  </h2>
424  <p>This document defines something useful.</p>
425
426  <section id="section-1.1">
427    <h3 id="name-overview">
428      <a class="section-number selfRef" href="#section-1.1">1.1. </a>
429      <a class="section-name selfRef" href="#name-overview">Overview</a>
430    </h3>
431    <p>An overview of the protocol.</p>
432  </section>
433</section>
434
435<section id="section-2">
436  <h2 id="name-protocol">
437    <a class="section-number selfRef" href="#section-2">2. </a>
438    <a class="section-name selfRef" href="#name-protocol">Protocol</a>
439  </h2>
440  <p>The protocol works as follows.</p>
441</section>
442
443<section id="appendix-A">
444  <h2 id="name-appendix-a">
445    <a class="section-number selfRef" href="#appendix-A">A. </a>
446    <a class="section-name selfRef" href="#name-appendix-a">Appendix A</a>
447  </h2>
448  <p>Additional notes.</p>
449</section>
450
451<section id="section-boilerplate.1">
452  <h2 id="name-status">Status of This Memo</h2>
453  <p>This is an Internet Standards Track document.</p>
454</section>
455
456<section id="section-toc">
457  <h2 id="name-toc">Table of Contents</h2>
458</section>
459</body>
460</html>"##;
461
462        let parsed = parse_spec(
463            html,
464            "RFC9999",
465            "https://www.rfc-editor.org/rfc/rfc9999.html",
466        )
467        .unwrap();
468
469        // Should have 4 sections: section-1, section-1.1, section-2, appendix-A
470        // boilerplate and toc are skipped
471        assert_eq!(parsed.sections.len(), 4);
472
473        // Anchors are the section IDs, not the heading name-* IDs
474        assert_eq!(parsed.sections[0].anchor, "section-1");
475        assert_eq!(parsed.sections[1].anchor, "section-1.1");
476        assert_eq!(parsed.sections[2].anchor, "section-2");
477        assert_eq!(parsed.sections[3].anchor, "appendix-A");
478
479        // Titles come from <a class="section-name">, without numeric prefix
480        assert_eq!(parsed.sections[0].title, Some("Introduction".to_string()));
481        assert_eq!(parsed.sections[1].title, Some("Overview".to_string()));
482        assert_eq!(parsed.sections[2].title, Some("Protocol".to_string()));
483        assert_eq!(parsed.sections[3].title, Some("Appendix A".to_string()));
484
485        // Depths from heading tag
486        assert_eq!(parsed.sections[0].depth, Some(2));
487        assert_eq!(parsed.sections[1].depth, Some(3));
488        assert_eq!(parsed.sections[2].depth, Some(2));
489        assert_eq!(parsed.sections[3].depth, Some(2));
490
491        // Tree: section-1.1 is child of section-1
492        assert_eq!(
493            parsed.sections[1].parent_anchor,
494            Some("section-1".to_string())
495        );
496
497        // section-2 has no parent (top-level h2), prev sibling is section-1
498        assert_eq!(parsed.sections[2].parent_anchor, None);
499        assert_eq!(
500            parsed.sections[2].prev_anchor,
501            Some("section-1".to_string())
502        );
503    }
504}
webspec_index/parse/mod.rs

webspec_index/parse/
mod.rs