webspec-index 0.8.0

pub mod algorithms;
pub mod idl;
pub mod idl_defs;
pub mod markdown;
pub mod references;
pub mod sections;

use crate::model::{ParsedSection, ParsedSpec, SectionType};
use anyhow::Result;
use htmd::HtmlToMarkdown;
use scraper::{Html, Selector};

// ── IETF RFC HTML parsing (xml2rfc format) ───────────────────────────────────

/// Detect IETF RFC HTML generated by xml2rfc (`<html class="RFC">`).
fn is_ietf_html(document: &Html) -> bool {
    let Ok(sel) = Selector::parse("html.RFC") else {
        return false;
    };
    document.select(&sel).next().is_some()
}

/// Extract the human-readable title from an IETF heading element.
///
/// xml2rfc headings contain `<a class="section-number selfRef">N. </a>` (the
/// numeric prefix) and `<a class="section-name selfRef">Title</a>` (the title).
/// We extract only the `section-name` text to get a clean, prefix-free title.
fn ietf_extract_title(heading: &scraper::ElementRef) -> Option<String> {
    let Ok(sel) = Selector::parse("a.section-name") else {
        return None;
    };
    if let Some(name_a) = heading.select(&sel).next() {
        let text = name_a.text().collect::<String>().trim().to_string();
        if !text.is_empty() {
            return Some(text);
        }
    }
    // Fallback: all text in the heading (older xml2rfc versions or edge cases)
    let text = heading.text().collect::<String>().trim().to_string();
    if text.is_empty() {
        None
    } else {
        Some(text)
    }
}

/// Extract prose content from an IETF `<section>` element.
///
/// Collects the direct children of the section, skipping:
/// - The heading element itself (h2–h6)
/// - Nested `<section>` sub-elements (each gets its own indexed entry)
///
/// This ensures each section's content is its own prose, not its children's.
fn extract_ietf_prose(section: &scraper::ElementRef, converter: &HtmlToMarkdown) -> Option<String> {
    let mut content_html = String::new();
    for node in section.children() {
        if let Some(child) = scraper::ElementRef::wrap(node) {
            let tag = child.value().name();
            if tag == "section" || matches!(tag, "h2" | "h3" | "h4" | "h5" | "h6") {
                continue;
            }
            content_html.push_str(&child.html());
        }
    }
    if content_html.trim().is_empty() {
        return None;
    }
    let md = markdown::element_to_markdown_from_html(&content_html, converter);
    let trimmed = md.trim();
    if trimmed.is_empty() {
        None
    } else {
        Some(trimmed.to_string())
    }
}

/// Parse an IETF RFC HTML document (xml2rfc format) into structured sections.
///
/// IETF HTML uses `<section id="section-N">` / `<section id="appendix-A">` as
/// the canonical anchor, with the heading inside the section carrying a
/// `name-*` id that is NOT what users reference.  We index the `section-*` /
/// `appendix-*` id as the anchor so that links like `#section-1` resolve.
///
/// Skipped section id prefixes:
/// - `section-boilerplate` — Status of This Memo, Copyright Notice
/// - `section-toc`         — Table of Contents
fn parse_ietf_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
    let section_sel =
        Selector::parse("section[id]").map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;
    let heading_sel = Selector::parse("h2, h3, h4, h5, h6")
        .map_err(|e| anyhow::anyhow!("Selector error: {:?}", e))?;

    let mut parsed = Vec::new();

    for section_elem in document.select(&section_sel) {
        let section_id = match section_elem.value().attr("id") {
            Some(id) => id,
            None => continue,
        };

        // Only process section-* and appendix-* IDs
        if !section_id.starts_with("section-") && !section_id.starts_with("appendix-") {
            continue;
        }
        // Skip non-content sections
        if section_id.starts_with("section-boilerplate") || section_id.starts_with("section-toc") {
            continue;
        }

        // Find the first heading child — it is always the section title in xml2rfc
        let heading = match section_elem.select(&heading_sel).next() {
            Some(h) => h,
            None => continue,
        };

        let depth = match heading.value().name() {
            "h2" => 2u8,
            "h3" => 3,
            "h4" => 4,
            "h5" => 5,
            "h6" => 6,
            _ => 2,
        };

        let title = ietf_extract_title(&heading);
        let content_text = extract_ietf_prose(&section_elem, converter);

        parsed.push(ParsedSection {
            anchor: section_id.to_string(),
            title,
            content_text,
            section_type: SectionType::Heading,
            parent_anchor: None,
            prev_anchor: None,
            next_anchor: None,
            depth: Some(depth),
        });
    }

    Ok(parsed)
}

// ── Generic spec parsing ──────────────────────────────────────────────────────

/// Parse generic (non-IETF) spec HTML into structured sections.
///
/// Selects headings (h2–h6 with id), definitions (dfn with id), and
/// TC39/ecmarkup clause elements (emu-clause, emu-annex with id).
fn parse_generic_html(document: &Html, converter: &HtmlToMarkdown) -> Result<Vec<ParsedSection>> {
    let mut sections = Vec::new();

    // Collect all potential section elements in a single pass to preserve document order.
    // This includes:
    // - headings (h2-h6 with id) — WHATWG/W3C specs
    // - definitions (dfn with id) — all specs
    // - emu-clause/emu-annex (with id) — TC39/ecmarkup specs
    // - tr/dt/section/li with id — W3C specs use these as named anchor targets
    let selector = Selector::parse(
        "h2[id], h3[id], h4[id], h5[id], h6[id], dfn[id], emu-clause[id], emu-annex[id], tr[id], dt[id], section[id], li[id]",
    )
    .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;

    for element in document.select(&selector) {
        let tag_name = element.value().name();

        match tag_name {
            "h2" | "h3" | "h4" | "h5" | "h6" => {
                if let Some(section) = sections::parse_heading_element(&element, converter)? {
                    sections.push(section);
                }
            }
            "dfn" => {
                if is_inside_emu_clause(&element) {
                    continue;
                }
                if let Some(section) = sections::parse_dfn_element(&element, converter)? {
                    sections.push(section);
                }
            }
            "emu-clause" | "emu-annex" => {
                if let Some(section) = sections::parse_emu_clause_element(&element, converter)? {
                    sections.push(section);
                }
            }
            "tr" | "dt" | "section" | "li" => {
                if let Some(section) = sections::parse_anchor_element(&element, converter)? {
                    sections.push(section);
                }
            }
            _ => {}
        }
    }

    Ok(sections)
}

/// Parse a complete spec HTML document into structured sections and references.
/// `base_url` is used to absolutize relative links in content markdown.
pub fn parse_spec(html: &str, spec_name: &str, base_url: &str) -> Result<ParsedSpec> {
    let document = Html::parse_document(html);
    let converter = markdown::build_converter(base_url);

    // IETF RFC HTML (xml2rfc format) uses a fundamentally different structure:
    // canonical anchors are on `<section id="section-N">` elements, not headings.
    let sections = if is_ietf_html(&document) {
        parse_ietf_html(&document, &converter)?
    } else {
        parse_generic_html(&document, &converter)?
    };

    // Build tree relationships (parent, prev, next)
    let sections = sections::build_section_tree(sections);

    // Extract references
    // Note: We need a SpecRegistry to resolve cross-spec URLs
    // For now, create an empty one (will be passed in later for full functionality)
    let registry = crate::spec_registry::SpecRegistry::new();
    let references = references::extract_references(html, spec_name, &sections, &registry);
    let idl_definitions = idl_defs::extract_idl_definitions(html);

    Ok(ParsedSpec {
        sections,
        references,
        idl_definitions,
    })
}

/// Check if a dfn element is inside an emu-clause (TC39/ecmarkup spec).
/// In ecmarkup specs, dfns are inline term definitions inside emu-clause content.
/// We skip them as standalone sections since the emu-clause itself is the section.
fn is_inside_emu_clause(element: &scraper::ElementRef) -> bool {
    let mut current = element.parent();
    while let Some(node) = current {
        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
            let tag = parent_elem.value().name();
            if tag == "emu-clause" || tag == "emu-annex" {
                return true;
            }
        }
        current = node.parent();
    }
    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::model::SectionType;

    #[test]
    fn test_parse_spec_full_pipeline() {
        let html = r#"
            <h2 id="intro">Introduction</h2>
            <p>This spec defines <dfn id="concept-widget">widgets</dfn>.</p>

            <h3 id="types">Widget Types</h3>
            <pre class="idl">
                <c- b>interface</c-> <dfn data-dfn-type="interface" id="widget"><code>Widget</code></dfn> {
                    <c- g>constructor</c->();
                };
            </pre>

            <div class="algorithm" data-algorithm="create widget">
                <p>To <dfn id="create-widget">create a widget</dfn>:</p>
                <ol>
                    <li>Let w be a new Widget.</li>
                    <li>Return w.</li>
                </ol>
            </div>

            <h3 id="examples">Examples</h3>
            <p>See the <dfn id="widget-example">widget example</dfn>.</p>
        "#;

        let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();

        // Should have 7 sections total
        assert_eq!(parsed.sections.len(), 7);
        assert!(!parsed.idl_definitions.is_empty());

        // Check section types and order
        assert_eq!(parsed.sections[0].anchor, "intro");
        assert_eq!(parsed.sections[0].section_type, SectionType::Heading);

        assert_eq!(parsed.sections[1].anchor, "concept-widget");
        assert_eq!(parsed.sections[1].section_type, SectionType::Definition);

        assert_eq!(parsed.sections[2].anchor, "types");
        assert_eq!(parsed.sections[2].section_type, SectionType::Heading);

        assert_eq!(parsed.sections[3].anchor, "widget");
        assert_eq!(parsed.sections[3].section_type, SectionType::Idl);

        assert_eq!(parsed.sections[4].anchor, "create-widget");
        assert_eq!(parsed.sections[4].section_type, SectionType::Algorithm);

        assert_eq!(parsed.sections[5].anchor, "examples");
        assert_eq!(parsed.sections[5].section_type, SectionType::Heading);

        assert_eq!(parsed.sections[6].anchor, "widget-example");
        assert_eq!(parsed.sections[6].section_type, SectionType::Definition);

        // Check tree relationships
        // intro (h2) should have no parent
        assert_eq!(parsed.sections[0].parent_anchor, None);

        // concept-widget (dfn) should have intro as parent
        assert_eq!(parsed.sections[1].parent_anchor, Some("intro".to_string()));

        // types (h3) should have intro as parent
        assert_eq!(parsed.sections[2].parent_anchor, Some("intro".to_string()));

        // widget (idl) should have types as parent
        assert_eq!(parsed.sections[3].parent_anchor, Some("types".to_string()));

        // create-widget (algorithm) should have types as parent
        assert_eq!(parsed.sections[4].parent_anchor, Some("types".to_string()));

        // examples (h3) should have intro as parent and types as prev sibling
        assert_eq!(parsed.sections[5].parent_anchor, Some("intro".to_string()));
        assert_eq!(parsed.sections[5].prev_anchor, Some("types".to_string()));

        // widget-example (dfn) should have examples as parent
        assert_eq!(
            parsed.sections[6].parent_anchor,
            Some("examples".to_string())
        );
    }

    #[test]
    fn test_parse_spec_empty() {
        let html = "<html><body></body></html>";
        let parsed = parse_spec(html, "TEST", "https://test.example.com").unwrap();
        assert_eq!(parsed.sections.len(), 0);
        assert_eq!(parsed.references.len(), 0);
        assert_eq!(parsed.idl_definitions.len(), 0);
    }

    #[test]
    fn test_parse_spec_ecmarkup_pipeline() {
        let html = r#"
            <emu-clause id="sec-types">
                <h1><span class="secnum">6</span> ECMAScript Data Types</h1>
                <p>An ECMAScript language type corresponds to values.</p>

                <emu-clause id="sec-undefined-type">
                    <h1><span class="secnum">6.1</span> The Undefined Type</h1>
                    <p>The Undefined type has exactly one value, called <emu-val>undefined</emu-val>.</p>
                </emu-clause>

                <emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
                    <h1><span class="secnum">6.2</span> ToString ( <var>argument</var> )</h1>
                    <p>Converts argument to a String.</p>
                    <emu-alg>
                        <ol>
                            <li>If <var>argument</var> is a String, return <var>argument</var>.</li>
                            <li>Return "default".</li>
                        </ol>
                    </emu-alg>
                </emu-clause>
            </emu-clause>
        "#;

        let parsed = parse_spec(html, "ECMA-262", "https://tc39.es/ecma262").unwrap();

        // Should have 3 sections (all emu-clauses), no dfns (dfns inside emu-clause are skipped)
        assert_eq!(parsed.sections.len(), 3);

        // Parent section
        assert_eq!(parsed.sections[0].anchor, "sec-types");
        assert_eq!(
            parsed.sections[0].title,
            Some("ECMAScript Data Types".to_string())
        );
        assert_eq!(parsed.sections[0].section_type, SectionType::Heading);
        assert_eq!(parsed.sections[0].depth, Some(2));
        assert_eq!(parsed.sections[0].parent_anchor, None);

        // Child section
        assert_eq!(parsed.sections[1].anchor, "sec-undefined-type");
        assert_eq!(parsed.sections[1].depth, Some(3));
        assert_eq!(
            parsed.sections[1].parent_anchor,
            Some("sec-types".to_string())
        );

        // Algorithm section
        assert_eq!(parsed.sections[2].anchor, "sec-tostring");
        assert_eq!(parsed.sections[2].section_type, SectionType::Algorithm);
        assert_eq!(parsed.sections[2].depth, Some(3));
        assert_eq!(
            parsed.sections[2].parent_anchor,
            Some("sec-types".to_string())
        );

        // Check tree: sec-undefined-type and sec-tostring are siblings
        assert_eq!(
            parsed.sections[1].next_anchor,
            Some("sec-tostring".to_string())
        );
        assert_eq!(
            parsed.sections[2].prev_anchor,
            Some("sec-undefined-type".to_string())
        );
    }

    #[test]
    fn test_parse_spec_ietf_xml2rfc() {
        // Minimal xml2rfc-generated HTML with the canonical IETF structure:
        // - <html class="RFC"> triggers IETF path
        // - <section id="section-N"> carries the user-referenced anchor
        // - headings have id="name-..." (NOT what users reference)
        // - section number in <a class="section-number selfRef">
        // - section title in <a class="section-name selfRef">
        let html = r##"<!DOCTYPE html>
<html class="RFC">
<head><title>Test RFC</title></head>
<body>
<section id="section-1">
  <h2 id="name-introduction">
    <a class="section-number selfRef" href="#section-1">1. </a>
    <a class="section-name selfRef" href="#name-introduction">Introduction</a>
  </h2>
  <p>This document defines something useful.</p>

  <section id="section-1.1">
    <h3 id="name-overview">
      <a class="section-number selfRef" href="#section-1.1">1.1. </a>
      <a class="section-name selfRef" href="#name-overview">Overview</a>
    </h3>
    <p>An overview of the protocol.</p>
  </section>
</section>

<section id="section-2">
  <h2 id="name-protocol">
    <a class="section-number selfRef" href="#section-2">2. </a>
    <a class="section-name selfRef" href="#name-protocol">Protocol</a>
  </h2>
  <p>The protocol works as follows.</p>
</section>

<section id="appendix-A">
  <h2 id="name-appendix-a">
    <a class="section-number selfRef" href="#appendix-A">A. </a>
    <a class="section-name selfRef" href="#name-appendix-a">Appendix A</a>
  </h2>
  <p>Additional notes.</p>
</section>

<section id="section-boilerplate.1">
  <h2 id="name-status">Status of This Memo</h2>
  <p>This is an Internet Standards Track document.</p>
</section>

<section id="section-toc">
  <h2 id="name-toc">Table of Contents</h2>
</section>
</body>
</html>"##;

        let parsed = parse_spec(
            html,
            "RFC9999",
            "https://www.rfc-editor.org/rfc/rfc9999.html",
        )
        .unwrap();

        // Should have 4 sections: section-1, section-1.1, section-2, appendix-A
        // boilerplate and toc are skipped
        assert_eq!(parsed.sections.len(), 4);

        // Anchors are the section IDs, not the heading name-* IDs
        assert_eq!(parsed.sections[0].anchor, "section-1");
        assert_eq!(parsed.sections[1].anchor, "section-1.1");
        assert_eq!(parsed.sections[2].anchor, "section-2");
        assert_eq!(parsed.sections[3].anchor, "appendix-A");

        // Titles come from <a class="section-name">, without numeric prefix
        assert_eq!(parsed.sections[0].title, Some("Introduction".to_string()));
        assert_eq!(parsed.sections[1].title, Some("Overview".to_string()));
        assert_eq!(parsed.sections[2].title, Some("Protocol".to_string()));
        assert_eq!(parsed.sections[3].title, Some("Appendix A".to_string()));

        // Depths from heading tag
        assert_eq!(parsed.sections[0].depth, Some(2));
        assert_eq!(parsed.sections[1].depth, Some(3));
        assert_eq!(parsed.sections[2].depth, Some(2));
        assert_eq!(parsed.sections[3].depth, Some(2));

        // Tree: section-1.1 is child of section-1
        assert_eq!(
            parsed.sections[1].parent_anchor,
            Some("section-1".to_string())
        );

        // section-2 has no parent (top-level h2), prev sibling is section-1
        assert_eq!(parsed.sections[2].parent_anchor, None);
        assert_eq!(
            parsed.sections[2].prev_anchor,
            Some("section-1".to_string())
        );
    }
}