Skip to main content

webspec_index/parse/
sections.rs

1use crate::model::{ParsedSection, SectionType};
2use anyhow::Result;
3use htmd::HtmlToMarkdown;
4#[cfg(test)]
5use scraper::{Html, Selector};
6
7/// Extract content between a heading and the next section (heading or dfn)
8/// Returns the markdown-converted prose content
9fn extract_heading_content(
10    heading: &scraper::ElementRef,
11    current_depth: u8,
12    converter: &HtmlToMarkdown,
13) -> Option<String> {
14    use super::markdown;
15
16    let mut content_html = String::new();
17    let mut current = heading.next_sibling();
18
19    while let Some(node) = current {
20        if let Some(sibling_elem) = scraper::ElementRef::wrap(node) {
21            let tag_name = sibling_elem.value().name();
22
23            // Stop at next heading of same or higher level
24            if let Some(sibling_depth) = heading_depth(tag_name) {
25                if sibling_depth <= current_depth {
26                    break;
27                }
28            }
29
30            // Stop at definitions (they're separate sections)
31            if tag_name == "dfn" && sibling_elem.value().attr("id").is_some() {
32                break;
33            }
34
35            // Collect this element's HTML
36            content_html.push_str(&sibling_elem.html());
37        }
38
39        current = node.next_sibling();
40    }
41
42    if content_html.trim().is_empty() {
43        return None;
44    }
45
46    let markdown = markdown::element_to_markdown_from_html(&content_html, converter);
47    let trimmed = markdown.trim();
48
49    if trimmed.is_empty() {
50        None
51    } else {
52        Some(trimmed.to_string())
53    }
54}
55
56/// Extract title text from a heading element, stripping secno and self-link
57fn extract_heading_title(element: &scraper::ElementRef) -> Option<String> {
58    // Clone the element to manipulate it
59    let mut text_parts = Vec::new();
60
61    for node in element.children() {
62        if let Some(elem) = scraper::ElementRef::wrap(node) {
63            // Skip section number spans and self-links:
64            // - "secno" (Bikeshed/Wattsi), "secnum" (ecmarkup/TC39), "self-link"
65            let classes = elem.value().classes().collect::<Vec<_>>();
66            if classes.contains(&"secno")
67                || classes.contains(&"secnum")
68                || classes.contains(&"self-link")
69            {
70                continue;
71            }
72            // Get text from other elements (like <span class="content">)
73            text_parts.push(elem.text().collect::<String>());
74        } else if let Some(text) = node.value().as_text() {
75            text_parts.push(text.to_string());
76        }
77    }
78
79    let result = text_parts.join("").trim().to_string();
80    if result.is_empty() {
81        None
82    } else {
83        Some(result)
84    }
85}
86
87/// Get the depth (2-6) from a heading tag name
88fn heading_depth(tag: &str) -> Option<u8> {
89    match tag {
90        "h2" => Some(2),
91        "h3" => Some(3),
92        "h4" => Some(4),
93        "h5" => Some(5),
94        "h6" => Some(6),
95        _ => None,
96    }
97}
98
99/// Parse a single heading element into a ParsedSection
100pub fn parse_heading_element(
101    element: &scraper::ElementRef,
102    converter: &HtmlToMarkdown,
103) -> Result<Option<ParsedSection>> {
104    let anchor = match element.value().attr("id") {
105        Some(id) => id.to_string(),
106        None => return Ok(None), // No id, skip this heading
107    };
108
109    let title = extract_heading_title(element);
110    let depth = heading_depth(element.value().name())
111        .ok_or_else(|| anyhow::anyhow!("Invalid heading tag: {}", element.value().name()))?;
112
113    // Extract content between this heading and the next heading/definition
114    let content_text = extract_heading_content(element, depth, converter);
115
116    Ok(Some(ParsedSection {
117        anchor,
118        title,
119        content_text,
120        section_type: SectionType::Heading,
121        parent_anchor: None,
122        prev_anchor: None,
123        next_anchor: None,
124        depth: Some(depth),
125    }))
126}
127
128/// Parse a single dfn element into a ParsedSection
129/// Determines whether it's a Definition, Algorithm, or IDL based on context
130pub fn parse_dfn_element(
131    element: &scraper::ElementRef,
132    converter: &HtmlToMarkdown,
133) -> Result<Option<ParsedSection>> {
134    let anchor = match element.value().attr("id") {
135        Some(id) => id.to_string(),
136        None => return Ok(None), // No id, skip this dfn
137    };
138
139    // Skip dfns that are inside algorithm content (e.g., inside <ol> steps)
140    // These are part of the algorithm's markdown content, not separate sections
141    if is_inside_algorithm_content(element) {
142        return Ok(None);
143    }
144
145    // Skip parameter dfns:
146    // 1. Those with data-dfn-for but WITHOUT data-dfn-type (e.g., <dfn data-dfn-for="navigate">url</dfn>)
147    // 2. Those with <var> as direct child (e.g., <dfn><var>options</var></dfn>)
148    // BUT keep method/attribute dfns which have BOTH data-dfn-for AND data-dfn-type
149    // Example of PARAMETER (skip): <dfn data-dfn-for="navigate"><var>url</var></dfn>
150    // Example of PARAMETER (skip): <dfn><var>options</var></dfn>
151    // Example of METHOD (keep): <dfn data-dfn-for="HTMLSlotElement" data-dfn-type="method">assign(...)</dfn>
152    let has_dfn_for = element.value().attr("data-dfn-for").is_some();
153    let has_dfn_type = element.value().attr("data-dfn-type").is_some();
154    let has_direct_var_child = element
155        .children()
156        .filter_map(scraper::ElementRef::wrap)
157        .any(|c| c.value().name() == "var");
158
159    // Skip if it's a parameter dfn
160    if (has_dfn_for && !has_dfn_type) || has_direct_var_child {
161        return Ok(None);
162    }
163
164    // Skip argument dfns (data-dfn-type="argument" in Bikeshed-generated specs)
165    // These are WebIDL function parameters, not standalone queryable concepts
166    if element.value().attr("data-dfn-type") == Some("argument") {
167        return Ok(None);
168    }
169
170    // Extract text content (including nested elements like <code>)
171    let title = element.text().collect::<String>().trim().to_string();
172    let title = if title.is_empty() { None } else { Some(title) };
173
174    // Determine section type based on context
175    // (parameter dfns already skipped above)
176    let section_type = if is_inside_algorithm_div(element) {
177        SectionType::Algorithm
178    } else if is_idl_type(element) {
179        SectionType::Idl
180    } else {
181        SectionType::Definition
182    };
183
184    // Extract content based on section type
185    let content_text = match section_type {
186        SectionType::Definition => extract_definition_content(element, converter),
187        SectionType::Algorithm => extract_algorithm_content(element, converter),
188        SectionType::Idl => extract_idl_content(element),
189        _ => None,
190    };
191
192    Ok(Some(ParsedSection {
193        anchor,
194        title,
195        content_text,
196        section_type,
197        parent_anchor: None,
198        prev_anchor: None,
199        next_anchor: None,
200        depth: None,
201    }))
202}
203
204/// Extract content for a definition (dfn not in algorithm, not IDL)
205/// Finds the enclosing block-level element and converts to markdown
206fn extract_definition_content(
207    element: &scraper::ElementRef,
208    converter: &HtmlToMarkdown,
209) -> Option<String> {
210    use super::markdown;
211
212    // Find the enclosing block-level element (p, div, dd, etc.)
213    let mut current = element.parent();
214    while let Some(node) = current {
215        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
216            let tag_name = parent_elem.value().name();
217            // Block-level elements that can contain definitions
218            if matches!(tag_name, "p" | "div" | "dd" | "dt" | "li" | "section") {
219                return Some(markdown::element_to_markdown(&parent_elem, converter));
220            }
221        }
222        current = node.parent();
223    }
224
225    // Fallback: just use the dfn's text
226    Some(element.text().collect::<String>().trim().to_string())
227}
228
229/// Extract content for an algorithm (dfn inside div.algorithm or with sibling <ol>)
230/// Handles both Bikeshed (div.algorithm) and Wattsi (sibling ol) patterns
231fn extract_algorithm_content(
232    element: &scraper::ElementRef,
233    converter: &HtmlToMarkdown,
234) -> Option<String> {
235    use super::{algorithms, markdown};
236
237    let mut current = element.parent();
238    while let Some(node) = current {
239        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
240            // Bikeshed/Wattsi div pattern: div.algorithm or div[data-algorithm]
241            if parent_elem.value().name() == "div" {
242                let classes: Vec<_> = parent_elem.value().classes().collect();
243                let is_algo_div = classes.contains(&"algorithm")
244                    || parent_elem.value().attr("data-algorithm").is_some();
245                if is_algo_div {
246                    return extract_from_algorithm_div(&parent_elem, converter);
247                }
248            }
249
250            // Wattsi sibling pattern: <p>To <dfn>foo</dfn>:</p><ol>...</ol>
251            if matches!(parent_elem.value().name(), "p" | "dd" | "li") {
252                let intro = markdown::element_to_markdown(&parent_elem, converter);
253
254                let mut sibling = node.next_sibling();
255                while let Some(sib_node) = sibling {
256                    if let Some(sib_elem) = scraper::ElementRef::wrap(sib_node) {
257                        if sib_elem.value().name() == "ol" {
258                            let steps = algorithms::render_algorithm_ol(&sib_elem, converter);
259                            return Some(format!("{}\n\n{}", intro.trim(), steps));
260                        }
261                        if matches!(
262                            sib_elem.value().name(),
263                            "p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
264                        ) {
265                            break;
266                        }
267                    }
268                    sibling = sib_node.next_sibling();
269                }
270            }
271        }
272        current = node.parent();
273    }
274
275    None
276}
277
278/// Extract algorithm content from a div.algorithm or div[data-algorithm] container.
279/// Properly separates the intro paragraph(s) from the steps <ol>.
280fn extract_from_algorithm_div(
281    div: &scraper::ElementRef,
282    converter: &HtmlToMarkdown,
283) -> Option<String> {
284    use super::algorithms;
285
286    let ol_selector = scraper::Selector::parse("ol").ok()?;
287    let ol_elem = div.select(&ol_selector).next()?;
288
289    // Build intro HTML from children before the first <ol>
290    let mut intro_html = String::new();
291    for child in div.children() {
292        if let Some(child_elem) = scraper::ElementRef::wrap(child) {
293            if child_elem.value().name() == "ol" {
294                break;
295            }
296            intro_html.push_str(&child_elem.html());
297        } else if let Some(text) = child.value().as_text() {
298            intro_html.push_str(text);
299        }
300    }
301
302    let intro = converter
303        .convert(&intro_html)
304        .unwrap_or_default()
305        .trim()
306        .to_string();
307    let steps = algorithms::render_algorithm_ol(&ol_elem, converter);
308    Some(format!("{}\n\n{}", intro, steps))
309}
310
311/// Extract content for an IDL type (dfn with data-dfn-type)
312/// Finds the parent <pre> block and extracts IDL
313fn extract_idl_content(element: &scraper::ElementRef) -> Option<String> {
314    use super::idl;
315
316    // Find the parent <pre> element
317    let mut current = element.parent();
318    while let Some(node) = current {
319        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
320            if parent_elem.value().name() == "pre" {
321                let idl_text = idl::extract_idl_text(&parent_elem);
322                return Some(idl_text);
323            }
324        }
325        current = node.parent();
326    }
327
328    None
329}
330
331/// Parse a generic anchor-bearing element (tr, dt, section, li) into a ParsedSection.
332/// W3C specs use these as named targets that don't fit the dfn/heading pattern.
333pub fn parse_anchor_element(
334    element: &scraper::ElementRef,
335    converter: &HtmlToMarkdown,
336) -> Result<Option<ParsedSection>> {
337    use super::markdown;
338
339    let anchor = match element.value().attr("id") {
340        Some(id) => id.to_string(),
341        None => return Ok(None),
342    };
343
344    let title_text = element.text().collect::<String>();
345    let title_text = title_text.trim();
346    let title = if title_text.is_empty() {
347        None
348    } else {
349        let truncated = if title_text.len() > 120 {
350            let boundary = title_text
351                .char_indices()
352                .map(|(i, _)| i)
353                .take_while(|&i| i <= 120)
354                .last()
355                .unwrap_or(0);
356            format!("{}…", &title_text[..boundary])
357        } else {
358            title_text.to_string()
359        };
360        Some(truncated)
361    };
362
363    let content_text = {
364        let html = element.html();
365        let md = markdown::element_to_markdown_from_html(&html, converter);
366        let trimmed = md.trim().to_string();
367        if trimmed.is_empty() {
368            None
369        } else {
370            Some(trimmed)
371        }
372    };
373
374    Ok(Some(ParsedSection {
375        anchor,
376        title,
377        content_text,
378        section_type: crate::model::SectionType::Definition,
379        parent_anchor: None,
380        prev_anchor: None,
381        next_anchor: None,
382        depth: None,
383    }))
384}
385
386/// Parse an ecmarkup `<emu-clause>` or `<emu-annex>` element into a ParsedSection.
387/// TC39 specs use these custom elements instead of standard headings.
388/// The section ID is on the emu-clause, with an `<h1>` child containing the title.
389pub fn parse_emu_clause_element(
390    element: &scraper::ElementRef,
391    converter: &HtmlToMarkdown,
392) -> Result<Option<ParsedSection>> {
393    let anchor = match element.value().attr("id") {
394        Some(id) => id.to_string(),
395        None => return Ok(None),
396    };
397
398    // Find the direct <h1> child to extract title and depth
399    let h1 = element
400        .children()
401        .filter_map(scraper::ElementRef::wrap)
402        .find(|c| c.value().name() == "h1");
403
404    let (title, depth) = match h1 {
405        Some(h1_elem) => {
406            let title = extract_heading_title(&h1_elem);
407            let depth = extract_secnum_depth(&h1_elem);
408            (title, depth)
409        }
410        None => (None, None),
411    };
412
413    // Classify: if the emu-clause has a type attribute, it's an algorithm-like operation
414    let section_type = if element.value().attr("type").is_some() {
415        SectionType::Algorithm
416    } else {
417        SectionType::Heading
418    };
419
420    let content_text = extract_emu_clause_content(element, converter);
421
422    Ok(Some(ParsedSection {
423        anchor,
424        title,
425        content_text,
426        section_type,
427        parent_anchor: None,
428        prev_anchor: None,
429        next_anchor: None,
430        depth,
431    }))
432}
433
434/// Extract the depth from a secnum span inside a heading.
435/// Parses `<span class="secnum">7.1.17</span>` → count parts → depth = parts + 1.
436/// Returns None if no secnum is found.
437fn extract_secnum_depth(heading: &scraper::ElementRef) -> Option<u8> {
438    for child in heading.children() {
439        if let Some(elem) = scraper::ElementRef::wrap(child) {
440            let classes: Vec<_> = elem.value().classes().collect();
441            if classes.contains(&"secnum") {
442                let text = elem.text().collect::<String>();
443                let text = text.trim();
444                if text.is_empty() {
445                    return None;
446                }
447                // Count parts: "7" → 1 part, "7.1" → 2 parts, "7.1.17" → 3 parts
448                let parts = text.split('.').count();
449                // Depth = parts + 1 to match h2=2 convention (top-level = depth 2)
450                return Some((parts + 1).min(255) as u8);
451            }
452        }
453    }
454    None
455}
456
457/// Extract content from an ecmarkup emu-clause element.
458/// Unlike standard headings where content is between siblings, emu-clause content
459/// is nested inside the element as children. We skip the h1 (title) and child
460/// emu-clause/emu-annex elements (sub-sections).
461fn extract_emu_clause_content(
462    element: &scraper::ElementRef,
463    converter: &HtmlToMarkdown,
464) -> Option<String> {
465    use super::{algorithms, markdown};
466
467    let mut intro_html = String::new();
468    let mut algo_steps: Option<String> = None;
469
470    for child in element.children() {
471        if let Some(child_elem) = scraper::ElementRef::wrap(child) {
472            let tag = child_elem.value().name();
473
474            // Skip title heading and sub-sections
475            if tag == "h1" || tag == "emu-clause" || tag == "emu-annex" || tag == "emu-import" {
476                continue;
477            }
478
479            // For emu-alg, use the dedicated algorithm renderer on its inner <ol>
480            if tag == "emu-alg" {
481                if let Some(ol) = child_elem
482                    .children()
483                    .filter_map(scraper::ElementRef::wrap)
484                    .find(|c| c.value().name() == "ol")
485                {
486                    algo_steps = Some(algorithms::render_algorithm_ol(&ol, converter));
487                }
488                continue;
489            }
490
491            // Skip legacy ID spans (empty <span id="...">)
492            if tag == "span" && child_elem.value().attr("id").is_some() {
493                let text = child_elem.text().collect::<String>();
494                if text.trim().is_empty() {
495                    continue;
496                }
497            }
498
499            intro_html.push_str(&child_elem.html());
500        }
501    }
502
503    let intro = markdown::element_to_markdown_from_html(&intro_html, converter);
504    let intro = intro.trim();
505
506    match (intro.is_empty(), algo_steps) {
507        (true, None) => None,
508        (true, Some(steps)) => Some(steps),
509        (false, None) => Some(intro.to_string()),
510        (false, Some(steps)) => Some(format!("{}\n\n{}", intro, steps)),
511    }
512}
513
514/// Collect all ID'd headings from HTML
515#[cfg(test)]
516pub fn collect_headings(html: &str) -> Result<Vec<ParsedSection>> {
517    let document = Html::parse_document(html);
518    let converter = crate::parse::markdown::build_converter("https://test.example.com");
519    let mut sections = Vec::new();
520
521    // Select all headings with an id attribute (h2, h3, h4, h5, h6)
522    let selector = Selector::parse("h2[id], h3[id], h4[id], h5[id], h6[id]")
523        .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
524
525    for element in document.select(&selector) {
526        if let Some(section) = parse_heading_element(&element, &converter)? {
527            // Clear content for tests that expect None (tree building tests)
528            // Real parsing in parse_spec will extract content
529            sections.push(ParsedSection {
530                content_text: None,
531                ..section
532            });
533        }
534    }
535
536    Ok(sections)
537}
538
539/// Check if a dfn is inside an algorithm's <ol> content (i.e., part of the algorithm steps)
540/// These dfns should not be collected as separate sections - they're part of algorithm content
541fn is_inside_algorithm_content(element: &scraper::ElementRef) -> bool {
542    // Check if this element is inside an <ol>
543    let mut current = element.parent();
544    while let Some(node) = current {
545        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
546            if parent_elem.value().name() == "ol" {
547                // Found an <ol> ancestor. Now check if this <ol> is part of an algorithm.
548                // Two patterns:
549                // 1. Bikeshed: <div class="algorithm">...<ol>...</ol></div>
550                // 2. Wattsi: <p>To <dfn>foo</dfn>:</p><ol>...</ol> (sibling pattern)
551
552                // Check if <ol> is inside div.algorithm or div[data-algorithm]
553                let mut ol_ancestor = parent_elem.parent();
554                while let Some(anc_node) = ol_ancestor {
555                    if let Some(anc_elem) = scraper::ElementRef::wrap(anc_node) {
556                        if anc_elem.value().name() == "div" {
557                            let classes: Vec<_> = anc_elem.value().classes().collect();
558                            if classes.contains(&"algorithm")
559                                || anc_elem.value().attr("data-algorithm").is_some()
560                            {
561                                return true; // Inside Bikeshed/Wattsi div.algorithm pattern
562                            }
563                        }
564                    }
565                    ol_ancestor = anc_node.parent();
566                }
567
568                // Check Wattsi sibling pattern: preceding <p> contains algorithm-defining dfn
569                let mut prev_sibling = node.prev_sibling();
570                while let Some(prev_node) = prev_sibling {
571                    if let Some(prev_elem) = scraper::ElementRef::wrap(prev_node) {
572                        if matches!(prev_elem.value().name(), "p" | "dd" | "li") {
573                            // Check if this block contains a dfn (algorithm-defining)
574                            if let Ok(dfn_selector) = scraper::Selector::parse("dfn[id]") {
575                                if prev_elem.select(&dfn_selector).next().is_some() {
576                                    return true; // Wattsi sibling pattern detected
577                                }
578                            }
579                        }
580                        // Stop at block elements
581                        if matches!(
582                            prev_elem.value().name(),
583                            "p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
584                        ) {
585                            break;
586                        }
587                    }
588                    prev_sibling = prev_node.prev_sibling();
589                }
590
591                // <ol> is not part of an algorithm, so this dfn is not in algorithm content
592                return false;
593            }
594        }
595        current = node.parent();
596    }
597    false
598}
599
600/// Check if an element is inside a <div class="algorithm"> or followed by sibling <ol>
601/// Detects both Bikeshed style (div.algorithm wrapping) and Wattsi style (sibling ol)
602fn is_inside_algorithm_div(element: &scraper::ElementRef) -> bool {
603    // First check Bikeshed pattern: parent div.algorithm
604    let mut current = element.parent();
605    while let Some(node) = current {
606        if let Some(parent_elem) = scraper::ElementRef::wrap(node) {
607            if parent_elem.value().name() == "div" {
608                let classes: Vec<_> = parent_elem.value().classes().collect();
609                if classes.contains(&"algorithm") {
610                    return true;
611                }
612            }
613
614            // Also check Wattsi pattern: if this block element has a sibling <ol>
615            // (e.g., <p>To <dfn>foo</dfn>:</p><ol>...</ol>)
616            if matches!(parent_elem.value().name(), "p" | "div" | "dd" | "li") {
617                // Check if there's a following <ol> sibling
618                let mut sibling = node.next_sibling();
619                while let Some(sib_node) = sibling {
620                    if let Some(sib_elem) = scraper::ElementRef::wrap(sib_node) {
621                        if sib_elem.value().name() == "ol" {
622                            return true;
623                        }
624                        // Stop if we hit another block element (not whitespace)
625                        if matches!(
626                            sib_elem.value().name(),
627                            "p" | "div" | "h2" | "h3" | "h4" | "h5" | "h6"
628                        ) {
629                            break;
630                        }
631                    }
632                    sibling = sib_node.next_sibling();
633                }
634            }
635        }
636        current = node.parent();
637    }
638    false
639}
640
641/// Check if a dfn element is an IDL type definition
642fn is_idl_type(element: &scraper::ElementRef) -> bool {
643    if let Some(dfn_type) = element.value().attr("data-dfn-type") {
644        matches!(
645            dfn_type,
646            "interface" | "dictionary" | "enum" | "callback" | "callback interface" | "typedef"
647        )
648    } else {
649        false
650    }
651}
652
653/// Collect all ID'd IDL type definitions from HTML
654#[cfg(test)]
655pub fn collect_idl(html: &str) -> Result<Vec<ParsedSection>> {
656    let document = Html::parse_document(html);
657    let mut sections = Vec::new();
658
659    // Select all dfn elements with an id and data-dfn-type attribute
660    let selector = Selector::parse("dfn[id][data-dfn-type]")
661        .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
662
663    for element in document.select(&selector) {
664        // Only collect IDL type definitions (interface, dictionary, enum, etc.)
665        if !is_idl_type(&element) {
666            continue;
667        }
668
669        let anchor = element
670            .value()
671            .attr("id")
672            .ok_or_else(|| anyhow::anyhow!("IDL type missing id"))?
673            .to_string();
674
675        // Extract text content (including nested elements like <code>)
676        let title = element.text().collect::<String>().trim().to_string();
677        let title = if title.is_empty() { None } else { Some(title) };
678
679        sections.push(ParsedSection {
680            anchor,
681            title,
682            content_text: None, // Will be extracted in a later pass
683            section_type: SectionType::Idl,
684            parent_anchor: None, // Will be computed in tree building
685            prev_anchor: None,   // Will be computed in tree building
686            next_anchor: None,   // Will be computed in tree building
687            depth: None,         // IDL types don't have depth
688        });
689    }
690
691    Ok(sections)
692}
693
694/// Collect all ID'd algorithms from HTML (dfn elements inside div.algorithm)
695#[cfg(test)]
696pub fn collect_algorithms(html: &str) -> Result<Vec<ParsedSection>> {
697    let document = Html::parse_document(html);
698    let mut sections = Vec::new();
699
700    // Select all definitions with an id attribute inside algorithm divs
701    let selector = Selector::parse("div.algorithm dfn[id]")
702        .map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
703
704    for element in document.select(&selector) {
705        let anchor = element
706            .value()
707            .attr("id")
708            .ok_or_else(|| anyhow::anyhow!("Algorithm missing id"))?
709            .to_string();
710
711        // Extract text content (including nested elements like <code>)
712        let title = element.text().collect::<String>().trim().to_string();
713        let title = if title.is_empty() { None } else { Some(title) };
714
715        sections.push(ParsedSection {
716            anchor,
717            title,
718            content_text: None, // Will be extracted in a later pass
719            section_type: SectionType::Algorithm,
720            parent_anchor: None, // Will be computed in tree building
721            prev_anchor: None,   // Will be computed in tree building
722            next_anchor: None,   // Will be computed in tree building
723            depth: None,         // Algorithms don't have depth
724        });
725    }
726
727    Ok(sections)
728}
729
730/// Collect all ID'd definitions from HTML (dfn elements NOT inside div.algorithm and NOT IDL types)
731#[cfg(test)]
732pub fn collect_definitions(html: &str) -> Result<Vec<ParsedSection>> {
733    let document = Html::parse_document(html);
734    let mut sections = Vec::new();
735
736    // Select all definitions with an id attribute
737    let selector =
738        Selector::parse("dfn[id]").map_err(|e| anyhow::anyhow!("Invalid selector: {:?}", e))?;
739
740    for element in document.select(&selector) {
741        // Skip definitions that are inside algorithm divs (those are algorithms)
742        if is_inside_algorithm_div(&element) {
743            continue;
744        }
745
746        // Skip IDL type definitions (those are IDL)
747        if is_idl_type(&element) {
748            continue;
749        }
750
751        let anchor = element
752            .value()
753            .attr("id")
754            .ok_or_else(|| anyhow::anyhow!("Definition missing id"))?
755            .to_string();
756
757        // Extract text content (including nested elements like <code>)
758        let title = element.text().collect::<String>().trim().to_string();
759        let title = if title.is_empty() { None } else { Some(title) };
760
761        sections.push(ParsedSection {
762            anchor,
763            title,
764            content_text: None, // Will be extracted in a later pass
765            section_type: SectionType::Definition,
766            parent_anchor: None, // Will be computed in tree building
767            prev_anchor: None,   // Will be computed in tree building
768            next_anchor: None,   // Will be computed in tree building
769            depth: None,         // Definitions don't have depth
770        });
771    }
772
773    Ok(sections)
774}
775
776/// Build parent/child/sibling relationships for a flat list of sections
777pub fn build_section_tree(mut sections: Vec<ParsedSection>) -> Vec<ParsedSection> {
778    // First pass: compute parent relationships
779    for i in 0..sections.len() {
780        if let Some(current_depth) = sections[i].depth {
781            // This is a heading - find parent heading with depth < current
782            for j in (0..i).rev() {
783                if let Some(parent_depth) = sections[j].depth {
784                    if parent_depth < current_depth {
785                        sections[i].parent_anchor = Some(sections[j].anchor.clone());
786                        break;
787                    }
788                }
789            }
790        } else {
791            // This is a non-heading (definition, algorithm, IDL)
792            // Parent is the most recent heading (any heading)
793            for j in (0..i).rev() {
794                if sections[j].depth.is_some() {
795                    sections[i].parent_anchor = Some(sections[j].anchor.clone());
796                    break;
797                }
798            }
799        }
800    }
801
802    // Second pass: compute prev/next sibling relationships
803    for i in 0..sections.len() {
804        let current_depth = sections[i].depth;
805        let current_parent = sections[i].parent_anchor.clone();
806
807        // Look backwards for prev sibling (same depth, same parent)
808        for j in (0..i).rev() {
809            if sections[j].depth == current_depth && sections[j].parent_anchor == current_parent {
810                sections[i].prev_anchor = Some(sections[j].anchor.clone());
811                break;
812            }
813        }
814
815        // Look forwards for next sibling (same depth, same parent)
816        for j in (i + 1)..sections.len() {
817            if sections[j].depth == current_depth && sections[j].parent_anchor == current_parent {
818                sections[i].next_anchor = Some(sections[j].anchor.clone());
819                break;
820            }
821        }
822    }
823
824    sections
825}
826
827#[cfg(test)]
828mod tests {
829    use super::*;
830
831    #[test]
832    fn test_bikeshed_heading_parsing() {
833        let html = include_str!("../../tests/fixtures/headings/bikeshed_heading.html");
834        let sections = collect_headings(html).unwrap();
835
836        assert_eq!(sections.len(), 1);
837        let section = &sections[0];
838
839        assert_eq!(section.anchor, "trees");
840        assert_eq!(section.title, Some("Trees".to_string()));
841        assert_eq!(section.section_type, SectionType::Heading);
842        assert_eq!(section.depth, Some(3));
843    }
844
845    #[test]
846    fn test_wattsi_heading_parsing() {
847        let html = include_str!("../../tests/fixtures/headings/wattsi_heading.html");
848        let sections = collect_headings(html).unwrap();
849
850        assert_eq!(sections.len(), 1);
851        let section = &sections[0];
852
853        assert_eq!(section.anchor, "abstract");
854        assert_eq!(
855            section.title,
856            Some("Where does this specification fit?".to_string())
857        );
858        assert_eq!(section.section_type, SectionType::Heading);
859        assert_eq!(section.depth, Some(3));
860    }
861
862    #[test]
863    fn test_multiple_heading_levels() {
864        let html = r#"
865            <h2 id="section-1">Section 1</h2>
866            <h3 id="section-1-1">Section 1.1</h3>
867            <h4 id="section-1-1-1">Section 1.1.1</h4>
868            <h2 id="section-2">Section 2</h2>
869        "#;
870
871        let sections = collect_headings(html).unwrap();
872        assert_eq!(sections.len(), 4);
873
874        assert_eq!(sections[0].anchor, "section-1");
875        assert_eq!(sections[0].depth, Some(2));
876
877        assert_eq!(sections[1].anchor, "section-1-1");
878        assert_eq!(sections[1].depth, Some(3));
879
880        assert_eq!(sections[2].anchor, "section-1-1-1");
881        assert_eq!(sections[2].depth, Some(4));
882
883        assert_eq!(sections[3].anchor, "section-2");
884        assert_eq!(sections[3].depth, Some(2));
885    }
886
887    #[test]
888    fn test_heading_without_id_ignored() {
889        let html = r#"
890            <h2 id="has-id">With ID</h2>
891            <h2>Without ID</h2>
892        "#;
893
894        let sections = collect_headings(html).unwrap();
895        assert_eq!(sections.len(), 1);
896        assert_eq!(sections[0].anchor, "has-id");
897    }
898
899    #[test]
900    fn test_build_section_tree_simple_nesting() {
901        let html = r#"
902            <h2 id="s1">Section 1</h2>
903            <h3 id="s1-1">Section 1.1</h3>
904            <h3 id="s1-2">Section 1.2</h3>
905            <h4 id="s1-2-1">Section 1.2.1</h4>
906            <h2 id="s2">Section 2</h2>
907        "#;
908
909        let sections = collect_headings(html).unwrap();
910        let tree = build_section_tree(sections);
911
912        // s1: no parent, no prev, next=s2
913        assert_eq!(tree[0].parent_anchor, None);
914        assert_eq!(tree[0].prev_anchor, None);
915        assert_eq!(tree[0].next_anchor, Some("s2".to_string()));
916
917        // s1-1: parent=s1, no prev, next=s1-2
918        assert_eq!(tree[1].parent_anchor, Some("s1".to_string()));
919        assert_eq!(tree[1].prev_anchor, None);
920        assert_eq!(tree[1].next_anchor, Some("s1-2".to_string()));
921
922        // s1-2: parent=s1, prev=s1-1, no next
923        assert_eq!(tree[2].parent_anchor, Some("s1".to_string()));
924        assert_eq!(tree[2].prev_anchor, Some("s1-1".to_string()));
925        assert_eq!(tree[2].next_anchor, None);
926
927        // s1-2-1: parent=s1-2, no prev, no next
928        assert_eq!(tree[3].parent_anchor, Some("s1-2".to_string()));
929        assert_eq!(tree[3].prev_anchor, None);
930        assert_eq!(tree[3].next_anchor, None);
931
932        // s2: no parent, prev=s1, no next
933        assert_eq!(tree[4].parent_anchor, None);
934        assert_eq!(tree[4].prev_anchor, Some("s1".to_string()));
935        assert_eq!(tree[4].next_anchor, None);
936    }
937
938    #[test]
939    fn test_build_section_tree_flat_structure() {
940        let html = r#"
941            <h2 id="a">A</h2>
942            <h2 id="b">B</h2>
943            <h2 id="c">C</h2>
944        "#;
945
946        let sections = collect_headings(html).unwrap();
947        let tree = build_section_tree(sections);
948
949        // a: no parent, no prev, next=b
950        assert_eq!(tree[0].parent_anchor, None);
951        assert_eq!(tree[0].prev_anchor, None);
952        assert_eq!(tree[0].next_anchor, Some("b".to_string()));
953
954        // b: no parent, prev=a, next=c
955        assert_eq!(tree[1].parent_anchor, None);
956        assert_eq!(tree[1].prev_anchor, Some("a".to_string()));
957        assert_eq!(tree[1].next_anchor, Some("c".to_string()));
958
959        // c: no parent, prev=b, no next
960        assert_eq!(tree[2].parent_anchor, None);
961        assert_eq!(tree[2].prev_anchor, Some("b".to_string()));
962        assert_eq!(tree[2].next_anchor, None);
963    }
964
965    #[test]
966    fn test_build_section_tree_single_heading() {
967        let html = r#"<h2 id="only">Only Section</h2>"#;
968
969        let sections = collect_headings(html).unwrap();
970        let tree = build_section_tree(sections);
971
972        assert_eq!(tree.len(), 1);
973        assert_eq!(tree[0].parent_anchor, None);
974        assert_eq!(tree[0].prev_anchor, None);
975        assert_eq!(tree[0].next_anchor, None);
976    }
977
978    #[test]
979    fn test_build_section_tree_skip_levels() {
980        // Test when heading levels are skipped (h2 -> h4, skipping h3)
981        let html = r#"
982            <h2 id="top">Top</h2>
983            <h4 id="nested">Nested (skipped h3)</h4>
984            <h2 id="next">Next Top</h2>
985        "#;
986
987        let sections = collect_headings(html).unwrap();
988        let tree = build_section_tree(sections);
989
990        // nested: parent should still be 'top' (nearest lower depth)
991        assert_eq!(tree[1].parent_anchor, Some("top".to_string()));
992        assert_eq!(tree[1].prev_anchor, None); // no siblings at depth 4
993        assert_eq!(tree[1].next_anchor, None);
994    }
995
996    #[test]
997    fn test_bikeshed_definition_parsing() {
998        let html = include_str!("../../tests/fixtures/definitions/bikeshed_definition.html");
999        let sections = collect_definitions(html).unwrap();
1000
1001        assert_eq!(sections.len(), 1);
1002        let section = &sections[0];
1003
1004        assert_eq!(section.anchor, "concept-tree");
1005        assert_eq!(section.title, Some("tree".to_string()));
1006        assert_eq!(section.section_type, SectionType::Definition);
1007        assert_eq!(section.depth, None);
1008    }
1009
1010    #[test]
1011    fn test_wattsi_definition_parsing() {
1012        let html = include_str!("../../tests/fixtures/definitions/wattsi_definition.html");
1013        let sections = collect_definitions(html).unwrap();
1014
1015        assert_eq!(sections.len(), 1);
1016        let section = &sections[0];
1017
1018        assert_eq!(section.anchor, "in-parallel");
1019        assert_eq!(section.title, Some("in parallel".to_string()));
1020        assert_eq!(section.section_type, SectionType::Definition);
1021        assert_eq!(section.depth, None);
1022    }
1023
1024    #[test]
1025    fn test_definition_with_code() {
1026        let html = include_str!("../../tests/fixtures/definitions/definition_with_code.html");
1027        let sections = collect_definitions(html).unwrap();
1028
1029        assert_eq!(sections.len(), 1);
1030        let section = &sections[0];
1031
1032        assert_eq!(section.anchor, "x-that");
1033        assert_eq!(section.title, Some("createElement".to_string()));
1034        assert_eq!(section.section_type, SectionType::Definition);
1035    }
1036
1037    #[test]
1038    fn test_definition_without_id_ignored() {
1039        let html = r#"
1040            <dfn id="has-id">With ID</dfn>
1041            <dfn>Without ID</dfn>
1042        "#;
1043
1044        let sections = collect_definitions(html).unwrap();
1045        assert_eq!(sections.len(), 1);
1046        assert_eq!(sections[0].anchor, "has-id");
1047    }
1048
1049    #[test]
1050    fn test_multiple_definitions() {
1051        let html = r#"
1052            <p>A <dfn id="def-1">first term</dfn> and a <dfn id="def-2">second term</dfn>.</p>
1053            <p>Also a <dfn id="def-3">third term</dfn>.</p>
1054        "#;
1055
1056        let sections = collect_definitions(html).unwrap();
1057        assert_eq!(sections.len(), 3);
1058        assert_eq!(sections[0].anchor, "def-1");
1059        assert_eq!(sections[1].anchor, "def-2");
1060        assert_eq!(sections[2].anchor, "def-3");
1061    }
1062
1063    #[test]
1064    fn test_bikeshed_algorithm_parsing() {
1065        let html = include_str!("../../tests/fixtures/algorithms/bikeshed_algorithm.html");
1066        let sections = collect_algorithms(html).unwrap();
1067
1068        assert_eq!(sections.len(), 1);
1069        let section = &sections[0];
1070
1071        assert_eq!(section.anchor, "concept-ordered-set-parser");
1072        assert_eq!(section.title, Some("ordered set parser".to_string()));
1073        assert_eq!(section.section_type, SectionType::Algorithm);
1074        assert_eq!(section.depth, None);
1075    }
1076
1077    #[test]
1078    fn test_algorithm_vs_definition_distinction() {
1079        let html =
1080            include_str!("../../tests/fixtures/algorithms/mixed_definitions_algorithms.html");
1081
1082        // Collect algorithms (dfn inside div.algorithm)
1083        let algorithms = collect_algorithms(html).unwrap();
1084        assert_eq!(algorithms.len(), 1);
1085        assert_eq!(algorithms[0].anchor, "algorithm-def");
1086        assert_eq!(algorithms[0].section_type, SectionType::Algorithm);
1087
1088        // Collect definitions (dfn NOT inside div.algorithm)
1089        let definitions = collect_definitions(html).unwrap();
1090        assert_eq!(definitions.len(), 2);
1091        assert_eq!(definitions[0].anchor, "standalone-def");
1092        assert_eq!(definitions[0].section_type, SectionType::Definition);
1093        assert_eq!(definitions[1].anchor, "another-standalone");
1094        assert_eq!(definitions[1].section_type, SectionType::Definition);
1095
1096        // No overlap: the dfn inside algorithm div should not appear in definitions
1097        let def_anchors: Vec<_> = definitions.iter().map(|d| &d.anchor).collect();
1098        assert!(!def_anchors.contains(&&"algorithm-def".to_string()));
1099    }
1100
1101    #[test]
1102    fn test_algorithm_without_dfn() {
1103        // Some algorithms might not have a dfn, just the algorithm div
1104        let html = r#"
1105            <div class="algorithm" data-algorithm="no dfn">
1106                <p>This algorithm has no dfn element.</p>
1107                <ol><li>Step 1</li></ol>
1108            </div>
1109        "#;
1110
1111        let sections = collect_algorithms(html).unwrap();
1112        assert_eq!(sections.len(), 0); // No dfn[id], so nothing to index
1113    }
1114
1115    #[test]
1116    fn test_idl_interface_parsing() {
1117        let html = include_str!("../../tests/fixtures/idl/interface.html");
1118        let sections = collect_idl(html).unwrap();
1119
1120        assert_eq!(sections.len(), 1);
1121        let section = &sections[0];
1122
1123        assert_eq!(section.anchor, "event");
1124        assert_eq!(section.title, Some("Event".to_string()));
1125        assert_eq!(section.section_type, SectionType::Idl);
1126        assert_eq!(section.depth, None);
1127    }
1128
1129    #[test]
1130    fn test_idl_dictionary_parsing() {
1131        let html = include_str!("../../tests/fixtures/idl/dictionary.html");
1132        let sections = collect_idl(html).unwrap();
1133
1134        assert_eq!(sections.len(), 1);
1135        let section = &sections[0];
1136
1137        assert_eq!(section.anchor, "eventinit");
1138        assert_eq!(section.title, Some("EventInit".to_string()));
1139        assert_eq!(section.section_type, SectionType::Idl);
1140        assert_eq!(section.depth, None);
1141    }
1142
1143    #[test]
1144    fn test_idl_vs_definition_distinction() {
1145        let html = include_str!("../../tests/fixtures/idl/mixed_idl_definitions.html");
1146
1147        // Collect IDL types (dfn with data-dfn-type="interface", "dictionary", etc.)
1148        let idl = collect_idl(html).unwrap();
1149        assert_eq!(idl.len(), 2);
1150        assert_eq!(idl[0].anchor, "myinterface");
1151        assert_eq!(idl[0].section_type, SectionType::Idl);
1152        assert_eq!(idl[1].anchor, "mydict");
1153        assert_eq!(idl[1].section_type, SectionType::Idl);
1154
1155        // Collect definitions (dfn NOT IDL types and NOT in algorithm divs)
1156        let definitions = collect_definitions(html).unwrap();
1157        assert_eq!(definitions.len(), 2);
1158        assert_eq!(definitions[0].anchor, "regular-term");
1159        assert_eq!(definitions[0].section_type, SectionType::Definition);
1160        assert_eq!(definitions[1].anchor, "another-term");
1161        assert_eq!(definitions[1].section_type, SectionType::Definition);
1162
1163        // No overlap: IDL types should not appear in definitions
1164        let def_anchors: Vec<_> = definitions.iter().map(|d| &d.anchor).collect();
1165        assert!(!def_anchors.contains(&&"myinterface".to_string()));
1166        assert!(!def_anchors.contains(&&"mydict".to_string()));
1167    }
1168
1169    #[test]
1170    fn test_idl_without_data_dfn_type_ignored() {
1171        let html = r#"
1172            <pre class="idl">
1173                <dfn id="has-type" data-dfn-type="interface">WithType</dfn>
1174                <dfn id="no-type">WithoutType</dfn>
1175            </pre>
1176        "#;
1177
1178        let sections = collect_idl(html).unwrap();
1179        assert_eq!(sections.len(), 1);
1180        assert_eq!(sections[0].anchor, "has-type");
1181    }
1182
1183    #[test]
1184    fn test_wattsi_algorithm_pattern() {
1185        // Test Wattsi-style algorithm: <p>To <dfn>foo</dfn>:</p><ol>...</ol>
1186        // (as opposed to Bikeshed's <div class="algorithm"><p>To <dfn>foo</dfn>:</p><ol>...</ol></div>)
1187        let html = include_str!("../../tests/fixtures/algorithms/wattsi_navigate.html");
1188        let converter = crate::parse::markdown::build_converter("https://html.spec.whatwg.org");
1189
1190        let document = Html::parse_document(html);
1191        let selector = Selector::parse("dfn[id]").unwrap();
1192
1193        let mut algorithms = Vec::new();
1194        for element in document.select(&selector) {
1195            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1196                algorithms.push(section);
1197            }
1198        }
1199
1200        assert_eq!(algorithms.len(), 1, "Should detect one algorithm");
1201        let algo = &algorithms[0];
1202
1203        assert_eq!(algo.anchor, "navigate");
1204        assert_eq!(algo.title, Some("navigate".to_string()));
1205        assert_eq!(
1206            algo.section_type,
1207            SectionType::Algorithm,
1208            "Should be classified as Algorithm, not Definition"
1209        );
1210
1211        // Check that content includes both intro and steps (now markdown)
1212        let content = algo.content_text.as_ref().unwrap();
1213        assert!(content.contains("navigate"), "Should include intro text");
1214        assert!(content.contains("1. "), "Should include first step");
1215        assert!(content.contains("2. "), "Should include second step");
1216        // Check for nested step (step 4 has sub-steps in the fixture)
1217        assert!(
1218            content.contains("    1. "),
1219            "Should include nested step with indentation"
1220        );
1221    }
1222
1223    #[test]
1224    fn test_dfn_inside_algorithm_content_skipped() {
1225        // Dfns that appear inside algorithm <ol> content should NOT be collected as separate sections
1226        // They're part of the algorithm's markdown content
1227        let html = r#"
1228            <h2 id="algorithms">Algorithms</h2>
1229            <p>To <dfn id="do-something">do something</dfn> with <var>input</var>:</p>
1230            <ol>
1231                <li><p>Let <var>result</var> be the result of calling <dfn id="helper">helper</dfn>.</p></li>
1232                <li><p>Return <var>result</var>.</p></li>
1233            </ol>
1234            <p>The <dfn id="outside-def">outside definition</dfn> is separate.</p>
1235        "#;
1236
1237        let converter = crate::parse::markdown::build_converter("https://test.example.com");
1238        let document = Html::parse_document(html);
1239        let selector = Selector::parse("dfn[id]").unwrap();
1240
1241        let mut sections = Vec::new();
1242        for element in document.select(&selector) {
1243            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1244                sections.push(section);
1245            }
1246        }
1247
1248        // Should only collect "do-something" (the algorithm) and "outside-def"
1249        // "helper" inside the <ol> should be skipped
1250        assert_eq!(
1251            sections.len(),
1252            2,
1253            "Should collect 2 sections (algorithm + outside def), not the helper inside <ol>"
1254        );
1255
1256        let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
1257        assert!(
1258            anchors.contains(&"do-something"),
1259            "Should include the algorithm-defining dfn"
1260        );
1261        assert!(
1262            anchors.contains(&"outside-def"),
1263            "Should include the outside definition"
1264        );
1265        assert!(
1266            !anchors.contains(&"helper"),
1267            "Should NOT include dfn inside algorithm <ol>"
1268        );
1269    }
1270
1271    #[test]
1272    fn test_dfn_inside_bikeshed_algorithm_content_skipped() {
1273        // Same test but for Bikeshed div.algorithm pattern
1274        let html = r#"
1275            <h2 id="algorithms">Algorithms</h2>
1276            <div class="algorithm">
1277                <p>To <dfn id="process">process</dfn> the <var>data</var>:</p>
1278                <ol>
1279                    <li><p>Let <var>x</var> be a new <dfn id="internal-thing">internal thing</dfn>.</p></li>
1280                    <li><p>Return <var>x</var>.</p></li>
1281                </ol>
1282            </div>
1283            <p>A <dfn id="external-term">external term</dfn> here.</p>
1284        "#;
1285
1286        let converter = crate::parse::markdown::build_converter("https://test.example.com");
1287        let document = Html::parse_document(html);
1288        let selector = Selector::parse("dfn[id]").unwrap();
1289
1290        let mut sections = Vec::new();
1291        for element in document.select(&selector) {
1292            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1293                sections.push(section);
1294            }
1295        }
1296
1297        // Should only collect "process" (the algorithm) and "external-term"
1298        assert_eq!(
1299            sections.len(),
1300            2,
1301            "Should collect 2 sections, not the internal-thing inside <ol>"
1302        );
1303
1304        let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
1305        assert!(anchors.contains(&"process"));
1306        assert!(anchors.contains(&"external-term"));
1307        assert!(
1308            !anchors.contains(&"internal-thing"),
1309            "Should NOT include dfn inside algorithm <ol>"
1310        );
1311    }
1312
1313    #[test]
1314    fn test_parameter_dfns_skipped() {
1315        // Parameter dfns (with data-dfn-for or containing <var>) should NOT be collected as sections
1316        // They're part of the parent definition/algorithm signature, not standalone sections
1317        let html = r#"
1318            <h2 id="algorithms">Algorithms</h2>
1319            <p>To <dfn id="navigate">navigate</dfn> with <dfn data-dfn-for="navigate" id="param1"><var>url</var></dfn>
1320            and <dfn id="param2"><var>options</var></dfn>:</p>
1321            <ol>
1322                <li><p>Do something.</p></li>
1323            </ol>
1324            <p>A standalone <dfn id="regular-def">definition</dfn>.</p>
1325        "#;
1326
1327        let converter = crate::parse::markdown::build_converter("https://test.example.com");
1328        let document = Html::parse_document(html);
1329        let selector = Selector::parse("dfn[id]").unwrap();
1330
1331        let mut sections = Vec::new();
1332        for element in document.select(&selector) {
1333            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1334                sections.push(section);
1335            }
1336        }
1337
1338        // Should only collect "navigate" (algorithm) and "regular-def" (standalone definition)
1339        // Parameter dfns "param1" (has data-dfn-for) and "param2" (contains <var>) should be skipped
1340        assert_eq!(
1341            sections.len(),
1342            2,
1343            "Should collect 2 sections (algorithm + regular def)"
1344        );
1345
1346        let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
1347        assert!(
1348            anchors.contains(&"navigate"),
1349            "Should include the algorithm"
1350        );
1351        assert!(
1352            anchors.contains(&"regular-def"),
1353            "Should include standalone definition"
1354        );
1355        assert!(
1356            !anchors.contains(&"param1"),
1357            "Should NOT include parameter dfn with data-dfn-for"
1358        );
1359        assert!(
1360            !anchors.contains(&"param2"),
1361            "Should NOT include parameter dfn containing <var>"
1362        );
1363    }
1364
1365    #[test]
1366    fn test_property_dfns_with_dfn_for_and_dfn_type_kept() {
1367        // dfns with data-dfn-for AND data-dfn-type="dfn" are property definitions,
1368        // not parameters. They should be indexed.
1369        // Real example from DOM spec: <dfn data-dfn-for="tree" data-dfn-type="dfn" id="concept-tree-parent">parent</dfn>
1370        let html = r#"
1371            <h2 id="trees">Trees</h2>
1372            <p>An object that <dfn class="dfn-paneled" data-dfn-type="dfn" data-export id="concept-tree">participates</dfn>
1373            in a tree has a <dfn class="dfn-paneled" data-dfn-for="tree" data-dfn-type="dfn" data-export id="concept-tree-parent">parent</dfn>,
1374            which is either null or an object, and has
1375            <dfn class="dfn-paneled" data-dfn-for="tree" data-dfn-type="dfn" data-export id="concept-tree-child">children</dfn>,
1376            which is an ordered set of objects.</p>
1377        "#;
1378
1379        let converter = crate::parse::markdown::build_converter("https://test.example.com");
1380        let document = Html::parse_document(html);
1381        let selector = Selector::parse("dfn[id]").unwrap();
1382
1383        let mut sections = Vec::new();
1384        for element in document.select(&selector) {
1385            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1386                sections.push(section);
1387            }
1388        }
1389
1390        let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
1391        assert!(
1392            anchors.contains(&"concept-tree"),
1393            "Should include dfn without data-dfn-for"
1394        );
1395        assert!(
1396            anchors.contains(&"concept-tree-parent"),
1397            "Should include property dfn with data-dfn-for + data-dfn-type"
1398        );
1399        assert!(
1400            anchors.contains(&"concept-tree-child"),
1401            "Should include property dfn with data-dfn-for + data-dfn-type"
1402        );
1403    }
1404
1405    #[test]
1406    fn test_argument_dfns_skipped() {
1407        // Bikeshed-generated W3C specs use data-dfn-type="argument" for function parameters.
1408        // These should be skipped, while method/attribute/interface/constructor dfns are kept.
1409        let html = r#"
1410            <h2 id="api">API</h2>
1411            <pre class="idl">
1412                <dfn data-dfn-type="interface" id="audiodecoder"><code>AudioDecoder</code></dfn>
1413                <dfn data-dfn-for="AudioDecoder" data-dfn-type="constructor" id="dom-audiodecoder-ctor"><code>AudioDecoder(init)</code></dfn>
1414                <dfn data-dfn-for="AudioDecoder/AudioDecoder(init)" data-dfn-type="argument" id="dom-audiodecoder-ctor-init"><code>init</code></dfn>
1415                <dfn data-dfn-for="AudioDecoder" data-dfn-type="method" id="dom-audiodecoder-configure"><code>configure(config)</code></dfn>
1416                <dfn data-dfn-for="AudioDecoder/configure(config)" data-dfn-type="argument" id="dom-audiodecoder-configure-config"><code>config</code></dfn>
1417                <dfn data-dfn-for="AudioDecoder" data-dfn-type="attribute" id="dom-audiodecoder-state"><code>state</code></dfn>
1418            </pre>
1419        "#;
1420
1421        let converter = crate::parse::markdown::build_converter("https://test.example.com");
1422        let document = Html::parse_document(html);
1423        let selector = Selector::parse("dfn[id]").unwrap();
1424
1425        let mut sections = Vec::new();
1426        for element in document.select(&selector) {
1427            if let Some(section) = parse_dfn_element(&element, &converter).unwrap() {
1428                sections.push(section);
1429            }
1430        }
1431
1432        let anchors: Vec<_> = sections.iter().map(|s| s.anchor.as_str()).collect();
1433
1434        // Interface, constructor, method, attribute should be kept
1435        assert!(
1436            anchors.contains(&"audiodecoder"),
1437            "Interface should be kept"
1438        );
1439        assert!(
1440            anchors.contains(&"dom-audiodecoder-ctor"),
1441            "Constructor should be kept"
1442        );
1443        assert!(
1444            anchors.contains(&"dom-audiodecoder-configure"),
1445            "Method should be kept"
1446        );
1447        assert!(
1448            anchors.contains(&"dom-audiodecoder-state"),
1449            "Attribute should be kept"
1450        );
1451
1452        // Arguments should be skipped
1453        assert!(
1454            !anchors.contains(&"dom-audiodecoder-ctor-init"),
1455            "Argument should be skipped"
1456        );
1457        assert!(
1458            !anchors.contains(&"dom-audiodecoder-configure-config"),
1459            "Argument should be skipped"
1460        );
1461    }
1462
1463    // -- TC39/ecmarkup emu-clause tests --
1464
1465    #[test]
1466    fn test_emu_clause_prose_section() {
1467        let html = r#"
1468            <emu-clause id="sec-overview">
1469                <h1><span class="secnum">4</span> Overview</h1>
1470                <p>This section contains a non-normative overview of the ECMAScript language.</p>
1471            </emu-clause>
1472        "#;
1473
1474        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1475        let document = Html::parse_document(html);
1476        let selector = Selector::parse("emu-clause[id]").unwrap();
1477        let element = document.select(&selector).next().unwrap();
1478
1479        let section = parse_emu_clause_element(&element, &converter)
1480            .unwrap()
1481            .unwrap();
1482
1483        assert_eq!(section.anchor, "sec-overview");
1484        assert_eq!(section.title, Some("Overview".to_string()));
1485        assert_eq!(section.depth, Some(2)); // "4" = 1 part → depth 2
1486        assert_eq!(section.section_type, SectionType::Heading);
1487        assert!(section.content_text.is_some());
1488        assert!(section
1489            .content_text
1490            .as_ref()
1491            .unwrap()
1492            .contains("non-normative overview"));
1493    }
1494
1495    #[test]
1496    fn test_emu_clause_algorithm_section() {
1497        let html = r#"
1498            <emu-clause id="sec-tostring" type="abstract operation" aoid="ToString">
1499                <h1><span class="secnum">7.1.17</span> ToString ( <var>argument</var> )</h1>
1500                <p>The abstract operation ToString converts argument to a String.</p>
1501                <emu-alg>
1502                    <ol>
1503                        <li>If <var>argument</var> is a String, return <var>argument</var>.</li>
1504                        <li>If <var>argument</var> is <emu-val>undefined</emu-val>, return "undefined".</li>
1505                        <li>If <var>argument</var> is <emu-val>null</emu-val>, return "null".</li>
1506                    </ol>
1507                </emu-alg>
1508            </emu-clause>
1509        "#;
1510
1511        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1512        let document = Html::parse_document(html);
1513        let selector = Selector::parse("emu-clause[id]").unwrap();
1514        let element = document.select(&selector).next().unwrap();
1515
1516        let section = parse_emu_clause_element(&element, &converter)
1517            .unwrap()
1518            .unwrap();
1519
1520        assert_eq!(section.anchor, "sec-tostring");
1521        assert_eq!(section.title, Some("ToString ( argument )".to_string()));
1522        assert_eq!(section.depth, Some(4)); // "7.1.17" = 3 parts → depth 4
1523        assert_eq!(section.section_type, SectionType::Algorithm);
1524
1525        let content = section.content_text.unwrap();
1526        assert!(
1527            content.contains("converts argument"),
1528            "Should have intro prose"
1529        );
1530        assert!(content.contains("1."), "Should have algorithm steps");
1531    }
1532
1533    #[test]
1534    fn test_emu_clause_nested_sections_excluded_from_content() {
1535        let html = r#"
1536            <emu-clause id="sec-parent">
1537                <h1><span class="secnum">23</span> Parent Section</h1>
1538                <p>Intro text for the parent.</p>
1539                <emu-clause id="sec-child">
1540                    <h1><span class="secnum">23.1</span> Child Section</h1>
1541                    <p>This should NOT appear in parent content.</p>
1542                </emu-clause>
1543            </emu-clause>
1544        "#;
1545
1546        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1547        let document = Html::parse_document(html);
1548        let selector = Selector::parse("emu-clause[id]").unwrap();
1549
1550        let mut sections = Vec::new();
1551        for element in document.select(&selector) {
1552            if let Some(section) = parse_emu_clause_element(&element, &converter).unwrap() {
1553                sections.push(section);
1554            }
1555        }
1556
1557        assert_eq!(sections.len(), 2);
1558        assert_eq!(sections[0].anchor, "sec-parent");
1559        assert_eq!(sections[1].anchor, "sec-child");
1560
1561        // Parent content should NOT include child section content
1562        let parent_content = sections[0].content_text.as_ref().unwrap();
1563        assert!(parent_content.contains("Intro text"));
1564        assert!(!parent_content.contains("should NOT appear"));
1565    }
1566
1567    #[test]
1568    fn test_secnum_depth_derivation() {
1569        // Helper to quickly test depth extraction
1570        fn depth_from_html(secnum: &str) -> Option<u8> {
1571            let html = format!(r#"<h1><span class="secnum">{}</span> Title</h1>"#, secnum);
1572            let document = Html::parse_document(&html);
1573            let selector = Selector::parse("h1").unwrap();
1574            let h1 = document.select(&selector).next().unwrap();
1575            extract_secnum_depth(&h1)
1576        }
1577
1578        assert_eq!(depth_from_html("4"), Some(2)); // 1 part → depth 2
1579        assert_eq!(depth_from_html("4.3"), Some(3)); // 2 parts → depth 3
1580        assert_eq!(depth_from_html("7.1.17"), Some(4)); // 3 parts → depth 4
1581        assert_eq!(depth_from_html("23.1.3.30"), Some(5)); // 4 parts → depth 5
1582        assert_eq!(depth_from_html("A"), Some(2)); // annex, 1 part
1583        assert_eq!(depth_from_html("A.1"), Some(3)); // annex sub
1584        assert_eq!(depth_from_html("A.1.2"), Some(4)); // annex deep
1585    }
1586
1587    #[test]
1588    fn test_emu_clause_secnum_stripped_from_title() {
1589        let html = r#"
1590            <emu-clause id="sec-test">
1591                <h1><span class="secnum">7.1.17</span> ToString ( <var>argument</var> )</h1>
1592            </emu-clause>
1593        "#;
1594
1595        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1596        let document = Html::parse_document(html);
1597        let selector = Selector::parse("emu-clause[id]").unwrap();
1598        let element = document.select(&selector).next().unwrap();
1599
1600        let section = parse_emu_clause_element(&element, &converter)
1601            .unwrap()
1602            .unwrap();
1603
1604        // Title should not contain "7.1.17"
1605        let title = section.title.unwrap();
1606        assert!(
1607            !title.contains("7.1.17"),
1608            "secnum should be stripped: {}",
1609            title
1610        );
1611        assert!(
1612            title.contains("ToString"),
1613            "Title should have function name: {}",
1614            title
1615        );
1616    }
1617
1618    #[test]
1619    fn test_emu_annex_parsed() {
1620        let html = r#"
1621            <emu-annex id="sec-additional-built-in-properties">
1622                <h1><span class="secnum">B</span> Additional Built-in Properties</h1>
1623                <p>Annex content here.</p>
1624            </emu-annex>
1625        "#;
1626
1627        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1628        let document = Html::parse_document(html);
1629        let selector = Selector::parse("emu-annex[id]").unwrap();
1630        let element = document.select(&selector).next().unwrap();
1631
1632        let section = parse_emu_clause_element(&element, &converter)
1633            .unwrap()
1634            .unwrap();
1635
1636        assert_eq!(section.anchor, "sec-additional-built-in-properties");
1637        assert_eq!(
1638            section.title,
1639            Some("Additional Built-in Properties".to_string())
1640        );
1641        assert_eq!(section.depth, Some(2)); // "B" = 1 part → depth 2
1642    }
1643
1644    // -- Integration tests using real TC39 HTML fixtures --
1645
1646    #[test]
1647    fn test_ecmarkup_fixture_tostring_algorithm() {
1648        let html = include_str!("../../tests/fixtures/ecmarkup/tostring.html");
1649        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1650        let document = Html::parse_document(html);
1651        let selector = Selector::parse("emu-clause[id]").unwrap();
1652        let element = document.select(&selector).next().unwrap();
1653
1654        let section = parse_emu_clause_element(&element, &converter)
1655            .unwrap()
1656            .unwrap();
1657
1658        assert_eq!(section.anchor, "sec-tostring");
1659        assert_eq!(section.title, Some("ToString ( argument )".to_string()));
1660        assert_eq!(section.depth, Some(4)); // "7.1.17" = 3 parts → depth 4
1661        assert_eq!(section.section_type, SectionType::Algorithm);
1662
1663        let content = section.content_text.as_ref().unwrap();
1664
1665        // Intro prose should be a single flowing paragraph
1666        assert!(
1667            content.contains("The abstract operation ToString takes argument *argument*"),
1668            "Intro should have italic var: {}",
1669            &content[..200]
1670        );
1671        assert!(
1672            content.contains("[ECMAScript language value](https://tc39.es/ecma262#sec-ecmascript-language-types)"),
1673            "emu-xref links should be inline markdown links"
1674        );
1675
1676        // Algorithm steps should be numbered, one per line, no broken lines
1677        assert!(
1678            content.contains("1. If *argument* [is a String]("),
1679            "Step 1 should be on a single line with inline link"
1680        );
1681        assert!(
1682            content.contains("2. If *argument* [is a Symbol]("),
1683            "Step 2 should follow immediately"
1684        );
1685        assert!(
1686            content.contains("3. If *argument* is undefined, return \"undefined\"."),
1687            "Step 3: emu-val should render inline"
1688        );
1689        assert!(
1690            content.contains("10. Let *primValue* be ?"),
1691            "Step 10 should have var and link inline"
1692        );
1693        assert!(
1694            content.contains("10. Let *primValue*") && content.contains("[ToPrimitive]("),
1695            "Step 10 should have ToPrimitive link"
1696        );
1697        assert!(
1698            content.contains("12. Return ?") && content.contains("[ToString]("),
1699            "Step 12 should have recursive call"
1700        );
1701
1702        // Steps should be on individual lines, not broken across multiple lines
1703        for i in 1..=12 {
1704            let prefix = format!("{}. ", i);
1705            let matches: Vec<_> = content
1706                .lines()
1707                .filter(|l| {
1708                    let trimmed = l.trim_start();
1709                    trimmed.starts_with(&prefix)
1710                        || (i >= 10 && trimmed.starts_with(&format!("{}.", i)))
1711                })
1712                .collect();
1713            assert!(
1714                !matches.is_empty(),
1715                "Step {} should appear on its own line",
1716                i
1717            );
1718        }
1719    }
1720
1721    #[test]
1722    fn test_ecmarkup_fixture_undefined_type_prose() {
1723        let html = include_str!("../../tests/fixtures/ecmarkup/undefined_type.html");
1724        let converter = crate::parse::markdown::build_converter("https://tc39.es/ecma262");
1725        let document = Html::parse_document(html);
1726        let selector = Selector::parse("emu-clause[id]").unwrap();
1727        let element = document.select(&selector).next().unwrap();
1728
1729        let section = parse_emu_clause_element(&element, &converter)
1730            .unwrap()
1731            .unwrap();
1732
1733        assert_eq!(
1734            section.anchor,
1735            "sec-ecmascript-language-types-undefined-type"
1736        );
1737        assert_eq!(section.title, Some("The Undefined Type".to_string()));
1738        assert_eq!(section.depth, Some(4)); // "6.1.1" = 3 parts → depth 4
1739        assert_eq!(section.section_type, SectionType::Heading);
1740
1741        let content = section.content_text.as_ref().unwrap();
1742
1743        // Should be a single paragraph, no spurious newlines from emu-val
1744        assert!(
1745            content.contains("The Undefined type has exactly one value, called undefined."),
1746            "emu-val should render inline as plain text: {}",
1747            content
1748        );
1749        assert!(
1750            content.contains("the value undefined."),
1751            "Second emu-val should also be inline"
1752        );
1753        // Should be a single line (one paragraph)
1754        let line_count = content.lines().count();
1755        assert!(
1756            line_count <= 2,
1757            "Simple prose should be 1-2 lines, got {}: {}",
1758            line_count,
1759            content
1760        );
1761    }
1762}