html_bindgen/scrape/
elements.rs

1use crate::Result;
2use scraper::ElementRef;
3use std::collections::HashMap;
4
5/// The raw values extracted from the HTML spec
6#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
7pub struct ScrapedElement {
8    pub tag_name: String,
9    pub categories: Vec<String>,
10    pub submodule_name: String,
11    pub contexts: Vec<String>,
12    pub content_model: Vec<String>,
13    pub tag_omission: Vec<String>,
14    pub content_attributes: Vec<String>,
15    pub dom_interface: Vec<String>,
16}
17
18/// Parse the WhatWG HTML standards document.
19///
20/// # Design
21///
22/// The entire HTML spec is a flat document with little hierarchy. we first need to find
23/// the metadata section labeled by `.element`. Then we need to track back through the
24/// siblings to find the first `h4` node. That will contain the title of the elements.
25///
26/// Once we have the title, we can inspect the `.element` node properly. This is a nested
27/// table containing strings. We then parse these strings into a structured representation.
28pub fn scrape_elements(spec: String) -> Result<Vec<ScrapedElement>> {
29    let document = scraper::Html::parse_document(&spec);
30    let selector = scraper::Selector::parse(".element").unwrap();
31
32    let extract_text = |child| {
33        let categories = ElementRef::wrap(child).unwrap();
34        categories.text().collect::<String>()
35    };
36
37    let mut specs = vec![];
38
39    for element in document.select(&selector).into_iter() {
40        let tag_names = match extract_tag_names(element) {
41            Some(tag_names) => tag_names,
42            None => continue,
43        };
44
45        let element_kind = extract_element_kind(element);
46
47        // Iterate over the table and extract the raw values
48        let mut current: Option<(String, Vec<String>)> = None;
49        let mut outputs: HashMap<String, Vec<String>> = HashMap::new();
50        for child in element.children() {
51            let el = child.value().as_element();
52            let tag_name = el.as_deref().unwrap().name();
53            match tag_name {
54                "dt" => {
55                    if current.is_some() {
56                        let current = current.take().unwrap();
57                        outputs.insert(current.0, current.1);
58                    }
59                    current = Some((extract_text(child), vec![]));
60                }
61                "dd" => {
62                    let current = current.as_mut().unwrap();
63                    current.1.push(extract_text(child));
64                }
65                other => panic!("unexpected tag name {other}"),
66            }
67        }
68        if current.is_some() {
69            let current = current.take().unwrap();
70            outputs.insert(current.0, current.1);
71        }
72
73        // Construct a raw spec item from the parsed data.
74        for tag_name in tag_names {
75            let tag_omission = match outputs.get("Tag omission in text/html:").as_deref() {
76                Some(vec) => vec.clone(),
77                None => vec![],
78            };
79
80            specs.push(ScrapedElement {
81                tag_name,
82                submodule_name: element_kind.clone(),
83                categories: outputs.get("Categories:").as_deref().unwrap().clone(),
84                contexts: outputs
85                    .get("Contexts in which this element can be used:")
86                    .as_deref()
87                    .unwrap()
88                    .clone(),
89                content_model: outputs.get("Content model:").as_deref().unwrap().clone(),
90                content_attributes: outputs
91                    .get("Content attributes:")
92                    .as_deref()
93                    .unwrap()
94                    .clone(),
95                tag_omission,
96                dom_interface: outputs.get("DOM interface:").as_deref().unwrap().clone(),
97            });
98        }
99    }
100    Ok(specs)
101}
102
103/// Extract the tag names from the document.
104fn extract_tag_names(element: scraper::ElementRef) -> Option<Vec<String>> {
105    // Find the name of the element we're inspecting.
106    let mut sibling = element.prev_sibling().unwrap();
107    loop {
108        if let scraper::node::Node::Element(element) = sibling.value() {
109            if element.name() == "h4" {
110                let s = element.id.as_ref().expect("could not parse h4 element id");
111
112                // Skip over `h4` elements which aren't nodes.
113                if s.contains("the") && s.contains("element") {
114                    return Some(parse_tag_names(s));
115                } else {
116                    return None;
117                }
118            }
119        }
120
121        sibling = sibling.prev_sibling().unwrap();
122    }
123}
124
125/// Extract the tag names from the document.
126fn extract_element_kind(element: scraper::ElementRef) -> String {
127    // Find the name of the element we're inspecting.
128    let mut sibling = element.prev_sibling().unwrap();
129    loop {
130        if let scraper::node::Node::Element(element) = sibling.value() {
131            if element.name() == "h3" {
132                let s = element.id.as_ref().expect("could not parse h3 element id");
133                return s.to_string();
134            }
135        }
136
137        sibling = sibling.prev_sibling().unwrap();
138    }
139}
140
141/// Parse the HTML tag names.
142///
143/// A single HTML heading can correspond to several HTML nodes
144fn parse_tag_names(s: &str) -> Vec<String> {
145    if s.ends_with("elements") {
146        s.strip_prefix("the-")
147            .unwrap()
148            .strip_suffix("-elements")
149            .unwrap()
150            .replace("-and", "")
151            .replace(",", "")
152            .split("-")
153            .map(|s| s.to_owned())
154            .collect()
155    } else {
156        let s = s
157            .strip_prefix("the-")
158            .unwrap()
159            .strip_suffix("-element")
160            .unwrap()
161            .to_owned();
162        vec![s]
163    }
164}
html_bindgen/scrape/elements.rs

html_bindgen/scrape/
elements.rs