html_bindgen/scrape/
elements.rs1use crate::Result;
2use scraper::ElementRef;
3use std::collections::HashMap;
4
5#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
7pub struct ScrapedElement {
8 pub tag_name: String,
9 pub categories: Vec<String>,
10 pub submodule_name: String,
11 pub contexts: Vec<String>,
12 pub content_model: Vec<String>,
13 pub tag_omission: Vec<String>,
14 pub content_attributes: Vec<String>,
15 pub dom_interface: Vec<String>,
16}
17
18pub fn scrape_elements(spec: String) -> Result<Vec<ScrapedElement>> {
29 let document = scraper::Html::parse_document(&spec);
30 let selector = scraper::Selector::parse(".element").unwrap();
31
32 let extract_text = |child| {
33 let categories = ElementRef::wrap(child).unwrap();
34 categories.text().collect::<String>()
35 };
36
37 let mut specs = vec![];
38
39 for element in document.select(&selector).into_iter() {
40 let tag_names = match extract_tag_names(element) {
41 Some(tag_names) => tag_names,
42 None => continue,
43 };
44
45 let element_kind = extract_element_kind(element);
46
47 let mut current: Option<(String, Vec<String>)> = None;
49 let mut outputs: HashMap<String, Vec<String>> = HashMap::new();
50 for child in element.children() {
51 let el = child.value().as_element();
52 let tag_name = el.as_deref().unwrap().name();
53 match tag_name {
54 "dt" => {
55 if current.is_some() {
56 let current = current.take().unwrap();
57 outputs.insert(current.0, current.1);
58 }
59 current = Some((extract_text(child), vec![]));
60 }
61 "dd" => {
62 let current = current.as_mut().unwrap();
63 current.1.push(extract_text(child));
64 }
65 other => panic!("unexpected tag name {other}"),
66 }
67 }
68 if current.is_some() {
69 let current = current.take().unwrap();
70 outputs.insert(current.0, current.1);
71 }
72
73 for tag_name in tag_names {
75 let tag_omission = match outputs.get("Tag omission in text/html:").as_deref() {
76 Some(vec) => vec.clone(),
77 None => vec![],
78 };
79
80 specs.push(ScrapedElement {
81 tag_name,
82 submodule_name: element_kind.clone(),
83 categories: outputs.get("Categories:").as_deref().unwrap().clone(),
84 contexts: outputs
85 .get("Contexts in which this element can be used:")
86 .as_deref()
87 .unwrap()
88 .clone(),
89 content_model: outputs.get("Content model:").as_deref().unwrap().clone(),
90 content_attributes: outputs
91 .get("Content attributes:")
92 .as_deref()
93 .unwrap()
94 .clone(),
95 tag_omission,
96 dom_interface: outputs.get("DOM interface:").as_deref().unwrap().clone(),
97 });
98 }
99 }
100 Ok(specs)
101}
102
103fn extract_tag_names(element: scraper::ElementRef) -> Option<Vec<String>> {
105 let mut sibling = element.prev_sibling().unwrap();
107 loop {
108 if let scraper::node::Node::Element(element) = sibling.value() {
109 if element.name() == "h4" {
110 let s = element.id.as_ref().expect("could not parse h4 element id");
111
112 if s.contains("the") && s.contains("element") {
114 return Some(parse_tag_names(s));
115 } else {
116 return None;
117 }
118 }
119 }
120
121 sibling = sibling.prev_sibling().unwrap();
122 }
123}
124
125fn extract_element_kind(element: scraper::ElementRef) -> String {
127 let mut sibling = element.prev_sibling().unwrap();
129 loop {
130 if let scraper::node::Node::Element(element) = sibling.value() {
131 if element.name() == "h3" {
132 let s = element.id.as_ref().expect("could not parse h3 element id");
133 return s.to_string();
134 }
135 }
136
137 sibling = sibling.prev_sibling().unwrap();
138 }
139}
140
141fn parse_tag_names(s: &str) -> Vec<String> {
145 if s.ends_with("elements") {
146 s.strip_prefix("the-")
147 .unwrap()
148 .strip_suffix("-elements")
149 .unwrap()
150 .replace("-and", "")
151 .replace(",", "")
152 .split("-")
153 .map(|s| s.to_owned())
154 .collect()
155 } else {
156 let s = s
157 .strip_prefix("the-")
158 .unwrap()
159 .strip_suffix("-element")
160 .unwrap()
161 .to_owned();
162 vec![s]
163 }
164}