html_bindgen/scrape/
webidls.rs

1use crate::{utils::extract_webidl_name, Result};
2use serde::{Deserialize, Serialize};
3
4/// The raw values extracted from the HTML spec
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct ScrapedInterface {
7    pub name: String,
8    pub idl: String,
9}
10
11/// Parse the WhatWG HTML standards document.
12///
13/// # Design
14///
15/// The entire HTML spec is a flat document with little hierarchy. we first need to find
16/// the metadata section labeled by `.element`. Then we need to track back through the
17/// siblings to find the first `h4` node. That will contain the title of the elements.
18///
19/// Once we have the title, we can inspect the `.element` node properly. This is a nested
20/// table containing strings. We then parse these strings into a structured representation.
21pub fn scrape_webidls(spec: String) -> Result<Vec<ScrapedInterface>> {
22    let document = scraper::Html::parse_document(&spec);
23    let selector = scraper::Selector::parse(".idl").unwrap();
24
25    let mut specs = vec![];
26    for element in document.select(&selector).into_iter() {
27        let idl = element.text().map(|t| t.to_owned()).collect::<String>();
28        let name = match extract_webidl_name(&idl) {
29            Some(name) => name,
30            None => continue,
31        };
32        specs.push(ScrapedInterface { name, idl });
33    }
34    Ok(specs)
35}