html-tags 0.0.4

An automated list of HTML tags with their attributes and values
Documentation
#!/usr/bin/env rust-script
//! This is a regular crate doc comment, but it also contains a partial
//! Cargo manifest.  Note the use of a *fenced* code block, and the
//! `cargo` "language".
//!
//! ```cargo
//! [dependencies]
//! heck = "0.4.1"
//! scraper = "0.16.0"
//! ureq = "2.6.2"
//! itertools = "0.10.5"
//! ```

use heck::{AsKebabCase, ToSnakeCase, ToUpperCamelCase};
use itertools::Itertools;
use scraper::{Element, ElementRef, Html, Selector};
use std::{collections::BTreeMap, io::Write};

fn main() {
    let agent = ureq::agent();

    let resp = agent
        .get("https://developer.mozilla.org/en-US/docs/Web/HTML/Element")
        .call()
        .unwrap();
    let html = resp.into_string().unwrap();
    let document = Html::parse_document(&html);
    let selector =
        Selector::parse("td:first-child > a[href^='/en-US/docs/Web/HTML/Element/']:only-child")
            .unwrap();

    let mut elems = Vec::new();

    let global_attrs = BTreeMap::from_iter(get_global_attrs());

    let mut buf = String::from(
        "// generated by gen.rs + rustfmt - not in a build.rs because HTML tags don't change too often
        #![no_std]
        #[cfg(feature = \"alloc\")]
        extern crate alloc;",
    )
    .into_bytes();
    for e in document.select(&selector) {
        let url = format!(
            "https://developer.mozilla.org{}",
            e.value().attr("href").unwrap()
        );
        // the name without the brackets
        let name = e.text().next().unwrap();
        let name = &name[1..name.len() - 1];
        let name = name.to_upper_camel_case();

        let resp = agent.get(&url).call().unwrap();
        let html = resp.into_string().unwrap();
        let document = Html::parse_document(&html);

        let deprecated = document
            .select(
                &Selector::parse(".main-page-content > .section-content > .notecard.deprecated")
                    .unwrap(),
            )
            .count()
            != 0;

        elems.push((name.clone(), deprecated));

        let mut attrs = global_attrs.clone();
        attrs.extend(get_attrs(&document));

        writeln!(
            buf,
            "{}
            {}
            #[derive(Debug, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
            pub struct {name}<'life> {{
                {}
            }}",
            get_mdn_doc(&document, &url),
            if deprecated { "#[deprecated]" } else { "" },
            attrs
                .iter()
                .format_with(",\n/// ", |(name, (desc, ty, alloc)), f| f(&format_args!(
                    "{desc}
                    {}
                    pub {name}: core::option::Option<{ty}>",
                    if *alloc {
                        "#[cfg(feature = \"alloc\")]"
                    } else {
                        ""
                    },
                ))),
        )
        .unwrap();
    }
    writeln!(
        buf,
        "#[allow(deprecated)]
        pub enum Element<'life> {{
            {}
        }}",
        elems
            .iter()
            .format_with(",\n", |(e, dep), f| f(&format_args!(
                "{} {e}({e}<'life>)",
                if *dep { "#[deprecated]" } else { "" },
            )))
    )
    .unwrap();
    writeln!(
        buf,
        "#[allow(deprecated)]
impl<'life> Element<'life> {{
    /// Gets an element from a lowercase tag name.
    pub fn from_tag(tag: &str) -> core::option::Option<Self> {{
        match tag {{
            {},
            _ => None,
        }}
    }}
}}",
        elems.iter().format_with(",", |(e, _), f| f(&format_args!(
            "\"{}\" => Some(Self::{e}({e}::default()))",
            AsKebabCase(e)
        )))
    )
    .unwrap();
    writeln!(
        buf,
        "#[allow(deprecated)]
impl<'life> Element<'life> {{
    {}
}}",
        global_attrs
            .iter()
            .format_with("\n", |(name, (desc, ty, alloc)), f| f(&format_args!(
                "{desc}
                {}
                pub fn {name}(&self) -> core::option::Option<{}{ty}> {{
                    match self {{
                        {}
                    }}
                }}",
                if *alloc {
                    "#[cfg(feature = \"alloc\")]"
                } else {
                    ""
                },
                if *alloc { "&" } else { "" },
                elems.iter().format_with(",", |(e, _), f| f(&format_args!(
                    "Self::{e}(e) => e.{name}{}",
                    if *alloc { ".as_ref()" } else { "" }
                )))
            )))
    )
    .unwrap();
    std::fs::write("src/lib.rs", buf).unwrap();

    std::process::Command::new("rustfmt")
        .arg("src/lib.rs")
        .status()
        .unwrap();
}

fn get_global_attrs() -> Vec<(String, (String, String, bool))> {
    let agent = ureq::agent();

    let resp = agent
        .get("https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes")
        .call()
        .unwrap();
    let html = resp.into_string().unwrap();
    let document = Html::parse_document(&html);
    let selector = Selector::parse("dl").unwrap();
    let dl = document.select(&selector).next().unwrap();
    dl_to_attrs(dl)
}

fn get_attrs(document: &Html) -> Vec<(String, (String, String, bool))> {
    let selector = Selector::parse(".section-content > dl").unwrap();

    if let Some(dl) = document.select(&selector).next() {
        dl_to_attrs(dl)
    } else {
        Vec::new()
    }
}

// fn get_aria_attrs() -> Vec<(String, String)> {
//     let agent = ureq::agent();
//     // scrape MDN for all the elements
//     let resp = agent
//         .get("https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Attributes")
//         .call()
//         .unwrap();
//     let html = resp.into_string().unwrap();
//     let document = Html::parse_document(&html);
//     let selector = Selector::parse(
//         "td:first-child > a[href^='/en-US/docs/Web/Accessibility/ARIA/Attributes/']:only-child",
//     )
//     .unwrap();
// }

fn get_mdn_doc(document: &Html, url: &str) -> String {
    let mut summary = document
        .select(&Selector::parse(".main-page-content > .section-content > p").unwrap())
        .map(|e| e.inner_html())
        .collect::<Vec<_>>();
    if summary.len() == 0 {
        summary = document
            .select(
                &Selector::parse(
                    ".main-page-content > section[aria-labelledby='summary'] > .section-content",
                )
                .unwrap(),
            )
            .map(|e| e.inner_html())
            .collect::<Vec<_>>();
    }
    let summary = summary
        .join("\n\n")
        .replace("<br>", "\n\n")
        .replace('\n', "\n/// ");
    format!("/// {}\n///\n/// More information: <{url}>", summary)
}

fn dl_to_attrs(dl: ElementRef) -> Vec<(String, (String, String, bool))> {
    let mut attrs = Vec::new();
    for e in dl
        .children()
        .filter_map(ElementRef::wrap)
        .filter(|e| e.value().name() == "dt")
    {
        let name = e.text().next().unwrap();
        let desc = e
            .next_sibling_element()
            .unwrap()
            .inner_html()
            .replace("<br>", "\n\n")
            .replace('\n', "\n/// ");
        let name = name.to_snake_case();

        let (ty, alloc) = match name.as_str() {
            "data" => ("alloc::collections::BTreeMap<&'life str, &'life str>", true),

            _ => (
                match name.as_str() {
                    "autofocus" | "checked" | "disabled" | "multiple" | "readonly" | "required"
                    | "selected" | "novalidate" | "formnovalidate" | "hidden" => "bool",
                    _ => "&'life str",
                },
                false,
            ),
        };

        attrs.push((
            if ["type", "loop", "async", "for", "as"].contains(&&*name) {
                format!("{name}_")
            } else {
                name
            },
            (desc, ty.to_string(), alloc),
        ));
    }
    attrs
}