fn main() {
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
let entities = load_entities("entities.json");
#[cfg(feature = "unescape_fast")]
generate_matcher_rs(&entities);
#[cfg(feature = "entities")]
generate_entities_rs(&entities);
}
#[cfg(feature = "entities")]
fn generate_entities_rs(entities: &[(String, String)]) {
use std::cmp::{max, min};
use std::env;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
let out_path = Path::new(&env::var("OUT_DIR").unwrap()).join("entities.rs");
let mut out = BufWriter::new(File::create(out_path).unwrap());
writeln!(out, "\
#[allow(clippy::doc_markdown)] // Doesn’t work correctly here.\n\
/// A map of all valid HTML entities to their expansions.\n\
///\n\
/// The keys of the map are full entity byte strings, e.g. `b\"©\"`, and the\n\
/// values are their expansions, e.g. `b\"©\"`.\n\
///\n\
/// See the [WHATWG HTML spec][spec] for the canonical list of entities with\n\
/// their codepoints and glyphs. The [entities.json][] file linked there is\n\
/// used to generate this constant.\n\
///\n\
/// [spec]: https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references\n\
/// [entities.json]: https://html.spec.whatwg.org/entities.json\n\
///\n\
/// Entity | Codepoints | Glyph\n\
/// -------------------------------|--------------------|------").unwrap();
let mut map_builder = phf_codegen::Map::<&[u8]>::new();
let mut max_len: usize = 0;
let mut min_len: usize = usize::MAX;
for (name, glyph) in entities {
map_builder.entry(name.as_bytes(), &format!("&{:?}", glyph.as_bytes()));
max_len = max(max_len, name.len());
min_len = min(min_len, name.len());
let name = format!("`{name}`");
let codepoints = glyph
.chars()
.map(|c| format!("U+{:06X}", u32::from(c)))
.collect::<Vec<_>>()
.join(", ");
let glyph = match glyph.as_str() {
"\n" | "\t" => "",
"`" => "\\`",
v => v,
};
writeln!(out, "/// {name:30} | {codepoints:18} | {glyph}",).unwrap();
}
let map = map_builder.build();
writeln!(
out,
"\
#[allow(clippy::unreadable_literal)]\n\
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};\n\
\n\
/// Length of longest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\
\n\
/// Length of shortest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MIN_LENGTH: usize = {min_len};"
)
.unwrap();
}
#[cfg(feature = "unescape_fast")]
fn generate_matcher_rs(entities: &[(String, String)]) {
use std::env;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
let out_path = Path::new(&env::var("OUT_DIR").unwrap()).join("matcher.rs");
let mut out = BufWriter::new(File::create(out_path).unwrap());
let mut matcher = matchgen::TreeMatcher::new(
"fn entity_matcher",
"(bool, &'static [u8])",
);
for (name, glyph) in entities {
matcher.add(
name.as_bytes(),
format!("({:?}, &{:?})", name.ends_with(';'), glyph.as_bytes()),
);
}
matcher
.doc("Used in `match_entity()`.")
.disable_clippy(true)
.input_type(matchgen::Input::Iterator)
.render(&mut out)
.unwrap();
writeln!(out).unwrap();
}
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
fn load_entities<P: AsRef<std::path::Path>>(path: P) -> Vec<(String, String)> {
let input = std::fs::read(path.as_ref()).unwrap();
let input: serde_json::Map<String, serde_json::Value> =
serde_json::from_slice(&input).unwrap();
input
.iter()
.map(|(name, info)| {
(
name.clone(),
info["characters"].as_str().unwrap().to_owned(),
)
})
.collect()
}