use std::{
collections::BTreeMap,
error::Error as StdError,
fmt::{Display, Formatter, Result as FmtResult},
};
use scraper::{ElementRef, Html, Selector};
#[derive(Debug)]
pub enum CodegenError {
Parse(String),
Validation(String),
}
impl Display for CodegenError {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
match self {
CodegenError::Parse(msg) => write!(f, "Parse error: {msg}"),
CodegenError::Validation(msg) => write!(f, "Validation error: {msg}"),
}
}
}
impl StdError for CodegenError {}
#[derive(Debug)]
struct TagSpec {
ident: String,
tag: String,
english: String,
autonym: String,
}
pub fn generate(languages_html: &str) -> Result<String, CodegenError> {
let specs = load_languages(languages_html)?;
Ok(render(&specs))
}
fn load_languages(languages_html: &str) -> Result<Vec<TagSpec>, CodegenError> {
let document = Html::parse_document(languages_html);
let row_selector = Selector::parse("#languages-table tbody tr")
.map_err(|err| CodegenError::Parse(format!("Invalid selector: {err}")))?;
let cell_selector = Selector::parse("td")
.map_err(|err| CodegenError::Parse(format!("Invalid selector: {err}")))?;
let mut merged: BTreeMap<String, (String, String)> = BTreeMap::new();
for row in document.select(&row_selector) {
let mut cells = row.select(&cell_selector).take(4).map(extract_text).collect::<Vec<_>>();
if cells.len() < 4 {
continue;
}
let tag = cells.remove(0);
let name = cells.remove(0);
let region = cells.remove(0);
let native = cells.remove(0);
let lower_tag = tag.to_ascii_lowercase();
if lower_tag.contains("system") {
continue;
}
let english = if !region.is_empty() { region } else { name };
if tag.is_empty() || english.is_empty() || native.is_empty() {
continue;
}
merged.entry(tag).or_insert((english, native));
}
if merged.is_empty() {
return Err(CodegenError::Parse("No languages found in source file.".into()));
}
let mut specs = Vec::with_capacity(merged.len());
for (tag, (english, native)) in merged {
validate_tag(&tag).map_err(|msg| CodegenError::Validation(format!("{msg} (tag {tag})")))?;
let ident = tag_to_ident(&tag)
.map_err(|msg| CodegenError::Validation(format!("{msg} (tag {tag})")))?;
specs.push(TagSpec { ident, tag, english, autonym: native });
}
Ok(specs)
}
fn extract_text(cell: ElementRef<'_>) -> String {
let joined = cell.text().collect::<String>();
normalize_whitespace(&joined)
}
fn validate_tag(tag: &str) -> Result<(), String> {
if tag.is_empty() {
return Err("Tag is empty.".into());
}
if !tag.is_ascii() {
return Err("Tag contains non-ASCII characters.".into());
}
if tag.starts_with('-') || tag.ends_with('-') {
return Err("Tag must not start or end with a hyphen.".into());
}
let mut parts = tag.split('-').peekable();
let Some(language) = parts.next() else {
return Err("Tag is missing a language subtag.".into());
};
if !(2..=3).contains(&language.len()) || !language.chars().all(|c| c.is_ascii_alphabetic()) {
return Err(format!("Invalid language subtag `{language}`."));
}
if let Some(script) = parts.peek().copied()
&& script.len() == 4
&& script.chars().all(|c| c.is_ascii_alphabetic())
{
parts.next();
}
if let Some(region) = parts.peek().copied() {
let is_alpha_region = region.len() == 2 && region.chars().all(|c| c.is_ascii_alphabetic());
let is_numeric_region = region.len() == 3 && region.chars().all(|c| c.is_ascii_digit());
if is_alpha_region || is_numeric_region {
parts.next();
}
}
for variant in parts {
if variant.len() < 4 || variant.len() > 8 {
return Err(format!("Variant subtag `{variant}` has invalid length."));
}
if !variant.chars().all(|c| c.is_ascii_alphanumeric()) {
return Err(format!("Variant subtag `{variant}` contains invalid characters."));
}
}
Ok(())
}
fn normalize_whitespace(input: &str) -> String {
let mut out = String::new();
let mut last_space = false;
for ch in input.chars() {
if ch == '\u{00ad}' || ch.is_control() {
continue;
}
let is_space = ch.is_whitespace();
if is_space {
if !last_space {
out.push(' ');
}
} else {
out.push(ch);
}
last_space = is_space;
}
out.trim().into()
}
fn tag_to_ident(tag: &str) -> Result<String, String> {
let mut ident = String::new();
for part in tag.split('-') {
let mut chars = part.chars();
let Some(first) = chars.next() else {
return Err("Tag contains an empty component.".into());
};
ident.push(first.to_ascii_uppercase());
ident.extend(chars.map(|c| c.to_ascii_lowercase()));
}
if ident.is_empty() {
return Err("Failed to derive identifier from tag.".into());
}
Ok(ident)
}
fn escape(s: &str) -> String {
s.replace('\\', "\\\\").replace('"', "\\\"")
}
fn render(specs: &[TagSpec]) -> String {
let mut out = String::new();
out.push_str(
"\
// std
use std::convert::TryFrom;
// self
use crate::prelude::*;
use Language::*;
/// Generated from the translation.io languages-with-plural-cases page.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Language {
",
);
for spec in specs {
out.push_str(&format!(
" /// {}
{},
",
spec.english, spec.ident
));
}
out.push_str(
"}
impl Language {
/// Get all languages.
#[rustfmt::skip]
pub const fn all() -> [Self; ",
);
out.push_str(&specs.len().to_string());
out.push_str(
"] {
[
",
);
for spec in specs {
out.push_str(&format!(
" {},
",
spec.ident
));
}
out.push_str(
" ]
}
/// Get the language tag.
pub fn tag(&self) -> &'static str {
match self {
",
);
for spec in specs {
out.push_str(&format!(
" {} => \"{}\",
",
spec.ident, spec.tag
));
}
out.push_str(
" }
}
/// Get the language name.
pub fn name(&self) -> &str {
match self {
",
);
for spec in specs {
out.push_str(&format!(
" {} => \"{}\",
",
spec.ident,
escape(&spec.english)
));
}
out.push_str(
" }
}
/// Get the language name in the language itself.
pub fn local_name(&self) -> &'static str {
match self {
",
);
for spec in specs {
out.push_str(&format!(
" {} => \"{}\",
",
spec.ident,
escape(&spec.autonym)
));
}
out.push_str(
" }
}
}
impl TryFrom<&str> for Language {
type Error = Error;
fn try_from(tag: &str) -> Result<Self, Self::Error> {
let this = match tag {
",
);
for spec in specs {
out.push_str(&format!(
" \"{}\" => {},
",
spec.tag, spec.ident
));
}
out.push_str(
" _ => return Err(Error::UnsupportedLanguageTag(tag.into())),
};
Ok(this)
}
}
impl TryFrom<String> for Language {
type Error = Error;
fn try_from(value: String) -> Result<Self, Self::Error> {
Language::try_from(value.as_str())
}
}
#[cfg(feature = \"serde\")]
impl serde::Serialize for Language {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
serializer.serialize_str(self.tag())
}
}
#[cfg(feature = \"serde\")]
impl<'de> serde::Deserialize<'de> for Language {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let tag = String::deserialize(deserializer)?;
Language::try_from(tag.as_str()).map_err(|_| serde::de::Error::unknown_variant(&tag, &[]))
}
}
#[cfg(feature = \"utoipa\")]
impl utoipa::PartialSchema for Language {
fn schema() -> utoipa::openapi::RefOr<utoipa::openapi::schema::Schema> {
let enum_values =
Language::all().iter().map(|language| language.tag().to_string()).collect::<Vec<_>>();
let object = utoipa::openapi::schema::ObjectBuilder::new()
.schema_type(utoipa::openapi::schema::SchemaType::Type(
utoipa::openapi::schema::Type::String,
))
.enum_values(Some(enum_values))
.build();
utoipa::openapi::RefOr::T(utoipa::openapi::schema::Schema::Object(object))
}
}
#[cfg(feature = \"utoipa\")]
impl utoipa::ToSchema for Language {}
",
);
out
}