#![warn(missing_docs)]
#![deny(unsafe_code)]
use icu_locale::{LanguageIdentifier, LocaleExpander};
use serde::Deserialize;
use std::collections::{HashMap, HashSet};
trait Rule<T> {
fn matches(self, tag: T, vars: &Variables) -> bool;
}
#[derive(Debug, PartialEq)]
enum SubTagRule {
Str(String),
Var(String),
VarExclude(String),
All,
}
impl From<&'_ str> for SubTagRule {
fn from(s: &'_ str) -> Self {
if s == "*" {
Self::All
} else if let Some(name) = s.strip_prefix("$!") {
Self::VarExclude(name.to_string())
} else if let Some(name) = s.strip_prefix('$') {
Self::Var(name.to_string())
} else {
Self::Str(s.to_string())
}
}
}
impl Rule<&'_ str> for &'_ SubTagRule {
fn matches(self, tag: &str, vars: &Variables) -> bool {
match self {
SubTagRule::Str(s) => s == tag,
SubTagRule::Var(key) => vars[key].contains(tag),
SubTagRule::VarExclude(key) => !vars[key].contains(tag),
SubTagRule::All => true,
}
}
}
impl Rule<Option<&'_ str>> for Option<&'_ SubTagRule> {
fn matches(self, tag: Option<&str>, vars: &Variables) -> bool {
match (self, tag) {
(None, None) | (Some(SubTagRule::All), _) => true,
(Some(s), Some(tag)) => s.matches(tag, vars),
_ => false,
}
}
}
#[derive(Debug, PartialEq, Deserialize)]
#[serde(from = "String")]
struct LanguageIdentifierRule {
pub language: SubTagRule,
pub script: Option<SubTagRule>,
pub region: Option<SubTagRule>,
}
impl From<&'_ str> for LanguageIdentifierRule {
fn from(s: &'_ str) -> Self {
let mut parts = s.split('_');
let language = parts.next().unwrap().into();
let script = parts.next().map(|s| s.into());
let region = parts.next().map(|s| s.into());
Self {
language,
script,
region,
}
}
}
impl From<String> for LanguageIdentifierRule {
fn from(s: String) -> Self {
s.as_str().into()
}
}
impl Rule<&'_ LanguageIdentifier> for &'_ LanguageIdentifierRule {
fn matches(self, lang: &LanguageIdentifier, vars: &Variables) -> bool {
self.language.matches(lang.language.as_str(), vars)
&& self
.script
.as_ref()
.matches(lang.script.as_ref().map(|s| s.as_str()), vars)
&& self
.region
.as_ref()
.matches(lang.region.as_ref().map(|s| s.as_str()), vars)
}
}
#[derive(Debug, Deserialize, PartialEq)]
struct ParadigmLocales {
#[serde(rename = "@locales")]
pub locales: String,
}
#[derive(Debug, Deserialize, PartialEq)]
struct MatchVariable {
#[serde(rename = "@id")]
pub id: String,
#[serde(rename = "@value")]
pub value: String,
}
#[derive(Debug, Deserialize, PartialEq)]
struct LanguageMatch {
#[serde(rename = "@desired")]
pub desired: LanguageIdentifierRule,
#[serde(rename = "@supported")]
pub supported: LanguageIdentifierRule,
#[serde(rename = "@distance")]
pub distance: u16,
#[serde(default, rename = "@oneway")]
pub oneway: bool,
}
#[derive(Debug, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase")]
struct LanguageMatches {
pub paradigm_locales: ParadigmLocales,
pub match_variable: Vec<MatchVariable>,
pub language_match: Vec<LanguageMatch>,
}
#[derive(Debug, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase")]
struct LanguageMatching {
pub language_matches: LanguageMatches,
}
#[derive(Debug, Deserialize, PartialEq)]
#[serde(rename_all = "camelCase")]
struct SupplementalData {
pub language_matching: LanguageMatching,
}
const LANGUAGE_INFO: &str = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/languageInfo.xml"
));
pub struct LanguageMatcher {
paradigm: HashSet<LanguageIdentifier>,
vars: Variables,
rules: Vec<LanguageMatch>,
expander: LocaleExpander,
}
type Variables = HashMap<String, HashSet<String>>;
impl From<SupplementalData> for LanguageMatcher {
fn from(data: SupplementalData) -> Self {
let expander = LocaleExpander::new_extended();
let matches = data.language_matching.language_matches;
let paradigm = matches
.paradigm_locales
.locales
.split(' ')
.map(|s| {
let mut lang = s.parse().unwrap();
expander.maximize(&mut lang);
lang
})
.collect::<HashSet<_>>();
let vars = matches
.match_variable
.into_iter()
.map(|MatchVariable { id, value }| {
debug_assert!(id.starts_with('$'));
(
id[1..].to_string(),
value.split('+').map(|s| s.to_string()).collect(),
)
})
.collect::<HashMap<_, _>>();
Self {
paradigm,
vars,
rules: matches.language_match,
expander,
}
}
}
impl LanguageMatcher {
pub fn new() -> Self {
let data: SupplementalData = quick_xml::de::from_str(LANGUAGE_INFO).unwrap();
data.into()
}
pub fn matches<'a>(
&self,
mut desired: LanguageIdentifier,
supported: impl IntoIterator<Item = &'a LanguageIdentifier>,
) -> Option<(&'a LanguageIdentifier, u16)> {
self.expander.maximize(&mut desired);
supported
.into_iter()
.map(|s| {
let mut max_s = s.clone();
self.expander.maximize(&mut max_s);
(s, self.distance_impl(desired.clone(), max_s))
})
.min_by_key(|(_, dis)| *dis)
.filter(|(_, dis)| *dis < 1000)
}
pub fn distance(
&self,
mut desired: LanguageIdentifier,
mut supported: LanguageIdentifier,
) -> u16 {
self.expander.maximize(&mut desired);
self.expander.maximize(&mut supported);
self.distance_impl(desired, supported)
}
fn distance_impl(
&self,
mut desired: LanguageIdentifier,
mut supported: LanguageIdentifier,
) -> u16 {
debug_assert!(desired.region.is_some());
debug_assert!(desired.script.is_some());
debug_assert!(supported.region.is_some());
debug_assert!(supported.script.is_some());
let mut distance = 0;
if desired.region != supported.region {
distance += self.distance_match(&desired, &supported);
}
desired.region = None;
supported.region = None;
if desired.script != supported.script {
distance += self.distance_match(&desired, &supported);
}
desired.script = None;
supported.script = None;
if desired.language != supported.language {
distance += self.distance_match(&desired, &supported);
}
distance
}
fn distance_match(&self, desired: &LanguageIdentifier, supported: &LanguageIdentifier) -> u16 {
for rule in &self.rules {
let mut matches = rule.desired.matches(desired, &self.vars)
&& rule.supported.matches(supported, &self.vars);
if !rule.oneway && !matches {
matches = rule.supported.matches(desired, &self.vars)
&& rule.desired.matches(supported, &self.vars);
}
if matches {
let mut distance = rule.distance * 10;
if self.is_paradigm(desired) ^ self.is_paradigm(supported) {
distance -= 1
}
return distance;
}
}
unreachable!()
}
fn is_paradigm(&self, lang: &LanguageIdentifier) -> bool {
self.paradigm.contains(lang)
}
}
impl Default for LanguageMatcher {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod test {
use crate::LanguageMatcher;
use icu_locale::langid;
#[test]
fn distance() {
let matcher = LanguageMatcher::new();
assert_eq!(matcher.distance(langid!("zh-CN"), langid!("zh-Hans")), 0);
assert_eq!(matcher.distance(langid!("zh-TW"), langid!("zh-Hant")), 0);
assert_eq!(matcher.distance(langid!("zh-HK"), langid!("zh-MO")), 40);
assert_eq!(matcher.distance(langid!("zh-HK"), langid!("zh-Hant")), 50);
}
#[test]
fn matcher() {
let matcher = LanguageMatcher::new();
let accepts = [
langid!("en"),
langid!("ja"),
langid!("zh-Hans"),
langid!("zh-Hant"),
];
assert_eq!(
matcher.matches(langid!("zh-CN"), &accepts),
Some((&langid!("zh-Hans"), 0))
);
assert_eq!(
matcher.matches(langid!("zh-TW"), &accepts),
Some((&langid!("zh-Hant"), 0))
);
}
}