use std::fmt;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum LanguageTagError {
#[error("Empty language tag")]
Empty,
#[error("Invalid language code: {0}")]
InvalidLanguage(String),
#[error("Invalid script code: {0}")]
InvalidScript(String),
#[error("Invalid region code: {0}")]
InvalidRegion(String),
#[error("Language tag parse error: {0}")]
Parse(String),
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct LanguageTag {
language: String,
script: Option<String>,
region: Option<String>,
variant: Option<String>,
}
impl LanguageTag {
pub fn new(language: impl Into<String>) -> Self {
Self {
language: language.into().to_lowercase(),
script: None,
region: None,
variant: None,
}
}
pub fn with_region(language: impl Into<String>, region: impl Into<String>) -> Self {
Self {
language: language.into().to_lowercase(),
script: None,
region: Some(region.into().to_uppercase()),
variant: None,
}
}
pub fn with_script(language: impl Into<String>, script: impl Into<String>) -> Self {
let script_str = script.into();
let script_normalized = if !script_str.is_empty() {
let mut chars = script_str.chars();
match chars.next() {
Some(first) => first.to_uppercase().to_string() + &chars.as_str().to_lowercase(),
None => String::new(),
}
} else {
String::new()
};
Self {
language: language.into().to_lowercase(),
script: Some(script_normalized),
region: None,
variant: None,
}
}
pub fn parse(tag: &str) -> Result<Self, LanguageTagError> {
if tag.is_empty() {
return Err(LanguageTagError::Empty);
}
let langid: unic_langid::LanguageIdentifier =
tag.parse()
.map_err(|e: unic_langid::LanguageIdentifierError| {
LanguageTagError::Parse(e.to_string())
})?;
Ok(Self {
language: langid.language.to_string(),
script: langid.script.map(|s| s.to_string()),
region: langid.region.map(|r| r.to_string()),
variant: None, })
}
pub fn language(&self) -> &str {
&self.language
}
pub fn script(&self) -> Option<&str> {
self.script.as_deref()
}
pub fn region(&self) -> Option<&str> {
self.region.as_deref()
}
pub fn variant(&self) -> Option<&str> {
self.variant.as_deref()
}
pub fn base(&self) -> LanguageTag {
LanguageTag {
language: self.language.clone(),
script: None,
region: None,
variant: None,
}
}
pub fn matches(&self, other: &LanguageTag) -> bool {
if self == other {
return true;
}
if self.language != other.language {
return false;
}
if other.script.is_none() && other.region.is_none() {
return true;
}
if other.script.is_some() && self.script != other.script {
return false;
}
if other.region.is_some() && self.region != other.region {
return false;
}
true
}
pub fn to_path(&self) -> String {
let mut path = self.language.clone();
if self.script.is_some() || self.region.is_some() {
path.push('/');
path.push_str(&self.to_string());
}
path
}
}
impl fmt::Display for LanguageTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.language)?;
if let Some(ref script) = self.script {
write!(f, "-{}", script)?;
}
if let Some(ref region) = self.region {
write!(f, "-{}", region)?;
}
if let Some(ref variant) = self.variant {
write!(f, "-{}", variant)?;
}
Ok(())
}
}
impl FromStr for LanguageTag {
type Err = LanguageTagError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
LanguageTag::parse(s)
}
}
pub const WIKIPEDIA_URLS: &[(&str, &str)] = &[
(
"en",
"https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
),
(
"simple",
"https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2",
),
(
"de",
"https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2",
),
(
"fr",
"https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2",
),
(
"es",
"https://dumps.wikimedia.org/eswiki/latest/eswiki-latest-pages-articles.xml.bz2",
),
(
"pt",
"https://dumps.wikimedia.org/ptwiki/latest/ptwiki-latest-pages-articles.xml.bz2",
),
(
"it",
"https://dumps.wikimedia.org/itwiki/latest/itwiki-latest-pages-articles.xml.bz2",
),
(
"ru",
"https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2",
),
(
"zh",
"https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2",
),
(
"ja",
"https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2",
),
(
"ko",
"https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2",
),
(
"ar",
"https://dumps.wikimedia.org/arwiki/latest/arwiki-latest-pages-articles.xml.bz2",
),
(
"nl",
"https://dumps.wikimedia.org/nlwiki/latest/nlwiki-latest-pages-articles.xml.bz2",
),
(
"pl",
"https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles.xml.bz2",
),
(
"sv",
"https://dumps.wikimedia.org/svwiki/latest/svwiki-latest-pages-articles.xml.bz2",
),
];
pub fn wikipedia_dump_url(lang: &str) -> String {
if let Some((_, url)) = WIKIPEDIA_URLS.iter().find(|(code, _)| *code == lang) {
return (*url).to_string();
}
format!(
"https://dumps.wikimedia.org/{}wiki/latest/{}wiki-latest-pages-articles.xml.bz2",
lang, lang
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple() {
let tag: LanguageTag = "en".parse().unwrap();
assert_eq!(tag.language(), "en");
assert_eq!(tag.script(), None);
assert_eq!(tag.region(), None);
}
#[test]
fn test_parse_with_region() {
let tag: LanguageTag = "en-US".parse().unwrap();
assert_eq!(tag.language(), "en");
assert_eq!(tag.region(), Some("US"));
}
#[test]
fn test_parse_with_script() {
let tag: LanguageTag = "zh-Hans".parse().unwrap();
assert_eq!(tag.language(), "zh");
assert_eq!(tag.script(), Some("Hans"));
}
#[test]
fn test_display() {
let tag = LanguageTag::with_region("en", "US");
assert_eq!(tag.to_string(), "en-US");
let tag = LanguageTag::with_script("zh", "Hans");
assert_eq!(tag.to_string(), "zh-Hans");
}
#[test]
fn test_matches() {
let en = LanguageTag::new("en");
let en_us = LanguageTag::with_region("en", "US");
let en_gb = LanguageTag::with_region("en", "GB");
let de = LanguageTag::new("de");
assert!(en_us.matches(&en));
assert!(en_gb.matches(&en));
assert!(!en_us.matches(&en_gb));
assert!(!de.matches(&en));
}
#[test]
fn test_to_path() {
let en = LanguageTag::new("en");
assert_eq!(en.to_path(), "en");
let en_us = LanguageTag::with_region("en", "US");
assert_eq!(en_us.to_path(), "en/en-US");
}
}