#![deny(missing_docs)]
#![cfg_attr(test, deny(warnings))]
mod iana_registry;
#[cfg(feature = "serde")]
mod serde;
use crate::iana_registry::*;
use std::error::Error;
use std::fmt;
use std::iter::once;
use std::ops::Deref;
use std::str::FromStr;
use std::str::Split;
#[derive(Eq, PartialEq, Debug, Clone, Hash)]
pub struct LanguageTag {
serialization: String,
language_end: usize,
extlang_end: usize,
script_end: usize,
region_end: usize,
variant_end: usize,
extension_end: usize,
}
impl LanguageTag {
#[inline]
pub fn as_str(&self) -> &str {
&self.serialization
}
#[inline]
pub fn into_string(self) -> String {
self.serialization
}
#[inline]
pub fn primary_language(&self) -> &str {
&self.serialization[..self.language_end]
}
#[inline]
pub fn extended_language(&self) -> Option<&str> {
if self.language_end == self.extlang_end {
None
} else {
Some(&self.serialization[self.language_end + 1..self.extlang_end])
}
}
#[inline]
pub fn extended_language_subtags(&self) -> impl Iterator<Item = &str> {
self.extended_language().unwrap_or("").split_terminator('-')
}
#[inline]
pub fn full_language(&self) -> &str {
&self.serialization[..self.extlang_end]
}
#[inline]
pub fn script(&self) -> Option<&str> {
if self.extlang_end == self.script_end {
None
} else {
Some(&self.serialization[self.extlang_end + 1..self.script_end])
}
}
#[inline]
pub fn region(&self) -> Option<&str> {
if self.script_end == self.region_end {
None
} else {
Some(&self.serialization[self.script_end + 1..self.region_end])
}
}
#[inline]
pub fn variant(&self) -> Option<&str> {
if self.region_end == self.variant_end {
None
} else {
Some(&self.serialization[self.region_end + 1..self.variant_end])
}
}
#[inline]
pub fn variant_subtags(&self) -> impl Iterator<Item = &str> {
self.variant().unwrap_or("").split_terminator('-')
}
#[inline]
pub fn extension(&self) -> Option<&str> {
if self.variant_end == self.extension_end {
None
} else {
Some(&self.serialization[self.variant_end + 1..self.extension_end])
}
}
#[inline]
pub fn extension_subtags(&self) -> impl Iterator<Item = (char, &str)> {
match self.extension() {
Some(parts) => ExtensionsIterator::new(parts),
None => ExtensionsIterator::new(""),
}
}
#[inline]
pub fn private_use(&self) -> Option<&str> {
if self.serialization.starts_with("x-") {
Some(&self.serialization)
} else if self.extension_end == self.serialization.len() {
None
} else {
Some(&self.serialization[self.extension_end + 1..])
}
}
#[inline]
pub fn private_use_subtags(&self) -> impl Iterator<Item = &str> {
self.private_use()
.map(|part| &part[2..])
.unwrap_or("")
.split_terminator('-')
}
pub fn parse(input: &str) -> Result<Self, ParseError> {
if let Some(tag) = GRANDFATHEREDS
.iter()
.find(|record| record.eq_ignore_ascii_case(input))
{
Ok(tag_from_primary_language(*tag))
} else if input.starts_with("x-") || input.starts_with("X-") {
if !is_alphanumeric_or_dash(input) {
Err(ParseError::ForbiddenChar)
} else if input.len() == 2 {
Err(ParseError::EmptyPrivateUse)
} else {
Ok(tag_from_primary_language(input.to_ascii_lowercase()))
}
} else {
parse_language_tag(input)
}
}
pub fn validate(&self) -> Result<(), ValidationError> {
if self.serialization.starts_with("x-") {
return Ok(());
}
if is_in_str_slice_set(&GRANDFATHEREDS, &self.serialization) {
return Ok(());
}
if let Some(extended_language) = self.extended_language() {
if extended_language.contains('-') {
return Err(ValidationError::MultipleExtendedLanguageSubtags);
}
}
let primary_language = self.primary_language();
if !between(primary_language, "qaa", "qtz")
&& !is_in_from_str_slice_set(&LANGUAGES, primary_language)
{
return Err(ValidationError::PrimaryLanguageNotInRegistry);
}
if let Some(extended_language) = self.extended_language() {
if let Some(extended_language_prefix) =
find_in_from_str_slice_map(&EXTLANGS, extended_language)
{
if !self.serialization.starts_with(extended_language_prefix) {
return Err(ValidationError::WrongExtendedLanguagePrefix);
}
} else {
return Err(ValidationError::ExtendedLanguageNotInRegistry);
}
}
if let Some(script) = self.script() {
if !between(script, "Qaaa", "Qabx") && !is_in_from_str_slice_set(&SCRIPTS, script) {
return Err(ValidationError::ScriptNotInRegistry);
}
}
if let Some(region) = self.region() {
if !between(region, "QM", "QZ")
&& !between(region, "XA", "XZ")
&& !is_in_from_str_slice_set(®IONS, region)
{
return Err(ValidationError::RegionNotInRegistry);
}
}
for variant in self.variant_subtags() {
if let Some(variant_prefixes) = find_in_str_slice_map(&VARIANTS, variant) {
if !variant_prefixes
.split(' ')
.any(|prefix| self.serialization.starts_with(prefix))
{
return Err(ValidationError::WrongVariantPrefix);
}
} else {
return Err(ValidationError::VariantNotInRegistry);
}
}
let with_duplicate_variant = self.variant_subtags().enumerate().any(|(id1, variant1)| {
self.variant_subtags()
.enumerate()
.any(|(id2, variant2)| id1 != id2 && variant1 == variant2)
});
if with_duplicate_variant {
return Err(ValidationError::DuplicateVariant);
}
if let Some(extension) = self.extension() {
let mut seen_extensions = AlphanumericLowerCharSet::new();
let with_duplicate_extension = extension.split('-').any(|subtag| {
if subtag.len() == 1 {
let extension = subtag.chars().next().unwrap();
if seen_extensions.contains(extension) {
true
} else {
seen_extensions.insert(extension);
false
}
} else {
false
}
});
if with_duplicate_extension {
return Err(ValidationError::DuplicateExtension);
}
}
Ok(())
}
pub fn is_valid(&self) -> bool {
self.validate().is_ok()
}
pub fn canonicalize(&self) -> Result<LanguageTag, ValidationError> {
if self.serialization.starts_with("x-") {
return Ok(self.clone());
}
if is_in_str_slice_set(&GRANDFATHEREDS, &self.serialization) {
return Ok(
if let Some(preferred_value) =
find_in_str_slice_map(&GRANDFATHEREDS_PREFERRED_VALUE, &self.serialization)
{
Self::parse(preferred_value).unwrap()
} else {
self.clone()
},
);
}
if let Some(preferred_value) =
find_in_str_slice_map(&REDUNDANTS_PREFERRED_VALUE, &self.serialization)
{
return Ok(Self::parse(preferred_value).unwrap());
}
let mut primary_language = self.primary_language();
if let Some(preferred_value) =
find_in_from_str_slice_map(&LANGUAGES_PREFERRED_VALUE, primary_language)
{
primary_language = preferred_value;
}
let mut extended_language = None;
if let Some(extlang) = self.extended_language() {
if extlang.contains('-') {
return Err(ValidationError::MultipleExtendedLanguageSubtags);
}
if let Some(preferred_value) =
find_in_from_str_slice_map(&EXTLANGS_PREFERRED_VALUE, extlang)
{
primary_language = preferred_value;
} else {
extended_language = Some(extlang);
}
}
let mut serialization = String::with_capacity(self.serialization.len());
serialization.push_str(primary_language);
let language_end = serialization.len();
if let Some(extended_language) = extended_language {
serialization.push('-');
serialization.push_str(extended_language);
}
let extlang_end = serialization.len();
if let Some(script) = self.script() {
let script =
find_in_from_str_slice_map(&SCRIPTS_PREFERRED_VALUE, script).unwrap_or(script);
let match_suppress_script =
find_in_from_str_slice_map(&LANGUAGES_SUPPRESS_SCRIPT, primary_language)
.filter(|suppress_script| *suppress_script == script)
.is_some();
if !match_suppress_script {
serialization.push('-');
serialization.push_str(script);
}
}
let script_end = serialization.len();
if let Some(region) = self.region() {
serialization.push('-');
serialization.push_str(
find_in_from_str_slice_map(®IONS_PREFERRED_VALUE, region).unwrap_or(region),
);
}
let region_end = serialization.len();
for variant in self.variant_subtags() {
let variant =
*find_in_str_slice_map(&VARIANTS_PREFERRED_VALUE, variant).unwrap_or(&variant);
let variant_already_exists = serialization.split('-').any(|subtag| subtag == variant);
if !variant_already_exists {
serialization.push('-');
serialization.push_str(variant);
}
}
let variant_end = serialization.len();
if self.extension().is_some() {
let mut extensions: Vec<_> = self.extension_subtags().collect();
extensions.sort_unstable();
for (k, v) in extensions {
serialization.push('-');
serialization.push(k);
serialization.push('-');
serialization.push_str(v);
}
}
let extension_end = serialization.len();
if let Some(private_use) = self.private_use() {
serialization.push('-');
serialization.push_str(private_use);
}
Ok(LanguageTag {
serialization,
language_end,
extlang_end,
script_end,
region_end,
variant_end,
extension_end,
})
}
pub fn matches(&self, other: &LanguageTag) -> bool {
fn matches_option(a: Option<&str>, b: Option<&str>) -> bool {
match (a, b) {
(Some(a), Some(b)) => a == b,
(None, _) => true,
(_, None) => false,
}
}
fn matches_iter<'a>(
a: impl Iterator<Item = &'a str>,
b: impl Iterator<Item = &'a str>,
) -> bool {
a.zip(b).all(|(x, y)| x == y)
}
assert!(self.is_language_range());
self.full_language() == other.full_language()
&& matches_option(self.script(), other.script())
&& matches_option(self.region(), other.region())
&& matches_iter(self.variant_subtags(), other.variant_subtags())
}
pub fn is_language_range(&self) -> bool {
self.extension().is_none() && self.private_use().is_none()
}
}
impl FromStr for LanguageTag {
type Err = ParseError;
#[inline]
fn from_str(input: &str) -> Result<Self, ParseError> {
Self::parse(input)
}
}
impl fmt::Display for LanguageTag {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(self.as_str())
}
}
fn tag_from_primary_language(tag: impl Into<String>) -> LanguageTag {
let serialization = tag.into();
let end = serialization.len();
LanguageTag {
serialization,
language_end: end,
extlang_end: end,
script_end: end,
region_end: end,
variant_end: end,
extension_end: end,
}
}
fn parse_language_tag(input: &str) -> Result<LanguageTag, ParseError> {
#[derive(PartialEq, Eq)]
enum State {
Start,
AfterLanguage,
AfterExtLang,
AfterScript,
AfterRegion,
InExtension { expected: bool },
InPrivateUse { expected: bool },
}
let mut serialization = String::with_capacity(input.len());
let mut state = State::Start;
let mut language_end = 0;
let mut extlang_end = 0;
let mut script_end = 0;
let mut region_end = 0;
let mut variant_end = 0;
let mut extension_end = 0;
let mut extlangs_count = 0;
for (subtag, end) in SubTagIterator::new(input) {
if subtag.is_empty() {
return Err(ParseError::EmptySubtag);
}
if subtag.len() > 8 {
return Err(ParseError::SubtagTooLong);
}
if state == State::Start {
if subtag.len() < 2 || !is_alphabetic(subtag) {
return Err(ParseError::InvalidLanguage);
}
language_end = end;
serialization.extend(to_lowercase(subtag));
if subtag.len() < 4 {
state = State::AfterLanguage;
} else {
state = State::AfterExtLang;
}
} else if let State::InPrivateUse { .. } = state {
if !is_alphanumeric(subtag) {
return Err(ParseError::InvalidSubtag);
}
serialization.push('-');
serialization.extend(to_lowercase(subtag));
state = State::InPrivateUse { expected: false };
} else if subtag == "x" || subtag == "X" {
if let State::InExtension { expected: true } = state {
return Err(ParseError::EmptyExtension);
}
serialization.push('-');
serialization.push('x');
state = State::InPrivateUse { expected: true };
} else if subtag.len() == 1 && is_alphanumeric(subtag) {
if let State::InExtension { expected: true } = state {
return Err(ParseError::EmptyExtension);
}
let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase();
serialization.push('-');
serialization.push(extension_tag);
state = State::InExtension { expected: true };
} else if let State::InExtension { .. } = state {
if !is_alphanumeric(subtag) {
return Err(ParseError::InvalidSubtag);
}
extension_end = end;
serialization.push('-');
serialization.extend(to_lowercase(subtag));
state = State::InExtension { expected: false };
} else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) {
extlangs_count += 1;
if extlangs_count > 3 {
return Err(ParseError::TooManyExtlangs);
}
extlang_end = end;
serialization.push('-');
serialization.extend(to_lowercase(subtag));
} else if (state == State::AfterLanguage || state == State::AfterExtLang)
&& subtag.len() == 4
&& is_alphabetic(subtag)
{
script_end = end;
serialization.push('-');
serialization.extend(to_uppercase_first(subtag));
state = State::AfterScript;
} else if (state == State::AfterLanguage
|| state == State::AfterExtLang
|| state == State::AfterScript)
&& (subtag.len() == 2 && is_alphabetic(subtag)
|| subtag.len() == 3 && is_numeric(subtag))
{
region_end = end;
serialization.push('-');
serialization.extend(to_uppercase(subtag));
state = State::AfterRegion;
} else if (state == State::AfterLanguage
|| state == State::AfterExtLang
|| state == State::AfterScript
|| state == State::AfterRegion)
&& is_alphanumeric(subtag)
&& (subtag.len() >= 5 && is_alphabetic(&subtag[0..1])
|| subtag.len() >= 4 && is_numeric(&subtag[0..1]))
{
variant_end = end;
serialization.push('-');
serialization.extend(to_lowercase(subtag));
state = State::AfterRegion;
} else {
return Err(ParseError::InvalidSubtag);
}
}
if let State::InExtension { expected: true } = state {
return Err(ParseError::EmptyExtension);
}
if let State::InPrivateUse { expected: true } = state {
return Err(ParseError::EmptyPrivateUse);
}
if extlang_end < language_end {
extlang_end = language_end;
}
if script_end < extlang_end {
script_end = extlang_end;
}
if region_end < script_end {
region_end = script_end;
}
if variant_end < region_end {
variant_end = region_end;
}
if extension_end < variant_end {
extension_end = variant_end;
}
Ok(LanguageTag {
serialization,
language_end,
extlang_end,
script_end,
region_end,
variant_end,
extension_end,
})
}
struct ExtensionsIterator<'a> {
input: &'a str,
}
impl<'a> ExtensionsIterator<'a> {
fn new(input: &'a str) -> Self {
Self { input }
}
}
impl<'a> Iterator for ExtensionsIterator<'a> {
type Item = (char, &'a str);
fn next(&mut self) -> Option<(char, &'a str)> {
let mut parts_iterator = self.input.split_terminator('-');
let singleton = parts_iterator.next()?.chars().next().unwrap();
let mut content_size: usize = 2;
for part in parts_iterator {
if part.len() == 1 {
let content = &self.input[2..content_size - 1];
self.input = &self.input[content_size..];
return Some((singleton, content));
} else {
content_size += part.len() + 1;
}
}
let result = self.input.get(2..).map(|content| (singleton, content));
self.input = "";
result
}
}
struct SubTagIterator<'a> {
split: Split<'a, char>,
position: usize,
}
impl<'a> SubTagIterator<'a> {
fn new(input: &'a str) -> Self {
Self {
split: input.split('-'),
position: 0,
}
}
}
impl<'a> Iterator for SubTagIterator<'a> {
type Item = (&'a str, usize);
fn next(&mut self) -> Option<(&'a str, usize)> {
let tag = self.split.next()?;
let tag_end = self.position + tag.len();
self.position = tag_end + 1;
Some((tag, tag_end))
}
}
struct AlphanumericLowerCharSet {
alphabetic_set: [bool; 26],
numeric_set: [bool; 10],
}
impl AlphanumericLowerCharSet {
fn new() -> Self {
Self {
alphabetic_set: [false; 26],
numeric_set: [false; 10],
}
}
fn contains(&mut self, c: char) -> bool {
if c.is_ascii_digit() {
self.numeric_set[char_sub(c, '0')]
} else if c.is_ascii_lowercase() {
self.alphabetic_set[char_sub(c, 'a')]
} else if c.is_ascii_uppercase() {
self.alphabetic_set[char_sub(c, 'A')]
} else {
false
}
}
fn insert(&mut self, c: char) {
if c.is_ascii_digit() {
self.numeric_set[char_sub(c, '0')] = true
} else if c.is_ascii_lowercase() {
self.alphabetic_set[char_sub(c, 'a')] = true
} else if c.is_ascii_uppercase() {
self.alphabetic_set[char_sub(c, 'A')] = true
}
}
}
fn char_sub(c1: char, c2: char) -> usize {
(c1 as usize) - (c2 as usize)
}
fn is_alphabetic(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphabetic())
}
fn is_numeric(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_digit())
}
fn is_alphanumeric(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphanumeric())
}
fn is_alphanumeric_or_dash(s: &str) -> bool {
s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-')
}
fn to_uppercase(s: &'_ str) -> impl Iterator<Item = char> + '_ {
s.chars().map(|c| c.to_ascii_uppercase())
}
fn to_uppercase_first(s: &'_ str) -> impl Iterator<Item = char> + '_ {
let mut chars = s.chars();
once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase()))
}
fn to_lowercase(s: &'_ str) -> impl Iterator<Item = char> + '_ {
s.chars().map(|c| c.to_ascii_lowercase())
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ParseError {
EmptyExtension,
EmptyPrivateUse,
ForbiddenChar,
InvalidSubtag,
InvalidLanguage,
SubtagTooLong,
EmptySubtag,
TooManyExtlangs,
}
impl Error for ParseError {}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(match self {
Self::EmptyExtension => "if an extension subtag is present, it must not be empty",
Self::EmptyPrivateUse => "if the `x` subtag is present, it must not be empty",
Self::ForbiddenChar => "the langtag contains a char not allowed",
Self::InvalidSubtag => "a subtag fails to parse, it does not match any other subtags",
Self::InvalidLanguage => "the given language subtag is invalid",
Self::SubtagTooLong => "a subtag may be eight characters in length at maximum",
Self::EmptySubtag => "a subtag should not be empty",
Self::TooManyExtlangs => "at maximum three extlangs are allowed",
})
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ValidationError {
DuplicateVariant,
DuplicateExtension,
MultipleExtendedLanguageSubtags,
PrimaryLanguageNotInRegistry,
ExtendedLanguageNotInRegistry,
ScriptNotInRegistry,
RegionNotInRegistry,
VariantNotInRegistry,
WrongExtendedLanguagePrefix,
WrongVariantPrefix,
}
impl Error for ValidationError {}
impl fmt::Display for ValidationError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(match self {
Self::DuplicateVariant => {
"the same variant subtag is only allowed once in a tag"
}
Self::DuplicateExtension => {
"the same extension subtag is only allowed once in a tag"
}
Self::MultipleExtendedLanguageSubtags => {
"only one extended language subtag is allowed"
}
Self::PrimaryLanguageNotInRegistry => {
"the primary language is not in the IANA Language Subtag Registry"
}
Self::ExtendedLanguageNotInRegistry => {
"the extended language is not in the IANA Language Subtag Registry"
}
Self::ScriptNotInRegistry => {
"the script is not in the IANA Language Subtag Registry"
}
Self::RegionNotInRegistry => {
"the region is not in the IANA Language Subtag Registry"
}
Self::VariantNotInRegistry => {
"a variant is not in the IANA Language Subtag Registry"
}
Self::WrongExtendedLanguagePrefix => {
"the primary language is not the expected extended language prefix from the IANA Language Subtag Registry"
}
Self::WrongVariantPrefix => {
"the language tag has not one of the expected variant prefix from the IANA Language Subtag Registry"
}
})
}
}
fn between<T: Ord>(value: T, start: T, end: T) -> bool {
start <= value && value <= end
}
fn is_in_str_slice_set(slice: &[&'static str], value: &str) -> bool {
slice.binary_search(&value).is_ok()
}
fn is_in_from_str_slice_set<T: Copy + Ord + FromStr>(slice: &[T], value: &str) -> bool {
match T::from_str(value) {
Ok(key) => slice.binary_search(&key).is_ok(),
Err(_) => false,
}
}
fn find_in_str_slice_map<'a, V>(slice: &'a [(&'static str, V)], value: &str) -> Option<&'a V> {
if let Ok(position) = slice.binary_search_by_key(&value, |t| t.0) {
Some(&slice[position].1)
} else {
None
}
}
fn find_in_from_str_slice_map<'a, K: Copy + Ord + FromStr, V: Deref<Target = str>>(
slice: &'a [(K, V)],
value: &str,
) -> Option<&'a str> {
if let Ok(position) = slice.binary_search_by_key(&K::from_str(value).ok()?, |t| t.0) {
Some(&*slice[position].1)
} else {
None
}
}