icu_locid 1.1.0

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::ordering::SubtagOrderingResult;
use crate::parser::{
    parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
    ParserError, ParserMode, SubtagIterator,
};
use crate::{extensions, subtags, LanguageIdentifier};
use alloc::string::String;
use core::cmp::Ordering;
use core::str::FromStr;
use tinystr::TinyAsciiStr;
use writeable::Writeable;

/// A core struct representing a [`Unicode Locale Identifier`].
///
/// A locale is made of two parts:
///  * Unicode Language Identifier
///  * A set of Unicode Extensions
///
/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
/// on top of that is able to parse, manipulate and serialize unicode extension fields.
///
///
/// # Examples
///
/// ```
/// use icu_locid::{
///     extensions_unicode_key as key, extensions_unicode_value as value,
///     locale, subtags_language as language, subtags_region as region,
/// };
///
/// let loc = locale!("en-US-u-ca-buddhist");
///
/// assert_eq!(loc.id.language, language!("en"));
/// assert_eq!(loc.id.script, None);
/// assert_eq!(loc.id.region, Some(region!("US")));
/// assert_eq!(loc.id.variants.len(), 0);
/// assert_eq!(
///     loc.extensions.unicode.keywords.get(&key!("ca")),
///     Some(&value!("buddhist"))
/// );
/// ```
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for a locale:
///
///  * *well-formed* - syntactically correct
///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
///  * *canonical* - valid and no deprecated codes or structure.
///
/// At the moment parsing normalizes a well-formed locale identifier converting
/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
///
/// Any bogus subtags will cause the parsing to fail with an error.
/// No subtag validation or canonicalization is performed.
///
/// # Examples
///
/// ```
/// use icu::locid::{subtags::*, Locale};
///
/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
///     .parse()
///     .expect("Failed to parse.");
///
/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
/// assert_eq!(
///     loc.id.variants.get(0),
///     "valencia".parse::<Variant>().ok().as_ref()
/// );
/// ```
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
#[derive(Default, PartialEq, Eq, Clone, Hash)]
#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
pub struct Locale {
    /// The basic language/script/region components in the locale identifier along with any variants.
    pub id: LanguageIdentifier,
    /// Any extensions present in the locale identifier.
    pub extensions: extensions::Extensions,
}

#[test]
fn test_sizes() {
    // Remove when we upgrade to a compiler where the new sizes are default
    let forced_nightly = std::env::var("ICU4X_BUILDING_WITH_FORCED_NIGHTLY").is_ok();
    assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
    assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
    assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
    assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
    assert_eq!(core::mem::size_of::<subtags::Variants>(), 32);
    assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 48);

    assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 72);
    assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 48);
    assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);

    assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 24);
    assert_eq!(
        core::mem::size_of::<extensions::unicode::Keywords>(),
        if forced_nightly { 40 } else { 48 }
    );
    assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
    assert_eq!(core::mem::size_of::<extensions::private::Private>(), 24);
    assert_eq!(
        core::mem::size_of::<extensions::Extensions>(),
        if forced_nightly { 184 } else { 192 }
    );

    assert_eq!(
        core::mem::size_of::<Locale>(),
        if forced_nightly { 232 } else { 240 }
    );
}

impl Locale {
    /// A constructor which takes a utf8 slice, parses it and
    /// produces a well-formed [`Locale`].
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::Locale;
    ///
    /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
    /// ```
    pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
        parse_locale(v)
    }

    /// The default undefined locale "und". Same as [`default()`](Default::default()).
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::Locale;
    ///
    /// assert_eq!(Locale::default(), Locale::UND);
    /// ```
    pub const UND: Self = Self {
        id: LanguageIdentifier::UND,
        extensions: extensions::Extensions::new(),
    };

    /// This is a best-effort operation that performs all available levels of canonicalization.
    ///
    /// At the moment the operation will normalize casing and the separator, but in the future
    /// it may also validate and update from deprecated subtags to canonical ones.
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::Locale;
    ///
    /// assert_eq!(
    ///     Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
    ///     Ok("pl-Latn-PL-u-hc-h12")
    /// );
    /// ```
    pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
        let locale = Self::try_from_bytes(input.as_ref())?;
        Ok(locale.write_to_string().into_owned())
    }

    /// Compare this [`Locale`] with BCP-47 bytes.
    ///
    /// The return value is equivalent to what would happen if you first converted this
    /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
    ///
    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::Locale;
    /// use std::cmp::Ordering;
    ///
    /// let bcp47_strings: &[&str] = &[
    ///     "pl-Latn-PL",
    ///     "und",
    ///     "und-fonipa",
    ///     "und-t-m0-true",
    ///     "und-u-ca-hebrew",
    ///     "und-u-ca-japanese",
    ///     "zh",
    /// ];
    ///
    /// for ab in bcp47_strings.windows(2) {
    ///     let a = ab[0];
    ///     let b = ab[1];
    ///     assert!(a.cmp(b) == Ordering::Less);
    ///     let a_loc = a.parse::<Locale>().unwrap();
    ///     assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
    ///     assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
    /// }
    /// ```
    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
        self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
    }

    /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
    ///
    /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
    /// a more modular version that allows multiple subtag iterators to be chained together.
    ///
    /// For an additional example, see [`SubtagOrderingResult`].
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::locale;
    /// use std::cmp::Ordering;
    ///
    /// let subtags: &[&[u8]] =
    ///     &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
    ///
    /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
    /// assert_eq!(
    ///     Ordering::Equal,
    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
    /// );
    ///
    /// let loc = locale!("ca-ES-valencia");
    /// assert_eq!(
    ///     Ordering::Less,
    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
    /// );
    ///
    /// let loc = locale!("ca-ES-valencia-u-nu-arab");
    /// assert_eq!(
    ///     Ordering::Greater,
    ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
    /// );
    /// ```
    pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
    where
        I: Iterator<Item = &'l [u8]>,
    {
        let r = self.for_each_subtag_str(&mut |subtag| {
            if let Some(other) = subtags.next() {
                match subtag.as_bytes().cmp(other) {
                    Ordering::Equal => Ok(()),
                    not_equal => Err(not_equal),
                }
            } else {
                Err(Ordering::Greater)
            }
        });
        match r {
            Ok(_) => SubtagOrderingResult::Subtags(subtags),
            Err(o) => SubtagOrderingResult::Ordering(o),
        }
    }

    /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
    ///
    /// The return value is equivalent to what would happen if you first parsed the
    /// BCP-47 string to a `Locale` and then performed a structucal comparison.
    ///
    /// # Examples
    ///
    /// ```
    /// use icu::locid::Locale;
    /// use std::cmp::Ordering;
    ///
    /// let bcp47_strings: &[&str] = &[
    ///     "pl-LaTn-pL",
    ///     "uNd",
    ///     "UND-FONIPA",
    ///     "UnD-t-m0-TrUe",
    ///     "uNd-u-CA-Japanese",
    ///     "ZH",
    /// ];
    ///
    /// for a in bcp47_strings {
    ///     assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
    /// }
    /// ```
    pub fn normalizing_eq(&self, other: &str) -> bool {
        macro_rules! subtag_matches {
            ($T:ty, $iter:ident, $expected:expr) => {
                $iter
                    .next()
                    .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
                    .unwrap_or(false)
            };
        }

        let mut iter = SubtagIterator::new(other.as_bytes());
        if !subtag_matches!(subtags::Language, iter, self.id.language) {
            return false;
        }
        if let Some(ref script) = self.id.script {
            if !subtag_matches!(subtags::Script, iter, *script) {
                return false;
            }
        }
        if let Some(ref region) = self.id.region {
            if !subtag_matches!(subtags::Region, iter, *region) {
                return false;
            }
        }
        for variant in self.id.variants.iter() {
            if !subtag_matches!(subtags::Variant, iter, *variant) {
                return false;
            }
        }
        if !self.extensions.is_empty() {
            match extensions::Extensions::try_from_iter(&mut iter) {
                Ok(exts) => {
                    if self.extensions != exts {
                        return false;
                    }
                }
                Err(_) => {
                    return false;
                }
            }
        }
        iter.next() == None
    }

    #[doc(hidden)]
    #[allow(clippy::type_complexity)]
    pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
        v: &[u8],
    ) -> Result<
        (
            subtags::Language,
            Option<subtags::Script>,
            Option<subtags::Region>,
            Option<subtags::Variant>,
            Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>,
        ),
        ParserError,
    > {
        parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
            v,
            ParserMode::Locale,
        )
    }

    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
    where
        F: FnMut(&str) -> Result<(), E>,
    {
        self.id.for_each_subtag_str(f)?;
        self.extensions.for_each_subtag_str(f)?;
        Ok(())
    }
}

impl FromStr for Locale {
    type Err = ParserError;

    fn from_str(source: &str) -> Result<Self, Self::Err> {
        Self::try_from_bytes(source.as_bytes())
    }
}

impl From<LanguageIdentifier> for Locale {
    fn from(id: LanguageIdentifier) -> Self {
        Self {
            id,
            extensions: extensions::Extensions::default(),
        }
    }
}

impl From<Locale> for LanguageIdentifier {
    fn from(loc: Locale) -> Self {
        loc.id
    }
}

impl AsRef<LanguageIdentifier> for Locale {
    fn as_ref(&self) -> &LanguageIdentifier {
        &self.id
    }
}

impl AsMut<LanguageIdentifier> for Locale {
    fn as_mut(&mut self) -> &mut LanguageIdentifier {
        &mut self.id
    }
}

impl core::fmt::Debug for Locale {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        writeable::Writeable::write_to(self, f)
    }
}

impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());

#[test]
fn test_writeable() {
    use writeable::assert_writeable_eq;
    assert_writeable_eq!(Locale::UND, "und");
    assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
    assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
    assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
    assert_writeable_eq!(
        "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
        "my-Mymr-MM-posix",
    );
    assert_writeable_eq!(
        "zh-macos-posix".parse::<Locale>().unwrap(),
        "zh-macos-posix",
    );
    assert_writeable_eq!(
        "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
        "my-t-my-d0-zawgyi",
    );
    assert_writeable_eq!(
        "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
        "ar-SA-u-ca-islamic-civil",
    );
    assert_writeable_eq!(
        "en-001-x-foo-bar".parse::<Locale>().unwrap(),
        "en-001-x-foo-bar",
    );
    assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
}

/// # Examples
///
/// ```
/// use icu::locid::Locale;
/// use icu::locid::{locale, subtags_language as language};
///
/// assert_eq!(Locale::from(language!("en")), locale!("en"));
/// ```
impl From<subtags::Language> for Locale {
    fn from(language: subtags::Language) -> Self {
        Self {
            id: language.into(),
            ..Default::default()
        }
    }
}

/// # Examples
///
/// ```
/// use icu::locid::Locale;
/// use icu::locid::{locale, subtags_script as script};
///
/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
/// ```
impl From<Option<subtags::Script>> for Locale {
    fn from(script: Option<subtags::Script>) -> Self {
        Self {
            id: script.into(),
            ..Default::default()
        }
    }
}

/// # Examples
///
/// ```
/// use icu::locid::Locale;
/// use icu::locid::{locale, subtags_region as region};
///
/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
/// ```
impl From<Option<subtags::Region>> for Locale {
    fn from(region: Option<subtags::Region>) -> Self {
        Self {
            id: region.into(),
            ..Default::default()
        }
    }
}

/// # Examples
///
/// ```
/// use icu::locid::Locale;
/// use icu::locid::{
///     locale, subtags_language as language, subtags_region as region,
///     subtags_script as script,
/// };
///
/// assert_eq!(
///     Locale::from((
///         language!("en"),
///         Some(script!("Latn")),
///         Some(region!("US"))
///     )),
///     locale!("en-Latn-US")
/// );
/// ```
impl
    From<(
        subtags::Language,
        Option<subtags::Script>,
        Option<subtags::Region>,
    )> for Locale
{
    fn from(
        lsr: (
            subtags::Language,
            Option<subtags::Script>,
            Option<subtags::Region>,
        ),
    ) -> Self {
        Self {
            id: lsr.into(),
            ..Default::default()
        }
    }
}