unic_locale_impl/lib.rs
1pub(crate) mod errors;
2pub mod extensions;
3pub mod parser;
4
5use errors::LocaleError;
6pub use extensions::{ExtensionType, ExtensionsMap};
7use std::str::FromStr;
8pub use unic_langid_impl::CharacterDirection;
9pub use unic_langid_impl::{subtags, LanguageIdentifier};
10
11/// `Locale` is a core struct representing a Unicode Locale Identifier.
12///
13/// A locale is made of two parts:
14/// * `id` - Unicode Language Identifier
15/// * `extensions` - A set of Unicode Extensions
16///
17/// `Locale` exposes all of the same methods as `LanguageIdentifier`, and
18/// on top of that is able to parse, manipulate and serialize unicode extension
19/// fields.
20///
21/// # Examples
22///
23/// ```
24/// use unic_locale_impl::Locale;
25///
26/// let loc: Locale = "en-US-u-ca-buddhist".parse()
27/// .expect("Failed to parse.");
28///
29/// assert_eq!(loc.id.language, "en");
30/// assert_eq!(loc.id.script, None);
31/// assert_eq!(loc.id.region, Some("US".parse().unwrap()));
32/// assert_eq!(loc.id.variants().len(), 0);
33/// assert_eq!(loc.extensions.unicode.keyword("ca")
34/// .expect("Getting keyword failed.")
35/// .collect::<Vec<_>>(),
36/// &["buddhist"]);
37/// ```
38///
39/// # Parsing
40///
41/// Unicode recognizes three levels of standard conformance for a locale:
42///
43/// * *well-formed* - syntactically correct
44/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
45/// * *canonical* - valid and no deprecated codes or structure.
46///
47/// At the moment parsing normalizes a well-formed language identifier converting
48/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
49///
50/// Any bogus subtags will cause the parsing to fail with an error.
51/// No subtag validation is performed.
52///
53/// # Examples:
54///
55/// ```
56/// use unic_locale_impl::Locale;
57///
58/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12".parse()
59/// .expect("Failed to parse.");
60///
61/// assert_eq!(loc.id.language, "en");
62/// assert_eq!(loc.id.script, Some("Latn".parse().unwrap()));
63/// assert_eq!(loc.id.region, Some("US".parse().unwrap()));
64/// assert_eq!(loc.id.variants().collect::<Vec<_>>(), &["valencia"]);
65/// ```
66#[derive(Debug, Default, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
67pub struct Locale {
68 pub id: LanguageIdentifier,
69 pub extensions: extensions::ExtensionsMap,
70}
71
72type PartsTuple = (
73 subtags::Language,
74 Option<subtags::Script>,
75 Option<subtags::Region>,
76 Vec<subtags::Variant>,
77 String,
78);
79
80impl Locale {
81 /// A constructor which takes a utf8 slice, parses it and
82 /// produces a well-formed `Locale`.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use unic_locale_impl::Locale;
88 ///
89 /// let loc = Locale::from_bytes("en-US-u-hc-h12".as_bytes())
90 /// .expect("Parsing failed.");
91 ///
92 /// assert_eq!(loc.to_string(), "en-US-u-hc-h12");
93 /// ```
94 pub fn from_bytes(v: &[u8]) -> Result<Self, LocaleError> {
95 Ok(parser::parse_locale(v)?)
96 }
97
98 /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
99 /// produces a well-formed `Locale`.
100 ///
101 /// # Examples
102 ///
103 /// ```
104 /// use unic_locale_impl::Locale;
105 ///
106 /// let loc = Locale::from_parts("fr".parse().unwrap(), None, Some("CA".parse().unwrap()), &[], None);
107 ///
108 /// assert_eq!(loc.to_string(), "fr-CA");
109 /// ```
110 pub fn from_parts(
111 language: subtags::Language,
112 script: Option<subtags::Script>,
113 region: Option<subtags::Region>,
114 variants: &[subtags::Variant],
115 extensions: Option<extensions::ExtensionsMap>,
116 ) -> Self {
117 let id = LanguageIdentifier::from_parts(language, script, region, variants);
118 Locale {
119 id,
120 extensions: extensions.unwrap_or_default(),
121 }
122 }
123
124 /// # Safety
125 ///
126 /// This function accepts subtags expecting variants
127 /// to be deduplicated and ordered.
128 pub const unsafe fn from_raw_parts_unchecked(
129 language: subtags::Language,
130 script: Option<subtags::Script>,
131 region: Option<subtags::Region>,
132 variants: Option<Box<[subtags::Variant]>>,
133 extensions: extensions::ExtensionsMap,
134 ) -> Self {
135 let id = LanguageIdentifier::from_raw_parts_unchecked(language, script, region, variants);
136 Self { id, extensions }
137 }
138
139 /// Consumes `Locale` and produces raw internal representations
140 /// of all subtags in form of `u64`/`u32`.
141 ///
142 /// Primarily used for storing internal representation and restoring via
143 /// `from_raw_parts_unchecked`.
144 ///
145 /// # Examples
146 ///
147 /// ```
148 /// use unic_locale_impl::Locale;
149 /// use tinystr::{TinyStr8, TinyStr4};
150 ///
151 /// let loc: Locale = "en-US".parse()
152 /// .expect("Parsing failed.");
153 ///
154 /// let (lang, script, region, variants, extensions) = loc.into_parts();
155 ///
156 /// let loc2 = Locale::from_parts(
157 /// lang,
158 /// script,
159 /// region,
160 /// &variants,
161 /// Some(extensions.parse().unwrap())
162 /// );
163 ///
164 /// assert_eq!(loc2.to_string(), "en-US");
165 /// ```
166 pub fn into_parts(self) -> PartsTuple {
167 let (lang, region, script, variants) = self.id.into_parts();
168 (lang, region, script, variants, self.extensions.to_string())
169 }
170
171 /// Compares a `Locale` to another `AsRef<Locale`
172 /// allowing for either side to use the missing fields as wildcards.
173 ///
174 /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
175 ///
176 /// # Examples
177 ///
178 /// ```
179 /// use unic_locale_impl::Locale;
180 ///
181 /// let loc1: Locale = "en".parse()
182 /// .expect("Parsing failed.");
183 ///
184 /// let loc2: Locale = "en-US".parse()
185 /// .expect("Parsing failed.");
186 ///
187 /// assert_ne!(loc1, loc2); // "en" != "en-US"
188 /// assert_ne!(loc1.to_string(), loc2.to_string()); // "en" != "en-US"
189 ///
190 /// assert_eq!(loc1.matches(&loc2, false, false), false); // "en" != "en-US"
191 /// assert_eq!(loc1.matches(&loc2, true, false), true); // "en-*-*-*" == "en-US"
192 /// assert_eq!(loc1.matches(&loc2, false, true), false); // "en" != "en-*-US-*"
193 /// assert_eq!(loc1.matches(&loc2, true, true), true); // "en-*-*-*" == "en-*-US-*"
194 /// ```
195 pub fn matches<O: AsRef<Self>>(
196 &self,
197 other: &O,
198 self_as_range: bool,
199 other_as_range: bool,
200 ) -> bool {
201 let other = other.as_ref();
202 if !self.extensions.private.is_empty() || !other.extensions.private.is_empty() {
203 return false;
204 }
205 self.id.matches(&other.id, self_as_range, other_as_range)
206 }
207}
208
209impl FromStr for Locale {
210 type Err = LocaleError;
211
212 fn from_str(source: &str) -> Result<Self, Self::Err> {
213 Ok(parser::parse_locale(source)?)
214 }
215}
216
217impl From<LanguageIdentifier> for Locale {
218 fn from(id: LanguageIdentifier) -> Self {
219 Locale {
220 id,
221 extensions: ExtensionsMap::default(),
222 }
223 }
224}
225
226impl From<Locale> for LanguageIdentifier {
227 fn from(value: Locale) -> Self {
228 value.id
229 }
230}
231
232impl AsRef<LanguageIdentifier> for Locale {
233 fn as_ref(&self) -> &LanguageIdentifier {
234 &self.id
235 }
236}
237
238impl AsRef<Locale> for Locale {
239 #[inline(always)]
240 fn as_ref(&self) -> &Locale {
241 self
242 }
243}
244
245impl std::fmt::Display for Locale {
246 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
247 write!(f, "{}{}", self.id, self.extensions)
248 }
249}
250
251/// This is a best-effort operation that performs all available levels of canonicalization.
252///
253/// At the moment the operation will normalize casing and the separator, but in the future
254/// it may also validate and update from deprecated subtags to canonical ones.
255///
256/// # Examples
257///
258/// ```
259/// use unic_locale_impl::canonicalize;
260///
261/// assert_eq!(canonicalize("pL_latn_pl-U-HC-H12"), Ok("pl-Latn-PL-u-hc-h12".to_string()));
262/// ```
263pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LocaleError> {
264 let locale = Locale::from_bytes(input.as_ref())?;
265 Ok(locale.to_string())
266}