unic_locale_impl/extensions/unicode.rs
1use crate::errors::LocaleError;
2use crate::parser::ParserError;
3
4use std::collections::BTreeMap;
5use std::iter::Peekable;
6use std::ops::RangeInclusive;
7
8use tinystr::{TinyStr4, TinyStr8};
9
10/// Constants for locale extension key/value handling.
11const KEY_LENGTH: usize = 2;
12const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
13const ATTR_LENGTH: RangeInclusive<usize> = 3..=8;
14
15/// A list of [`Unicode BCP47 U Extensions`] as defined in [`Unicode Locale
16/// Identifier`] specification.
17///
18/// Unicode extensions provide subtags that specify language and/or locale-based behavior
19/// or refinements to language tags, according to work done by the Unicode Consortium.
20/// (See [`RFC 6067`] for details).
21///
22/// # Examples
23///
24/// ```
25/// use unic_locale_impl::Locale;
26///
27/// let mut loc: Locale = "de-u-hc-h12-ca-buddhist".parse()
28/// .expect("Parsing failed.");
29///
30/// assert_eq!(loc.extensions.unicode.keyword("ca")
31/// .expect("Getting keyword failed.")
32/// .collect::<Vec<_>>(),
33/// &["buddhist"]);
34/// ```
35/// [`Unicode BCP47 U Extensions`]: https://unicode.org/reports/tr35/#u_Extension
36/// [`RFC 6067`]: https://www.ietf.org/rfc/rfc6067.txt
37/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
38#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
39pub struct UnicodeExtensionList {
40 // Canonical: sort by key (BTreeMap is already sorted) / remove value 'true'
41 keywords: BTreeMap<TinyStr4, Vec<TinyStr8>>,
42
43 // Canonical: sort / de-dup
44 attributes: Vec<TinyStr8>,
45}
46
47fn parse_key(key: &[u8]) -> Result<TinyStr4, ParserError> {
48 if key.len() != KEY_LENGTH || !key[0].is_ascii_alphanumeric() || !key[1].is_ascii_alphabetic() {
49 return Err(ParserError::InvalidSubtag);
50 }
51 let key = TinyStr4::try_from_utf8(key).map_err(|_| ParserError::InvalidSubtag)?;
52 Ok(key.to_ascii_lowercase())
53}
54
55const TRUE_TYPE: TinyStr8 = tinystr::tinystr!(8, "true"); // "true"
56
57fn parse_type(t: &[u8]) -> Result<Option<TinyStr8>, ParserError> {
58 let s = TinyStr8::try_from_utf8(t).map_err(|_| ParserError::InvalidSubtag)?;
59 if !TYPE_LENGTH.contains(&t.len()) || !s.is_ascii_alphanumeric() {
60 return Err(ParserError::InvalidSubtag);
61 }
62
63 let s = s.to_ascii_lowercase();
64
65 if s == TRUE_TYPE {
66 Ok(None)
67 } else {
68 Ok(Some(s))
69 }
70}
71
72fn parse_attribute(t: &[u8]) -> Result<TinyStr8, ParserError> {
73 let s = TinyStr8::try_from_utf8(t).map_err(|_| ParserError::InvalidSubtag)?;
74 if !ATTR_LENGTH.contains(&t.len()) || !s.is_ascii_alphanumeric() {
75 return Err(ParserError::InvalidSubtag);
76 }
77
78 Ok(s.to_ascii_lowercase())
79}
80
81fn is_type(t: &[u8]) -> bool {
82 let slen = t.len();
83 TYPE_LENGTH.contains(&slen) && !t.iter().any(|c: &u8| !c.is_ascii_alphanumeric())
84}
85
86fn is_attribute(t: &[u8]) -> bool {
87 let slen = t.len();
88 ATTR_LENGTH.contains(&slen) && !t.iter().any(|c: &u8| !c.is_ascii_alphanumeric())
89}
90
91impl UnicodeExtensionList {
92 /// Returns `true` if there are no keywords and no attributes in
93 /// the `UnicodeExtensionList`.
94 ///
95 /// # Examples
96 ///
97 /// ```
98 /// use unic_locale_impl::Locale;
99 ///
100 /// let mut loc: Locale = "en-US-u-foo".parse()
101 /// .expect("Parsing failed.");
102 ///
103 /// assert_eq!(loc.extensions.unicode.is_empty(), false);
104 /// ```
105 pub fn is_empty(&self) -> bool {
106 self.keywords.is_empty() && self.attributes.is_empty()
107 }
108
109 /// Returns the value of keyword in the `UnicodeExtensionList`.
110 ///
111 /// # Examples
112 ///
113 /// ```
114 /// use unic_locale_impl::Locale;
115 ///
116 /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
117 /// .expect("Parsing failed.");
118 ///
119 /// assert_eq!(loc.extensions.unicode.keyword("ca")
120 /// .expect("Getting keyword failed.")
121 /// .collect::<Vec<_>>(),
122 /// &["buddhist"]);
123 ///
124 /// // Here keyword with key "aa" is not available
125 /// assert_eq!(loc.extensions.unicode.keyword("aa")
126 /// .expect("Getting keyword failed.")
127 /// .len(),
128 /// 0);
129 /// ```
130 pub fn keyword<S: AsRef<[u8]>>(
131 &self,
132 key: S,
133 ) -> Result<impl ExactSizeIterator<Item = &str>, LocaleError> {
134 let keywords: &[_] = match self.keywords.get(&parse_key(key.as_ref())?) {
135 Some(v) => v,
136 None => &[],
137 };
138
139 Ok(keywords.iter().map(|s| s.as_ref()))
140 }
141
142 /// Returns an iterator over all keys in the `UnicodeExtensionList`.
143 ///
144 /// # Examples
145 ///
146 /// ```
147 /// use unic_locale_impl::Locale;
148 ///
149 /// let mut loc: Locale = "en-US-u-ca-buddhist-nu-thai".parse()
150 /// .expect("Parsing failed.");
151 ///
152 /// assert_eq!(loc.extensions.unicode.keyword_keys().collect::<Vec<_>>(),
153 /// &["ca", "nu"]);
154 /// ```
155 pub fn keyword_keys(&self) -> impl ExactSizeIterator<Item = &str> {
156 self.keywords.keys().map(|s| s.as_ref())
157 }
158
159 /// Adds a keyword to the `UnicodeExtensionList` or sets value for key if
160 /// keyword is already included in the `UnicodeExtensionList`.
161 ///
162 /// # Examples
163 ///
164 /// ```
165 /// use unic_locale_impl::Locale;
166 ///
167 /// let mut loc: Locale = "en-US".parse()
168 /// .expect("Parsing failed.");
169 ///
170 /// loc.extensions.unicode.set_keyword("ca", &["buddhist"])
171 /// .expect("Setting keyword failed.");
172 ///
173 /// assert_eq!(loc.to_string(), "en-US-u-ca-buddhist");
174 ///
175 /// loc.extensions.unicode.set_keyword("ca", &["chinese"])
176 /// .expect("Setting keyword failed.");
177 ///
178 /// assert_eq!(loc.to_string(), "en-US-u-ca-chinese");
179 /// ```
180 pub fn set_keyword<S: AsRef<[u8]>>(&mut self, key: S, value: &[S]) -> Result<(), LocaleError> {
181 let key = parse_key(key.as_ref())?;
182
183 let t = value
184 .iter()
185 .filter_map(|t| parse_type(t.as_ref()).transpose())
186 .collect::<Result<Vec<_>, _>>()?;
187
188 self.keywords.insert(key, t);
189 Ok(())
190 }
191
192 /// Removes a keyword from the `UnicodeExtensionList`.
193 ///
194 /// Returns `true` if keyword was included in the `UnicodeExtensionList`
195 /// before removal.
196 ///
197 /// # Examples
198 ///
199 /// ```
200 /// use unic_locale_impl::Locale;
201 ///
202 /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
203 /// .expect("Parsing failed.");
204 ///
205 /// assert_eq!(loc.extensions.unicode.remove_keyword("ca")
206 /// .expect("Removing tag failed."),
207 /// true);
208 ///
209 /// assert_eq!(loc.to_string(), "en-US");
210 /// ```
211 pub fn remove_keyword<S: AsRef<[u8]>>(&mut self, key: S) -> Result<bool, LocaleError> {
212 Ok(self.keywords.remove(&parse_key(key.as_ref())?).is_some())
213 }
214
215 /// Clears all keywords from the `UnicodeExtensionList`.
216 ///
217 /// # Examples
218 ///
219 /// ```
220 /// use unic_locale_impl::Locale;
221 ///
222 /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
223 /// .expect("Parsing failed.");
224 ///
225 /// loc.extensions.unicode.clear_keywords();
226 /// assert_eq!(loc.to_string(), "en-US");
227 /// ```
228 pub fn clear_keywords(&mut self) {
229 self.keywords.clear();
230 }
231
232 /// Returns `true` if attribute is included in the `UnicodeExtensionList`.
233 ///
234 /// # Examples
235 ///
236 /// ```
237 /// use unic_locale_impl::Locale;
238 ///
239 /// let mut loc: Locale = "en-US-u-foo".parse()
240 /// .expect("Parsing failed.");
241 ///
242 /// assert_eq!(loc.extensions.unicode.has_attribute("foo")
243 /// .expect("Getting attribute failed."),
244 /// true);
245 /// ```
246 pub fn has_attribute<S: AsRef<[u8]>>(&self, attribute: S) -> Result<bool, LocaleError> {
247 Ok(self
248 .attributes
249 .contains(&parse_attribute(attribute.as_ref())?))
250 }
251
252 /// Returns an iterator over all attributes in the `UnicodeExtensionList`.
253 ///
254 /// # Examples
255 ///
256 /// ```
257 /// use unic_locale_impl::Locale;
258 ///
259 /// let mut loc: Locale = "en-US-u-foo-bar".parse()
260 /// .expect("Parsing failed.");
261 ///
262 /// assert_eq!(loc.extensions.unicode.attributes().collect::<Vec<_>>(),
263 /// &["bar", "foo"]);
264 /// ```
265 pub fn attributes(&self) -> impl ExactSizeIterator<Item = &str> {
266 self.attributes.iter().map(|s| s.as_ref())
267 }
268
269 /// Sets an attribute on the `UnicodeExtensionList`.
270 ///
271 /// # Examples
272 ///
273 /// ```
274 /// use unic_locale_impl::Locale;
275 ///
276 /// let mut loc: Locale = "en-US".parse()
277 /// .expect("Parsing failed.");
278 ///
279 /// loc.extensions.unicode.set_attribute("foo")
280 /// .expect("Setting attribute failed.");
281 ///
282 /// assert_eq!(loc.to_string(), "en-US-u-foo");
283 /// ```
284 pub fn set_attribute<S: AsRef<[u8]>>(&mut self, attribute: S) -> Result<(), LocaleError> {
285 let attribute = parse_attribute(attribute.as_ref())?;
286 if let Err(idx) = self.attributes.binary_search(&attribute) {
287 self.attributes.insert(idx, attribute);
288 }
289 Ok(())
290 }
291
292 /// Removes an attribute from the `UnicodeExtensionList`.
293 ///
294 /// Returns `true` if attribute was included in the `UnicodeExtensionList`
295 /// before removal.
296 ///
297 /// # Examples
298 ///
299 /// ```
300 /// use unic_locale_impl::Locale;
301 ///
302 /// let mut loc: Locale = "en-US-u-foo".parse()
303 /// .expect("Parsing failed.");
304 ///
305 /// assert_eq!(loc.extensions.unicode.remove_attribute("foo")
306 /// .expect("Removing attribute failed."),
307 /// true);
308 ///
309 /// assert_eq!(loc.to_string(), "en-US");
310 /// ```
311 pub fn remove_attribute<S: AsRef<[u8]>>(&mut self, attribute: S) -> Result<bool, LocaleError> {
312 let attribute = parse_attribute(attribute.as_ref())?;
313 match self.attributes.binary_search(&attribute) {
314 Ok(idx) => {
315 self.attributes.remove(idx);
316 Ok(true)
317 }
318 Err(_) => Ok(false),
319 }
320 }
321
322 /// Clears all attributes from the `UnicodeExtensionList`.
323 ///
324 /// # Examples
325 ///
326 /// ```
327 /// use unic_locale_impl::Locale;
328 ///
329 /// let mut loc: Locale = "en-US-u-foo".parse()
330 /// .expect("Parsing failed.");
331 ///
332 /// loc.extensions.unicode.clear_attributes();
333 /// assert_eq!(loc.to_string(), "en-US");
334 /// ```
335 pub fn clear_attributes(&mut self) {
336 self.attributes.clear();
337 }
338
339 pub(crate) fn try_from_iter<'a>(
340 iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
341 ) -> Result<Self, ParserError> {
342 let mut uext = Self::default();
343
344 let mut st_peek = iter.peek();
345
346 let mut current_keyword = None;
347 let mut current_types = vec![];
348
349 while let Some(subtag) = st_peek {
350 let slen = subtag.len();
351 if slen == 2 {
352 if let Some(current_keyword) = current_keyword {
353 uext.keywords.insert(current_keyword, current_types);
354 current_types = vec![];
355 }
356 current_keyword = Some(parse_key(subtag)?);
357 iter.next();
358 } else if current_keyword.is_some() && is_type(subtag) {
359 if let Some(ty) = parse_type(subtag)? {
360 current_types.push(ty);
361 }
362 iter.next();
363 } else if is_attribute(subtag) {
364 uext.attributes.push(parse_attribute(subtag)?);
365 iter.next();
366 } else {
367 break;
368 }
369 st_peek = iter.peek();
370 }
371
372 if let Some(current_keyword) = current_keyword {
373 uext.keywords.insert(current_keyword, current_types);
374 }
375
376 uext.attributes.sort_unstable();
377 uext.attributes.dedup();
378
379 Ok(uext)
380 }
381}
382
383impl std::fmt::Display for UnicodeExtensionList {
384 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
385 if self.is_empty() {
386 return Ok(());
387 }
388
389 f.write_str("-u")?;
390
391 for attr in &self.attributes {
392 write!(f, "-{}", attr)?;
393 }
394
395 for (k, t) in &self.keywords {
396 write!(f, "-{}", k)?;
397 for v in t {
398 write!(f, "-{}", v)?;
399 }
400 }
401 Ok(())
402 }
403}