unic_locale_impl/extensions/
unicode.rs

1use crate::errors::LocaleError;
2use crate::parser::ParserError;
3
4use std::collections::BTreeMap;
5use std::iter::Peekable;
6use std::ops::RangeInclusive;
7
8use tinystr::{TinyStr4, TinyStr8};
9
10/// Constants for locale extension key/value handling.
11const KEY_LENGTH: usize = 2;
12const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
13const ATTR_LENGTH: RangeInclusive<usize> = 3..=8;
14
15/// A list of [`Unicode BCP47 U Extensions`] as defined in [`Unicode Locale
16/// Identifier`] specification.
17///
18/// Unicode extensions provide subtags that specify language and/or locale-based behavior
19/// or refinements to language tags, according to work done by the Unicode Consortium.
20/// (See [`RFC 6067`] for details).
21///
22/// # Examples
23///
24/// ```
25/// use unic_locale_impl::Locale;
26///
27/// let mut loc: Locale = "de-u-hc-h12-ca-buddhist".parse()
28///     .expect("Parsing failed.");
29///
30/// assert_eq!(loc.extensions.unicode.keyword("ca")
31///               .expect("Getting keyword failed.")
32///               .collect::<Vec<_>>(),
33///            &["buddhist"]);
34/// ```
35/// [`Unicode BCP47 U Extensions`]: https://unicode.org/reports/tr35/#u_Extension
36/// [`RFC 6067`]: https://www.ietf.org/rfc/rfc6067.txt
37/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
38#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
39pub struct UnicodeExtensionList {
40    // Canonical: sort by key (BTreeMap is already sorted) / remove value 'true'
41    keywords: BTreeMap<TinyStr4, Vec<TinyStr8>>,
42
43    // Canonical: sort / de-dup
44    attributes: Vec<TinyStr8>,
45}
46
47fn parse_key(key: &[u8]) -> Result<TinyStr4, ParserError> {
48    if key.len() != KEY_LENGTH || !key[0].is_ascii_alphanumeric() || !key[1].is_ascii_alphabetic() {
49        return Err(ParserError::InvalidSubtag);
50    }
51    let key = TinyStr4::try_from_utf8(key).map_err(|_| ParserError::InvalidSubtag)?;
52    Ok(key.to_ascii_lowercase())
53}
54
55const TRUE_TYPE: TinyStr8 = tinystr::tinystr!(8, "true"); // "true"
56
57fn parse_type(t: &[u8]) -> Result<Option<TinyStr8>, ParserError> {
58    let s = TinyStr8::try_from_utf8(t).map_err(|_| ParserError::InvalidSubtag)?;
59    if !TYPE_LENGTH.contains(&t.len()) || !s.is_ascii_alphanumeric() {
60        return Err(ParserError::InvalidSubtag);
61    }
62
63    let s = s.to_ascii_lowercase();
64
65    if s == TRUE_TYPE {
66        Ok(None)
67    } else {
68        Ok(Some(s))
69    }
70}
71
72fn parse_attribute(t: &[u8]) -> Result<TinyStr8, ParserError> {
73    let s = TinyStr8::try_from_utf8(t).map_err(|_| ParserError::InvalidSubtag)?;
74    if !ATTR_LENGTH.contains(&t.len()) || !s.is_ascii_alphanumeric() {
75        return Err(ParserError::InvalidSubtag);
76    }
77
78    Ok(s.to_ascii_lowercase())
79}
80
81fn is_type(t: &[u8]) -> bool {
82    let slen = t.len();
83    TYPE_LENGTH.contains(&slen) && !t.iter().any(|c: &u8| !c.is_ascii_alphanumeric())
84}
85
86fn is_attribute(t: &[u8]) -> bool {
87    let slen = t.len();
88    ATTR_LENGTH.contains(&slen) && !t.iter().any(|c: &u8| !c.is_ascii_alphanumeric())
89}
90
91impl UnicodeExtensionList {
92    /// Returns `true` if there are no keywords and no attributes in
93    /// the `UnicodeExtensionList`.
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// use unic_locale_impl::Locale;
99    ///
100    /// let mut loc: Locale = "en-US-u-foo".parse()
101    ///     .expect("Parsing failed.");
102    ///
103    /// assert_eq!(loc.extensions.unicode.is_empty(), false);
104    /// ```
105    pub fn is_empty(&self) -> bool {
106        self.keywords.is_empty() && self.attributes.is_empty()
107    }
108
109    /// Returns the value of keyword in the `UnicodeExtensionList`.
110    ///
111    /// # Examples
112    ///
113    /// ```
114    /// use unic_locale_impl::Locale;
115    ///
116    /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
117    ///     .expect("Parsing failed.");
118    ///
119    /// assert_eq!(loc.extensions.unicode.keyword("ca")
120    ///                .expect("Getting keyword failed.")
121    ///                .collect::<Vec<_>>(),
122    ///            &["buddhist"]);
123    ///
124    /// // Here keyword with key "aa" is not available
125    /// assert_eq!(loc.extensions.unicode.keyword("aa")
126    ///               .expect("Getting keyword failed.")
127    ///               .len(),
128    ///            0);
129    /// ```
130    pub fn keyword<S: AsRef<[u8]>>(
131        &self,
132        key: S,
133    ) -> Result<impl ExactSizeIterator<Item = &str>, LocaleError> {
134        let keywords: &[_] = match self.keywords.get(&parse_key(key.as_ref())?) {
135            Some(v) => v,
136            None => &[],
137        };
138
139        Ok(keywords.iter().map(|s| s.as_ref()))
140    }
141
142    /// Returns an iterator over all keys in the `UnicodeExtensionList`.
143    ///
144    /// # Examples
145    ///
146    /// ```
147    /// use unic_locale_impl::Locale;
148    ///
149    /// let mut loc: Locale = "en-US-u-ca-buddhist-nu-thai".parse()
150    ///     .expect("Parsing failed.");
151    ///
152    /// assert_eq!(loc.extensions.unicode.keyword_keys().collect::<Vec<_>>(),
153    ///            &["ca", "nu"]);
154    /// ```
155    pub fn keyword_keys(&self) -> impl ExactSizeIterator<Item = &str> {
156        self.keywords.keys().map(|s| s.as_ref())
157    }
158
159    /// Adds a keyword to the `UnicodeExtensionList` or sets value for key if
160    /// keyword is already included in the `UnicodeExtensionList`.
161    ///
162    /// # Examples
163    ///
164    /// ```
165    /// use unic_locale_impl::Locale;
166    ///
167    /// let mut loc: Locale = "en-US".parse()
168    ///     .expect("Parsing failed.");
169    ///
170    /// loc.extensions.unicode.set_keyword("ca", &["buddhist"])
171    ///     .expect("Setting keyword failed.");
172    ///
173    /// assert_eq!(loc.to_string(), "en-US-u-ca-buddhist");
174    ///
175    /// loc.extensions.unicode.set_keyword("ca", &["chinese"])
176    ///     .expect("Setting keyword failed.");
177    ///
178    /// assert_eq!(loc.to_string(), "en-US-u-ca-chinese");
179    /// ```
180    pub fn set_keyword<S: AsRef<[u8]>>(&mut self, key: S, value: &[S]) -> Result<(), LocaleError> {
181        let key = parse_key(key.as_ref())?;
182
183        let t = value
184            .iter()
185            .filter_map(|t| parse_type(t.as_ref()).transpose())
186            .collect::<Result<Vec<_>, _>>()?;
187
188        self.keywords.insert(key, t);
189        Ok(())
190    }
191
192    /// Removes a keyword from the `UnicodeExtensionList`.
193    ///
194    /// Returns `true` if keyword was included in the `UnicodeExtensionList`
195    /// before removal.
196    ///
197    /// # Examples
198    ///
199    /// ```
200    /// use unic_locale_impl::Locale;
201    ///
202    /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
203    ///     .expect("Parsing failed.");
204    ///
205    /// assert_eq!(loc.extensions.unicode.remove_keyword("ca")
206    ///               .expect("Removing tag failed."),
207    ///            true);
208    ///
209    /// assert_eq!(loc.to_string(), "en-US");
210    /// ```
211    pub fn remove_keyword<S: AsRef<[u8]>>(&mut self, key: S) -> Result<bool, LocaleError> {
212        Ok(self.keywords.remove(&parse_key(key.as_ref())?).is_some())
213    }
214
215    /// Clears all keywords from the `UnicodeExtensionList`.
216    ///
217    /// # Examples
218    ///
219    /// ```
220    /// use unic_locale_impl::Locale;
221    ///
222    /// let mut loc: Locale = "en-US-u-ca-buddhist".parse()
223    ///     .expect("Parsing failed.");
224    ///
225    /// loc.extensions.unicode.clear_keywords();
226    /// assert_eq!(loc.to_string(), "en-US");
227    /// ```
228    pub fn clear_keywords(&mut self) {
229        self.keywords.clear();
230    }
231
232    /// Returns `true` if attribute is included in the `UnicodeExtensionList`.
233    ///
234    /// # Examples
235    ///
236    /// ```
237    /// use unic_locale_impl::Locale;
238    ///
239    /// let mut loc: Locale = "en-US-u-foo".parse()
240    ///     .expect("Parsing failed.");
241    ///
242    /// assert_eq!(loc.extensions.unicode.has_attribute("foo")
243    ///               .expect("Getting attribute failed."),
244    ///            true);
245    /// ```
246    pub fn has_attribute<S: AsRef<[u8]>>(&self, attribute: S) -> Result<bool, LocaleError> {
247        Ok(self
248            .attributes
249            .contains(&parse_attribute(attribute.as_ref())?))
250    }
251
252    /// Returns an iterator over all attributes in the `UnicodeExtensionList`.
253    ///
254    /// # Examples
255    ///
256    /// ```
257    /// use unic_locale_impl::Locale;
258    ///
259    /// let mut loc: Locale = "en-US-u-foo-bar".parse()
260    ///     .expect("Parsing failed.");
261    ///
262    /// assert_eq!(loc.extensions.unicode.attributes().collect::<Vec<_>>(),
263    ///            &["bar", "foo"]);
264    /// ```
265    pub fn attributes(&self) -> impl ExactSizeIterator<Item = &str> {
266        self.attributes.iter().map(|s| s.as_ref())
267    }
268
269    /// Sets an attribute on the `UnicodeExtensionList`.
270    ///
271    /// # Examples
272    ///
273    /// ```
274    /// use unic_locale_impl::Locale;
275    ///
276    /// let mut loc: Locale = "en-US".parse()
277    ///     .expect("Parsing failed.");
278    ///
279    /// loc.extensions.unicode.set_attribute("foo")
280    ///     .expect("Setting attribute failed.");
281    ///
282    /// assert_eq!(loc.to_string(), "en-US-u-foo");
283    /// ```
284    pub fn set_attribute<S: AsRef<[u8]>>(&mut self, attribute: S) -> Result<(), LocaleError> {
285        let attribute = parse_attribute(attribute.as_ref())?;
286        if let Err(idx) = self.attributes.binary_search(&attribute) {
287            self.attributes.insert(idx, attribute);
288        }
289        Ok(())
290    }
291
292    /// Removes an attribute from the `UnicodeExtensionList`.
293    ///
294    /// Returns `true` if attribute was included in the `UnicodeExtensionList`
295    /// before removal.
296    ///
297    /// # Examples
298    ///
299    /// ```
300    /// use unic_locale_impl::Locale;
301    ///
302    /// let mut loc: Locale = "en-US-u-foo".parse()
303    ///     .expect("Parsing failed.");
304    ///
305    /// assert_eq!(loc.extensions.unicode.remove_attribute("foo")
306    ///               .expect("Removing attribute failed."),
307    ///            true);
308    ///
309    /// assert_eq!(loc.to_string(), "en-US");
310    /// ```
311    pub fn remove_attribute<S: AsRef<[u8]>>(&mut self, attribute: S) -> Result<bool, LocaleError> {
312        let attribute = parse_attribute(attribute.as_ref())?;
313        match self.attributes.binary_search(&attribute) {
314            Ok(idx) => {
315                self.attributes.remove(idx);
316                Ok(true)
317            }
318            Err(_) => Ok(false),
319        }
320    }
321
322    /// Clears all attributes from the `UnicodeExtensionList`.
323    ///
324    /// # Examples
325    ///
326    /// ```
327    /// use unic_locale_impl::Locale;
328    ///
329    /// let mut loc: Locale = "en-US-u-foo".parse()
330    ///     .expect("Parsing failed.");
331    ///
332    /// loc.extensions.unicode.clear_attributes();
333    /// assert_eq!(loc.to_string(), "en-US");
334    /// ```
335    pub fn clear_attributes(&mut self) {
336        self.attributes.clear();
337    }
338
339    pub(crate) fn try_from_iter<'a>(
340        iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
341    ) -> Result<Self, ParserError> {
342        let mut uext = Self::default();
343
344        let mut st_peek = iter.peek();
345
346        let mut current_keyword = None;
347        let mut current_types = vec![];
348
349        while let Some(subtag) = st_peek {
350            let slen = subtag.len();
351            if slen == 2 {
352                if let Some(current_keyword) = current_keyword {
353                    uext.keywords.insert(current_keyword, current_types);
354                    current_types = vec![];
355                }
356                current_keyword = Some(parse_key(subtag)?);
357                iter.next();
358            } else if current_keyword.is_some() && is_type(subtag) {
359                if let Some(ty) = parse_type(subtag)? {
360                    current_types.push(ty);
361                }
362                iter.next();
363            } else if is_attribute(subtag) {
364                uext.attributes.push(parse_attribute(subtag)?);
365                iter.next();
366            } else {
367                break;
368            }
369            st_peek = iter.peek();
370        }
371
372        if let Some(current_keyword) = current_keyword {
373            uext.keywords.insert(current_keyword, current_types);
374        }
375
376        uext.attributes.sort_unstable();
377        uext.attributes.dedup();
378
379        Ok(uext)
380    }
381}
382
383impl std::fmt::Display for UnicodeExtensionList {
384    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
385        if self.is_empty() {
386            return Ok(());
387        }
388
389        f.write_str("-u")?;
390
391        for attr in &self.attributes {
392            write!(f, "-{}", attr)?;
393        }
394
395        for (k, t) in &self.keywords {
396            write!(f, "-{}", k)?;
397            for v in t {
398                write!(f, "-{}", v)?;
399            }
400        }
401        Ok(())
402    }
403}