unicode_locale_parser/
lang.rs

1use crate::constants::{LANG_UND, SEP};
2use crate::errors::ParserError;
3use crate::shared::split_str;
4use crate::subtags::{language_subtag, region_subtag, script_subtag, variant_subtag};
5
6use std::fmt::{self, Write};
7use std::iter::Peekable;
8use std::str::FromStr;
9
10#[derive(Debug, PartialEq)]
11pub struct UnicodeLanguageIdentifier {
12    pub language: String,
13    pub script: Option<String>,
14    pub region: Option<String>,
15    pub variants: Option<Vec<String>>,
16}
17
18/// Parse the given string as an Unicode Language Identifier.
19///
20/// This function parses according to [`unicode_language_id` EBNF defined in UTS #35](https://unicode.org/reports/tr35/#unicode_language_id).
21///
22/// # Examples
23///
24/// ```
25/// use unicode_locale_parser::parse_language_id;
26///
27/// let res = parse_language_id("en-US").unwrap();
28/// assert_eq!("en", res.language);
29/// assert_eq!(None, res.script);
30/// assert_eq!(Some("US".to_string()), res.region);
31/// assert_eq!(None, res.variants);
32/// ```
33///
34/// # Errors
35///
36/// This function returns an error in the following cases:
37///
38/// - [`ParserError::Missing`] if the given language id is empty.
39/// - [`ParserError::InvalidLanguage`] if the given language id is not a valid language identifier.
40/// - [`ParserError::InvalidSubtag`] if the given language id is not a valid subtag.
41///
42pub fn parse_unicode_language_id(lang_id: &str) -> Result<UnicodeLanguageIdentifier, ParserError> {
43    // check empty
44    if lang_id.is_empty() {
45        return Err(ParserError::Missing);
46    }
47
48    parse_unicode_language_id_from_iter(&mut split_str(lang_id).peekable())
49}
50
51pub fn parse_unicode_language_id_from_iter<'a>(
52    iter: &mut Peekable<impl Iterator<Item = &'a str>>,
53) -> Result<UnicodeLanguageIdentifier, ParserError> {
54    // language subtag
55    let language = if let Some(lang) = iter.next() {
56        language_subtag(lang)?
57    } else {
58        return Err(ParserError::Unexpected);
59    };
60    let language = String::from(language);
61
62    // other subtags
63    let mut script = None;
64    let mut region = None;
65    let mut variants = vec![];
66    let mut current = 1;
67    while let Some(subtag) = iter.peek() {
68        if current == 1 {
69            if let Ok(script_subtag) = script_subtag(subtag) {
70                script = Some(String::from(script_subtag));
71                current = 2;
72            } else if let Ok(region_subtag) = region_subtag(subtag) {
73                region = Some(String::from(region_subtag));
74                current = 3;
75            } else if let Ok(variant_subtag) = variant_subtag(subtag) {
76                variants.push(String::from(variant_subtag));
77                current = 3;
78            } else {
79                break;
80            }
81        } else if current == 2 {
82            if let Ok(region_subtag) = region_subtag(subtag) {
83                region = Some(String::from(region_subtag));
84                current = 3;
85            } else if let Ok(variant_subtag) = variant_subtag(subtag) {
86                variants.push(String::from(variant_subtag));
87                current = 3;
88            } else {
89                break;
90            }
91        } else if let Ok(variant_subtag) = variant_subtag(subtag) {
92            variants.push(String::from(variant_subtag));
93        } else {
94            break;
95        }
96        iter.next();
97    }
98
99    // normalize variants
100    let variants = if variants.is_empty() {
101        None
102    } else {
103        variants.dedup();
104        Some(variants)
105    };
106
107    Ok(UnicodeLanguageIdentifier {
108        language,
109        script,
110        region,
111        variants,
112    })
113}
114
115impl fmt::Display for UnicodeLanguageIdentifier {
116    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
117        if self.language.is_empty() {
118            f.write_str(LANG_UND)?;
119        } else {
120            self.language.fmt(f)?;
121        }
122        if let Some(ref script) = self.script {
123            f.write_char(SEP)?;
124            script.fmt(f)?;
125        }
126        if let Some(ref region) = self.region {
127            f.write_char(SEP)?;
128            region.fmt(f)?;
129        }
130        if let Some(ref variants) = self.variants {
131            for variant in variants.iter() {
132                f.write_char(SEP)?;
133                variant.fmt(f)?;
134            }
135        }
136        Ok(())
137    }
138}
139
140impl FromStr for UnicodeLanguageIdentifier {
141    type Err = ParserError;
142
143    fn from_str(source: &str) -> Result<Self, Self::Err> {
144        parse_unicode_language_id(source)
145    }
146}
147
148/**
149 * Unit tests
150 */
151
152#[test]
153fn success_parse_unicode_language_id() {
154    // full case
155    let result = parse_unicode_language_id("en-Latn-US-macos-windows-linux").unwrap();
156    assert_eq!(result.language, "en");
157    assert_eq!(result.script, Some("Latn".to_string()));
158    assert_eq!(result.region, Some("US".to_string()));
159    assert_eq!(
160        result.variants,
161        Some(vec![
162            "macos".to_string(),
163            "windows".to_string(),
164            "linux".to_string()
165        ])
166    );
167
168    // use sep with underscore
169    let result = parse_unicode_language_id("en_Latn_US").unwrap();
170    assert_eq!(result.language, "en");
171    assert_eq!(result.script, Some("Latn".to_string()));
172    assert_eq!(result.region, Some("US".to_string()));
173
174    // language subtag only
175    let result = parse_unicode_language_id("en").unwrap();
176    assert_eq!(result.language, "en");
177    assert_eq!(result.script, None);
178    assert_eq!(result.region, None);
179    assert_eq!(result.variants, None);
180
181    // language subtag and region subtag
182    let result = parse_unicode_language_id("en-US").unwrap();
183    assert_eq!(result.language, "en");
184    assert_eq!(result.script, None);
185    assert_eq!(result.region, Some("US".to_string()));
186    assert_eq!(result.variants, None);
187
188    // language subtag and script subtag
189    let result = parse_unicode_language_id("en-Latn").unwrap();
190    assert_eq!(result.language, "en");
191    assert_eq!(result.script, Some("Latn".to_string()));
192    assert_eq!(result.region, None);
193    assert_eq!(result.variants, None);
194
195    // language subtag and variant subtag
196    let result = parse_unicode_language_id("en-macos").unwrap();
197    assert_eq!(result.language, "en");
198    assert_eq!(result.script, None);
199    assert_eq!(result.region, None);
200    assert_eq!(result.variants, Some(vec!["macos".to_string()]));
201
202    // language subtag, script subtag and region subtag
203    let result = parse_unicode_language_id("en-Latn-US").unwrap();
204    assert_eq!(result.language, "en");
205    assert_eq!(result.script, Some("Latn".to_string()));
206    assert_eq!(result.region, Some("US".to_string()));
207    assert_eq!(result.variants, None);
208
209    // language subtag: 'root'
210    let result = parse_unicode_language_id("root").unwrap();
211    assert_eq!(result.language, "");
212    assert_eq!(result.script, None);
213    assert_eq!(result.region, None);
214    assert_eq!(result.variants, None);
215
216    // include language subtag: 'und'
217    let result = parse_unicode_language_id("und-Latn-AT-macos").unwrap();
218    assert_eq!(result.language, "");
219    assert_eq!(result.script, Some("Latn".to_string()));
220    assert_eq!(result.region, Some("AT".to_string()));
221    assert_eq!(result.variants, Some(vec!["macos".to_string()]));
222
223    // Display trait implementation
224    assert_eq!(
225        "en-Latn-US-macos",
226        format!("{}", parse_unicode_language_id("en-Latn-US-macos").unwrap())
227    );
228    assert_eq!(
229        "und-Latn-US-macos",
230        format!(
231            "{}",
232            parse_unicode_language_id("und-Latn-US-macos").unwrap()
233        )
234    );
235
236    // PartialEq trait implementation
237    assert_eq!(
238        parse_unicode_language_id("en-Latn-US").unwrap(),
239        parse_unicode_language_id("en-Latn-US").unwrap()
240    );
241
242    // FromStr trait implementation
243    let result: UnicodeLanguageIdentifier = "en-Latn-US-macos".parse().unwrap();
244    assert_eq!("en", result.language);
245    assert_eq!(Some("Latn".to_string()), result.script);
246    assert_eq!(Some("US".to_string()), result.region);
247    assert_eq!(Some(vec!["macos".to_string()]), result.variants);
248    let result: UnicodeLanguageIdentifier = "en-Latn-US".parse().unwrap();
249    assert_eq!("en-Latn-US", format!("{}", result));
250}
251
252#[test]
253fn fail_parse_unicode_language_id() {
254    // missing language
255    assert_eq!(
256        ParserError::Missing,
257        parse_unicode_language_id("").unwrap_err()
258    );
259}