Skip to main content

countries_iso3166/bcp47/
multi_lang_parser.rs

1use std::collections::HashMap;
2
3use crate::{CountriesIso31661Error, CountriesIso31661Result};
4
5#[cfg(any(
6    all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
7    all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
8))]
9type Key = u64;
10
11#[cfg(any(
12    all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
13    all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
14))]
15type Key = String;
16
17#[cfg(any(doc, docsrs))]
18type Key = String;
19
20// Use a generic parameter for struct in doc-tests:
21// This lets doctests instantiate it with a concrete type without relying on cfg-gated Key.
22#[derive(Debug, PartialEq, Eq, Clone, Default)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
25pub struct MultiLanguageTranslationMap<
26    K: std::hash::Hash + PartialEq + Eq + core::fmt::Debug + Clone = Key,
27> {
28    identifier_index: HashMap<K, usize>,
29    bcp47_index: HashMap<String, usize>,
30    translations: Vec<Vec<String>>,
31}
32
33impl MultiLanguageTranslationMap {
34    pub fn new(source_path: &str, source_contents: &str) -> CountriesIso31661Result<Self> {
35        let data = Self::parse(source_path, source_contents)?;
36
37        let mut identifier_index = HashMap::<Key, usize>::with_capacity(805);
38        let mut bcp47_index = HashMap::<String, usize>::with_capacity(805);
39        let mut translations = Vec::<Vec<String>>::default();
40
41        data.into_iter()
42            .enumerate()
43            .for_each(|(identifier_index_key, (identifier, languages))| {
44                let mut languages_inner = Vec::<String>::default();
45
46                languages
47                    .into_iter()
48                    .enumerate()
49                    .for_each(|(index, (bcp47_code, translation))| {
50                        languages_inner.push(translation);
51                        bcp47_index.insert(bcp47_code, index);
52                    });
53
54                translations.push(languages_inner);
55
56                #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
57                let key = rapidhash::v3::rapidhash_v3(identifier.as_bytes());
58                #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
59                let key = identifier;
60
61                identifier_index.insert(key, identifier_index_key);
62            });
63
64        Ok(Self {
65            identifier_index,
66            bcp47_index,
67            translations,
68        })
69    }
70
71    /// `Identifier` and `BCP-47 Code` are case sensitive
72    pub fn get_translation(
73        &self,
74        identifier: &str,
75        bcp47_code: &str,
76    ) -> CountriesIso31661Result<String> {
77        #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
78        let identifier = &rapidhash::v3::rapidhash_v3(identifier.as_bytes());
79        #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
80        let identifier = identifier;
81
82        self.identifier_index
83            .get(identifier)
84            .map(|identifier_index| {
85                let bcp47_index = self.bcp47_index.get(bcp47_code).ok_or(
86                    CountriesIso31661Error::Bcp47EntryNotFound {
87                        identifier: identifier.to_string(),
88                        bcp47_code: bcp47_code.to_string(),
89                    },
90                )?;
91                let outcome = self.translations[*identifier_index][*bcp47_index].clone();
92
93                Ok::<String, CountriesIso31661Error>(outcome)
94            })
95            .transpose()?
96            .ok_or(CountriesIso31661Error::IdentifierNotFound(
97                identifier.to_string(),
98            ))
99    }
100
101    /// If `large_keys` feature is enabled the result of this is a `&HashMap<u64, usize>`
102    pub fn identifier_index(&self) -> &HashMap<Key, usize> {
103        &self.identifier_index
104    }
105
106    pub fn bcp47_index(&self) -> &HashMap<String, usize> {
107        &self.bcp47_index
108    }
109
110    pub fn translations(&self) -> &[Vec<String>] {
111        self.translations.as_slice()
112    }
113
114    /// The start of a sentence must have a space `# `
115    pub fn parse(
116        source_path: &str,
117        input: &str,
118    ) -> CountriesIso31661Result<HashMap<String, Vec<(String, String)>>> {
119        let mut sentences: HashMap<String, Vec<(String, String)>> = HashMap::new();
120
121        let mut current_sentence: Option<String> = None;
122        let mut multiline_lang: Option<String> = None;
123        let mut multiline_buffer = String::new();
124
125        for raw_line in input.lines() {
126            let line = raw_line.trim();
127
128            if line.is_empty() {
129                continue;
130            }
131
132            // multiline continuation
133            if let Some(lang) = &multiline_lang {
134                if line.ends_with('"') {
135                    multiline_buffer.push_str(line.trim_end_matches('"'));
136
137                    if let Some(sentence) = &current_sentence {
138                        sentences
139                            .get_mut(sentence)
140                            .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
141                                source_path: source_path.to_string(),
142                                line: line.to_string(),
143                            })?
144                            .push((lang.clone(), multiline_buffer.clone()));
145                    }
146
147                    multiline_buffer.clear();
148                    multiline_lang = None;
149                } else {
150                    multiline_buffer.push_str(line);
151                    multiline_buffer.push('\n');
152                }
153
154                continue;
155            }
156
157            // new sentence
158            if line.starts_with("# ") {
159                let sentence = line.trim_start_matches("# ").to_string();
160                sentences.entry(sentence.clone()).or_default();
161                current_sentence = Some(sentence);
162                continue;
163            }
164
165            // translation entry
166            if let Some(eq_pos) = line.find('=') {
167                let lang = line[..eq_pos].trim().to_string();
168
169                let check_bc47_is_valid: crate::BC47LanguageInfo = lang.as_str().into();
170
171                if check_bc47_is_valid == crate::BC47LanguageInfo::UnsupportedLanguage {
172                    return Err(CountriesIso31661Error::UnsupportedBcp47Code {
173                        source_path: source_path.to_string(),
174                        invalid_lang: lang,
175                    });
176                }
177
178                let value = line[eq_pos + 1..].trim();
179
180                if value.starts_with('"') {
181                    let content = value.trim_start_matches('"');
182
183                    if content.ends_with('"') {
184                        let final_value = content.trim_end_matches('"');
185
186                        if let Some(sentence) = &current_sentence {
187                            sentences
188                                .get_mut(sentence)
189                                .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
190                                    source_path: source_path.to_string(),
191                                    line: line.to_string(),
192                                })?
193                                .push((lang, final_value.to_string()));
194                        }
195                    } else {
196                        multiline_lang = Some(lang);
197                        multiline_buffer.push_str(content);
198                        multiline_buffer.push('\n');
199                    }
200                } else {
201                    if let Some(sentence) = &current_sentence {
202                        sentences
203                            .get_mut(sentence)
204                            .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
205                                source_path: source_path.to_string(),
206                                line: line.to_string(),
207                            })?
208                            .push((lang, value.to_string()));
209                    }
210                }
211            }
212        }
213
214        Ok(sentences)
215    }
216}
217
218#[derive(Debug, PartialEq, Eq, Clone)]
219pub struct Translation {
220    bcp47: String,
221    native: String,
222}
223
224#[cfg(test)]
225mod sanity_checks {
226    use crate::{CountriesIso31661Error, MultiLanguageTranslationMap};
227
228    #[test]
229    fn parse_correct_translations() {
230        let source_contents = include_str!("../../example_data/test-lang.bcp47");
231        let source_path = "../../example_data/test-lang.bcp47";
232
233        assert!(MultiLanguageTranslationMap::new(source_path, source_contents).is_ok());
234    }
235
236    #[test]
237    fn parse_incorrect_translations() {
238        let source_contents = include_str!("../../example_data/test-lang-invalid.bcp47");
239        let source_path = "../../example_data/test-lang-invalid.bcp47";
240
241        assert_eq!(
242            MultiLanguageTranslationMap::new(source_path, source_contents).err(),
243            Some(CountriesIso31661Error::UnsupportedBcp47Code {
244                source_path: source_path.to_string(),
245                invalid_lang: "en-USS".to_string()
246            })
247        );
248    }
249}