Skip to main content

countries_iso3166/bcp47/
multi_lang_parser.rs

1use std::collections::HashMap;
2
3use crate::{CountriesIso31661Error, CountriesIso31661Result};
4
5#[cfg(any(
6    all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
7    all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
8))]
9type Key = u64;
10
11#[cfg(any(
12    all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
13    all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
14))]
15type Key = String;
16
17#[cfg(any(doc, docsrs))]
18type Key = String;
19
20// Use a generic parameter for struct in doc-tests:
21// This lets doctests instantiate it with a concrete type without relying on cfg-gated Key.
22#[derive(Debug, PartialEq, Eq, Clone)]
23pub struct MultiLanguageTranslationMap<
24    K: std::hash::Hash + PartialEq + Eq + core::fmt::Debug + Clone = Key,
25> {
26    identifier_index: HashMap<K, usize>,
27    bcp47_index: HashMap<String, usize>,
28    translations: Vec<Vec<String>>,
29}
30
31impl MultiLanguageTranslationMap {
32    pub fn new(source_path: &str, source_contents: &str) -> CountriesIso31661Result<Self> {
33        let data = Self::parse(source_path, source_contents)?;
34
35        let mut identifier_index = HashMap::<Key, usize>::with_capacity(805);
36        let mut bcp47_index = HashMap::<String, usize>::with_capacity(805);
37        let mut translations = Vec::<Vec<String>>::default();
38
39        data.into_iter()
40            .enumerate()
41            .for_each(|(identifier_index_key, (identifier, languages))| {
42                let mut languages_inner = Vec::<String>::default();
43
44                languages
45                    .into_iter()
46                    .enumerate()
47                    .for_each(|(index, (bcp47_code, translation))| {
48                        languages_inner.push(translation);
49                        bcp47_index.insert(bcp47_code, index);
50                    });
51
52                translations.push(languages_inner);
53
54                #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
55                let key = rapidhash::v3::rapidhash_v3(identifier.as_bytes());
56                #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
57                let key = identifier;
58
59                identifier_index.insert(key, identifier_index_key);
60            });
61
62        Ok(Self {
63            identifier_index,
64            bcp47_index,
65            translations,
66        })
67    }
68
69    /// `Identifier` and `BCP-47 Code` are case sensitive
70    pub fn get_translation(
71        &self,
72        identifier: &str,
73        bcp47_code: &str,
74    ) -> CountriesIso31661Result<String> {
75        #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
76        let identifier = &rapidhash::v3::rapidhash_v3(identifier.as_bytes());
77        #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
78        let identifier = identifier;
79
80        self.identifier_index
81            .get(identifier)
82            .map(|identifier_index| {
83                let bcp47_index = self.bcp47_index.get(bcp47_code).ok_or(
84                    CountriesIso31661Error::Bcp47EntryNotFound {
85                        identifier: identifier.to_string(),
86                        bcp47_code: bcp47_code.to_string(),
87                    },
88                )?;
89                let outcome = self.translations[*identifier_index][*bcp47_index].clone();
90
91                Ok::<String, CountriesIso31661Error>(outcome)
92            })
93            .transpose()?
94            .ok_or(CountriesIso31661Error::IdentifierNotFound(
95                identifier.to_string(),
96            ))
97    }
98
99    /// If `large_keys` feature is enabled the result of this is a `&HashMap<u64, usize>`
100    pub fn identifier_index(&self) -> &HashMap<Key, usize> {
101        &self.identifier_index
102    }
103
104    pub fn bcp47_index(&self) -> &HashMap<String, usize> {
105        &self.bcp47_index
106    }
107
108    pub fn translations(&self) -> &[Vec<String>] {
109        self.translations.as_slice()
110    }
111
112    /// The start of a sentence must have a space `# `
113    pub fn parse(
114        source_path: &str,
115        input: &str,
116    ) -> CountriesIso31661Result<HashMap<String, Vec<(String, String)>>> {
117        let mut sentences: HashMap<String, Vec<(String, String)>> = HashMap::new();
118
119        let mut current_sentence: Option<String> = None;
120        let mut multiline_lang: Option<String> = None;
121        let mut multiline_buffer = String::new();
122
123        for raw_line in input.lines() {
124            let line = raw_line.trim();
125
126            if line.is_empty() {
127                continue;
128            }
129
130            // multiline continuation
131            if let Some(lang) = &multiline_lang {
132                if line.ends_with('"') {
133                    multiline_buffer.push_str(line.trim_end_matches('"'));
134
135                    if let Some(sentence) = &current_sentence {
136                        sentences
137                            .get_mut(sentence)
138                            .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
139                                source_path: source_path.to_string(),
140                                line: line.to_string(),
141                            })?
142                            .push((lang.clone(), multiline_buffer.clone()));
143                    }
144
145                    multiline_buffer.clear();
146                    multiline_lang = None;
147                } else {
148                    multiline_buffer.push_str(line);
149                    multiline_buffer.push('\n');
150                }
151
152                continue;
153            }
154
155            // new sentence
156            if line.starts_with("# ") {
157                let sentence = line.trim_start_matches("# ").to_string();
158                sentences.entry(sentence.clone()).or_default();
159                current_sentence = Some(sentence);
160                continue;
161            }
162
163            // translation entry
164            if let Some(eq_pos) = line.find('=') {
165                let lang = line[..eq_pos].trim().to_string();
166
167                let check_bc47_is_valid: crate::BC47LanguageInfo = lang.as_str().into();
168
169                if check_bc47_is_valid == crate::BC47LanguageInfo::UnsupportedLanguage {
170                    return Err(CountriesIso31661Error::UnsupportedBcp47Code {
171                        source_path: source_path.to_string(),
172                        invalid_lang: lang,
173                    });
174                }
175
176                let value = line[eq_pos + 1..].trim();
177
178                if value.starts_with('"') {
179                    let content = value.trim_start_matches('"');
180
181                    if content.ends_with('"') {
182                        let final_value = content.trim_end_matches('"');
183
184                        if let Some(sentence) = &current_sentence {
185                            sentences
186                                .get_mut(sentence)
187                                .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
188                                    source_path: source_path.to_string(),
189                                    line: line.to_string(),
190                                })?
191                                .push((lang, final_value.to_string()));
192                        }
193                    } else {
194                        multiline_lang = Some(lang);
195                        multiline_buffer.push_str(content);
196                        multiline_buffer.push('\n');
197                    }
198                } else {
199                    if let Some(sentence) = &current_sentence {
200                        sentences
201                            .get_mut(sentence)
202                            .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
203                                source_path: source_path.to_string(),
204                                line: line.to_string(),
205                            })?
206                            .push((lang, value.to_string()));
207                    }
208                }
209            }
210        }
211
212        Ok(sentences)
213    }
214}
215
216#[derive(Debug, PartialEq, Eq, Clone)]
217pub struct Translation {
218    bcp47: String,
219    native: String,
220}
221
222#[cfg(test)]
223mod sanity_checks {
224    use crate::{CountriesIso31661Error, MultiLanguageTranslationMap};
225
226    #[test]
227    fn parse_correct_translations() {
228        let source_contents = include_str!("../../example_data/test-lang.bcp47");
229        let source_path = "../../example_data/test-lang.bcp47";
230
231        assert!(MultiLanguageTranslationMap::new(source_path, source_contents).is_ok());
232    }
233
234    #[test]
235    fn parse_incorrect_translations() {
236        let source_contents = include_str!("../../example_data/test-lang-invalid.bcp47");
237        let source_path = "../../example_data/test-lang-invalid.bcp47";
238
239        assert_eq!(
240            MultiLanguageTranslationMap::new(source_path, source_contents).err(),
241            Some(CountriesIso31661Error::UnsupportedBcp47Code {
242                source_path: source_path.to_string(),
243                invalid_lang: "en-USS".to_string()
244            })
245        );
246    }
247}