countries_iso3166/bcp47/
multi_lang_parser.rs1use std::collections::HashMap;
2
3use crate::{CountriesIso31661Error, CountriesIso31661Result};
4
5#[cfg(any(
6 all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
7 all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
8))]
9type Key = u64;
10
11#[cfg(any(
12 all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
13 all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
14))]
15type Key = String;
16
17#[cfg(any(doc, docsrs))]
18type Key = String;
19
20#[derive(Debug, PartialEq, Eq, Clone)]
23pub struct MultiLanguageTranslationMap<
24 K: std::hash::Hash + PartialEq + Eq + core::fmt::Debug + Clone = Key,
25> {
26 identifier_index: HashMap<K, usize>,
27 bcp47_index: HashMap<String, usize>,
28 translations: Vec<Vec<String>>,
29}
30
31impl MultiLanguageTranslationMap {
32 pub fn new(source_path: &str, source_contents: &str) -> CountriesIso31661Result<Self> {
33 let data = Self::parse(source_path, source_contents)?;
34
35 let mut identifier_index = HashMap::<Key, usize>::with_capacity(805);
36 let mut bcp47_index = HashMap::<String, usize>::with_capacity(805);
37 let mut translations = Vec::<Vec<String>>::default();
38
39 data.into_iter()
40 .enumerate()
41 .for_each(|(identifier_index_key, (identifier, languages))| {
42 let mut languages_inner = Vec::<String>::default();
43
44 languages
45 .into_iter()
46 .enumerate()
47 .for_each(|(index, (bcp47_code, translation))| {
48 languages_inner.push(translation);
49 bcp47_index.insert(bcp47_code, index);
50 });
51
52 translations.push(languages_inner);
53
54 #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
55 let key = rapidhash::v3::rapidhash_v3(identifier.as_bytes());
56 #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
57 let key = identifier;
58
59 identifier_index.insert(key, identifier_index_key);
60 });
61
62 Ok(Self {
63 identifier_index,
64 bcp47_index,
65 translations,
66 })
67 }
68
69 pub fn get_translation(
71 &self,
72 identifier: &str,
73 bcp47_code: &str,
74 ) -> CountriesIso31661Result<String> {
75 #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
76 let identifier = &rapidhash::v3::rapidhash_v3(identifier.as_bytes());
77 #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
78 let identifier = identifier;
79
80 self.identifier_index
81 .get(identifier)
82 .map(|identifier_index| {
83 let bcp47_index = self.bcp47_index.get(bcp47_code).ok_or(
84 CountriesIso31661Error::Bcp47EntryNotFound {
85 identifier: identifier.to_string(),
86 bcp47_code: bcp47_code.to_string(),
87 },
88 )?;
89 let outcome = self.translations[*identifier_index][*bcp47_index].clone();
90
91 Ok::<String, CountriesIso31661Error>(outcome)
92 })
93 .transpose()?
94 .ok_or(CountriesIso31661Error::IdentifierNotFound(
95 identifier.to_string(),
96 ))
97 }
98
99 pub fn identifier_index(&self) -> &HashMap<Key, usize> {
101 &self.identifier_index
102 }
103
104 pub fn bcp47_index(&self) -> &HashMap<String, usize> {
105 &self.bcp47_index
106 }
107
108 pub fn translations(&self) -> &[Vec<String>] {
109 self.translations.as_slice()
110 }
111
112 pub fn parse(
114 source_path: &str,
115 input: &str,
116 ) -> CountriesIso31661Result<HashMap<String, Vec<(String, String)>>> {
117 let mut sentences: HashMap<String, Vec<(String, String)>> = HashMap::new();
118
119 let mut current_sentence: Option<String> = None;
120 let mut multiline_lang: Option<String> = None;
121 let mut multiline_buffer = String::new();
122
123 for raw_line in input.lines() {
124 let line = raw_line.trim();
125
126 if line.is_empty() {
127 continue;
128 }
129
130 if let Some(lang) = &multiline_lang {
132 if line.ends_with('"') {
133 multiline_buffer.push_str(line.trim_end_matches('"'));
134
135 if let Some(sentence) = ¤t_sentence {
136 sentences
137 .get_mut(sentence)
138 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
139 source_path: source_path.to_string(),
140 line: line.to_string(),
141 })?
142 .push((lang.clone(), multiline_buffer.clone()));
143 }
144
145 multiline_buffer.clear();
146 multiline_lang = None;
147 } else {
148 multiline_buffer.push_str(line);
149 multiline_buffer.push('\n');
150 }
151
152 continue;
153 }
154
155 if line.starts_with("# ") {
157 let sentence = line.trim_start_matches("# ").to_string();
158 sentences.entry(sentence.clone()).or_default();
159 current_sentence = Some(sentence);
160 continue;
161 }
162
163 if let Some(eq_pos) = line.find('=') {
165 let lang = line[..eq_pos].trim().to_string();
166
167 let check_bc47_is_valid: crate::BC47LanguageInfo = lang.as_str().into();
168
169 if check_bc47_is_valid == crate::BC47LanguageInfo::UnsupportedLanguage {
170 return Err(CountriesIso31661Error::UnsupportedBcp47Code {
171 source_path: source_path.to_string(),
172 invalid_lang: lang,
173 });
174 }
175
176 let value = line[eq_pos + 1..].trim();
177
178 if value.starts_with('"') {
179 let content = value.trim_start_matches('"');
180
181 if content.ends_with('"') {
182 let final_value = content.trim_end_matches('"');
183
184 if let Some(sentence) = ¤t_sentence {
185 sentences
186 .get_mut(sentence)
187 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
188 source_path: source_path.to_string(),
189 line: line.to_string(),
190 })?
191 .push((lang, final_value.to_string()));
192 }
193 } else {
194 multiline_lang = Some(lang);
195 multiline_buffer.push_str(content);
196 multiline_buffer.push('\n');
197 }
198 } else {
199 if let Some(sentence) = ¤t_sentence {
200 sentences
201 .get_mut(sentence)
202 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
203 source_path: source_path.to_string(),
204 line: line.to_string(),
205 })?
206 .push((lang, value.to_string()));
207 }
208 }
209 }
210 }
211
212 Ok(sentences)
213 }
214}
215
216#[derive(Debug, PartialEq, Eq, Clone)]
217pub struct Translation {
218 bcp47: String,
219 native: String,
220}
221
222#[cfg(test)]
223mod sanity_checks {
224 use crate::{CountriesIso31661Error, MultiLanguageTranslationMap};
225
226 #[test]
227 fn parse_correct_translations() {
228 let source_contents = include_str!("../../example_data/test-lang.bcp47");
229 let source_path = "../../example_data/test-lang.bcp47";
230
231 assert!(MultiLanguageTranslationMap::new(source_path, source_contents).is_ok());
232 }
233
234 #[test]
235 fn parse_incorrect_translations() {
236 let source_contents = include_str!("../../example_data/test-lang-invalid.bcp47");
237 let source_path = "../../example_data/test-lang-invalid.bcp47";
238
239 assert_eq!(
240 MultiLanguageTranslationMap::new(source_path, source_contents).err(),
241 Some(CountriesIso31661Error::UnsupportedBcp47Code {
242 source_path: source_path.to_string(),
243 invalid_lang: "en-USS".to_string()
244 })
245 );
246 }
247}