countries_iso3166/bcp47/
multi_lang_parser.rs1use std::collections::HashMap;
2
3use crate::{CountriesIso31661Error, CountriesIso31661Result};
4
5#[cfg(any(
6 all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
7 all(feature = "large_keys", not(feature = "small_keys"), not(doc)),
8))]
9type Key = u64;
10
11#[cfg(any(
12 all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
13 all(feature = "small_keys", not(feature = "large_keys"), not(doc)),
14))]
15type Key = String;
16
17#[cfg(any(doc, docsrs))]
18type Key = String;
19
20#[derive(Debug, PartialEq, Eq, Clone, Default)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
25pub struct MultiLanguageTranslationMap<
26 K: std::hash::Hash + PartialEq + Eq + core::fmt::Debug + Clone = Key,
27> {
28 identifier_index: HashMap<K, usize>,
29 bcp47_index: HashMap<String, usize>,
30 translations: Vec<Vec<String>>,
31}
32
33impl MultiLanguageTranslationMap {
34 pub fn new(source_path: &str, source_contents: &str) -> CountriesIso31661Result<Self> {
35 let data = Self::parse(source_path, source_contents)?;
36
37 let mut identifier_index = HashMap::<Key, usize>::with_capacity(805);
38 let mut bcp47_index = HashMap::<String, usize>::with_capacity(805);
39 let mut translations = Vec::<Vec<String>>::default();
40
41 data.into_iter()
42 .enumerate()
43 .for_each(|(identifier_index_key, (identifier, languages))| {
44 let mut languages_inner = Vec::<String>::default();
45
46 languages
47 .into_iter()
48 .enumerate()
49 .for_each(|(index, (bcp47_code, translation))| {
50 languages_inner.push(translation);
51 bcp47_index.insert(bcp47_code, index);
52 });
53
54 translations.push(languages_inner);
55
56 #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
57 let key = rapidhash::v3::rapidhash_v3(identifier.as_bytes());
58 #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
59 let key = identifier;
60
61 identifier_index.insert(key, identifier_index_key);
62 });
63
64 Ok(Self {
65 identifier_index,
66 bcp47_index,
67 translations,
68 })
69 }
70
71 pub fn get_translation(
73 &self,
74 identifier: &str,
75 bcp47_code: &str,
76 ) -> CountriesIso31661Result<String> {
77 #[cfg(all(feature = "large_keys", not(feature = "small_keys")))]
78 let identifier = &rapidhash::v3::rapidhash_v3(identifier.as_bytes());
79 #[cfg(all(feature = "small_keys", not(feature = "large_keys")))]
80 let identifier = identifier;
81
82 self.identifier_index
83 .get(identifier)
84 .map(|identifier_index| {
85 let bcp47_index = self.bcp47_index.get(bcp47_code).ok_or(
86 CountriesIso31661Error::Bcp47EntryNotFound {
87 identifier: identifier.to_string(),
88 bcp47_code: bcp47_code.to_string(),
89 },
90 )?;
91 let outcome = self.translations[*identifier_index][*bcp47_index].clone();
92
93 Ok::<String, CountriesIso31661Error>(outcome)
94 })
95 .transpose()?
96 .ok_or(CountriesIso31661Error::IdentifierNotFound(
97 identifier.to_string(),
98 ))
99 }
100
101 pub fn identifier_index(&self) -> &HashMap<Key, usize> {
103 &self.identifier_index
104 }
105
106 pub fn bcp47_index(&self) -> &HashMap<String, usize> {
107 &self.bcp47_index
108 }
109
110 pub fn translations(&self) -> &[Vec<String>] {
111 self.translations.as_slice()
112 }
113
114 pub fn parse(
116 source_path: &str,
117 input: &str,
118 ) -> CountriesIso31661Result<HashMap<String, Vec<(String, String)>>> {
119 let mut sentences: HashMap<String, Vec<(String, String)>> = HashMap::new();
120
121 let mut current_sentence: Option<String> = None;
122 let mut multiline_lang: Option<String> = None;
123 let mut multiline_buffer = String::new();
124
125 for raw_line in input.lines() {
126 let line = raw_line.trim();
127
128 if line.is_empty() {
129 continue;
130 }
131
132 if let Some(lang) = &multiline_lang {
134 if line.ends_with('"') {
135 multiline_buffer.push_str(line.trim_end_matches('"'));
136
137 if let Some(sentence) = ¤t_sentence {
138 sentences
139 .get_mut(sentence)
140 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
141 source_path: source_path.to_string(),
142 line: line.to_string(),
143 })?
144 .push((lang.clone(), multiline_buffer.clone()));
145 }
146
147 multiline_buffer.clear();
148 multiline_lang = None;
149 } else {
150 multiline_buffer.push_str(line);
151 multiline_buffer.push('\n');
152 }
153
154 continue;
155 }
156
157 if line.starts_with("# ") {
159 let sentence = line.trim_start_matches("# ").to_string();
160 sentences.entry(sentence.clone()).or_default();
161 current_sentence = Some(sentence);
162 continue;
163 }
164
165 if let Some(eq_pos) = line.find('=') {
167 let lang = line[..eq_pos].trim().to_string();
168
169 let check_bc47_is_valid: crate::BC47LanguageInfo = lang.as_str().into();
170
171 if check_bc47_is_valid == crate::BC47LanguageInfo::UnsupportedLanguage {
172 return Err(CountriesIso31661Error::UnsupportedBcp47Code {
173 source_path: source_path.to_string(),
174 invalid_lang: lang,
175 });
176 }
177
178 let value = line[eq_pos + 1..].trim();
179
180 if value.starts_with('"') {
181 let content = value.trim_start_matches('"');
182
183 if content.ends_with('"') {
184 let final_value = content.trim_end_matches('"');
185
186 if let Some(sentence) = ¤t_sentence {
187 sentences
188 .get_mut(sentence)
189 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
190 source_path: source_path.to_string(),
191 line: line.to_string(),
192 })?
193 .push((lang, final_value.to_string()));
194 }
195 } else {
196 multiline_lang = Some(lang);
197 multiline_buffer.push_str(content);
198 multiline_buffer.push('\n');
199 }
200 } else {
201 if let Some(sentence) = ¤t_sentence {
202 sentences
203 .get_mut(sentence)
204 .ok_or(CountriesIso31661Error::InvalidLanguageEntryParsed {
205 source_path: source_path.to_string(),
206 line: line.to_string(),
207 })?
208 .push((lang, value.to_string()));
209 }
210 }
211 }
212 }
213
214 Ok(sentences)
215 }
216}
217
218#[derive(Debug, PartialEq, Eq, Clone)]
219pub struct Translation {
220 bcp47: String,
221 native: String,
222}
223
224#[cfg(test)]
225mod sanity_checks {
226 use crate::{CountriesIso31661Error, MultiLanguageTranslationMap};
227
228 #[test]
229 fn parse_correct_translations() {
230 let source_contents = include_str!("../../example_data/test-lang.bcp47");
231 let source_path = "../../example_data/test-lang.bcp47";
232
233 assert!(MultiLanguageTranslationMap::new(source_path, source_contents).is_ok());
234 }
235
236 #[test]
237 fn parse_incorrect_translations() {
238 let source_contents = include_str!("../../example_data/test-lang-invalid.bcp47");
239 let source_path = "../../example_data/test-lang-invalid.bcp47";
240
241 assert_eq!(
242 MultiLanguageTranslationMap::new(source_path, source_contents).err(),
243 Some(CountriesIso31661Error::UnsupportedBcp47Code {
244 source_path: source_path.to_string(),
245 invalid_lang: "en-USS".to_string()
246 })
247 );
248 }
249}