geocoder_abbreviations/
lib.rs1use rust_embed::RustEmbed;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use fancy_regex::Regex;
5
6#[derive(RustEmbed)]
7#[folder = "./tokens/"]
8struct Tokens;
9impl Tokens {
10 pub fn codes() -> Vec<String> {
11 let mut codes: Vec<String> = Tokens::iter().filter(|lang| {
12 lang.contains(".json")
13 }).map(|lang| {
14 String::from(lang).replace(".json", "")
15 }).collect();
16
17 codes.sort();
18
19 codes
20 }
21
22 pub fn import(lc: &str) -> Result<String, Error> {
23 match Tokens::get(format!("{}.json", &lc).as_str()) {
24 Some(tokens) => match std::str::from_utf8(tokens.as_ref()) {
25 Ok(tokens) => Ok(String::from(tokens)),
26 _ => Err(Error::TokenFileImportNotSupported(lc.to_string()))
27 },
28 None => Err(Error::TokenFileImportNotSupported(lc.to_string()))
29 }
30 }
31}
32
33#[derive(Debug, PartialEq)]
34pub enum Error {
35 LanguageCodeNotSupported(String),
36 TokenFileImportNotSupported(String),
37 TokenTypeNotSupported(String),
38 FancyRegexError
39}
40
41impl From<fancy_regex::Error> for Error {
42 fn from(_error: fancy_regex::Error) -> Self {
43 Error::FancyRegexError
44 }
45}
46
47#[derive(Deserialize, Debug, Clone)]
48struct InToken {
49 tokens: Vec<String>,
50 full: String,
51 canonical: String,
52 note: Option<String>,
53 #[serde(rename = "onlyCountries")]
54 only_countries: Option<Vec<String>>,
55 #[serde(rename = "onlyLayers")]
56 only_layers: Option<Vec<String>>,
57 #[serde(rename = "preferFull")]
58 prefer_full: Option<bool>,
59 regex: Option<bool>,
60 #[serde(rename = "skipBoundaries")]
61 skip_boundaries: Option<bool>,
62 #[serde(rename = "skipDiacriticStripping")]
63 skip_diacritic_stripping: Option<bool>,
64 #[serde(rename = "spanBoundaries")]
65 span_boundaries: Option<u8>,
66 #[serde(rename = "type")]
67 token_type: Option<String>,
68}
69
70pub struct Token {
71 pub tokens: Vec<String>,
72 pub full: Replacer,
73 pub canonical: String,
74 pub note: Option<String>,
75 pub only_countries: Option<Vec<String>>,
76 pub only_layers: Option<Vec<String>>,
77 pub prefer_full: bool,
78 pub regex: bool,
79 pub skip_boundaries: bool,
80 pub skip_diacritic_stripping: bool,
81 pub span_boundaries: Option<u8>,
82 pub token_type: Option<TokenType>,
83}
84
85impl Token {
86 fn new(input: InToken) -> Result<Self, Error> {
87 Ok(Token {
88 tokens: input.tokens,
89 full: match input.regex {
90 Some(true) => Replacer::Regex(Regex::new(&input.full)?),
91 Some(false) | None => Replacer::String(input.full),
92 },
93 canonical: input.canonical,
94 note: input.note,
95 only_countries: input.only_countries,
96 only_layers: input.only_layers,
97 prefer_full: input.prefer_full.unwrap_or(false),
98 regex: input.regex.unwrap_or(false),
99 skip_boundaries: input.skip_boundaries.unwrap_or(false),
100 skip_diacritic_stripping: input.skip_diacritic_stripping.unwrap_or(false),
101 span_boundaries: input.span_boundaries,
102 token_type: match input.token_type {
103 None => None,
104 Some(t) => match TokenType::from_str(&t) {
105 Ok(t) => Some(t),
106 Err(e) => return Err(e)
107 }
108 }
109 })
110 }
111}
112
113pub enum Replacer {
114 String(String),
115 Regex(Regex)
116}
117
118#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
119pub enum TokenType {
120 PostalBox,
121 Cardinal,
122 Number,
123 Ordinal,
124 Unit,
125 Way,
126 Determiner
127}
128
129impl TokenType {
130 fn from_str(s: &str) -> Result<TokenType, Error> {
131 match s {
132 "box" => Ok(TokenType::PostalBox),
133 "cardinal" => Ok(TokenType::Cardinal),
134 "number" => Ok(TokenType::Number),
135 "ordinal" => Ok(TokenType::Ordinal),
136 "unit" => Ok(TokenType::Unit),
137 "way" => Ok(TokenType::Way),
138 "determiner" => Ok(TokenType::Determiner),
139 _ => Err(Error::TokenTypeNotSupported(s.to_string()))
140 }
141 }
142}
143
144pub fn config(v: Vec<String>) -> Result<HashMap<String, Vec<Token>>, Error> {
145 if v.is_empty() {
146 return Ok(prepare(Tokens::codes())?)
147 }
148 for lc in &v {
149 if !Tokens::codes().contains(lc) {
150 return Err(Error::LanguageCodeNotSupported(lc.to_string()))
151 }
152 }
153 Ok(prepare(v)?)
154}
155
156fn prepare(v: Vec<String>) -> Result<HashMap<String, Vec<Token>>, Error> {
157 let mut map = HashMap::new();
158 for lc in &v {
159 let parsed : Vec<InToken> = serde_json::from_str(Tokens::import(lc)?.as_str())
160 .expect("unable to parse token JSON");
161 let mut tokens = Vec::new();
162 for tk in &parsed {
163 tokens.push(Token::new(tk.clone())?);
164 }
165 map.insert(lc.clone(), tokens);
166 }
167 Ok(map)
168}
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173 use std::fs;
174
175 #[test]
176 fn test_config() {
177 let lcs = config(vec![String::from("de"), String::from("en")]).unwrap();
178 assert_eq!(lcs.len(), 2);
179 assert!(lcs.contains_key("de"));
180 assert!(lcs.contains_key("en"));
181
182 let empty_lc = config(Vec::new()).unwrap();
183 let every_lc = prepare(Tokens::codes()).unwrap();
184 assert_eq!(empty_lc.len(), every_lc.len());
185 for lc in Tokens::codes() {
186 assert!(empty_lc.contains_key(&lc));
187 }
188 }
189
190 #[test]
191 #[should_panic(expected = "LanguageCodeNotSupported(\"zz\")")]
192 fn fail_config() {
193 config(vec![String::from("zz")]).unwrap();
194 }
195
196 #[test]
197 fn test_all_lcs() {
198 let mut fs_lcs = read_files();
199 alphanumeric_sort::sort_str_slice(&mut fs_lcs);
200 assert_eq!(Tokens::codes(), fs_lcs);
201 }
202
203 #[test]
204 fn test_prepare() {
205 let lcs = prepare(vec![String::from("de"), String::from("en")]).unwrap();
206 assert_eq!(lcs.len(), 2);
207 assert!(lcs.contains_key("de"));
208 assert!(lcs.contains_key("en"));
209 }
210
211 #[test]
212 #[should_panic(expected = "TokenFileImportNotSupported(\"zz\")")]
213 fn fail_import() {
214 Tokens::import("zz").unwrap();
215 }
216
217 #[test]
218 fn test_token_values() {
219 let map = config(Vec::new()).unwrap();
220
221 for lc in map.values() {
222 for tk in lc {
223 assert!(!tk.tokens.is_empty());
224 if let Some(l) = &tk.only_layers {
225 assert_eq!(l[0], "address");
226 assert_eq!(l.len(), 1);
227 }
228 }
229 }
230 }
231
232 fn read_files() -> Vec<String> {
233 let mut lcs = Vec::new();
234 for entry in fs::read_dir("./tokens").unwrap() {
235 let file_name = entry.unwrap().file_name().into_string().unwrap();
236 let file_components: Vec<&str> = file_name.split('.').collect();
237 if file_components[1] == "json" {
238 lcs.push(file_components[0].to_owned());
239 }
240 }
241 lcs
242 }
243}