geocoder_abbreviations/
lib.rs

1use rust_embed::RustEmbed;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use fancy_regex::Regex;
5
6#[derive(RustEmbed)]
7#[folder = "./tokens/"]
8struct Tokens;
9impl Tokens {
10    pub fn codes() -> Vec<String> {
11        let mut codes: Vec<String> = Tokens::iter().filter(|lang| {
12            lang.contains(".json")
13        }).map(|lang| {
14            String::from(lang).replace(".json", "")
15        }).collect();
16
17        codes.sort();
18
19        codes
20    }
21
22    pub fn import(lc: &str) -> Result<String, Error> {
23        match Tokens::get(format!("{}.json", &lc).as_str()) {
24            Some(tokens) => match std::str::from_utf8(tokens.as_ref()) {
25                Ok(tokens) => Ok(String::from(tokens)),
26                _ => Err(Error::TokenFileImportNotSupported(lc.to_string()))
27            },
28            None => Err(Error::TokenFileImportNotSupported(lc.to_string()))
29        }
30    }
31}
32
33#[derive(Debug, PartialEq)]
34pub enum Error {
35    LanguageCodeNotSupported(String),
36    TokenFileImportNotSupported(String),
37    TokenTypeNotSupported(String),
38    FancyRegexError
39}
40
41impl From<fancy_regex::Error> for Error {
42    fn from(_error: fancy_regex::Error) -> Self {
43        Error::FancyRegexError
44    }
45}
46
47#[derive(Deserialize, Debug, Clone)]
48struct InToken {
49    tokens: Vec<String>,
50    full: String,
51    canonical: String,
52    note: Option<String>,
53    #[serde(rename = "onlyCountries")]
54    only_countries: Option<Vec<String>>,
55    #[serde(rename = "onlyLayers")]
56    only_layers: Option<Vec<String>>,
57    #[serde(rename = "preferFull")]
58    prefer_full: Option<bool>,
59    regex: Option<bool>,
60    #[serde(rename = "skipBoundaries")]
61    skip_boundaries: Option<bool>,
62    #[serde(rename = "skipDiacriticStripping")]
63    skip_diacritic_stripping: Option<bool>,
64    #[serde(rename = "spanBoundaries")]
65    span_boundaries: Option<u8>,
66    #[serde(rename = "type")]
67    token_type: Option<String>,
68}
69
70pub struct Token {
71    pub tokens: Vec<String>,
72    pub full: Replacer,
73    pub canonical: String,
74    pub note: Option<String>,
75    pub only_countries: Option<Vec<String>>,
76    pub only_layers: Option<Vec<String>>,
77    pub prefer_full: bool,
78    pub regex: bool,
79    pub skip_boundaries: bool,
80    pub skip_diacritic_stripping: bool,
81    pub span_boundaries: Option<u8>,
82    pub token_type: Option<TokenType>,
83}
84
85impl Token {
86    fn new(input: InToken) -> Result<Self, Error> {
87        Ok(Token {
88            tokens: input.tokens,
89            full: match input.regex {
90                Some(true) => Replacer::Regex(Regex::new(&input.full)?),
91                Some(false) | None => Replacer::String(input.full),
92            },
93            canonical: input.canonical,
94            note: input.note,
95            only_countries: input.only_countries,
96            only_layers: input.only_layers,
97            prefer_full: input.prefer_full.unwrap_or(false),
98            regex: input.regex.unwrap_or(false),
99            skip_boundaries: input.skip_boundaries.unwrap_or(false),
100            skip_diacritic_stripping: input.skip_diacritic_stripping.unwrap_or(false),
101            span_boundaries: input.span_boundaries,
102            token_type: match input.token_type {
103                None => None,
104                Some(t) => match TokenType::from_str(&t) {
105                    Ok(t) => Some(t),
106                    Err(e) => return Err(e)
107                }
108            }
109        })
110    }
111}
112
113pub enum Replacer {
114   String(String),
115   Regex(Regex)
116}
117
118#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
119pub enum TokenType {
120    PostalBox,
121    Cardinal,
122    Number,
123    Ordinal,
124    Unit,
125    Way,
126    Determiner
127}
128
129impl TokenType {
130    fn from_str(s: &str) -> Result<TokenType, Error> {
131        match s {
132            "box" => Ok(TokenType::PostalBox),
133            "cardinal" => Ok(TokenType::Cardinal),
134            "number" => Ok(TokenType::Number),
135            "ordinal" => Ok(TokenType::Ordinal),
136            "unit" => Ok(TokenType::Unit),
137            "way" => Ok(TokenType::Way),
138            "determiner" => Ok(TokenType::Determiner),
139            _ => Err(Error::TokenTypeNotSupported(s.to_string()))
140        }
141    }
142}
143
144pub fn config(v: Vec<String>) -> Result<HashMap<String, Vec<Token>>, Error> {
145    if v.is_empty() {
146        return Ok(prepare(Tokens::codes())?)
147    }
148    for lc in &v {
149        if !Tokens::codes().contains(lc) {
150            return Err(Error::LanguageCodeNotSupported(lc.to_string()))
151        }
152    }
153    Ok(prepare(v)?)
154}
155
156fn prepare(v: Vec<String>) -> Result<HashMap<String, Vec<Token>>, Error> {
157    let mut map = HashMap::new();
158    for lc in &v {
159        let parsed : Vec<InToken> = serde_json::from_str(Tokens::import(lc)?.as_str())
160            .expect("unable to parse token JSON");
161        let mut tokens = Vec::new();
162        for tk in &parsed {
163            tokens.push(Token::new(tk.clone())?);
164        }
165        map.insert(lc.clone(), tokens);
166    }
167    Ok(map)
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173    use std::fs;
174
175    #[test]
176    fn test_config() {
177        let lcs = config(vec![String::from("de"), String::from("en")]).unwrap();
178        assert_eq!(lcs.len(), 2);
179        assert!(lcs.contains_key("de"));
180        assert!(lcs.contains_key("en"));
181
182        let empty_lc = config(Vec::new()).unwrap();
183        let every_lc = prepare(Tokens::codes()).unwrap();
184        assert_eq!(empty_lc.len(), every_lc.len());
185        for lc in Tokens::codes() {
186            assert!(empty_lc.contains_key(&lc));
187        }
188    }
189
190    #[test]
191    #[should_panic(expected = "LanguageCodeNotSupported(\"zz\")")]
192    fn fail_config() {
193        config(vec![String::from("zz")]).unwrap();
194    }
195
196    #[test]
197    fn test_all_lcs() {
198        let mut fs_lcs = read_files();
199        alphanumeric_sort::sort_str_slice(&mut fs_lcs);
200        assert_eq!(Tokens::codes(), fs_lcs);
201    }
202
203    #[test]
204    fn test_prepare() {
205        let lcs = prepare(vec![String::from("de"), String::from("en")]).unwrap();
206        assert_eq!(lcs.len(), 2);
207        assert!(lcs.contains_key("de"));
208        assert!(lcs.contains_key("en"));
209    }
210
211    #[test]
212    #[should_panic(expected = "TokenFileImportNotSupported(\"zz\")")]
213    fn fail_import() {
214        Tokens::import("zz").unwrap();
215    }
216
217    #[test]
218    fn test_token_values() {
219        let map = config(Vec::new()).unwrap();
220
221        for lc in map.values() {
222            for tk in lc {
223                assert!(!tk.tokens.is_empty());
224                if let Some(l) = &tk.only_layers {
225                    assert_eq!(l[0], "address");
226                    assert_eq!(l.len(), 1);
227                }
228            }
229        }
230    }
231
232    fn read_files() -> Vec<String> {
233        let mut lcs = Vec::new();
234        for entry in fs::read_dir("./tokens").unwrap() {
235            let file_name = entry.unwrap().file_name().into_string().unwrap();
236            let file_components: Vec<&str> = file_name.split('.').collect();
237            if file_components[1] == "json" {
238                lcs.push(file_components[0].to_owned());
239            }
240        }
241        lcs
242    }
243}