Skip to main content

postcode_extractor/
lib.rs

1use std::collections::BTreeMap;
2
3use include_dir::{Dir, include_dir};
4use once_cell::sync::Lazy;
5use regex::{Match, Regex};
6
7mod country;
8mod json_models;
9
10use crate::json_models::{PositionLogic, RegexJson};
11
12pub use country::{
13    Country, FIVE_DIGIT_ADDITIONAL_NATIONS, FIVE_DIGIT_NATIONS, FIVE_DIGIT_WITH_SPACE_NATIONS,
14    FOUR_DIGIT_NATIONS, SIX_DIGIT_NATIONS, UNIQUE_COUNTRIES,
15};
16
17static TEMPLATES_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/postalcode-extractor/regex");
18
19struct RegexWrapper {
20    pub regex: Regex,
21    pub position_logic: PositionLogic,
22}
23
24static ALL: Lazy<BTreeMap<Country, RegexWrapper>> = Lazy::new(|| {
25    let mut m = BTreeMap::new();
26    for file in TEMPLATES_DIR.files() {
27        let text = file.contents_utf8().expect("Must be UTF8");
28        let regex_json: RegexJson = serde_json::from_str(text).unwrap();
29
30        assert!(regex_json.regex.position_logic.position >= 0.0);
31        assert!(regex_json.regex.position_logic.position <= 1.0);
32
33        let regex = Regex::new(&regex_json.regex.engines.rust).unwrap();
34
35        let regex_wrapper = RegexWrapper {
36            regex: regex,
37            position_logic: regex_json.regex.position_logic.clone(),
38        };
39
40        m.insert(regex_json.country.clone(), regex_wrapper);
41    }
42    m
43});
44
45#[derive(Debug, Clone)]
46pub struct PostcodeHolder {
47    pub base: String,
48    pub additional: Option<String>,
49}
50
51#[derive(Debug, Clone)]
52pub struct PostcodeWrapper {
53    pub country: Country,
54    pub postcode: PostcodeHolder,
55}
56
57#[derive(Debug)]
58pub enum PostcodeError {
59    UnsupportedCountry,
60}
61
62/// Used to parse a postcode from an address of an already known country
63///
64/// Use `check_position` to control whether you are evaluating a whole address
65/// or a single postcode. For example
66///
67/// - `check_position = true`: "15 Main Road, EN35 0RS" will find "EN35 0RS"
68/// - `check_position = false`: "EN35 0RS" will find "EN35 0RS"
69///  
70///
71pub fn evaluate_single_country(
72    haystack: &str,
73    country: Country,
74    check_position: bool,
75) -> Result<Option<PostcodeHolder>, PostcodeError> {
76    let regex = ALL.get(&country).ok_or(PostcodeError::UnsupportedCountry)?;
77
78    let postalcode_captures = regex.regex.captures_iter(haystack);
79
80    // TODO: Sometimes we will need to take the first (e.g. Korea, Japan).
81    // The position logic struct can probably take care of that
82    let captures = match postalcode_captures.last() {
83        Some(x) => x,
84        None => return Ok(None),
85    };
86
87    let best_match = match captures.name("postcode") {
88        Some(x) => x,
89        None => return Ok(None),
90    };
91
92    if check_position {
93        if !check_positions(haystack, &best_match, &regex.position_logic) {
94            return Ok(None);
95        }
96    }
97
98    let additional = if FIVE_DIGIT_ADDITIONAL_NATIONS.contains(&country) {
99        if let Some(add_match) = captures.name("postcode_additional") {
100            Some(add_match.as_str().to_string())
101        } else {
102            None
103        }
104    } else {
105        None
106    };
107
108    let best_match = best_match.as_str().to_string();
109    Ok(Some(PostcodeHolder {
110        base: best_match,
111        additional: additional,
112    }))
113}
114
115fn check_positions(haystack: &str, mat: &Match, position_logic: &PositionLogic) -> bool {
116    let hay_len = haystack.chars().count() as f64;
117    let match_char_idex = haystack[..mat.start()].chars().count() as f64;
118
119    let func = position_logic.operation.as_function();
120
121    return func(match_char_idex / hay_len, position_logic.position);
122}
123
124/// Will attempt to parse a postcode from an address or a single postcode, and
125/// determine which country it comes from.
126///
127/// Use `check_position` to control whether you are evaluating a whole address
128/// or a single postcode. For example
129///
130/// - `check_position = true`: "15 Main Road, EN35 0RS" will find "EN35 0RS" and identify the country as United Kingdom
131/// - `check_position = false`: "EN35 0RS" will find "EN35 0RS" and identify the country as United Kingdom
132///
133/// Note that some countries can not be uniquely identified, for example, if
134/// they use a five digit postcode
135pub fn evaluate_all_countries(
136    haystack: &str,
137    check_position: bool,
138) -> Result<Option<PostcodeWrapper>, PostcodeError> {
139    for country in UNIQUE_COUNTRIES.iter().cloned() {
140        if let Ok(Some(pc)) = evaluate_single_country(haystack, country.clone(), check_position) {
141            return Ok(Some(PostcodeWrapper {
142                country: country,
143                postcode: pc,
144            }));
145        }
146    }
147
148    // 5 Digits but with a space
149    // We MIGHT be able to say it's Czechia if they include the CZ-
150    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::CZ, check_position) {
151        if pc.base.contains("CZ-") {
152            return Ok(Some(PostcodeWrapper {
153                country: Country::CZ,
154                postcode: pc,
155            }));
156        }
157    }
158
159    // We MIGHT be able to say it's Greece if they include the GR-
160    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::GR, check_position) {
161        if pc.base.contains("GR-") {
162            return Ok(Some(PostcodeWrapper {
163                country: Country::GR,
164                postcode: pc,
165            }));
166        }
167    }
168
169    // Generic 5 digit with space
170    if let Ok(Some(pc)) =
171        evaluate_single_country(haystack, Country::Unknown5DigitSpace, check_position)
172    {
173        return Ok(Some(PostcodeWrapper {
174            country: Country::Unknown5DigitSpace,
175            postcode: pc,
176        }));
177    }
178
179    
180    // USA and Saudi Arabia Special Case
181    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::US, check_position) {
182        if pc.additional.is_some() {
183            return Ok(Some(PostcodeWrapper {
184                country: Country::Unknown5DigitAdditional,
185                postcode: pc,
186            }));
187        }
188    }
189
190    // Six digit
191    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::Unknown6Digit, check_position)
192    {
193        return Ok(Some(PostcodeWrapper {
194            country: Country::Unknown6Digit,
195            postcode: pc,
196        }));
197    }
198
199    // Taiwan HACK
200    // only if we check position
201    if check_position {
202        if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::TW, check_position) {
203            return Ok(Some(PostcodeWrapper {
204                country: Country::TW,
205                postcode: pc,
206            }));
207        }
208    }
209
210    // 5 digit
211
212    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::Unknown5Digit, check_position)
213    {
214        return Ok(Some(PostcodeWrapper {
215            country: Country::Unknown5Digit,
216            postcode: pc,
217        }));
218    }
219
220    // Cyprus special case
221    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::CY, check_position) {
222        if pc.base.contains("CY-") {
223            return Ok(Some(PostcodeWrapper {
224                country: Country::CY,
225                postcode: pc,
226            }));
227        }
228    }
229
230    // Luxembourg special case with the L-
231    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::LU, check_position) {
232        if pc.base.starts_with("L-") {
233            return Ok(Some(PostcodeWrapper { country: Country::LU, postcode: pc }))
234        }
235    }
236
237    // 4 digit
238
239    if let Ok(Some(pc)) = evaluate_single_country(haystack, Country::Unknown4Digit, check_position)
240    {
241        return Ok(Some(PostcodeWrapper {
242            country: Country::Unknown4Digit,
243            postcode: pc,
244        }));
245    }
246    return Ok(None);
247}