address_formatter/
formatter.rs

1use crate::{Component, Place};
2use anyhow::{anyhow, Context, Error};
3use itertools::Itertools;
4use regex::{Regex, RegexBuilder};
5use std::collections::HashMap;
6use std::str::FromStr;
7use strum::IntoEnumIterator;
8
9const MULTILINE_TEMPLATE_NAME: &str = "multi_line";
10const SHORT_ADDR_TEMPLATE_NAME: &str = "short_addr";
11
12/// Represents a Regex and the value to replace the regex matches with
13#[derive(Debug, Clone)]
14pub(crate) struct Replacement {
15    pub regex: regex::Regex,
16    pub replacement_value: String,
17}
18
19/// Replacement rule
20/// a Replacement can be on all fields, or only one of them
21#[derive(Debug, Clone)]
22pub(crate) enum ReplaceRule {
23    All(Replacement),
24    Component((Component, Replacement)),
25}
26
27#[derive(Debug, Hash, Eq, PartialEq, Clone)]
28pub struct CountryCode(String); // TODO small string
29
30impl FromStr for CountryCode {
31    type Err = Error;
32
33    fn from_str(s: &str) -> Result<Self, Self::Err> {
34        if s.len() == 2 {
35            if s == "UK" {
36                Ok(CountryCode("GB".to_owned()))
37            } else {
38                Ok(CountryCode(s.to_uppercase()))
39            }
40        } else {
41            Err(anyhow!(
42                "{} is not a valid ISO3166-1:alpha2 country code",
43                s,
44            ))
45        }
46    }
47}
48
49impl CountryCode {
50    pub fn as_str(&self) -> &str {
51        self.0.as_str()
52    }
53}
54
55impl std::fmt::Display for CountryCode {
56    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
57        write!(f, "{}", self.0)
58    }
59}
60
61/// Represents a new field to add the a place
62#[derive(Debug, Clone)]
63pub(crate) struct NewComponent {
64    pub component: Component,
65    pub new_value: String,
66}
67
68/// The template handle the handlerbar template used to format a [`Place`](struct.Place.html)
69#[derive(Debug, Default)]
70pub(crate) struct Template {
71    /// Moustache template
72    pub handlebar_handler: handlebars::Handlebars<'static>,
73    place_template: String, // used only to clone the template
74}
75
76// Compute a string with only the formatting rule for a short address
77// (basicaly only the housenumber and the road)
78// it's not very elegant, but it works to only find the line with the housenumber
79fn compute_short_addr_template(place_template: &str) -> Option<String> {
80    place_template
81        .split('\n')
82        .find(|l| l.contains("house_number"))
83        .map(|l| l.trim().to_owned())
84}
85
86impl Template {
87    pub fn new(place_template: &str) -> Self {
88        let mut template_engine = crate::handlebar_helper::new_template_engine();
89        template_engine
90            .register_template_string(MULTILINE_TEMPLATE_NAME, place_template)
91            .expect("impossible to build multi line template");
92
93        if let Some(short_addr_template) = compute_short_addr_template(place_template) {
94            template_engine
95                .register_template_string(SHORT_ADDR_TEMPLATE_NAME, &short_addr_template)
96                .expect("impossible to build short addr template");
97        }
98
99        Template {
100            place_template: place_template.to_owned(),
101            handlebar_handler: template_engine,
102        }
103    }
104}
105
106impl Clone for Template {
107    fn clone(&self) -> Self {
108        Self::new(self.place_template.as_str())
109    }
110}
111
112/// The `Rules` contains all the rules used to cleanup the placees
113/// Some of those rules are used as preformating rules (before changing the [`Place`](struct.Place.html)
114/// to a text with the handlebar template)
115/// And some of those rules are used as postformating rules, on the formatted text
116#[derive(Debug, Default, Clone)]
117pub(crate) struct Rules {
118    pub replace: Vec<ReplaceRule>,
119    pub postformat_replace: Vec<Replacement>,
120    pub change_country: Option<String>,
121    pub change_country_code: Option<String>,
122    /// Override the country
123    pub add_component: Option<NewComponent>,
124}
125
126#[derive(Debug)]
127pub(crate) struct Templates {
128    pub default_template: Template,
129    pub fallback_template: Template,
130    pub templates_by_country: HashMap<CountryCode, Template>,
131    pub rules_by_country: HashMap<CountryCode, Rules>,
132    pub fallback_templates_by_country: HashMap<CountryCode, Template>,
133    pub fallback_rules: Rules,
134}
135
136/// This [`Formatter`](struct.Formatter.html) holds all the configuration needed to format a [`Place`](struct.Place.html)
137/// to a nice text.
138///
139/// The main method is the `format` method, that takes a [`Place`](struct.Place.html)
140/// or something that can be converted to a [`Place`](struct.Place.html) and return a result with the formatted `String`
141///
142/// ```
143/// # #[macro_use] extern crate maplit;
144/// # fn main() {
145///    use address_formatter::Component::*;
146///    let formatter = address_formatter::Formatter::default();
147///
148///    let addr: address_formatter::Place = hashmap!(
149///        City => "Toulouse",
150///        Country => "France",
151///        CountryCode => "FR",
152///        County => "Toulouse",
153///        HouseNumber => "17",
154///        Neighbourhood => "Lafourguette",
155///        Postcode => "31000",
156///        Road => "Rue du Médecin-Colonel Calbairac",
157///        State => "Midi-Pyrénées",
158///        Suburb => "Toulouse Ouest",
159///    ).into();
160///
161///    assert_eq!(
162///        formatter.format(addr).unwrap(),
163///        r#"17 Rue du Médecin-Colonel Calbairac
164///31000 Toulouse
165///France
166///"#
167///        .to_owned()
168///    )
169/// # }
170///
171/// ```
172pub struct Formatter {
173    pub(crate) templates: Templates,
174    pub(crate) county_codes: HashMap<(CountryCode, String), String>,
175    pub(crate) state_codes: HashMap<(CountryCode, String), String>,
176    // country_to_lang: Vec<>,
177    // abbreviations: Vec<>,
178    // valid_replacement_components: Vec<>
179}
180
181/// This configuration changes the [`Formatter`](struct.Formatter.html) behavior
182#[derive(Default, Debug)]
183pub struct Configuration {
184    /// force the use of a give country (so the [`Place`](struct.Place.html) country_code is not used)
185    pub country_code: Option<String>,
186    /// use abbreviation in the formated text (like "Avenue" to "Av.")
187    pub abbreviate: Option<bool>,
188}
189
190impl Default for Formatter {
191    /// Default constructor
192    fn default() -> Self {
193        crate::read_configuration::read_configuration()
194    }
195}
196
197impl Formatter {
198    /// make a human readable text from a [`Place`](struct.Place.html)
199    /// ```
200    /// # #[macro_use] extern crate maplit;
201    /// # fn main() {
202    ///    use address_formatter::Component::*;
203    ///    let formatter = address_formatter::Formatter::default();
204    ///
205    ///    let addr: address_formatter::Place = hashmap!(
206    ///        City => "Toulouse",
207    ///        Country => "France",
208    ///        CountryCode => "FR",
209    ///        County => "Toulouse",
210    ///        HouseNumber => "17",
211    ///        Neighbourhood => "Lafourguette",
212    ///        Postcode => "31000",
213    ///        Road => "Rue du Médecin-Colonel Calbairac",
214    ///        State => "Midi-Pyrénées",
215    ///        Suburb => "Toulouse Ouest",
216    ///    ).into();
217    ///
218    ///    assert_eq!(
219    ///        formatter.format(addr).unwrap(),
220    ///        r#"17 Rue du Médecin-Colonel Calbairac
221    ///31000 Toulouse
222    ///France
223    ///"#
224    ///        .to_owned()
225    ///    )
226    /// # }
227    /// ```
228    pub fn format(&self, into_addr: impl Into<Place>) -> Result<String, Error> {
229        self.format_with_config(into_addr.into(), Configuration::default())
230    }
231
232    /// make a human readable text from a [`Place`](struct.Place.html)
233    /// Same as the [`format`](struct.Formatter.html#method.format) method,
234    /// but with a [`Configuration`](address_formatter::formatter::Configuration) object
235    pub fn format_with_config(
236        &self,
237        into_addr: impl Into<Place>,
238        conf: Configuration,
239    ) -> Result<String, Error> {
240        let mut addr = into_addr.into();
241        let country_code = self.find_country_code(&mut addr, conf);
242
243        sanity_clean_place(&mut addr);
244
245        let template = self.find_template(&addr, &country_code);
246        let rules = country_code
247            .as_ref()
248            .and_then(|c| self.templates.rules_by_country.get(c))
249            .unwrap_or(&self.templates.fallback_rules);
250
251        self.preformat(rules, &mut addr);
252
253        let text = template
254            .handlebar_handler
255            .render(MULTILINE_TEMPLATE_NAME, &addr)
256            .context("impossible to render template")?;
257
258        let text = cleanup_rendered(&text, rules);
259
260        Ok(text)
261    }
262
263    /// make a human readable short text on 1 line with only the address [`Place`](struct.Place.html)
264    /// There is basically only the housenumber and the road
265    /// ```
266    /// # #[macro_use] extern crate maplit;
267    /// # fn main() {
268    ///    use address_formatter::Component::*;
269    ///    let formatter = address_formatter::Formatter::default();
270    ///
271    ///    let addr: address_formatter::Place = hashmap!(
272    ///        City => "Toulouse",
273    ///        Country => "France",
274    ///        CountryCode => "FR",
275    ///        County => "Toulouse",
276    ///        HouseNumber => "17",
277    ///        Neighbourhood => "Lafourguette",
278    ///        Postcode => "31000",
279    ///        Road => "Rue du Médecin-Colonel Calbairac",
280    ///        State => "Midi-Pyrénées",
281    ///        Suburb => "Toulouse Ouest",
282    ///    ).into();
283    ///
284    ///    assert_eq!(
285    ///        formatter.short_addr_format(addr).unwrap(),
286    ///        r#"17 Rue du Médecin-Colonel Calbairac"#
287    ///        .to_owned()
288    ///    )
289    /// # }
290    /// ```
291    pub fn short_addr_format(&self, into_addr: impl Into<Place>) -> Result<String, Error> {
292        self.short_addr_format_with_config(into_addr.into(), Configuration::default())
293    }
294
295    /// make a human readable short text on 1 line with only the address [`Place`](struct.Place.html)
296    /// Same as the [`short_addr_format`](struct.Formatter.html#method.short_addr_format) method,
297    /// but with a [`Configuration`](address_formatter::formatter::Configuration) object
298    pub fn short_addr_format_with_config(
299        &self,
300        into_addr: impl Into<Place>,
301        conf: Configuration,
302    ) -> Result<String, Error> {
303        let mut addr = into_addr.into();
304        let country_code = self.find_country_code(&mut addr, conf);
305
306        let template = self.find_template(&addr, &country_code);
307
308        let text = template
309            .handlebar_handler
310            .render(SHORT_ADDR_TEMPLATE_NAME, &addr)
311            .context("impossible to render short address template")?;
312
313        let text = text.trim().to_owned();
314        Ok(text)
315    }
316
317    fn find_country_code(&self, addr: &mut Place, conf: Configuration) -> Option<CountryCode> {
318        let mut country_code = conf
319            .country_code
320            .or_else(|| addr[Component::CountryCode].clone())
321            .and_then(|s| {
322                CountryCode::from_str(&s)
323                    .map_err(|e| log::info!("impossible to find a country: {}", e))
324                    .ok()
325            });
326
327        // we hardcode some country code values
328        if country_code == CountryCode::from_str("NL").ok() {
329            if let Some(state) = addr[Component::State].clone() {
330                if state.as_str() == "Curaçao" {
331                    country_code = CountryCode::from_str("CW").ok();
332                    addr[Component::Country] = Some("Curaçao".to_owned());
333                }
334                let state = state.to_lowercase();
335
336                if state.as_str() == "sint maarten" {
337                    country_code = CountryCode::from_str("SX").ok();
338                    addr[Component::Country] = Some("Sint Maarten".to_owned());
339                } else if state.as_str() == "aruba" {
340                    country_code = CountryCode::from_str("AW").ok();
341                    addr[Component::Country] = Some("Aruba".to_owned());
342                }
343            }
344        }
345
346        country_code
347    }
348
349    fn find_template(&self, addr: &Place, country_code: &Option<CountryCode>) -> &Template {
350        country_code
351            .as_ref()
352            .and_then(|c| {
353                if !has_minimum_place_components(addr) {
354                    // if the place does not have the minimum fields, we get its country fallback template
355                    // if there is a specific one, else we get the default fallback template
356                    self.templates
357                        .fallback_templates_by_country
358                        .get(c)
359                        .or(Some(&self.templates.fallback_template))
360                } else {
361                    self.templates.templates_by_country.get(c)
362                }
363            })
364            .unwrap_or(&self.templates.default_template)
365    }
366
367    fn preformat(&self, rules: &Rules, addr: &mut Place) {
368        for r in &rules.replace {
369            r.replace_fields(addr);
370        }
371
372        // in some cases, we need to add some components
373        if let Some(add_component) = &rules.add_component {
374            addr[add_component.component] = Some(add_component.new_value.clone());
375        }
376        if let Some(change_country) = &rules.change_country {
377            addr[Component::Country] = Some(change_country.clone());
378        }
379        if let Some(change_country_code) = &rules.change_country_code {
380            addr[Component::CountryCode] = Some(change_country_code.clone());
381        }
382
383        // we also try to find the state_code/county_code
384        if let Some(country) = addr[Component::CountryCode]
385            .as_ref()
386            .and_then(|c| CountryCode::from_str(c).ok())
387        {
388            if addr[Component::StateCode].is_none() {
389                // we try to see if we can use the state_code and the reference table 'state_codes.yaml' to find the state
390                if let Some(state) = &addr[Component::State] {
391                    if let Some(new_state) = self
392                        .state_codes
393                        .get(&(country.clone(), state.to_string()))
394                        .cloned()
395                    {
396                        addr[Component::StateCode] = Some(new_state);
397                    }
398                }
399            }
400
401            if addr[Component::CountyCode].is_none() {
402                // same for county
403                if let Some(county) = &addr[Component::County] {
404                    if let Some(new_county) = self
405                        .county_codes
406                        .get(&(country, county.to_string()))
407                        .cloned()
408                    {
409                        addr[Component::County] = Some(new_county);
410                    }
411                }
412            }
413        }
414    }
415}
416
417/// Build [`Place`](struct.Place.html) from a less structured input (like placees from [Nominatim](https://github.com/openstreetmap/Nominatim))
418///
419/// It applies aliases rules to fill the [`Place`](struct.Place.html)'s fields as good as possible.
420pub struct PlaceBuilder {
421    pub(crate) component_aliases: HashMap<Component, Vec<String>>,
422}
423
424impl Default for PlaceBuilder {
425    fn default() -> Self {
426        crate::read_configuration::read_place_builder_configuration()
427    }
428}
429
430impl PlaceBuilder {
431    /// Build a [`Place`](struct.Place.html)(crate::Place) from an unstructed source (like Nominatim output)
432    pub fn build_place<'a>(&self, values: impl IntoIterator<Item = (&'a str, String)>) -> Place {
433        let mut place = Place::default();
434        let mut unknown = HashMap::<String, String>::new();
435        for (k, v) in values.into_iter() {
436            let component = Component::from_str(k).ok();
437            if let Some(component) = component {
438                place[component] = Some(v);
439            } else {
440                unknown.insert(k.to_string(), v);
441            }
442        }
443
444        // all the unknown fields are added in the 'Attention' field
445        if !unknown.is_empty() {
446            for (c, aliases) in &self.component_aliases {
447                // if the place's component has not been already set, we set it to its first found alias
448                for alias in aliases {
449                    if let Some(a) = unknown.remove(alias) {
450                        if place[*c].is_none() {
451                            place[*c] = Some(a);
452                        }
453                    }
454                }
455            }
456            place[Component::Attention] = Some(unknown.values().join(", "));
457        }
458
459        // hardocded cleanup for some bad country data
460        if let (Some(state), Some(country)) = (&place[Component::State], &place[Component::Country])
461        {
462            if country.parse::<usize>().is_ok() {
463                place[Component::Country] = Some(state.clone());
464                place[Component::State] = None;
465            }
466        }
467        place
468    }
469}
470
471fn sanity_clean_place(addr: &mut Place) {
472    lazy_static::lazy_static! {
473        static ref POST_CODE_RANGE: Regex = Regex::new(r#"\d+;\d+"#).unwrap();
474        static ref MATCHABLE_POST_CODE_RANGE: Regex = Regex::new(r#"^(\d{5}),\d{5}"#).unwrap();
475        static ref IS_URL: Regex= Regex::new(r#"https?://"#).unwrap();
476
477    }
478    // cleanup the postcode
479    if let Some(post_code) = &addr[Component::Postcode] {
480        if post_code.len() > 20 || POST_CODE_RANGE.is_match(post_code) {
481            addr[Component::Postcode] = None;
482        } else if let Some(r) = MATCHABLE_POST_CODE_RANGE
483            .captures(post_code)
484            .and_then(|r| r.get(1))
485            .map(|c| c.as_str())
486        {
487            addr[Component::Postcode] = Some(r.to_owned());
488        }
489    }
490
491    // clean values containing URLs
492    for c in Component::iter() {
493        if let Some(v) = &addr[c] {
494            if IS_URL.is_match(v) {
495                addr[c] = None;
496            }
497        }
498    }
499}
500
501fn cleanup_rendered(text: &str, rules: &Rules) -> String {
502    lazy_static::lazy_static! {
503        static ref REPLACEMENTS:  [(Regex, &'static str); 12]= [
504            (RegexBuilder::new(r"[},\s]+$").multi_line(true).build().unwrap(), ""),
505            (RegexBuilder::new(r"^ - ").multi_line(true).build().unwrap(), ""), // line starting with dash due to a parameter missing
506            (RegexBuilder::new(r"^[,\s]+").multi_line(true).build().unwrap(), ""),
507            (RegexBuilder::new(r",\s*,").multi_line(true).build().unwrap(), ", "), //multiple commas to one
508            (RegexBuilder::new(r"[\t\p{Zs}]+,[\t\p{Zs}]+").multi_line(true).build().unwrap(), ", "), //one horiz whitespace behind comma
509            (RegexBuilder::new(r"[\t ][\t ]+").multi_line(true).build().unwrap(), " "), //multiple horiz whitespace to one
510            (RegexBuilder::new(r"[\t\p{Zs}]\n").multi_line(true).build().unwrap(), "\n"), //horiz whitespace, newline to newline
511            (RegexBuilder::new(r"\n,").multi_line(true).build().unwrap(), "\n"), //newline comma to just newline
512            (RegexBuilder::new(r",,+").multi_line(true).build().unwrap(), ","), //multiple commas to one
513            (RegexBuilder::new(r",\n").multi_line(true).build().unwrap(), "\n"), //comma newline to just newline
514            (RegexBuilder::new(r"\n[\t\p{Zs}]+").multi_line(true).build().unwrap(), "\n"), //newline plus space to newline
515            (RegexBuilder::new(r"\n\n+").multi_line(true).build().unwrap(), "\n"), //multiple newline to one
516        ];
517
518        static ref FINAL_CLEANUP:  [(Regex, &'static str); 2]= [
519            (Regex::new(r"^\s+").unwrap(), ""), //remove leading whitespace
520            (Regex::new(r"\s+$").unwrap(), ""), //remove end whitespace
521        ];
522    }
523
524    let mut res = text.to_owned();
525
526    for (rgx, new_val) in REPLACEMENTS.iter() {
527        let rep = rgx.replace_all(&res, *new_val);
528        // to improve performance, we update the string only if it was changed by the replace
529        match rep {
530            std::borrow::Cow::Borrowed(_) => {}
531            std::borrow::Cow::Owned(v) => {
532                res = v;
533            }
534        }
535    }
536
537    for r in &rules.postformat_replace {
538        let rep = r.regex.replace_all(&res, r.replacement_value.as_str());
539        match rep {
540            std::borrow::Cow::Borrowed(_) => {}
541            std::borrow::Cow::Owned(v) => {
542                res = v;
543            }
544        }
545    }
546
547    // we also dedup the string
548    // we dedup and trim and all the same 'token' in a line
549    // and all the same lines too
550    let mut res = res
551        .split('\n')
552        .map(|s| s.split(", ").map(|e| e.trim()).dedup().join(", "))
553        .dedup()
554        .join("\n");
555
556    for (rgx, new_val) in FINAL_CLEANUP.iter() {
557        let rep = rgx.replace(&res, *new_val);
558        match rep {
559            std::borrow::Cow::Borrowed(_) => {}
560            std::borrow::Cow::Owned(v) => {
561                res = v;
562            }
563        }
564    }
565
566    let res = res.trim();
567    format!("{}\n", res) //add final newline
568}
569
570fn has_minimum_place_components(addr: &Place) -> bool {
571    // if there are neither 'road' nor 'postcode', we consider that there are not enough data
572    // and use the fallback template
573    addr[Component::Road].is_some() || addr[Component::Postcode].is_some()
574}
575
576impl ReplaceRule {
577    fn replace_fields(&self, addr: &mut Place) {
578        match self {
579            ReplaceRule::All(replace_rule) => {
580                for c in Component::iter() {
581                    if let Some(v) = &addr[c] {
582                        addr[c] = Some(
583                            replace_rule
584                                .regex
585                                .replace(v, replace_rule.replacement_value.as_str())
586                                .to_string(),
587                        );
588                    }
589                }
590            }
591            ReplaceRule::Component((c, replace_rule)) => {
592                if let Some(v) = &addr[*c] {
593                    addr[*c] = Some(
594                        replace_rule
595                            .regex
596                            .replace(v, replace_rule.replacement_value.as_str())
597                            .to_string(),
598                    );
599                }
600            }
601        }
602    }
603}