libpostal_rust/
lib.rs

1//! `libpostal-rust`: A high-level, threadsafe wrapper around `libpostal`
2//!
3//! The open source C library
4//! [`libpostal`](https://github.com/openvenues/libpostal) provides support for
5//! parsing and normalizing addresses using an external language model trained
6//! on addresses around the world. We provide a high-level Rust wrapper around
7//! that library, in a way that it can be linked into your main Rust binary.
8//!
9//! Note that you will need to use `libpostal_data` (included with `libpostal`)
10//! to download and install about 2GB of language model data:
11//!
12//! ```sh
13//! sudo libpostal_data download all /usr/local/share/libpostal
14//! ```
15//!
16//! Once this is done, you can parse addresses as follows:
17//!
18//! ```no_run
19//! use libpostal_rust::{ParseAddressOptions, parse_address};
20//!
21//! let addr = "781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA";
22//! let opt = ParseAddressOptions::default();
23//! let parsed = parse_address(addr, &opt).unwrap();
24//! assert_eq!(parsed.get("state"), Some(&"ny".to_owned()));
25//! ```
26//!
27//! You can turn `parsed` back into a nicely-formatted address (almost anywhere
28//! in the world) by using
29//! [`address-formatter`](https://crates.io/crates/address-formatter)'s support
30//! for OpenCage address templates.
31
32use std::{
33    collections::HashMap,
34    ffi::{CStr, CString},
35    ops::DerefMut,
36};
37
38use init::{
39    initialize_libpostal, initialize_libpostal_language_classifier,
40    initialize_libpostal_parser,
41};
42use libpostal_sys::{
43    libpostal_address_parser_response_destroy, libpostal_expand_address,
44    libpostal_expansion_array_destroy, libpostal_get_address_parser_default_options,
45    libpostal_get_default_options, libpostal_parse_address, size_t, GLOBAL_LOCK,
46};
47
48mod errors;
49mod init;
50mod probe;
51
52pub use self::errors::Error;
53
54/// A `Result` type which defaults to `libpostal_rust::Error`.
55pub type Result<T, E = Error> = std::result::Result<T, E>;
56
57/// Options for use with `parse_address`.
58///
59/// Right now, this is just a placeholder and you can't set any options yet.
60#[derive(Debug, Default)]
61pub struct ParseAddressOptions {}
62
63/// Parse an address into its component values.
64pub fn parse_address(
65    addr: &str,
66    _opt: &ParseAddressOptions,
67) -> Result<HashMap<String, String>> {
68    // We need to hold onto this lock whenever we're calling libpostal.
69    let mut initialization_state = GLOBAL_LOCK.lock().expect("mutex poisoned");
70    unsafe { initialize_libpostal(initialization_state.deref_mut()) }?;
71    unsafe { initialize_libpostal_parser(initialization_state.deref_mut()) }?;
72
73    // Convert our arguments to work with C.
74    let addr = CString::new(addr).map_err(|_| Error::NullByteInString {
75        string: addr.to_owned(),
76    })?;
77    let parse_options = unsafe { libpostal_get_address_parser_default_options() };
78
79    // Parse the address.
80    let parsed =
81        unsafe { libpostal_parse_address(addr.as_ptr() as *mut _, parse_options) };
82
83    // Convert `parsed` to a reasonable Rust value.
84    let num_components = unsafe { (*parsed).num_components } as usize;
85    let mut result = HashMap::with_capacity(num_components);
86    for i in 0..num_components {
87        let (label, component) = unsafe {
88            (
89                CStr::from_ptr(*(*parsed).labels.add(i))
90                    .to_str()
91                    .expect("label contained invalid UTF-8"),
92                CStr::from_ptr(*(*parsed).components.add(i))
93                    .to_str()
94                    .expect("component contained invalid UTF-8"),
95            )
96        };
97        result.insert(label.to_owned(), component.to_owned());
98    }
99
100    // Clean up our C data structure.
101    unsafe { libpostal_address_parser_response_destroy(parsed) };
102
103    Ok(result)
104}
105
106/// Options for use with `expand_address`.
107#[derive(Debug, Default)]
108pub struct ExpandAddressOptions {}
109
110/// Try to expand any abbreviations in an address.
111pub fn expand_address(addr: &str, _opt: &ExpandAddressOptions) -> Result<Vec<String>> {
112    // We need to hold onto this lock whenever we're calling libpostal.
113    let mut initialization_state = GLOBAL_LOCK.lock().expect("mutex poisoned");
114    unsafe { initialize_libpostal(initialization_state.deref_mut()) }?;
115    unsafe {
116        initialize_libpostal_language_classifier(initialization_state.deref_mut())
117    }?;
118
119    // Convert our arguments to work with C.
120    let addr = CString::new(addr).map_err(|_| Error::NullByteInString {
121        string: addr.to_owned(),
122    })?;
123    let expand_options = unsafe { libpostal_get_default_options() };
124
125    // Parse the address.
126    let mut num_expansions: size_t = 0;
127    let expansions = unsafe {
128        libpostal_expand_address(
129            addr.as_ptr() as *mut _,
130            expand_options,
131            &mut num_expansions,
132        )
133    };
134
135    // Convert our results for Rust.
136    let mut result = Vec::with_capacity(num_expansions as usize);
137    for i in 0..num_expansions {
138        let expansion = unsafe {
139            CStr::from_ptr(*expansions.offset(i as isize))
140                .to_str()
141                .expect("expansion contained invalid UTF-8")
142        };
143        result.push(expansion.to_owned());
144    }
145
146    // Clean up our C data structure.
147    unsafe { libpostal_expansion_array_destroy(expansions, num_expansions) };
148
149    Ok(result)
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    #[ignore]
158    fn parse_address_returns_components() {
159        let addr = "781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA";
160        let opt = ParseAddressOptions::default();
161        let parsed = parse_address(addr, &opt).unwrap();
162        assert_eq!(parsed.get("state"), Some(&"ny".to_owned()));
163    }
164
165    #[test]
166    #[ignore]
167    fn expand_address_returns_candidates() {
168        let addr = "Quatre-vingt-douze Ave des Champs-Élysées";
169        let opt = ExpandAddressOptions::default();
170        let expanded = expand_address(addr, &opt).unwrap();
171        assert!(expanded[0].contains("92"));
172    }
173}