city_spellcheck/
lib.rs

1// Licensed under the the MIT license
2// <LICENSE-MIT or http://opensource.org/licenses/MIT>.
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! This crate provides a library for spell correction of city names
7//! using a fuzzy search scoring system that has optional weighting for
8//! distance.
9//!
10//! What that means is that if you supply your current GPS coordinates, then the
11//! spelling correction suggested results takes your current location heavily into
12//! account when scoring each potential match.
13//!
14//! Currently only supports USA and Canada, working on expanding to other countries ASAP.
15//!
16//! # Setup
17//!
18//! To use this library just add `city-spellcheck` to your `Cargo.toml` file:
19//!
20//! ```toml
21//! [dependencies]
22//! city-spellcheck = "0.1.0"
23//! ```
24//!
25//! Now you can use it:
26//!
27//! ```rust
28//! use city_spellcheck::*;
29//! ```
30//!
31//! To take a look at a very simple RESTful API (with only one route) that uses this library,
32//! check out the [City-Spellcheck Web Api](https://github.com/PrismaPhonic/city-spellcheck-web-api)
33//!
34//! # Example Use Case
35//!
36//! ```rust
37//! use city_spellcheck::*;
38//!
39//! let mut cities = CityData::new();
40//! cities
41//!     .populate_from_file("data/cities_canada-usa-filtered.csv")
42//!     .unwrap();
43//! let london = Coordinate::new(42.98339, -81.23304);
44//!     
45//! let results = cities.search("London", Some(london));
46//! assert_eq!(
47//!     format!("{:?}", results),
48//!     "[FuzzyResult { city: \"London, ON, CA\", latitude: 42.98339, longitude: -81.23304, score: 1.0 }, FuzzyResult { city: \"London, OH, US\", latitude: 39.88645, longitude: -83.44825, score: 0.6252391 }, FuzzyResult { city: \"London, KY, US\", latitude: 37.12898, longitude: -84.08326, score: 0.6250727 }, FuzzyResult { city: \"Lemont, IL, US\", latitude: 41.67364, longitude: -88.00173, score: 0.52094036 }, FuzzyResult { city: \"Brant, ON, CA\", latitude: 43.1334, longitude: -80.34967, score: 0.5208334 }]");
49//! ```
50//!
51//! Please explore this documentation to learn more. Nearly all useful methods are on the CityData
52//! struct.
53
54#[macro_use]
55extern crate serde_derive;
56extern crate rayon;
57extern crate redis;
58extern crate sift4;
59use rayon::iter::{IntoParallelIterator, ParallelIterator};
60use sift4::*;
61use std::cmp::Ordering;
62use std::error::Error;
63use std::fs;
64
65/// countries holds a country enum with valid countries
66pub mod countries;
67
68/// regions holds enums and implementations to display variants in english friendly string format
69pub mod regions;
70
71use self::countries::*;
72use self::regions::*;
73
74/// CityData holds all city data, each city is stored vertically across fields at a given index
75#[derive(Debug)]
76pub struct CityData {
77    names: Vec<String>,
78    countries: Vec<Country>,
79    regions: Vec<Region>,
80    latitudes: Vec<f32>,
81    longitudes: Vec<f32>,
82}
83
84/// City is an abstraction above CityData holding information on a single city (stored at a
85/// specific index in CityData)
86#[derive(Debug)]
87pub struct City<'a> {
88    name: &'a str,
89    country: Country,
90    region: Region,
91    latitude: f32,
92    longitude: f32,
93}
94
95/// Stores a GPS coordinate
96#[derive(Debug, Copy, Clone)]
97pub struct Coordinate {
98    latitude: f32,
99    longitude: f32,
100}
101
102impl Coordinate {
103    /// Instantiate a Coordinate instance with supplied latitude and longitude
104    pub fn new(latitude: f32, longitude: f32) -> Coordinate {
105        Coordinate {
106            latitude,
107            longitude,
108        }
109    }
110}
111
112/// FuzzyResult represents a result from our location weighted fuzzy search. Includes a score where
113/// 0 is worst and 1 is perfect match.
114#[derive(Debug, Serialize, Deserialize)]
115pub struct FuzzyResult {
116    city: String,
117    latitude: f32,
118    longitude: f32,
119    score: f32,
120}
121
122impl FuzzyResult {
123    /// Takes in a City instance and score, and instantiates a new FuzzyResult
124    /// Which includes human readable formatting of city name, region and country
125    pub fn new(city_data: City, score: f32) -> FuzzyResult {
126        let City {
127            name,
128            country,
129            region,
130            latitude,
131            longitude,
132        } = city_data;
133        let city = format!("{}, {}, {}", name, region, country);
134        FuzzyResult {
135            city,
136            latitude,
137            longitude,
138            score,
139        }
140    }
141}
142
143impl CityData {
144    /// Instantiates a new instance of CityData that is empty. Usually populated with
145    /// `populate_from_file` method
146    pub fn new() -> Self {
147        CityData {
148            names: Vec::new(),
149            countries: Vec::new(),
150            regions: Vec::new(),
151            latitudes: Vec::new(),
152            longitudes: Vec::new(),
153        }
154    }
155
156    /// Takes in a geonames file and matches countries and regions to our custom enum variants
157    /// It then adds each city in line by line to our CityData instance.
158    ///
159    /// Note that geonames file comes from: [http://download.geonames.org/export/dump](http://download.geonames.org/export/dump)
160    /// Nicer formatted version is available in the github project for this crate under
161    /// the data folder:
162    ///
163    /// [city-spellcheck github](https://github.com/PrismaPhonic/city-spellcheck)
164    pub fn populate_from_file(&mut self, filename: &str) -> Result<(), Box<dyn Error>> {
165        let buffer = fs::read_to_string(filename)?;
166        let mut lines = buffer.lines();
167
168        // skip header line
169        lines.next();
170
171        for line in lines {
172            if let [name, country, region, latitude, longitude] =
173                line.split(',').collect::<Vec<&str>>()[..]
174            {
175                let latitude: f32 = latitude.parse()?;
176                let longitude: f32 = longitude.parse()?;
177
178                let country = match country {
179                    "US" => Country::US,
180                    "CA" => Country::CA,
181                    _ => continue,
182                };
183
184                let region = match country {
185                    Country::US => CityData::us_match(region),
186                    Country::CA => CityData::ca_match(region),
187                };
188
189                self.add_city(name, country, region, latitude, longitude);
190            };
191        }
192
193        Ok(())
194    }
195
196    // Matches admin1 codes:
197    // http://download.geonames.org/export/dump/admin1CodesASCII.txt
198    fn ca_match(region: &str) -> Region {
199        match region {
200            "01" => Region::Province(CAProvince::AB),
201            "02" => Region::Province(CAProvince::BC),
202            "03" => Region::Province(CAProvince::MB),
203            "04" => Region::Province(CAProvince::NB),
204            "05" => Region::Province(CAProvince::NL),
205            "07" => Region::Province(CAProvince::NS),
206            "08" => Region::Province(CAProvince::ON),
207            "09" => Region::Province(CAProvince::PE),
208            "10" => Region::Province(CAProvince::QC),
209            "11" => Region::Province(CAProvince::SK),
210            "12" => Region::Territory(CATerritory::YT),
211            "13" => Region::Territory(CATerritory::NT),
212            "14" => Region::Territory(CATerritory::NU),
213            _ => Region::None,
214        }
215    }
216
217    fn us_match(region: &str) -> Region {
218        match region {
219            "AL" => Region::State(USState::AL),
220            "AK" => Region::State(USState::AK),
221            "AZ" => Region::State(USState::AZ),
222            "AR" => Region::State(USState::AR),
223            "CA" => Region::State(USState::CA),
224            "CO" => Region::State(USState::CO),
225            "CT" => Region::State(USState::CT),
226            "DE" => Region::State(USState::DE),
227            "FL" => Region::State(USState::FL),
228            "GA" => Region::State(USState::GA),
229            "HI" => Region::State(USState::HI),
230            "ID" => Region::State(USState::ID),
231            "IL" => Region::State(USState::IL),
232            "IN" => Region::State(USState::IN),
233            "IA" => Region::State(USState::IA),
234            "KS" => Region::State(USState::KS),
235            "KY" => Region::State(USState::KY),
236            "LA" => Region::State(USState::LA),
237            "ME" => Region::State(USState::ME),
238            "MD" => Region::State(USState::MD),
239            "MA" => Region::State(USState::MA),
240            "MI" => Region::State(USState::MI),
241            "MN" => Region::State(USState::MN),
242            "MS" => Region::State(USState::MS),
243            "MO" => Region::State(USState::MO),
244            "MT" => Region::State(USState::MT),
245            "NE" => Region::State(USState::NE),
246            "NV" => Region::State(USState::NV),
247            "NH" => Region::State(USState::NH),
248            "NJ" => Region::State(USState::NJ),
249            "NM" => Region::State(USState::NM),
250            "NY" => Region::State(USState::NY),
251            "NC" => Region::State(USState::NC),
252            "ND" => Region::State(USState::ND),
253            "OH" => Region::State(USState::OH),
254            "OK" => Region::State(USState::OK),
255            "OR" => Region::State(USState::OR),
256            "PA" => Region::State(USState::PA),
257            "RI" => Region::State(USState::RI),
258            "SC" => Region::State(USState::SC),
259            "SD" => Region::State(USState::SD),
260            "TN" => Region::State(USState::TN),
261            "TX" => Region::State(USState::TX),
262            "UT" => Region::State(USState::UT),
263            "VT" => Region::State(USState::VT),
264            "VA" => Region::State(USState::VA),
265            "WA" => Region::State(USState::WA),
266            "WV" => Region::State(USState::WV),
267            "WI" => Region::State(USState::WI),
268            "WY" => Region::State(USState::WY),
269            _ => Region::None,
270        }
271    }
272
273    fn add_city(
274        &mut self,
275        name: &str,
276        country: Country,
277        region: Region,
278        latitude: f32,
279        longitude: f32,
280    ) {
281        self.names.push(name.to_string());
282        self.countries.push(country);
283        self.regions.push(region);
284        self.latitudes.push(latitude);
285        self.longitudes.push(longitude);
286    }
287
288    /// `get_city` is a function to get a city back from our `CityData` struct.
289    /// Helps us keep our data stored in a DoD fashion and provide
290    /// a higher level abstraction to retrieve each city based on index
291    ///
292    /// Supply an index and get back the city at that index.
293    ///
294    /// # Example
295    ///
296    /// ```rust
297    /// use city_spellcheck::*;
298    ///
299    /// let mut cities = CityData::new();
300    ///
301    /// 	cities
302    ///     	.populate_from_file("data/cities_canada-usa-filtered.csv")
303    ///     	.unwrap();
304    ///        
305    ///		assert_eq!(
306    ///            format!("{:?}", cities.get_city(0)),
307    ///            "City { name: \"Abbotsford\", country: CA, region: Province(BC), latitude: 49.05798, longitude: -122.25257 }"
308    ///     );
309    /// ```
310    pub fn get_city(&self, idx: usize) -> City {
311        City {
312            name: &self.names[idx],
313            country: self.countries[idx],
314            region: self.regions[idx],
315            latitude: self.latitudes[idx],
316            longitude: self.longitudes[idx],
317        }
318    }
319
320    /// `total_score` takes into account location as well as
321    /// string distance using sift4 algorithm. Supply the index of the city
322    /// and optionally a Coordinate instance representing your current location.
323    /// `total_score` will then return a weighted fuzzy match score.
324    ///
325    /// # Example
326    ///
327    /// ```rust
328    /// use city_spellcheck::*;
329    ///
330    /// let mut cities = CityData::new();
331    /// cities
332    ///     .populate_from_file("data/cities_canada-usa-filtered.csv")
333    ///     .unwrap();
334    /// assert_eq!(cities.total_score("Abbotsfor", 0, None), 0.88888896);
335    /// ```
336    ///
337    pub fn total_score(&self, term: &str, idx: usize, loc: Option<Coordinate>) -> f32 {
338        let city = &self.names[idx];
339        let latitude = self.latitudes[idx];
340        let longitude = self.longitudes[idx];
341        let city_loc = Coordinate {
342            latitude,
343            longitude,
344        };
345
346        let str_dist = sift4(city, term) as f32;
347        let mut str_score = if str_dist >= term.len() as f32 {
348            0.0
349        } else {
350            (term.len() as f32 - str_dist) / term.len() as f32
351        };
352
353        if str_score == 0.0 {
354            return 0.0;
355        };
356
357        // penalty if first letters don't match
358        if city.chars().next().unwrap() != term.chars().next().unwrap() {
359            if str_score < 0.1 {
360                str_score = 0.0;
361            } else {
362                str_score -= 0.1;
363            }
364        }
365
366        let mut dist_score = str_score;
367
368        if let Some(loc2) = loc {
369            let phys_dist = CityData::find_distance_earth(city_loc, loc2);
370            dist_score = CityData::dist_score(phys_dist);
371        };
372
373        (str_score * 5.0 + dist_score * 3.0) / 8.0
374    }
375
376    /// Finds circular distance from two gps coordinates using haversine formula.
377    /// Just supply two locations as Coordinate instances and get back distance
378    /// in **kilometers**.
379    ///
380    /// # Example
381    ///
382    /// ```rust
383    /// use city_spellcheck::*;
384    ///
385    /// let sf = Coordinate::new(37.774929, -122.419416);
386    /// let nyc = Coordinate::new(40.730610, -73.935242);
387    ///
388    /// assert_eq!(CityData::find_distance_earth(sf, nyc), 4135.694);
389    /// ```
390    pub fn find_distance_earth(loc1: Coordinate, loc2: Coordinate) -> f32 {
391        const R: f32 = 6372.8;
392        let Coordinate {
393            latitude: mut lat1,
394            longitude: mut long1,
395        } = loc1;
396        let Coordinate {
397            latitude: mut lat2,
398            longitude: long2,
399        } = loc2;
400        long1 -= long2;
401        long1 = long1.to_radians();
402        lat1 = lat1.to_radians();
403        lat2 = lat2.to_radians();
404        let dz: f32 = lat1.sin() - lat2.sin();
405        let dx: f32 = long1.cos() * lat1.cos() - lat2.cos();
406        let dy: f32 = long1.sin() * lat1.cos();
407        ((dx * dx + dy * dy + dz * dz).sqrt() / 2.0).asin() * 2.0 * R
408    }
409
410    // Largest city in North America by area is NYC which is 8600 square km
411    // or 92 km away - setting a dist of 92 as perfect 1.0
412    fn dist_score(dist: f32) -> f32 {
413        if dist < 92.0 {
414            1.0
415        } else {
416            92.0 / (dist.powf(2.0) - (91.9 as f32).powf(2.0))
417        }
418    }
419
420    /// `search` will search through **all** the cities stored in CityData, optionally
421    /// including a set of coordinates for distance weighting, and return a vector
422    /// of FuzzyResult's (instances that include the city name in a human readable format,
423    /// the lat and long of the city, and the fuzzy score).
424    ///
425    /// `search` sorts results from highest score (1.0 max) to lowest score. `search` only
426    /// shows results with a score greater than 0.5.
427    ///
428    /// # Example
429    ///
430    /// ```rust
431    /// use city_spellcheck::*;
432    ///
433    /// let mut cities = CityData::new();
434    /// cities
435    ///     .populate_from_file("data/cities_canada-usa-filtered.csv")
436    ///     .unwrap();
437    /// let london = Coordinate::new(42.98339, -81.23304);
438    ///     
439    /// let results = cities.search("London", Some(london));
440    /// assert_eq!(
441    ///     format!("{:?}", results),
442    ///     "[FuzzyResult { city: \"London, ON, CA\", latitude: 42.98339, longitude: -81.23304, score: 1.0 }, FuzzyResult { city: \"London, OH, US\", latitude: 39.88645, longitude: -83.44825, score: 0.6252391 }, FuzzyResult { city: \"London, KY, US\", latitude: 37.12898, longitude: -84.08326, score: 0.6250727 }, FuzzyResult { city: \"Lemont, IL, US\", latitude: 41.67364, longitude: -88.00173, score: 0.52094036 }, FuzzyResult { city: \"Brant, ON, CA\", latitude: 43.1334, longitude: -80.34967, score: 0.5208334 }]");
443    /// ```
444    pub fn search(&self, term: &str, loc: Option<Coordinate>) -> Vec<FuzzyResult> {
445        let mut found: Vec<(usize, f32)> = (0..self.names.len())
446            .into_par_iter()
447            .map(|i| (i, self.total_score(term, i, loc)))
448            .filter(|(_, score)| score > &0.5)
449            .collect();
450
451        found.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
452
453        found
454            .iter()
455            .map(|result| FuzzyResult::new(self.get_city(result.0), result.1))
456            .collect()
457    }
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463
464    #[test]
465    fn test_citydata_struct_nyc() {
466        let mut cities = CityData::new();
467        cities.add_city(
468            "New York City",
469            Country::US,
470            Region::State(USState::NY),
471            40.7128,
472            74.0060,
473        );
474        assert_eq!(format!("{:?}", cities.get_city(0)), "City { name: \"New York City\", country: US, region: State(NY), latitude: 40.7128, longitude: 74.006 }");
475    }
476
477    #[test]
478    fn test_citydata_struct_sf() {
479        let mut cities = CityData::new();
480        cities.add_city(
481            "San Francisco",
482            Country::US,
483            Region::State(USState::CA),
484            37.7749,
485            122.4194,
486        );
487        assert_eq!(format!("{:?}", cities.get_city(0)), "City { name: \"San Francisco\", country: US, region: State(CA), latitude: 37.7749, longitude: 122.4194 }");
488    }
489
490    #[test]
491    fn test_populate_from_file() {
492        let mut cities = CityData::new();
493        cities
494            .populate_from_file("data/cities_canada-usa-filtered.csv")
495            .unwrap();
496        assert_eq!(
497            format!("{:?}", cities.get_city(0)),
498            "City { name: \"Abbotsford\", country: CA, region: Province(BC), latitude: 49.05798, longitude: -122.25257 }"
499        );
500    }
501
502    #[test]
503    fn test_str_dist() {
504        assert_eq!(sift4("Londo", "London"), 1);
505    }
506
507    #[test]
508    fn test_phys_dist() {
509        let sf = Coordinate {
510            latitude: 37.774929,
511            longitude: -122.419416,
512        };
513        let nyc = Coordinate {
514            latitude: 40.730610,
515            longitude: -73.935242,
516        };
517        assert_eq!(CityData::find_distance_earth(sf, nyc), 4135.694);
518    }
519
520    #[test]
521    fn test_dist_score() {
522        assert_eq!(CityData::dist_score(4135.694), 0.0000053815274);
523    }
524
525    #[test]
526    fn test_total_score_no_gps() {
527        let mut cities = CityData::new();
528        cities
529            .populate_from_file("data/cities_canada-usa-filtered.csv")
530            .unwrap();
531        assert_eq!(cities.total_score("Abbotsfor", 0, None), 0.88888896);
532    }
533
534    #[test]
535    fn test_search_with_gps() {
536        let mut cities = CityData::new();
537        cities
538            .populate_from_file("data/cities_canada-usa-filtered.csv")
539            .unwrap();
540        let london = Coordinate {
541            latitude: 42.98339,
542            longitude: -81.23304,
543        };
544        let results = cities.search("London", Some(london));
545        assert_eq!(
546            format!("{:?}", results),
547            "[FuzzyResult { city: \"London, ON, CA\", latitude: 42.98339, longitude: -81.23304, score: 1.0 }, FuzzyResult { city: \"London, OH, US\", latitude: 39.88645, longitude: -83.44825, score: 0.6252391 }, FuzzyResult { city: \"London, KY, US\", latitude: 37.12898, longitude: -84.08326, score: 0.6250727 }, FuzzyResult { city: \"Lemont, IL, US\", latitude: 41.67364, longitude: -88.00173, score: 0.52094036 }, FuzzyResult { city: \"Brant, ON, CA\", latitude: 43.1334, longitude: -80.34967, score: 0.5208334 }]"
548        );
549    }
550}