spellcheck/
lib.rs

1//! A spell-checker based on the statistical algorithm described by Peter Norvig
2//! in http://norvig.com/spell-correct.html
3//!
4//! This is a port of the JavaScript implementation of the algorithm from
5//! https://github.com/past/speller
6//!
7//! Usage requires a two-step process:
8//! 1) call speller.train() one or more times with a large text to train the language model
9//! 2) call speller.correct(word) to retrieve the correction for the specified word
10
11extern crate regex;
12use regex::Regex;
13use std::collections::HashMap;
14
15pub struct Speller {
16    /// The letters of the alphabet.
17    pub letters: String,
18    /// A frequency map of words to the number of times they were found during
19    /// training.
20    pub n_words: HashMap<String, u32>
21}
22
23impl Speller {
24    /// A function that trains the language model with the words in the supplied text.
25    /// Multiple invocation of this function can extend the training of the model.
26    pub fn train(&mut self, text: &str) {
27        let re = Regex::new(r"[a-z]+").unwrap();
28        let lc_text = text.to_lowercase();
29        for m in re.find_iter(&lc_text) {
30            let count = self.n_words.entry(m.as_str().to_string()).or_insert(0);
31            *count += 1;
32        }
33    }
34
35    /// A function that returns the correction for the specified word.
36    pub fn correct(&mut self, word: &str) -> String {
37        // A word in our word frequency map is already correct.
38        if self.n_words.contains_key(word) {
39            return word.to_string();
40        }
41
42        let mut candidates: HashMap<u32, String> = HashMap::new();
43        let list = self.edits(word);
44
45        // Try to find candidate corrections in the edits of the word.
46        for edit in &list {
47            if let Some(value) = self.n_words.get(edit) {
48                candidates.insert(*value, edit.to_string());
49            }
50        }
51        if let Some(c) = candidates.iter().max_by_key(|&entry| entry.0) {
52            return c.1.to_string();
53        }
54
55        // Try to find candidate corrections in the edits of the edits.
56        for edit in &list {
57            for w in self.edits(&edit) {
58                if let Some(value) = self.n_words.get(&w) {
59                    candidates.insert(*value, w);
60                }
61            }
62        }
63        if let Some(c) = candidates.iter().max_by_key(|&entry| entry.0) {
64            return c.1.to_string();
65        }
66
67        // Can't find a correction, return the input unchanged.
68        word.to_string()
69    }
70
71    /// A function that returns the set of possible corrections of the specified word.
72    /// The edits can be deletions, insertions, alterations or transpositions.
73    fn edits(&mut self, word: &str) -> Vec<String> {
74        let mut results = Vec::new();
75        // deletion
76        for i in 0 .. word.len() {
77            let (first, last) = word.split_at(i);
78            results.push([first, &last[1..]].concat());
79        }
80        // transposition
81        for i in 0 .. word.len() - 1 {
82            let (first, last) = word.split_at(i);
83            results.push([first, &last[1..2], &last[..1], &last[2..]].concat());
84        }
85        // alteration
86        for i in 0 .. word.len() {
87            for c in self.letters.chars() {
88                let (first, last) = word.split_at(i);
89                let mut buffer = [0; 1];
90                let result = c.encode_utf8(&mut buffer);
91                results.push([first, result, &last[1..]].concat());
92            }
93        }
94        // insertion
95        for i in 0 .. word.len() + 1 {
96            for c in self.letters.chars() {
97                let (first, last) = word.split_at(i);
98                let mut buffer = [0; 1];
99                let result = c.encode_utf8(&mut buffer);
100                results.push([first, result, last].concat());
101            }
102        }
103
104        results
105    }
106}