1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
//! A spell-checker based on the statistical algorithm described by Peter Norvig
//! in http://norvig.com/spell-correct.html
//!
//! This is a port of the JavaScript implementation of the algorithm from
//! https://github.com/past/speller
//!
//! Usage requires a two-step process:
//! 1) call speller.train() one or more times with a large text to train the language model
//! 2) call speller.correct(word) to retrieve the correction for the specified word

extern crate regex;
use regex::Regex;
use std::collections::HashMap;

pub struct Speller {
    /// The letters of the alphabet.
    pub letters: String,
    /// A frequency map of words to the number of times they were found during
    /// training.
    pub n_words: HashMap<String, u32>
}

impl Speller {
    /// A function that trains the language model with the words in the supplied text.
    /// Multiple invocation of this function can extend the training of the model.
    pub fn train(&mut self, text: &str) {
        let re = Regex::new(r"[a-z]+").unwrap();
        let lc_text = text.to_lowercase();
        for m in re.find_iter(&lc_text) {
            let count = self.n_words.entry(m.as_str().to_string()).or_insert(0);
            *count += 1;
        }
    }

    /// A function that returns the correction for the specified word.
    pub fn correct(&mut self, word: &str) -> String {
        // A word in our word frequency map is already correct.
        if self.n_words.contains_key(word) {
            return word.to_string();
        }

        let mut candidates: HashMap<u32, String> = HashMap::new();
        let list = self.edits(word);

        // Try to find candidate corrections in the edits of the word.
        for edit in &list {
            if let Some(value) = self.n_words.get(edit) {
                candidates.insert(*value, edit.to_string());
            }
        }
        if let Some(c) = candidates.iter().max_by_key(|&entry| entry.0) {
            return c.1.to_string();
        }

        // Try to find candidate corrections in the edits of the edits.
        for edit in &list {
            for w in self.edits(&edit) {
                if let Some(value) = self.n_words.get(&w) {
                    candidates.insert(*value, w);
                }
            }
        }
        if let Some(c) = candidates.iter().max_by_key(|&entry| entry.0) {
            return c.1.to_string();
        }

        // Can't find a correction, return the input unchanged.
        word.to_string()
    }

    /// A function that returns the set of possible corrections of the specified word.
    /// The edits can be deletions, insertions, alterations or transpositions.
    fn edits(&mut self, word: &str) -> Vec<String> {
        let mut results = Vec::new();
        // deletion
        for i in 0 .. word.len() {
            let (first, last) = word.split_at(i);
            results.push([first, &last[1..]].concat());
        }
        // transposition
        for i in 0 .. word.len() - 1 {
            let (first, last) = word.split_at(i);
            results.push([first, &last[1..2], &last[..1], &last[2..]].concat());
        }
        // alteration
        for i in 0 .. word.len() {
            for c in self.letters.chars() {
                let (first, last) = word.split_at(i);
                let mut buffer = [0; 1];
                let result = c.encode_utf8(&mut buffer);
                results.push([first, result, &last[1..]].concat());
            }
        }
        // insertion
        for i in 0 .. word.len() + 1 {
            for c in self.letters.chars() {
                let (first, last) = word.split_at(i);
                let mut buffer = [0; 1];
                let result = c.encode_utf8(&mut buffer);
                results.push([first, result, last].concat());
            }
        }

        results
    }
}