untanglr/
lib.rs

1extern crate regex;
2use regex::Regex;
3use std::cmp::max;
4use std::collections::HashMap;
5use std::str;
6
7// The LanguageModel holds the path to the dictionary (english
8// by default), the maximum word cost and the maximum word length
9pub struct LanguageModel {
10    word_cost: HashMap<String, f64>,
11    max_wlen: u8,
12}
13
14impl LanguageModel {
15    // Initialize the LanguageModel using the words in the dictionary to
16    // get the maximum word length and calculate word cost uzing Zipf's law
17    pub fn new() -> LanguageModel {
18        let dict = include_str!("dicts/english.txt");
19
20        let word_count: f64 = dict.len() as f64;
21        let mut max_wlen: u8 = 0;
22        let mut word_cost: HashMap<String, f64> = HashMap::new();
23
24        for (index, line) in dict.lines().enumerate() {
25            let word = line; // Ignore errors.
26            let word_len: u8 = word.chars().count() as u8;
27            max_wlen = max(word_len, max_wlen);
28
29            word_cost.insert(
30                String::from(word),
31                ((index + 1) as f64 * word_count.ln()).ln(),
32            );
33        }
34
35        LanguageModel {
36            word_cost,
37            max_wlen,
38        }
39    }
40
41    // Split the input into alphanumerical substrings and feed the results
42    // into the internal splitter
43    pub fn untangle(&self, s: &str) -> Vec<String> {
44        let re = Regex::new(r"[^a-zA-Z0-9']+").unwrap();
45        re.split(s).flat_map(|x| self.split(x.into())).collect()
46    }
47
48    // Takes as input a string and returns a vector of the substrings that
49    // match the frequency dictionary
50    fn split(&self, s: String) -> Vec<String> {
51        let mut cost: Vec<f64> = vec![0.0];
52        for i in 1..s.len() + 1 {
53            let (_, k) = self.best_match(i, &cost, &s);
54            cost.push(k);
55        }
56
57        let mut out: Vec<String> = Vec::new();
58        let mut i = s.len();
59        while i > 0 {
60            let (k, c) = self.best_match(i, &cost, &s);
61            assert!(c == cost[i]);
62            let mut new_token = true;
63            let z: usize = 0;
64            let outlen = out.len();
65            let idx = if (i as i8 - k as i8) < 0 {
66                0
67            } else {
68                i - k as usize
69            };
70            if &s[idx..i] != "'" {
71                if &out.len() > &z {
72                    let o = &out[out.len() - 1];
73                    if out[out.len() - 1] == "'s"
74                        || (s.as_bytes()[i - 1].is_ascii_digit()
75                            && o.as_bytes()[0].is_ascii_digit())
76                    {
77                        out[outlen - 1] =
78                            format!("{}{}", &s[i - k as usize..i], &out[out.len() - 1]);
79                        new_token = false;
80                    }
81                }
82            }
83
84            if new_token {
85                out.push(s[i - k as usize..i].to_string());
86            }
87            i -= k as usize;
88        }
89        out.reverse();
90        out
91    }
92
93    fn best_match(&self, i: usize, cost: &[f64], s: &str) -> (u8, f64) {
94        let candidates = &cost[(max(0, i as i64 - self.max_wlen as i64) as usize)..i];
95        let mut storedmin: (u8, f64) = (0, 9e99);
96
97        for (index, &candidate) in candidates.into_iter().rev().enumerate() {
98            let current_word_cost = match self
99                .word_cost
100                .get(&s[i - index - 1..i].to_ascii_lowercase())
101            {
102                Some(value) => value,
103                _ => &(9e99 as f64),
104            };
105            if candidate + current_word_cost < storedmin.1 {
106                storedmin.1 = candidate + *current_word_cost;
107                storedmin.0 = index as u8 + 1;
108            }
109        }
110
111        storedmin
112    }
113}
114
115
116// Unit tests 
117#[cfg(test)]
118mod tests {
119    use super::*;
120    #[test]
121    fn basic_split() {
122        let no_spaces = "thequickbrownfoxjumpedoverthelazydog";
123        let with_spaces = "the quick brown fox jumped over the lazy dog";
124        let lm = LanguageModel::new();
125        let correct: Vec<&str> = with_spaces.split_whitespace().collect();
126
127        assert_eq!(lm.untangle(no_spaces), correct);
128    }
129
130    #[test]
131    fn split_with_punctuation() {
132        let no_spaces = "thequick!brownfox.jumpedoverthe,lazydog?";
133        let with_spaces = "the quick brown fox jumped over the lazy dog";
134        let lm = LanguageModel::new();
135        let correct: Vec<&str> = with_spaces.split_whitespace().collect();
136
137        assert_eq!(lm.untangle(no_spaces), correct);
138    }
139}