1extern crate regex;
2use regex::Regex;
3use std::cmp::max;
4use std::collections::HashMap;
5use std::str;
6
7pub struct LanguageModel {
10 word_cost: HashMap<String, f64>,
11 max_wlen: u8,
12}
13
14impl LanguageModel {
15 pub fn new() -> LanguageModel {
18 let dict = include_str!("dicts/english.txt");
19
20 let word_count: f64 = dict.len() as f64;
21 let mut max_wlen: u8 = 0;
22 let mut word_cost: HashMap<String, f64> = HashMap::new();
23
24 for (index, line) in dict.lines().enumerate() {
25 let word = line; let word_len: u8 = word.chars().count() as u8;
27 max_wlen = max(word_len, max_wlen);
28
29 word_cost.insert(
30 String::from(word),
31 ((index + 1) as f64 * word_count.ln()).ln(),
32 );
33 }
34
35 LanguageModel {
36 word_cost,
37 max_wlen,
38 }
39 }
40
41 pub fn untangle(&self, s: &str) -> Vec<String> {
44 let re = Regex::new(r"[^a-zA-Z0-9']+").unwrap();
45 re.split(s).flat_map(|x| self.split(x.into())).collect()
46 }
47
48 fn split(&self, s: String) -> Vec<String> {
51 let mut cost: Vec<f64> = vec![0.0];
52 for i in 1..s.len() + 1 {
53 let (_, k) = self.best_match(i, &cost, &s);
54 cost.push(k);
55 }
56
57 let mut out: Vec<String> = Vec::new();
58 let mut i = s.len();
59 while i > 0 {
60 let (k, c) = self.best_match(i, &cost, &s);
61 assert!(c == cost[i]);
62 let mut new_token = true;
63 let z: usize = 0;
64 let outlen = out.len();
65 let idx = if (i as i8 - k as i8) < 0 {
66 0
67 } else {
68 i - k as usize
69 };
70 if &s[idx..i] != "'" {
71 if &out.len() > &z {
72 let o = &out[out.len() - 1];
73 if out[out.len() - 1] == "'s"
74 || (s.as_bytes()[i - 1].is_ascii_digit()
75 && o.as_bytes()[0].is_ascii_digit())
76 {
77 out[outlen - 1] =
78 format!("{}{}", &s[i - k as usize..i], &out[out.len() - 1]);
79 new_token = false;
80 }
81 }
82 }
83
84 if new_token {
85 out.push(s[i - k as usize..i].to_string());
86 }
87 i -= k as usize;
88 }
89 out.reverse();
90 out
91 }
92
93 fn best_match(&self, i: usize, cost: &[f64], s: &str) -> (u8, f64) {
94 let candidates = &cost[(max(0, i as i64 - self.max_wlen as i64) as usize)..i];
95 let mut storedmin: (u8, f64) = (0, 9e99);
96
97 for (index, &candidate) in candidates.into_iter().rev().enumerate() {
98 let current_word_cost = match self
99 .word_cost
100 .get(&s[i - index - 1..i].to_ascii_lowercase())
101 {
102 Some(value) => value,
103 _ => &(9e99 as f64),
104 };
105 if candidate + current_word_cost < storedmin.1 {
106 storedmin.1 = candidate + *current_word_cost;
107 storedmin.0 = index as u8 + 1;
108 }
109 }
110
111 storedmin
112 }
113}
114
115
116#[cfg(test)]
118mod tests {
119 use super::*;
120 #[test]
121 fn basic_split() {
122 let no_spaces = "thequickbrownfoxjumpedoverthelazydog";
123 let with_spaces = "the quick brown fox jumped over the lazy dog";
124 let lm = LanguageModel::new();
125 let correct: Vec<&str> = with_spaces.split_whitespace().collect();
126
127 assert_eq!(lm.untangle(no_spaces), correct);
128 }
129
130 #[test]
131 fn split_with_punctuation() {
132 let no_spaces = "thequick!brownfox.jumpedoverthe,lazydog?";
133 let with_spaces = "the quick brown fox jumped over the lazy dog";
134 let lm = LanguageModel::new();
135 let correct: Vec<&str> = with_spaces.split_whitespace().collect();
136
137 assert_eq!(lm.untangle(no_spaces), correct);
138 }
139}