1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
use std::collections::BTreeMap;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::iter::once;
use std::path::Path;

use indexmap::IndexMap;

use crate::tokenizer::{tokenize, Token};
use crate::truecase::{CaseMap, Model};
use crate::utils::join_with_spaces;

/// Trainer for new truecasing models.
///
/// Use this to create your own models from a set of training sentences.
/// See [crate documentation](index.html) for examples.
#[derive(Debug, Default)]
pub struct ModelTrainer {
    unigram_stats: CaseStats,
    bigram_stats: CaseStats,
    trigram_stats: CaseStats,
}

impl ModelTrainer {
    /// Create a new model trainer.
    pub fn new() -> Self {
        Self::default()
    }

    /// Add sentences to the training set from a file.
    ///
    /// The file is assumed to have one sentence per line.
    pub fn add_sentences_from_file<P: AsRef<Path>>(&mut self, path: P) -> io::Result<&mut Self> {
        let file = File::open(path)?;
        let reader = BufReader::new(file);
        for line in reader.lines() {
            self.add_sentence(&line?);
        }

        Ok(self)
    }

    /// Add multiple sentences to the training set from an iterator.
    pub fn add_sentences_from_iter<I>(&mut self, iter: I) -> &mut Self
    where
        I: Iterator,
        I::Item: AsRef<str>,
    {
        for sentence in iter {
            self.add_sentence(sentence.as_ref());
        }

        self
    }

    /// Add one sentence to the training set.
    pub fn add_sentence(&mut self, sentence: &str) -> &mut Self {
        if !is_sentence_sane(sentence) {
            return self;
        }

        let tokens: Vec<_> = tokenize(sentence)
            .filter(Token::is_meaningful)
            // skip the first word of the sentence because certain words are more
            // likely to start a sentence than to be in the middle of it,
            // but when they are indeed in the middle, they are not capitalized
            // which leads to statistical data that doesn't make much sense
            .skip(1)
            .collect();

        for token in &tokens {
            self.unigram_stats.add_token(token);
        }

        for ngram in tokens.windows(2) {
            self.bigram_stats.add_ngram(ngram);
        }

        for ngram in tokens.windows(3) {
            self.trigram_stats.add_ngram(ngram);
        }

        self
    }

    /// Build a model from all gathered statistics.
    pub fn into_model(self) -> Model {
        let mut unigrams = self.unigram_stats.into_most_frequent(1);
        let mut bigrams = self.bigram_stats.into_most_frequent(10);
        let mut trigrams = self.trigram_stats.into_most_frequent(10);

        trigrams.retain(|k, v| {
            let normalized_words = k.split(' ').collect::<Vec<_>>();
            let truecased_words = v.split(' ').collect::<Vec<_>>();

            let normalized_bigrams = normalized_words
                .windows(2)
                .map(|whatever| join_with_spaces(whatever.iter()));
            let truecased_bigrams = truecased_words
                .windows(2)
                .map(|whatever| join_with_spaces(whatever.iter()));

            normalized_bigrams
                .zip(truecased_bigrams)
                .any(|(k, v)| bigrams[&k] != v)
        });

        bigrams.retain(|k, v| {
            let normalized_words = k.split(' ');
            let truecased_words = v.split(' ');
            normalized_words
                .zip(truecased_words)
                .any(|(k, v)| unigrams[k] != v)
        });

        unigrams.retain(|k, v| k != v);

        Model {
            unigrams,
            bigrams,
            trigrams,
        }
    }
}

#[derive(Debug, Default)]
struct CaseStats {
    stats: IndexMap<String, CaseCounts>,
}

impl CaseStats {
    fn add_token(&mut self, token: &Token) {
        self.add_string(token.original, &token.normalized)
    }

    fn add_ngram(&mut self, ngram: &[Token]) {
        let original = join_with_spaces(ngram.iter().map(Token::get_original));
        let normalized = join_with_spaces(ngram.iter().map(Token::get_normalized));
        self.add_string(&original, &normalized)
    }

    fn add_string(&mut self, original: &str, normalized: &str) {
        // currently it's impossible to add things to a hashmap ergonomically
        // using the .entry() API without needlessly cloning all of the source strings every time
        if let Some(counts) = self.stats.get_mut(normalized) {
            counts.add(original, normalized);
            return;
        }

        let mut counts = CaseCounts::default();
        counts.add(original, normalized);

        self.stats.insert(normalized.to_owned(), counts);
    }

    fn into_most_frequent(self, min_frequency: u32) -> CaseMap {
        self.stats
            .into_iter()
            .flat_map(|(normalized, word_case_counts)| {
                word_case_counts
                    .into_most_frequent_kind(min_frequency)
                    .map(|kind| kind.into_to_string_from(&normalized))
                    .map(|truecased| (normalized, truecased))
            })
            .collect()
    }
}

#[derive(Debug, Default)]
struct CaseCounts {
    normalized: u32,
    other: BTreeMap<String, u32>,
}

impl CaseCounts {
    fn add(&mut self, string: &str, normalized: &str) {
        if string == normalized {
            self.normalized += 1;
        } else {
            if let Some(other_count) = self.other.get_mut(string) {
                *other_count += 1;
                return;
            }

            self.other.insert(string.to_owned(), 1);
        }
    }

    fn into_most_frequent_kind(self, min_frequency: u32) -> Option<CaseKind> {
        let normalized = (CaseKind::Normalized, self.normalized);
        let other_options = self
            .other
            .into_iter()
            .map(|(string, count)| (CaseKind::Other(string), count));

        once(normalized)
            .chain(other_options)
            .filter(|&(_, frequency)| frequency >= min_frequency)
            .max_by_key(|&(_, frequency)| frequency)
            .map(|(option, _)| option)
    }
}

enum CaseKind {
    Normalized,
    Other(String),
}

impl CaseKind {
    fn into_to_string_from(self, normalized: &str) -> String {
        match self {
            CaseKind::Normalized => normalized.to_owned(),
            CaseKind::Other(string) => string,
        }
    }
}

fn is_sentence_sane(sentence: &str) -> bool {
    !sentence.chars().all(char::is_uppercase)
        && !sentence.chars().all(char::is_lowercase)
        && !sentence.trim().is_empty()
}