toipe/
textgen.rs

1//! Utilities for generating/selecting new (random) words for the typing
2//! test.
3
4use std::collections::VecDeque;
5use std::fs::File;
6use std::io::{self, BufRead, BufReader, Cursor, Seek, SeekFrom};
7use std::path::PathBuf;
8
9use rand::seq::SliceRandom;
10use rand::Rng;
11
12use bisection::bisect_right;
13use rand::prelude::ThreadRng;
14
15/// Efficient selector of words from a word list.
16///
17/// The word list is given by a BufReader.
18///
19/// ### Assumptions
20///
21/// The word list is assumed to:
22/// - Have a list of words separated by newline.
23/// - Use only English alphabet and **ASCII**.
24/// - Be **sorted alphabetically**.
25///     - In case-insensitive manner.
26///     - For example, both "Apple" and "apple" must appear before words
27///       started with "b".
28/// - Be a file that is **not modified** while the object is alive.
29/// - Have no empty lines except at the end of the file.
30///
31/// Note: only words between length 2 and 8, inclusive, are considered.
32/// Having no words matching the criteria may lead to an infinite loop.
33///
34/// ### Algorithm
35///
36/// During initialization, the [`RawWordSelector`] iterates through all
37/// the words in the list and builds an index mapping each letter (of
38/// the alphabet) to its byte position in the file and the cumulative
39/// number of words present starting with it.
40///
41/// To select a (pesudo-)random word, a random number between 0
42/// (inclusive) and number of lines (exclusive) is generated. Using
43/// binary search, the index of where this number lies in the cumulative
44/// no. of words list is found. Using this index, the byte offset of the
45/// corresponding letter is found. The file is then read starting from
46/// this byte offset and read line-by-line until the correct word (at
47/// line `number - cumulative num. words` from the starting of this
48/// letter).
49///
50/// ### Time complexity
51///
52/// Initialization: `O(n)`
53///
54/// Selecting a word: `O(1)` (best case) or `O(n)` (worst case)
55///
56/// ### Space complexity
57///
58/// `O(1)` (only needs fixed length arrays).
59#[derive(Debug)]
60pub struct RawWordSelector<T> {
61    reader: BufReader<T>,
62    letter_pos: [u64; 26],
63    letter_lines_sum: [u64; 27],
64}
65
66impl<T: Seek + io::Read> RawWordSelector<T> {
67    /// Create from any arbitrary [`BufReader`].
68    ///
69    /// Please ensure that assumptions defined at
70    /// [`RawWordSelector#assumptions`] are valid for the contents.
71    pub fn new(mut reader: BufReader<T>) -> Result<Self, io::Error> {
72        let mut letter_pos = [0u64; 26];
73        let mut letter_lines = [0u64; 26];
74        let mut num_lines = 0;
75        let mut cur_letter = b'a' - 1;
76        let mut cur_pos = 0;
77        let mut buffer = String::new();
78
79        fn is_letter(char: u8) -> bool {
80            char.is_ascii_lowercase()
81        }
82
83        loop {
84            buffer.clear();
85            let len = reader.read_line(&mut buffer)?;
86
87            if len == 0 {
88                break;
89            }
90
91            let line = buffer.to_ascii_lowercase();
92            num_lines += 1;
93            let first_char = line.bytes().next().unwrap();
94
95            if !is_letter(first_char) {
96                continue;
97            }
98
99            if cur_letter != first_char {
100                letter_pos[cur_pos] = reader.stream_position()?;
101
102                letter_lines[cur_pos] = num_lines;
103                num_lines = 0;
104
105                // println!(
106                //     "{}, {}, {}, {}",
107                //     char::from(first_char),
108                //     cur_pos,
109                //     letter_pos[cur_pos],
110                //     letter_lines[cur_pos],
111                // );
112
113                cur_pos += 1;
114                cur_letter = first_char;
115
116                if cur_pos >= 26 {
117                    break;
118                }
119            }
120        }
121
122        letter_lines.rotate_left(1);
123        letter_lines[25] = num_lines;
124        let letter_lines_sum: [u64; 26] = letter_lines
125            .into_iter()
126            .scan(0, |acc, x| {
127                *acc += x;
128
129                Some(*acc)
130            })
131            .collect::<Vec<u64>>()
132            .try_into()
133            .unwrap();
134        let mut letter_lines_sum_ = [0u64; 27];
135        letter_lines_sum_[1..].copy_from_slice(&letter_lines_sum[..]);
136        let letter_lines_sum = letter_lines_sum_;
137
138        // println!("{:?}", letter_lines);
139        // println!("{:?}", letter_lines_sum);
140
141        let word_selector = Self {
142            reader,
143            letter_pos,
144            letter_lines_sum,
145        };
146
147        Ok(word_selector)
148    }
149
150    fn word_at_letter_offset(
151        &mut self,
152        letter_index: usize,
153        line_offset: u64,
154    ) -> Result<String, io::Error> {
155        self.reader
156            .seek(SeekFrom::Start(self.letter_pos[letter_index]))?;
157
158        let mut buffer = String::new();
159        let mut line_no = 0;
160
161        loop {
162            buffer.clear();
163            self.reader.read_line(&mut buffer)?;
164
165            if line_no == line_offset {
166                break;
167            }
168
169            line_no += 1
170        }
171
172        // remove trailing newline
173        buffer.truncate(buffer.len() - 1);
174
175        Ok(buffer)
176    }
177
178    fn new_word_raw(&mut self, rng: &mut ThreadRng) -> Result<String, io::Error> {
179        let line_index = rng.gen_range(self.letter_lines_sum[0]..self.letter_lines_sum[26]);
180        // let line_index = 0;
181
182        let letter_lines_sum_index = bisect_right(&self.letter_lines_sum, &line_index);
183
184        let line_offset = self.letter_lines_sum[letter_lines_sum_index] - line_index;
185
186        let letter_index = letter_lines_sum_index - 1;
187
188        // println!(
189        //     "{}, {}, {}, {}",
190        //     line_index,
191        //     letter_lines_sum_index,
192        //     self.letter_lines_sum[letter_lines_sum_index],
193        //     line_offset
194        // );
195
196        self.word_at_letter_offset(letter_index, line_offset)
197    }
198}
199
200impl RawWordSelector<File> {
201    /// Create from a file at a path given by a [`PathBuf`].
202    ///
203    /// Please ensure that assumptions defined at
204    /// [`RawWordSelector#assumptions`] are valid for this file.
205    pub fn from_path(word_list_path: PathBuf) -> Result<Self, io::Error> {
206        let file = File::open(word_list_path)?;
207
208        let reader = BufReader::new(file);
209
210        Self::new(reader)
211    }
212}
213
214impl RawWordSelector<Cursor<String>> {
215    /// Create from a String representing the word list file.
216    ///
217    /// Please ensure that assumptions defined at
218    /// [`RawWordSelector#assumptions`] are valid for the contents.
219    pub fn from_string(word_list: String) -> Result<Self, io::Error> {
220        let cursor = Cursor::new(word_list);
221        let reader = BufReader::new(cursor);
222
223        RawWordSelector::new(reader)
224    }
225}
226
227/// Describes a thing that provides new words.
228pub trait WordSelector {
229    /// Returns a new word.
230    fn new_word(&mut self) -> Result<String, io::Error>;
231
232    /// Returns a [`Vec`] containing `num_words` words.
233    fn new_words(&mut self, num_words: usize) -> Result<Vec<String>, io::Error> {
234        (0..num_words).map(|_| self.new_word()).collect()
235    }
236}
237
238impl<T: Seek + io::Read> WordSelector for RawWordSelector<T> {
239    fn new_word(&mut self) -> Result<String, io::Error> {
240        let mut rng = rand::thread_rng();
241
242        let mut word = "-".to_string();
243
244        while word.len() < 2 || word.len() > 8 || !word.chars().all(|c| c.is_ascii_alphabetic()) {
245            word = self.new_word_raw(&mut rng)?;
246        }
247
248        word.make_ascii_lowercase();
249
250        Ok(word)
251    }
252}
253
254/// Wraps another word selector, taking words from it and adding punctuation to the end of or
255/// around words with a configurable chance. Will capitalize the next word when an end-of-sentence
256/// punctuation mark is used.
257pub struct PunctuatedWordSelector {
258    selector: Box<dyn WordSelector>,
259    next_is_capital: bool,
260    punctuation_chance: f64,
261}
262
263enum PunctuationType {
264    Capitaizing(char),
265    Ending(char),
266    Surrounding(char, char),
267}
268
269const PUNCTUATION: [PunctuationType; 12] = [
270    PunctuationType::Capitaizing('!'),
271    PunctuationType::Capitaizing('?'),
272    PunctuationType::Capitaizing('.'),
273    PunctuationType::Ending(','),
274    PunctuationType::Ending(':'),
275    PunctuationType::Ending(';'),
276    PunctuationType::Surrounding('\'', '\''),
277    PunctuationType::Surrounding('"', '"'),
278    PunctuationType::Surrounding('(', ')'),
279    PunctuationType::Surrounding('{', '}'),
280    PunctuationType::Surrounding('<', '>'),
281    PunctuationType::Surrounding('[', ']'),
282];
283
284impl PunctuatedWordSelector {
285    /// Creates a PunctuatedWordSelector from another WordSelector, allowing the selection of the
286    /// chance of punctuation.
287    pub fn from_word_selector(
288        word_selector: Box<dyn WordSelector>,
289        punctuation_chance: f64,
290    ) -> Self {
291        Self {
292            selector: word_selector,
293            next_is_capital: true,
294            punctuation_chance,
295        }
296    }
297}
298
299impl WordSelector for PunctuatedWordSelector {
300    fn new_word(&mut self) -> Result<String, io::Error> {
301        let mut rng = rand::thread_rng();
302
303        let mut word = self.selector.new_word()?;
304
305        let will_punctuate = rng.gen_bool(self.punctuation_chance);
306        if will_punctuate || self.next_is_capital {
307            let mut chars: VecDeque<char> = word.chars().collect();
308            if self.next_is_capital {
309                // some unicode chars map to multiple chars when uppercased.
310                for c in chars
311                    .pop_front()
312                    .expect("got empty word")
313                    .to_uppercase()
314                    .rev()
315                {
316                    chars.push_front(c)
317                }
318                self.next_is_capital = false;
319            }
320            if will_punctuate {
321                match PUNCTUATION
322                    .choose(&mut rng)
323                    .expect("only returns none if the slice is empty")
324                {
325                    PunctuationType::Capitaizing(c) => {
326                        self.next_is_capital = true;
327                        chars.push_back(*c)
328                    }
329                    PunctuationType::Ending(c) => chars.push_back(*c),
330                    PunctuationType::Surrounding(opening, closing) => {
331                        chars.push_front(*opening);
332                        chars.push_back(*closing);
333                    }
334                }
335            }
336            word = chars.into_iter().collect();
337        }
338        Ok(word)
339    }
340}