Skip to main content

scry_learn/text/
tokenizer.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! Text tokenization utilities.
3//!
4//! Zero-dependency tokenizer that splits text at whitespace and punctuation
5//! boundaries, normalizes to lowercase, and supports n-gram generation.
6
7/// Tokenize text into lowercase words, stripping punctuation.
8///
9/// Splits on whitespace, removes non-alphanumeric characters from token
10/// boundaries, and filters empty tokens.
11///
12/// # Examples
13///
14/// ```ignore
15/// use scry_learn::text::tokenizer::default_tokenize;
16///
17/// let tokens = default_tokenize("Hello, World! It's a test.");
18/// assert_eq!(tokens, vec!["hello", "world", "it's", "a", "test"]);
19/// ```
20pub fn default_tokenize(text: &str) -> Vec<String> {
21    text.split_whitespace()
22        .map(|w| {
23            // Strip leading/trailing punctuation, preserve internal (e.g. apostrophes)
24            w.trim_matches(|c: char| !c.is_alphanumeric())
25                .to_lowercase()
26        })
27        .filter(|w| !w.is_empty())
28        .collect()
29}
30
31/// Generate n-grams from a list of tokens.
32///
33/// `range` is `(min_n, max_n)` inclusive on both ends.
34/// For `(1, 1)` this returns unigrams (the original tokens).
35/// For `(1, 2)` this returns both unigrams and bigrams.
36///
37/// # Examples
38///
39/// ```ignore
40/// use scry_learn::text::tokenizer::ngrams;
41///
42/// let tokens: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
43/// let result = ngrams(&tokens, (1, 2));
44/// // ["a", "b", "c", "a b", "b c"]
45/// ```
46pub fn ngrams(tokens: &[String], range: (usize, usize)) -> Vec<String> {
47    let (min_n, max_n) = range;
48    let min_n = min_n.max(1);
49    let max_n = max_n.max(min_n);
50
51    let mut result = Vec::new();
52
53    for n in min_n..=max_n {
54        if n > tokens.len() {
55            continue;
56        }
57        for window in tokens.windows(n) {
58            if n == 1 {
59                result.push(window[0].clone());
60            } else {
61                result.push(window.join(" "));
62            }
63        }
64    }
65
66    result
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72
73    #[test]
74    fn basic_tokenization() {
75        let tokens = default_tokenize("Hello, World!");
76        assert_eq!(tokens, vec!["hello", "world"]);
77    }
78
79    #[test]
80    fn handles_punctuation() {
81        let tokens = default_tokenize("It's a well-known fact, indeed!");
82        assert_eq!(tokens, vec!["it's", "a", "well-known", "fact", "indeed"]);
83    }
84
85    #[test]
86    fn handles_empty_string() {
87        let tokens = default_tokenize("");
88        assert!(tokens.is_empty());
89    }
90
91    #[test]
92    fn handles_only_whitespace() {
93        let tokens = default_tokenize("   \t\n  ");
94        assert!(tokens.is_empty());
95    }
96
97    #[test]
98    fn handles_only_punctuation() {
99        let tokens = default_tokenize("!!! ??? ...");
100        assert!(tokens.is_empty());
101    }
102
103    #[test]
104    fn unigrams() {
105        let tokens = vec!["a".into(), "b".into(), "c".into()];
106        let result = ngrams(&tokens, (1, 1));
107        assert_eq!(result, vec!["a", "b", "c"]);
108    }
109
110    #[test]
111    fn bigrams() {
112        let tokens = vec!["a".into(), "b".into(), "c".into()];
113        let result = ngrams(&tokens, (2, 2));
114        assert_eq!(result, vec!["a b", "b c"]);
115    }
116
117    #[test]
118    fn unigrams_and_bigrams() {
119        let tokens = vec!["a".into(), "b".into(), "c".into()];
120        let result = ngrams(&tokens, (1, 2));
121        assert_eq!(result, vec!["a", "b", "c", "a b", "b c"]);
122    }
123
124    #[test]
125    fn ngrams_larger_than_input() {
126        let tokens = vec!["a".into(), "b".into()];
127        let result = ngrams(&tokens, (3, 3));
128        assert!(result.is_empty());
129    }
130
131    #[test]
132    fn trigrams() {
133        let tokens = vec!["the".into(), "cat".into(), "sat".into(), "down".into()];
134        let result = ngrams(&tokens, (3, 3));
135        assert_eq!(result, vec!["the cat sat", "cat sat down"]);
136    }
137}