scry_learn/text/
tokenizer.rs1pub fn default_tokenize(text: &str) -> Vec<String> {
21 text.split_whitespace()
22 .map(|w| {
23 w.trim_matches(|c: char| !c.is_alphanumeric())
25 .to_lowercase()
26 })
27 .filter(|w| !w.is_empty())
28 .collect()
29}
30
31pub fn ngrams(tokens: &[String], range: (usize, usize)) -> Vec<String> {
47 let (min_n, max_n) = range;
48 let min_n = min_n.max(1);
49 let max_n = max_n.max(min_n);
50
51 let mut result = Vec::new();
52
53 for n in min_n..=max_n {
54 if n > tokens.len() {
55 continue;
56 }
57 for window in tokens.windows(n) {
58 if n == 1 {
59 result.push(window[0].clone());
60 } else {
61 result.push(window.join(" "));
62 }
63 }
64 }
65
66 result
67}
68
69#[cfg(test)]
70mod tests {
71 use super::*;
72
73 #[test]
74 fn basic_tokenization() {
75 let tokens = default_tokenize("Hello, World!");
76 assert_eq!(tokens, vec!["hello", "world"]);
77 }
78
79 #[test]
80 fn handles_punctuation() {
81 let tokens = default_tokenize("It's a well-known fact, indeed!");
82 assert_eq!(tokens, vec!["it's", "a", "well-known", "fact", "indeed"]);
83 }
84
85 #[test]
86 fn handles_empty_string() {
87 let tokens = default_tokenize("");
88 assert!(tokens.is_empty());
89 }
90
91 #[test]
92 fn handles_only_whitespace() {
93 let tokens = default_tokenize(" \t\n ");
94 assert!(tokens.is_empty());
95 }
96
97 #[test]
98 fn handles_only_punctuation() {
99 let tokens = default_tokenize("!!! ??? ...");
100 assert!(tokens.is_empty());
101 }
102
103 #[test]
104 fn unigrams() {
105 let tokens = vec!["a".into(), "b".into(), "c".into()];
106 let result = ngrams(&tokens, (1, 1));
107 assert_eq!(result, vec!["a", "b", "c"]);
108 }
109
110 #[test]
111 fn bigrams() {
112 let tokens = vec!["a".into(), "b".into(), "c".into()];
113 let result = ngrams(&tokens, (2, 2));
114 assert_eq!(result, vec!["a b", "b c"]);
115 }
116
117 #[test]
118 fn unigrams_and_bigrams() {
119 let tokens = vec!["a".into(), "b".into(), "c".into()];
120 let result = ngrams(&tokens, (1, 2));
121 assert_eq!(result, vec!["a", "b", "c", "a b", "b c"]);
122 }
123
124 #[test]
125 fn ngrams_larger_than_input() {
126 let tokens = vec!["a".into(), "b".into()];
127 let result = ngrams(&tokens, (3, 3));
128 assert!(result.is_empty());
129 }
130
131 #[test]
132 fn trigrams() {
133 let tokens = vec!["the".into(), "cat".into(), "sat".into(), "down".into()];
134 let result = ngrams(&tokens, (3, 3));
135 assert_eq!(result, vec!["the cat sat", "cat sat down"]);
136 }
137}