1pub fn tokenize(s: &str) -> Vec<String> {
8 let mut out = Vec::new();
9 let mut cur = String::new();
10 for ch in s.chars() {
11 if ch.is_ascii_alphanumeric() {
12 cur.push(ch.to_ascii_lowercase());
13 } else if !cur.is_empty() {
14 if cur.len() >= 2 {
15 out.push(std::mem::take(&mut cur));
16 } else {
17 cur.clear();
18 }
19 }
20 }
21 if cur.len() >= 2 {
22 out.push(cur);
23 }
24 out
25}
26
27const STOPWORDS: &[&str] = &[
32 "the", "an", "of", "to", "for", "and", "or", "in", "on", "at", "is", "it", "be", "as", "by",
33 "with", "from", "into", "me", "my", "we", "our", "you", "your", "this", "that", "these",
34 "those", "use", "used", "when", "user", "users", "say", "says", "want", "wants", "ask", "asks",
35 "do", "does", "not", "if", "so", "up", "out", "via", "are", "was", "will", "can", "a", "i",
36];
37
38pub fn content_tokens(s: &str) -> Vec<String> {
41 tokenize(s)
42 .into_iter()
43 .filter(|t| !STOPWORDS.contains(&t.as_str()))
44 .collect()
45}
46
47pub fn norm_token(t: &str) -> String {
54 let b = t.as_bytes();
55 let n = b.len();
56 if n <= 3 || !t.ends_with('s') {
57 return t.to_string();
58 }
59 if t.ends_with("ss") || t.ends_with("us") || t.ends_with("is") {
61 return t.to_string();
62 }
63 if n > 4 && t.ends_with("ies") {
64 return format!("{}y", &t[..n - 3]); }
66 if t.ends_with("sses")
67 || t.ends_with("ches")
68 || t.ends_with("shes")
69 || t.ends_with("xes")
70 || t.ends_with("zes")
71 {
72 return t[..n - 2].to_string(); }
74 t[..n - 1].to_string() }
76
77pub fn match_tokens(s: &str) -> Vec<String> {
80 content_tokens(s).iter().map(|t| norm_token(t)).collect()
81}
82
83pub fn fnv1a_32(s: &str) -> u32 {
85 let mut h: u32 = 0x811c_9dc5;
86 for b in s.bytes() {
87 h ^= b as u32;
88 h = h.wrapping_mul(0x0100_0193);
89 }
90 h
91}
92
93pub fn fnv1a_64(bytes: &[u8]) -> u64 {
95 let mut h: u64 = 0xcbf2_9ce4_8422_2325;
96 for &b in bytes {
97 h ^= b as u64;
98 h = h.wrapping_mul(0x0000_0100_0000_01b3);
99 }
100 h
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 #[test]
108 fn tokenize_splits_and_lowercases() {
109 assert_eq!(
110 tokenize("Set-up a NEW uv_project!"),
111 ["set", "up", "new", "uv", "project"]
112 );
113 }
114
115 #[test]
116 fn tokenize_drops_single_chars() {
117 assert_eq!(tokenize("a b cd e"), ["cd"]);
118 }
119
120 #[test]
121 fn content_tokens_drops_stopwords() {
122 assert_eq!(
125 content_tokens("connect to the Neon database"),
126 ["connect", "neon", "database"]
127 );
128 assert!(content_tokens("set it up").is_empty() || content_tokens("set it up") == ["set"]);
130 }
131
132 #[test]
133 fn norm_token_singularizes_common_plurals() {
134 assert_eq!(norm_token("spreadsheets"), "spreadsheet");
135 assert_eq!(norm_token("charts"), "chart");
136 assert_eq!(norm_token("dependencies"), "dependency");
137 assert_eq!(norm_token("branches"), "branch");
138 assert_eq!(norm_token("boxes"), "box");
139 assert_eq!(norm_token("classes"), "class");
140 }
141
142 #[test]
143 fn norm_token_leaves_non_plurals_alone() {
144 for t in ["uv", "css", "class", "status", "analysis", "chart", "rust"] {
146 assert_eq!(norm_token(t), t);
147 }
148 }
149
150 #[test]
151 fn match_tokens_normalizes_content_tokens() {
152 assert_eq!(
153 match_tokens("compute the formulas in these spreadsheets"),
154 ["compute", "formula", "spreadsheet"]
155 );
156 }
157
158 #[test]
159 fn fnv_is_deterministic() {
160 assert_eq!(fnv1a_32("commit"), fnv1a_32("commit"));
161 assert_ne!(fnv1a_32("commit"), fnv1a_32("attribution"));
162 assert_eq!(fnv1a_64(b"hello"), fnv1a_64(b"hello"));
163 }
164}