pub fn tokenize(s: &str) -> Vec<String> {
let mut out = Vec::new();
let mut cur = String::new();
for ch in s.chars() {
if ch.is_ascii_alphanumeric() {
cur.push(ch.to_ascii_lowercase());
} else if !cur.is_empty() {
if cur.len() >= 2 {
out.push(std::mem::take(&mut cur));
} else {
cur.clear();
}
}
}
if cur.len() >= 2 {
out.push(cur);
}
out
}
const STOPWORDS: &[&str] = &[
"the", "an", "of", "to", "for", "and", "or", "in", "on", "at", "is", "it", "be", "as", "by",
"with", "from", "into", "me", "my", "we", "our", "you", "your", "this", "that", "these",
"those", "use", "used", "when", "user", "users", "say", "says", "want", "wants", "ask", "asks",
"do", "does", "not", "if", "so", "up", "out", "via", "are", "was", "will", "can", "a", "i",
];
pub fn content_tokens(s: &str) -> Vec<String> {
tokenize(s)
.into_iter()
.filter(|t| !STOPWORDS.contains(&t.as_str()))
.collect()
}
pub fn norm_token(t: &str) -> String {
let b = t.as_bytes();
let n = b.len();
if n <= 3 || !t.ends_with('s') {
return t.to_string();
}
if t.ends_with("ss") || t.ends_with("us") || t.ends_with("is") {
return t.to_string();
}
if n > 4 && t.ends_with("ies") {
return format!("{}y", &t[..n - 3]); }
if t.ends_with("sses")
|| t.ends_with("ches")
|| t.ends_with("shes")
|| t.ends_with("xes")
|| t.ends_with("zes")
{
return t[..n - 2].to_string(); }
t[..n - 1].to_string() }
pub fn match_tokens(s: &str) -> Vec<String> {
content_tokens(s).iter().map(|t| norm_token(t)).collect()
}
pub fn fnv1a_32(s: &str) -> u32 {
let mut h: u32 = 0x811c_9dc5;
for b in s.bytes() {
h ^= b as u32;
h = h.wrapping_mul(0x0100_0193);
}
h
}
pub fn fnv1a_64(bytes: &[u8]) -> u64 {
let mut h: u64 = 0xcbf2_9ce4_8422_2325;
for &b in bytes {
h ^= b as u64;
h = h.wrapping_mul(0x0000_0100_0000_01b3);
}
h
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_splits_and_lowercases() {
assert_eq!(
tokenize("Set-up a NEW uv_project!"),
["set", "up", "new", "uv", "project"]
);
}
#[test]
fn tokenize_drops_single_chars() {
assert_eq!(tokenize("a b cd e"), ["cd"]);
}
#[test]
fn content_tokens_drops_stopwords() {
assert_eq!(
content_tokens("connect to the Neon database"),
["connect", "neon", "database"]
);
assert!(content_tokens("set it up").is_empty() || content_tokens("set it up") == ["set"]);
}
#[test]
fn norm_token_singularizes_common_plurals() {
assert_eq!(norm_token("spreadsheets"), "spreadsheet");
assert_eq!(norm_token("charts"), "chart");
assert_eq!(norm_token("dependencies"), "dependency");
assert_eq!(norm_token("branches"), "branch");
assert_eq!(norm_token("boxes"), "box");
assert_eq!(norm_token("classes"), "class");
}
#[test]
fn norm_token_leaves_non_plurals_alone() {
for t in ["uv", "css", "class", "status", "analysis", "chart", "rust"] {
assert_eq!(norm_token(t), t);
}
}
#[test]
fn match_tokens_normalizes_content_tokens() {
assert_eq!(
match_tokens("compute the formulas in these spreadsheets"),
["compute", "formula", "spreadsheet"]
);
}
#[test]
fn fnv_is_deterministic() {
assert_eq!(fnv1a_32("commit"), fnv1a_32("commit"));
assert_ne!(fnv1a_32("commit"), fnv1a_32("attribution"));
assert_eq!(fnv1a_64(b"hello"), fnv1a_64(b"hello"));
}
}