use std::collections::HashMap;
use crate::sparse::SparseVector;
pub const TEXT_KEY: &str = "__quiver_text__";
const STOP_WORDS: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with",
];
pub fn term_id(token: &str) -> u32 {
let mut hash: u32 = 0x811c_9dc5; for byte in token.bytes() {
hash ^= u32::from(byte);
hash = hash.wrapping_mul(0x0100_0193); }
hash
}
pub fn tokens(text: &str) -> Vec<String> {
let mut out = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() {
current.extend(ch.to_lowercase());
} else if !current.is_empty() {
push_term(&mut out, ¤t);
current.clear();
}
}
if !current.is_empty() {
push_term(&mut out, ¤t);
}
out
}
fn push_term(out: &mut Vec<String>, raw: &str) {
if STOP_WORDS.contains(&raw) {
return;
}
let stemmed = stem(raw);
if stemmed.is_empty() || STOP_WORDS.contains(&stemmed.as_str()) {
return;
}
out.push(stemmed);
}
thread_local! {
static STEMMER: rust_stemmers::Stemmer =
rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
}
fn stem(token: &str) -> String {
STEMMER.with(|s| s.stem(token).into_owned())
}
pub fn text_to_sparse(text: &str) -> SparseVector {
let mut tf: HashMap<u32, f32> = HashMap::new();
for token in tokens(text) {
*tf.entry(term_id(&token)).or_insert(0.0) += 1.0;
}
let mut indices = Vec::with_capacity(tf.len());
let mut values = Vec::with_capacity(tf.len());
for (id, count) in tf {
indices.push(id);
values.push(count);
}
SparseVector { indices, values }
}
pub fn query_term_ids(text: &str) -> Vec<u32> {
let mut seen = std::collections::HashSet::new();
tokens(text)
.into_iter()
.map(|t| term_id(&t))
.filter(|id| seen.insert(*id))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn term_id_is_stable_and_distinguishes_tokens() {
assert_eq!(term_id("quiver"), term_id("quiver"));
assert_ne!(term_id("quiver"), term_id("vector"));
assert_eq!(term_id(""), 0x811c_9dc5);
}
#[test]
fn splits_lowercases_and_strips_punctuation() {
assert_eq!(tokens("Hello, WORLD!"), vec!["hello", "world"]);
assert_eq!(tokens("rust-lang/quiver"), vec!["rust", "lang", "quiver"]);
assert_eq!(tokens("café Über 2026"), vec!["café", "über", "2026"]);
}
#[test]
fn removes_stop_words_before_and_after_stemming() {
assert_eq!(tokens("the cat is on a mat"), vec!["cat", "mat"]);
}
#[test]
fn snowball_stemmer_conflates_morphological_variants() {
assert_eq!(stem("cats"), "cat");
assert_eq!(stem("connecting"), "connect");
assert_eq!(stem("connected"), "connect");
assert_eq!(stem("connection"), "connect");
assert_eq!(stem("cat"), "cat");
assert!(!stem("is").is_empty());
assert_eq!(tokens("connections")[0], tokens("connect")[0]);
assert_eq!(tokens("running")[0], tokens("run")[0]);
assert_eq!(tokens("cats")[0], tokens("cat")[0]);
}
#[test]
fn text_to_sparse_counts_term_frequencies() {
let sv = text_to_sparse("the cat cats");
assert_eq!(sv.indices.len(), 1);
assert_eq!(sv.values, vec![2.0]);
assert_eq!(sv.indices[0], term_id("cat"));
assert!(text_to_sparse("the of and").is_empty());
}
#[test]
fn query_term_ids_are_deduplicated() {
let ids = query_term_ids("cat cat dog");
assert_eq!(ids.len(), 2);
assert_eq!(ids[0], term_id("cat")); assert_eq!(ids[1], term_id("dog"));
assert!(query_term_ids("the a of").is_empty());
}
}