quiver_query/tokenize.rs
1// SPDX-License-Identifier: AGPL-3.0-only
2//! A small, dependency-free tokenizer for the BM25 / full-text path (ADR-0046).
3//!
4//! Turns text into term ids so a tokenized string *is* a [`SparseVector`] whose
5//! values are term frequencies, reusing the sparse machinery of ADR-0043/0045. The
6//! pipeline is deterministic — Unicode-aware splitting on non-alphanumeric
7//! boundaries, lowercasing, a small English stop-word filter, and **Snowball
8//! (Porter2) stemming** — so the same text always produces the same terms, and
9//! ingest and query tokenize identically.
10//!
11//! The stemmer is the Snowball English (Porter2) algorithm via `rust-stemmers`
12//! (ADR-0048), which conflates morphological variants — `connection` /
13//! `connected` / `connecting` → `connect`, `ponies` → `poni` — so a query term
14//! matches inflected document terms. Because ingest and query share the same
15//! stemmer the conflation is consistent on both sides. (It superseded the original
16//! dependency-free plural-only S-stemmer of ADR-0046, behind the same [`tokens`]
17//! seam.)
18//!
19//! One remaining, documented ceiling: term ids are a 32-bit FNV-1a hash of the
20//! token, so distinct tokens can in principle collide. For realistic vocabularies
21//! this is negligible (and learned-sparse vocabularies already collide by
22//! construction).
23
24use std::collections::HashMap;
25
26use crate::sparse::SparseVector;
27
28/// The reserved payload key carrying a point's full-text field (ADR-0046). When a
29/// point has no explicit `__quiver_sparse__` vector but carries a string under this
30/// key, the engine tokenizes it into a term-frequency sparse vector at ingest, so
31/// the point is searchable by BM25 over text alone.
32pub const TEXT_KEY: &str = "__quiver_text__";
33
34/// A compact English stop-word list (closed-class function words). Small on
35/// purpose: aggressive stop-word removal hurts more than it helps for short
36/// queries, and BM25's IDF already down-weights ubiquitous terms.
37const STOP_WORDS: &[&str] = &[
38 "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
39 "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
40 "they", "this", "to", "was", "will", "with",
41];
42
43/// The stable 32-bit dimension id for a token (FNV-1a). Deterministic across runs
44/// and platforms.
45pub fn term_id(token: &str) -> u32 {
46 let mut hash: u32 = 0x811c_9dc5; // FNV offset basis
47 for byte in token.bytes() {
48 hash ^= u32::from(byte);
49 hash = hash.wrapping_mul(0x0100_0193); // FNV prime
50 }
51 hash
52}
53
54/// Tokenize `text` into normalized terms: lowercased, split on non-alphanumeric
55/// boundaries, stop-words removed, and plural-stemmed. The order is preserved and
56/// duplicates are kept (so callers can compute term frequencies).
57pub fn tokens(text: &str) -> Vec<String> {
58 let mut out = Vec::new();
59 let mut current = String::new();
60 for ch in text.chars() {
61 if ch.is_alphanumeric() {
62 // `to_lowercase` is Unicode-correct and may expand one char to several.
63 current.extend(ch.to_lowercase());
64 } else if !current.is_empty() {
65 push_term(&mut out, ¤t);
66 current.clear();
67 }
68 }
69 if !current.is_empty() {
70 push_term(&mut out, ¤t);
71 }
72 out
73}
74
75// Stem and stop-filter one raw (already-lowercased) token, pushing it if kept.
76fn push_term(out: &mut Vec<String>, raw: &str) {
77 if STOP_WORDS.contains(&raw) {
78 return;
79 }
80 let stemmed = stem(raw);
81 // Re-check the stop list after stemming (e.g. a stemmed form could land on one).
82 if stemmed.is_empty() || STOP_WORDS.contains(&stemmed.as_str()) {
83 return;
84 }
85 out.push(stemmed);
86}
87
88thread_local! {
89 // The Snowball (Porter2) English stemmer (ADR-0048). `Stemmer` holds no
90 // per-call state and `stem` is a pure function; we cache one per thread to
91 // avoid re-creating it on every token. Stemming is the seam ADR-0046 reserved.
92 static STEMMER: rust_stemmers::Stemmer =
93 rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
94}
95
96// Reduce a token to its Snowball (Porter2) stem so morphological variants conflate
97// (`connection`/`connected`/`connecting` → `connect`, `ponies` → `poni`). Ingest
98// and query share this, so the conflation is consistent on both sides (ADR-0048).
99fn stem(token: &str) -> String {
100 STEMMER.with(|s| s.stem(token).into_owned())
101}
102
103/// Tokenize `text` into a term-frequency [`SparseVector`]: dimension ids are token
104/// ids ([`term_id`]) and values are within-text term counts. The ingest side of
105/// the BM25 path (ADR-0046).
106pub fn text_to_sparse(text: &str) -> SparseVector {
107 let mut tf: HashMap<u32, f32> = HashMap::new();
108 for token in tokens(text) {
109 *tf.entry(term_id(&token)).or_insert(0.0) += 1.0;
110 }
111 let mut indices = Vec::with_capacity(tf.len());
112 let mut values = Vec::with_capacity(tf.len());
113 for (id, count) in tf {
114 indices.push(id);
115 values.push(count);
116 }
117 SparseVector { indices, values }
118}
119
120/// Tokenize `text` into the de-duplicated query term ids BM25 scores against (a
121/// repeated query term counts once). The query side of the BM25 path (ADR-0046).
122pub fn query_term_ids(text: &str) -> Vec<u32> {
123 let mut seen = std::collections::HashSet::new();
124 tokens(text)
125 .into_iter()
126 .map(|t| term_id(&t))
127 .filter(|id| seen.insert(*id))
128 .collect()
129}
130
131#[cfg(test)]
132mod tests {
133 use super::*;
134
135 #[test]
136 fn term_id_is_stable_and_distinguishes_tokens() {
137 assert_eq!(term_id("quiver"), term_id("quiver"));
138 assert_ne!(term_id("quiver"), term_id("vector"));
139 // Known FNV-1a value pins determinism across platforms.
140 assert_eq!(term_id(""), 0x811c_9dc5);
141 }
142
143 #[test]
144 fn splits_lowercases_and_strips_punctuation() {
145 assert_eq!(tokens("Hello, WORLD!"), vec!["hello", "world"]);
146 assert_eq!(tokens("rust-lang/quiver"), vec!["rust", "lang", "quiver"]);
147 assert_eq!(tokens("café Über 2026"), vec!["café", "über", "2026"]);
148 }
149
150 #[test]
151 fn removes_stop_words_before_and_after_stemming() {
152 // "the", "is", "on", "a" are all stop words; only content words survive.
153 assert_eq!(tokens("the cat is on a mat"), vec!["cat", "mat"]);
154 }
155
156 #[test]
157 fn snowball_stemmer_conflates_morphological_variants() {
158 // Plurals and verb inflections reduce to a shared stem (ADR-0048).
159 assert_eq!(stem("cats"), "cat");
160 assert_eq!(stem("connecting"), "connect");
161 assert_eq!(stem("connected"), "connect");
162 assert_eq!(stem("connection"), "connect");
163 // A root is left at (or near) itself, never emptied.
164 assert_eq!(stem("cat"), "cat");
165 assert!(!stem("is").is_empty());
166 // The point of stemming: query and document forms conflate to one term, so
167 // a search for "connect" matches a document about "connections".
168 assert_eq!(tokens("connections")[0], tokens("connect")[0]);
169 assert_eq!(tokens("running")[0], tokens("run")[0]);
170 assert_eq!(tokens("cats")[0], tokens("cat")[0]);
171 }
172
173 #[test]
174 fn text_to_sparse_counts_term_frequencies() {
175 // "the" is a stop word; "cats"/"cat" conflate to one term seen twice.
176 let sv = text_to_sparse("the cat cats");
177 assert_eq!(sv.indices.len(), 1);
178 assert_eq!(sv.values, vec![2.0]);
179 assert_eq!(sv.indices[0], term_id("cat"));
180 assert!(text_to_sparse("the of and").is_empty());
181 }
182
183 #[test]
184 fn query_term_ids_are_deduplicated() {
185 let ids = query_term_ids("cat cat dog");
186 assert_eq!(ids.len(), 2);
187 assert_eq!(ids[0], term_id("cat")); // order preserved, first occurrence
188 assert_eq!(ids[1], term_id("dog"));
189 assert!(query_term_ids("the a of").is_empty());
190 }
191}