Skip to main content

quiver_query/
tokenize.rs

1// SPDX-License-Identifier: AGPL-3.0-only
2//! A small, dependency-free tokenizer for the BM25 / full-text path (ADR-0046).
3//!
4//! Turns text into term ids so a tokenized string *is* a [`SparseVector`] whose
5//! values are term frequencies, reusing the sparse machinery of ADR-0043/0045. The
6//! pipeline is deterministic — Unicode-aware splitting on non-alphanumeric
7//! boundaries, lowercasing, a small English stop-word filter, and **Snowball
8//! (Porter2) stemming** — so the same text always produces the same terms, and
9//! ingest and query tokenize identically.
10//!
11//! The stemmer is the Snowball English (Porter2) algorithm via `rust-stemmers`
12//! (ADR-0048), which conflates morphological variants — `connection` /
13//! `connected` / `connecting` → `connect`, `ponies` → `poni` — so a query term
14//! matches inflected document terms. Because ingest and query share the same
15//! stemmer the conflation is consistent on both sides. (It superseded the original
16//! dependency-free plural-only S-stemmer of ADR-0046, behind the same [`tokens`]
17//! seam.)
18//!
19//! One remaining, documented ceiling: term ids are a 32-bit FNV-1a hash of the
20//! token, so distinct tokens can in principle collide. For realistic vocabularies
21//! this is negligible (and learned-sparse vocabularies already collide by
22//! construction).
23
24use std::collections::HashMap;
25
26use crate::sparse::SparseVector;
27
28/// The reserved payload key carrying a point's full-text field (ADR-0046). When a
29/// point has no explicit `__quiver_sparse__` vector but carries a string under this
30/// key, the engine tokenizes it into a term-frequency sparse vector at ingest, so
31/// the point is searchable by BM25 over text alone.
32pub const TEXT_KEY: &str = "__quiver_text__";
33
34/// A compact English stop-word list (closed-class function words). Small on
35/// purpose: aggressive stop-word removal hurts more than it helps for short
36/// queries, and BM25's IDF already down-weights ubiquitous terms.
37const STOP_WORDS: &[&str] = &[
38    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
39    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
40    "they", "this", "to", "was", "will", "with",
41];
42
43/// The stable 32-bit dimension id for a token (FNV-1a). Deterministic across runs
44/// and platforms.
45pub fn term_id(token: &str) -> u32 {
46    let mut hash: u32 = 0x811c_9dc5; // FNV offset basis
47    for byte in token.bytes() {
48        hash ^= u32::from(byte);
49        hash = hash.wrapping_mul(0x0100_0193); // FNV prime
50    }
51    hash
52}
53
54/// Tokenize `text` into normalized terms: lowercased, split on non-alphanumeric
55/// boundaries, stop-words removed, and plural-stemmed. The order is preserved and
56/// duplicates are kept (so callers can compute term frequencies).
57pub fn tokens(text: &str) -> Vec<String> {
58    let mut out = Vec::new();
59    let mut current = String::new();
60    for ch in text.chars() {
61        if ch.is_alphanumeric() {
62            // `to_lowercase` is Unicode-correct and may expand one char to several.
63            current.extend(ch.to_lowercase());
64        } else if !current.is_empty() {
65            push_term(&mut out, &current);
66            current.clear();
67        }
68    }
69    if !current.is_empty() {
70        push_term(&mut out, &current);
71    }
72    out
73}
74
75// Stem and stop-filter one raw (already-lowercased) token, pushing it if kept.
76fn push_term(out: &mut Vec<String>, raw: &str) {
77    if STOP_WORDS.contains(&raw) {
78        return;
79    }
80    let stemmed = stem(raw);
81    // Re-check the stop list after stemming (e.g. a stemmed form could land on one).
82    if stemmed.is_empty() || STOP_WORDS.contains(&stemmed.as_str()) {
83        return;
84    }
85    out.push(stemmed);
86}
87
88thread_local! {
89    // The Snowball (Porter2) English stemmer (ADR-0048). `Stemmer` holds no
90    // per-call state and `stem` is a pure function; we cache one per thread to
91    // avoid re-creating it on every token. Stemming is the seam ADR-0046 reserved.
92    static STEMMER: rust_stemmers::Stemmer =
93        rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
94}
95
96// Reduce a token to its Snowball (Porter2) stem so morphological variants conflate
97// (`connection`/`connected`/`connecting` → `connect`, `ponies` → `poni`). Ingest
98// and query share this, so the conflation is consistent on both sides (ADR-0048).
99fn stem(token: &str) -> String {
100    STEMMER.with(|s| s.stem(token).into_owned())
101}
102
103/// Tokenize `text` into a term-frequency [`SparseVector`]: dimension ids are token
104/// ids ([`term_id`]) and values are within-text term counts. The ingest side of
105/// the BM25 path (ADR-0046).
106pub fn text_to_sparse(text: &str) -> SparseVector {
107    let mut tf: HashMap<u32, f32> = HashMap::new();
108    for token in tokens(text) {
109        *tf.entry(term_id(&token)).or_insert(0.0) += 1.0;
110    }
111    let mut indices = Vec::with_capacity(tf.len());
112    let mut values = Vec::with_capacity(tf.len());
113    for (id, count) in tf {
114        indices.push(id);
115        values.push(count);
116    }
117    SparseVector { indices, values }
118}
119
120/// Tokenize `text` into the de-duplicated query term ids BM25 scores against (a
121/// repeated query term counts once). The query side of the BM25 path (ADR-0046).
122pub fn query_term_ids(text: &str) -> Vec<u32> {
123    let mut seen = std::collections::HashSet::new();
124    tokens(text)
125        .into_iter()
126        .map(|t| term_id(&t))
127        .filter(|id| seen.insert(*id))
128        .collect()
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn term_id_is_stable_and_distinguishes_tokens() {
137        assert_eq!(term_id("quiver"), term_id("quiver"));
138        assert_ne!(term_id("quiver"), term_id("vector"));
139        // Known FNV-1a value pins determinism across platforms.
140        assert_eq!(term_id(""), 0x811c_9dc5);
141    }
142
143    #[test]
144    fn splits_lowercases_and_strips_punctuation() {
145        assert_eq!(tokens("Hello, WORLD!"), vec!["hello", "world"]);
146        assert_eq!(tokens("rust-lang/quiver"), vec!["rust", "lang", "quiver"]);
147        assert_eq!(tokens("café Über 2026"), vec!["café", "über", "2026"]);
148    }
149
150    #[test]
151    fn removes_stop_words_before_and_after_stemming() {
152        // "the", "is", "on", "a" are all stop words; only content words survive.
153        assert_eq!(tokens("the cat is on a mat"), vec!["cat", "mat"]);
154    }
155
156    #[test]
157    fn snowball_stemmer_conflates_morphological_variants() {
158        // Plurals and verb inflections reduce to a shared stem (ADR-0048).
159        assert_eq!(stem("cats"), "cat");
160        assert_eq!(stem("connecting"), "connect");
161        assert_eq!(stem("connected"), "connect");
162        assert_eq!(stem("connection"), "connect");
163        // A root is left at (or near) itself, never emptied.
164        assert_eq!(stem("cat"), "cat");
165        assert!(!stem("is").is_empty());
166        // The point of stemming: query and document forms conflate to one term, so
167        // a search for "connect" matches a document about "connections".
168        assert_eq!(tokens("connections")[0], tokens("connect")[0]);
169        assert_eq!(tokens("running")[0], tokens("run")[0]);
170        assert_eq!(tokens("cats")[0], tokens("cat")[0]);
171    }
172
173    #[test]
174    fn text_to_sparse_counts_term_frequencies() {
175        // "the" is a stop word; "cats"/"cat" conflate to one term seen twice.
176        let sv = text_to_sparse("the cat cats");
177        assert_eq!(sv.indices.len(), 1);
178        assert_eq!(sv.values, vec![2.0]);
179        assert_eq!(sv.indices[0], term_id("cat"));
180        assert!(text_to_sparse("the of and").is_empty());
181    }
182
183    #[test]
184    fn query_term_ids_are_deduplicated() {
185        let ids = query_term_ids("cat cat dog");
186        assert_eq!(ids.len(), 2);
187        assert_eq!(ids[0], term_id("cat")); // order preserved, first occurrence
188        assert_eq!(ids[1], term_id("dog"));
189        assert!(query_term_ids("the a of").is_empty());
190    }
191}