Skip to main content

sqlrite/sql/fts/
tokenizer.rs

1//! ASCII tokenizer for FTS — splits on `[^A-Za-z0-9]+` and lowercases.
2//!
3//! Resolves Phase 8 plan Q3 (ASCII MVP). Unicode-aware tokenization is
4//! deferred to Phase 8.1 behind a `unicode` cargo feature; the limitation
5//! here is intentional. Non-ASCII bytes are treated as separators, which
6//! means accented Latin (`café`), CJK, and other non-ASCII scripts won't
7//! be searchable until that follow-up lands.
8//!
9//! No stemming and no stop-word removal (Q4 + Q5). BM25's IDF naturally
10//! downweights common terms, and modern RAG pipelines rely on exact
11//! lexical matches for technical retrieval.
12
13/// Split `text` on runs of non-ASCII-alphanumeric bytes and lowercase
14/// each resulting term. Empty input or input made entirely of separators
15/// returns an empty `Vec`.
16///
17/// Tokens are `String` rather than `&str` because the posting-list owns
18/// its term strings (see [`super::posting_list::PostingList`]); returning
19/// owned strings keeps the call site shape consistent with how the index
20/// stores them and avoids a second allocation downstream.
21pub fn tokenize(text: &str) -> Vec<String> {
22    let mut tokens = Vec::new();
23    let mut current = String::new();
24    for b in text.bytes() {
25        if b.is_ascii_alphanumeric() {
26            current.push(b.to_ascii_lowercase() as char);
27        } else if !current.is_empty() {
28            tokens.push(std::mem::take(&mut current));
29        }
30    }
31    if !current.is_empty() {
32        tokens.push(current);
33    }
34    tokens
35}
36
37#[cfg(test)]
38mod tests {
39    use super::*;
40
41    #[test]
42    fn empty_input_returns_empty_vec() {
43        assert!(tokenize("").is_empty());
44        assert!(tokenize("   ").is_empty());
45        assert!(tokenize("!!!---???").is_empty());
46    }
47
48    #[test]
49    fn splits_on_punctuation_and_whitespace() {
50        assert_eq!(
51            tokenize("hello, world!"),
52            vec!["hello".to_string(), "world".to_string()]
53        );
54        assert_eq!(
55            tokenize("a\tb\nc d"),
56            vec![
57                "a".to_string(),
58                "b".to_string(),
59                "c".to_string(),
60                "d".to_string()
61            ]
62        );
63    }
64
65    #[test]
66    fn lowercases_ascii_letters() {
67        assert_eq!(
68            tokenize("FooBar BAZ"),
69            vec!["foobar".to_string(), "baz".to_string()]
70        );
71    }
72
73    #[test]
74    fn alphanumeric_runs_stay_together() {
75        // "rust2026" is a single token; digits are alphanumeric.
76        assert_eq!(tokenize("rust2026"), vec!["rust2026".to_string()]);
77        // "co-op" splits on the hyphen.
78        assert_eq!(tokenize("co-op"), vec!["co".to_string(), "op".to_string()]);
79    }
80
81    #[test]
82    fn non_ascii_bytes_act_as_separators_without_panicking() {
83        // ASCII MVP per Q3 — non-ASCII bytes (é = 0xC3 0xA9 in UTF-8) are
84        // treated as separators. "café" -> ["caf"]. Documented limitation.
85        let toks = tokenize("café");
86        assert_eq!(toks, vec!["caf".to_string()]);
87        // CJK input: every byte is non-ASCII, so we get an empty result.
88        assert!(tokenize("日本語").is_empty());
89    }
90
91    #[test]
92    fn smoke_module_path_reaches_through_lib() {
93        // Confirms `sqlrite::sql::fts::tokenize` is reachable via the
94        // public `sql` module path from the crate root. If 8b ever moves
95        // the module behind a feature gate, this test will fail loudly.
96        assert_eq!(
97            crate::sql::fts::tokenize("Hello, world!"),
98            vec!["hello".to_string(), "world".to_string()]
99        );
100    }
101}