sqlrite/sql/fts/tokenizer.rs
1//! ASCII tokenizer for FTS — splits on `[^A-Za-z0-9]+` and lowercases.
2//!
3//! Resolves Phase 8 plan Q3 (ASCII MVP). Unicode-aware tokenization is
4//! deferred to Phase 8.1 behind a `unicode` cargo feature; the limitation
5//! here is intentional. Non-ASCII bytes are treated as separators, which
6//! means accented Latin (`café`), CJK, and other non-ASCII scripts won't
7//! be searchable until that follow-up lands.
8//!
9//! No stemming and no stop-word removal (Q4 + Q5). BM25's IDF naturally
10//! downweights common terms, and modern RAG pipelines rely on exact
11//! lexical matches for technical retrieval.
12
13/// Split `text` on runs of non-ASCII-alphanumeric bytes and lowercase
14/// each resulting term. Empty input or input made entirely of separators
15/// returns an empty `Vec`.
16///
17/// Tokens are `String` rather than `&str` because the posting-list owns
18/// its term strings (see [`super::posting_list::PostingList`]); returning
19/// owned strings keeps the call site shape consistent with how the index
20/// stores them and avoids a second allocation downstream.
21pub fn tokenize(text: &str) -> Vec<String> {
22 let mut tokens = Vec::new();
23 let mut current = String::new();
24 for b in text.bytes() {
25 if b.is_ascii_alphanumeric() {
26 current.push(b.to_ascii_lowercase() as char);
27 } else if !current.is_empty() {
28 tokens.push(std::mem::take(&mut current));
29 }
30 }
31 if !current.is_empty() {
32 tokens.push(current);
33 }
34 tokens
35}
36
37#[cfg(test)]
38mod tests {
39 use super::*;
40
41 #[test]
42 fn empty_input_returns_empty_vec() {
43 assert!(tokenize("").is_empty());
44 assert!(tokenize(" ").is_empty());
45 assert!(tokenize("!!!---???").is_empty());
46 }
47
48 #[test]
49 fn splits_on_punctuation_and_whitespace() {
50 assert_eq!(
51 tokenize("hello, world!"),
52 vec!["hello".to_string(), "world".to_string()]
53 );
54 assert_eq!(
55 tokenize("a\tb\nc d"),
56 vec![
57 "a".to_string(),
58 "b".to_string(),
59 "c".to_string(),
60 "d".to_string()
61 ]
62 );
63 }
64
65 #[test]
66 fn lowercases_ascii_letters() {
67 assert_eq!(
68 tokenize("FooBar BAZ"),
69 vec!["foobar".to_string(), "baz".to_string()]
70 );
71 }
72
73 #[test]
74 fn alphanumeric_runs_stay_together() {
75 // "rust2026" is a single token; digits are alphanumeric.
76 assert_eq!(tokenize("rust2026"), vec!["rust2026".to_string()]);
77 // "co-op" splits on the hyphen.
78 assert_eq!(tokenize("co-op"), vec!["co".to_string(), "op".to_string()]);
79 }
80
81 #[test]
82 fn non_ascii_bytes_act_as_separators_without_panicking() {
83 // ASCII MVP per Q3 — non-ASCII bytes (é = 0xC3 0xA9 in UTF-8) are
84 // treated as separators. "café" -> ["caf"]. Documented limitation.
85 let toks = tokenize("café");
86 assert_eq!(toks, vec!["caf".to_string()]);
87 // CJK input: every byte is non-ASCII, so we get an empty result.
88 assert!(tokenize("日本語").is_empty());
89 }
90
91 #[test]
92 fn smoke_module_path_reaches_through_lib() {
93 // Confirms `sqlrite::sql::fts::tokenize` is reachable via the
94 // public `sql` module path from the crate root. If 8b ever moves
95 // the module behind a feature gate, this test will fail loudly.
96 assert_eq!(
97 crate::sql::fts::tokenize("Hello, world!"),
98 vec!["hello".to_string(), "world".to_string()]
99 );
100 }
101}