aprender_rag/tokenizer.rs
1//! HELIX-IDEA-005 Phase 4 — pluggable tokenizer trait.
2//!
3//! Contract: `contracts/apr-hybrid-retrieval-v1.yaml` (FALSIFY-HYBRID-003).
4//!
5//! Pre-Phase-4, `BM25Index` had a hardcoded internal `tokenize()`
6//! method that split on non-alphanumeric boundaries with built-in
7//! lowercasing + stopword filtering. The §2.5 sketch called this
8//! out as drift risk: a future `apr serve` inference path that uses
9//! a different tokenizer (BPE / SentencePiece) would silently
10//! disagree with BM25 on what counts as a "term", with no
11//! compile-time anchor pinning the two together.
12//!
13//! Phase 4 extracts the contract: a public `Tokenizer` trait that
14//! BM25Index can OPTIONALLY accept via [`BM25Index::with_tokenizer`].
15//! The default behaviour is unchanged (the existing internal
16//! tokenization stays the fallback when no override is set), so
17//! existing callers don't break. Future callers — including a
18//! shared inference tokenizer — implement [`Tokenizer`] and plug
19//! in.
20//!
21//! Note on scope: this PR ships the trait surface plus the
22//! BM25 plumbing to use it. Wiring the actual inference-path
23//! tokenizer (Qwen / Llama BPE) into the same trait is a separate
24//! effort; the gate in §2.5 only asserts the trait is pluggable
25//! today, not that any specific inference tokenizer is wired.
26
27/// Decompose a string into BM25 search terms.
28///
29/// Implementors choose the tokenization rule (whitespace +
30/// lowercase, BPE, sentencepiece, etc.) — `BM25Index` will index
31/// and search using whatever tokens this returns.
32///
33/// `Send + Sync` so a `BM25Index` carrying a `dyn Tokenizer` can be
34/// shared across threads. `Debug` so the index's `Debug` derive
35/// works without manual impl.
36pub trait Tokenizer: Send + Sync + std::fmt::Debug {
37 /// Decompose `text` into a sequence of indexable terms.
38 fn tokenize(&self, text: &str) -> Vec<String>;
39}
40
41/// Whitespace-and-non-alphanumeric tokenizer with optional
42/// lowercasing, stopword filtering, and minimum-length filtering.
43/// Matches the rule that lived inside `BM25Index` pre-Phase-4 and
44/// is the default when no other tokenizer is supplied.
45#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
46pub struct WhitespaceTokenizer {
47 /// If true, all tokens are lowercased before being emitted.
48 pub lowercase: bool,
49 /// Tokens of length strictly less than this are dropped.
50 pub min_token_len: usize,
51 /// Tokens contained in this set are dropped (after lowercase).
52 pub stopwords: std::collections::BTreeSet<String>,
53}
54
55impl Default for WhitespaceTokenizer {
56 fn default() -> Self {
57 Self { lowercase: true, min_token_len: 2, stopwords: std::collections::BTreeSet::new() }
58 }
59}
60
61impl WhitespaceTokenizer {
62 /// Construct with default settings (lowercase, min-length 2,
63 /// no stopwords). Use the public fields to customise.
64 #[must_use]
65 pub fn new() -> Self {
66 Self::default()
67 }
68}
69
70impl Tokenizer for WhitespaceTokenizer {
71 fn tokenize(&self, text: &str) -> Vec<String> {
72 text.split(|c: char| !c.is_alphanumeric())
73 .filter(|s| !s.is_empty())
74 .map(|s| if self.lowercase { s.to_lowercase() } else { s.to_string() })
75 .filter(|s| !self.stopwords.contains(s))
76 .filter(|s| s.len() >= self.min_token_len)
77 .collect()
78 }
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84
85 #[test]
86 fn whitespace_tokenizer_default_lowercases_and_drops_short() {
87 let tok = WhitespaceTokenizer::default();
88 let out = tok.tokenize("Hello, World! a b c");
89 assert_eq!(out, vec!["hello", "world"]);
90 }
91
92 #[test]
93 fn whitespace_tokenizer_can_disable_lowercase() {
94 let tok = WhitespaceTokenizer { lowercase: false, ..Default::default() };
95 let out = tok.tokenize("Hello World");
96 assert_eq!(out, vec!["Hello", "World"]);
97 }
98
99 #[test]
100 fn whitespace_tokenizer_stopwords_drop_match() {
101 let mut stopwords = std::collections::BTreeSet::new();
102 stopwords.insert("hello".to_string());
103 let tok = WhitespaceTokenizer { stopwords, ..Default::default() };
104 let out = tok.tokenize("Hello World");
105 assert_eq!(out, vec!["world"]);
106 }
107}