Skip to main content

aprender_rag/
tokenizer.rs

1//! HELIX-IDEA-005 Phase 4 — pluggable tokenizer trait.
2//!
3//! Contract: `contracts/apr-hybrid-retrieval-v1.yaml` (FALSIFY-HYBRID-003).
4//!
5//! Pre-Phase-4, `BM25Index` had a hardcoded internal `tokenize()`
6//! method that split on non-alphanumeric boundaries with built-in
7//! lowercasing + stopword filtering. The §2.5 sketch called this
8//! out as drift risk: a future `apr serve` inference path that uses
9//! a different tokenizer (BPE / SentencePiece) would silently
10//! disagree with BM25 on what counts as a "term", with no
11//! compile-time anchor pinning the two together.
12//!
13//! Phase 4 extracts the contract: a public `Tokenizer` trait that
14//! BM25Index can OPTIONALLY accept via [`BM25Index::with_tokenizer`].
15//! The default behaviour is unchanged (the existing internal
16//! tokenization stays the fallback when no override is set), so
17//! existing callers don't break. Future callers — including a
18//! shared inference tokenizer — implement [`Tokenizer`] and plug
19//! in.
20//!
21//! Note on scope: this PR ships the trait surface plus the
22//! BM25 plumbing to use it. Wiring the actual inference-path
23//! tokenizer (Qwen / Llama BPE) into the same trait is a separate
24//! effort; the gate in §2.5 only asserts the trait is pluggable
25//! today, not that any specific inference tokenizer is wired.
26
27/// Decompose a string into BM25 search terms.
28///
29/// Implementors choose the tokenization rule (whitespace +
30/// lowercase, BPE, sentencepiece, etc.) — `BM25Index` will index
31/// and search using whatever tokens this returns.
32///
33/// `Send + Sync` so a `BM25Index` carrying a `dyn Tokenizer` can be
34/// shared across threads. `Debug` so the index's `Debug` derive
35/// works without manual impl.
36pub trait Tokenizer: Send + Sync + std::fmt::Debug {
37    /// Decompose `text` into a sequence of indexable terms.
38    fn tokenize(&self, text: &str) -> Vec<String>;
39}
40
41/// Whitespace-and-non-alphanumeric tokenizer with optional
42/// lowercasing, stopword filtering, and minimum-length filtering.
43/// Matches the rule that lived inside `BM25Index` pre-Phase-4 and
44/// is the default when no other tokenizer is supplied.
45#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
46pub struct WhitespaceTokenizer {
47    /// If true, all tokens are lowercased before being emitted.
48    pub lowercase: bool,
49    /// Tokens of length strictly less than this are dropped.
50    pub min_token_len: usize,
51    /// Tokens contained in this set are dropped (after lowercase).
52    pub stopwords: std::collections::BTreeSet<String>,
53}
54
55impl Default for WhitespaceTokenizer {
56    fn default() -> Self {
57        Self { lowercase: true, min_token_len: 2, stopwords: std::collections::BTreeSet::new() }
58    }
59}
60
61impl WhitespaceTokenizer {
62    /// Construct with default settings (lowercase, min-length 2,
63    /// no stopwords). Use the public fields to customise.
64    #[must_use]
65    pub fn new() -> Self {
66        Self::default()
67    }
68}
69
70impl Tokenizer for WhitespaceTokenizer {
71    fn tokenize(&self, text: &str) -> Vec<String> {
72        text.split(|c: char| !c.is_alphanumeric())
73            .filter(|s| !s.is_empty())
74            .map(|s| if self.lowercase { s.to_lowercase() } else { s.to_string() })
75            .filter(|s| !self.stopwords.contains(s))
76            .filter(|s| s.len() >= self.min_token_len)
77            .collect()
78    }
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84
85    #[test]
86    fn whitespace_tokenizer_default_lowercases_and_drops_short() {
87        let tok = WhitespaceTokenizer::default();
88        let out = tok.tokenize("Hello, World! a b c");
89        assert_eq!(out, vec!["hello", "world"]);
90    }
91
92    #[test]
93    fn whitespace_tokenizer_can_disable_lowercase() {
94        let tok = WhitespaceTokenizer { lowercase: false, ..Default::default() };
95        let out = tok.tokenize("Hello World");
96        assert_eq!(out, vec!["Hello", "World"]);
97    }
98
99    #[test]
100    fn whitespace_tokenizer_stopwords_drop_match() {
101        let mut stopwords = std::collections::BTreeSet::new();
102        stopwords.insert("hello".to_string());
103        let tok = WhitespaceTokenizer { stopwords, ..Default::default() };
104        let out = tok.tokenize("Hello World");
105        assert_eq!(out, vec!["world"]);
106    }
107}