Skip to main content

triplets_core/
tokenizer.rs

1//! Tokenization primitives used across chunking, sampling, and BM25 indexing.
2//!
3//! # Structural tokenizers vs. model tokenizers
4//!
5//! The [`Tokenizer`](crate::tokenizer::Tokenizer) trait and its default implementation, [`WhitespaceTokenizer`](crate::tokenizer::WhitespaceTokenizer),
6//! are **structural** tokenizers — their token counts drive window sizing, prefix
7//! budget arithmetic, and BM25 term-frequency scoring.  They are **not** the
8//! subword tokenizers used by embedding or language models, which include:
9//!
10//! * **BPE** (Byte-Pair Encoding) — GPT-series, RoBERTa, most OpenAI encoders.
11//! * **WordPiece** — BERT-family models.
12//! * **SentencePiece / Unigram** — T5, LLaMA, Mistral, and most instruction-tuned LLMs.
13//!
14//! Subword tokenizers operate on a learned vocabulary and routinely split a
15//! single word into multiple tokens.  Whitespace token counts are a *structural
16//! estimate*, running roughly 0.75–1.3× the equivalent BPE token count depending
17//! on vocabulary and language.  Exact model token counts are unnecessary and
18//! prohibitively expensive to compute without a loaded tokenizer binary.
19
20/// Tokenizer over text slices.
21///
22/// Implementations are expected to be cheap to construct — ideally zero-size —
23/// and stateless.  Methods take `&self` to allow future implementations that
24/// carry configuration (e.g. vocabulary, normalisation flags).
25pub trait Tokenizer {
26    /// Split `text` into tokens, returning slices into the original string.
27    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
28
29    /// Count the number of tokens in `text`.
30    ///
31    /// Implementations should override this when a direct count is cheaper
32    /// than collecting tokens into a `Vec`.
33    fn token_count(&self, text: &str) -> usize {
34        self.tokenize(text).len()
35    }
36}
37
38/// Unicode-scalar whitespace tokenizer.
39///
40/// Splits on any sequence of Unicode whitespace via [`str::split_whitespace`]
41/// and discards empty spans.  Zero-size; free to copy.
42///
43/// Token counts produced by this type are a *structural estimate* — see the
44/// [module documentation](self) for how they relate to subword model tokenizers.
45///
46/// # Performance
47///
48/// Both [`tokenize`](Tokenizer::tokenize) and [`token_count`](Tokenizer::token_count)
49/// are O(n) single-pass scans with no internal allocation beyond the returned
50/// `Vec`.  An LRU cache would add memory pressure and synchronisation overhead
51/// that outweighs any benefit at these text sizes.
52#[derive(Clone, Copy, Debug, Default)]
53pub struct WhitespaceTokenizer;
54
55impl Tokenizer for WhitespaceTokenizer {
56    #[inline]
57    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58        text.split_whitespace().collect()
59    }
60
61    #[inline]
62    fn token_count(&self, text: &str) -> usize {
63        text.split_whitespace().count()
64    }
65}
66
67#[cfg(test)]
68mod tests {
69    use super::*;
70
71    // --- Tokenizer::tokenize ---
72
73    #[test]
74    fn tokenize_splits_on_spaces() {
75        let tokens = WhitespaceTokenizer.tokenize("hello world foo");
76        assert_eq!(tokens, vec!["hello", "world", "foo"]);
77    }
78
79    #[test]
80    fn tokenize_splits_on_tabs_and_newlines() {
81        let tokens = WhitespaceTokenizer.tokenize("a\tb\nc");
82        assert_eq!(tokens, vec!["a", "b", "c"]);
83    }
84
85    #[test]
86    fn tokenize_collapses_runs_of_whitespace() {
87        let tokens = WhitespaceTokenizer.tokenize("  foo   bar  ");
88        assert_eq!(tokens, vec!["foo", "bar"]);
89    }
90
91    #[test]
92    fn tokenize_empty_string_returns_empty() {
93        assert!(WhitespaceTokenizer.tokenize("").is_empty());
94    }
95
96    #[test]
97    fn tokenize_whitespace_only_returns_empty() {
98        assert!(WhitespaceTokenizer.tokenize("   \t\n  ").is_empty());
99    }
100
101    #[test]
102    fn tokenize_single_token_no_whitespace() {
103        let tokens = WhitespaceTokenizer.tokenize("solo");
104        assert_eq!(tokens, vec!["solo"]);
105    }
106
107    #[test]
108    fn tokenize_returns_slices_into_original() {
109        let text = String::from("alpha beta gamma");
110        let tokens = WhitespaceTokenizer.tokenize(&text);
111        // Pointers should point inside the original allocation.
112        for token in &tokens {
113            let token_ptr = token.as_ptr() as usize;
114            let text_start = text.as_ptr() as usize;
115            let text_end = text_start + text.len();
116            assert!(token_ptr >= text_start && token_ptr < text_end);
117        }
118    }
119
120    #[test]
121    fn tokenize_unicode_whitespace_splits_correctly() {
122        // U+3000 IDEOGRAPHIC SPACE is Unicode whitespace.
123        let tokens = WhitespaceTokenizer.tokenize("東京\u{3000}大阪");
124        assert_eq!(tokens, vec!["東京", "大阪"]);
125    }
126
127    // --- Tokenizer::token_count ---
128
129    #[test]
130    fn token_count_matches_tokenize_len() {
131        let text = "one two three four";
132        assert_eq!(
133            WhitespaceTokenizer.token_count(text),
134            WhitespaceTokenizer.tokenize(text).len()
135        );
136    }
137
138    #[test]
139    fn token_count_empty_is_zero() {
140        assert_eq!(WhitespaceTokenizer.token_count(""), 0);
141    }
142
143    #[test]
144    fn token_count_whitespace_only_is_zero() {
145        assert_eq!(WhitespaceTokenizer.token_count("  \t\n "), 0);
146    }
147
148    #[test]
149    fn token_count_single_word() {
150        assert_eq!(WhitespaceTokenizer.token_count("word"), 1);
151    }
152
153    // --- Trait default method ---
154
155    #[test]
156    fn default_token_count_delegates_to_tokenize() {
157        /// Tokenizer that always splits on '|' — exercises the default `token_count`.
158        struct PipeTokenizer;
159        impl Tokenizer for PipeTokenizer {
160            fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
161                text.split('|').filter(|s| !s.is_empty()).collect()
162            }
163        }
164        // token_count falls back to tokenize().len() since PipeTokenizer doesn't override it.
165        assert_eq!(PipeTokenizer.token_count("a|b|c"), 3);
166        assert_eq!(PipeTokenizer.token_count(""), 0);
167    }
168
169    // --- Derive traits ---
170
171    #[test]
172    fn whitespace_tokenizer_is_clone_copy_and_debug() {
173        let t = WhitespaceTokenizer;
174        let cloned = t;
175        let copied = t;
176        assert_eq!(format!("{:?}", cloned), "WhitespaceTokenizer");
177        let _ = copied;
178    }
179
180    #[test]
181    fn whitespace_tokenizer_default_is_usable() {
182        let t = WhitespaceTokenizer;
183        assert_eq!(t.token_count("x y"), 2);
184    }
185}