triplets_core/tokenizer.rs
1//! Tokenization primitives used across chunking, sampling, and BM25 indexing.
2//!
3//! # Structural tokenizers vs. model tokenizers
4//!
5//! The [`Tokenizer`](crate::tokenizer::Tokenizer) trait and its default implementation, [`WhitespaceTokenizer`](crate::tokenizer::WhitespaceTokenizer),
6//! are **structural** tokenizers — their token counts drive window sizing, prefix
7//! budget arithmetic, and BM25 term-frequency scoring. They are **not** the
8//! subword tokenizers used by embedding or language models, which include:
9//!
10//! * **BPE** (Byte-Pair Encoding) — GPT-series, RoBERTa, most OpenAI encoders.
11//! * **WordPiece** — BERT-family models.
12//! * **SentencePiece / Unigram** — T5, LLaMA, Mistral, and most instruction-tuned LLMs.
13//!
14//! Subword tokenizers operate on a learned vocabulary and routinely split a
15//! single word into multiple tokens. Whitespace token counts are a *structural
16//! estimate*, running roughly 0.75–1.3× the equivalent BPE token count depending
17//! on vocabulary and language. Exact model token counts are unnecessary and
18//! prohibitively expensive to compute without a loaded tokenizer binary.
19
20/// Tokenizer over text slices.
21///
22/// Implementations are expected to be cheap to construct — ideally zero-size —
23/// and stateless. Methods take `&self` to allow future implementations that
24/// carry configuration (e.g. vocabulary, normalisation flags).
25pub trait Tokenizer {
26 /// Split `text` into tokens, returning slices into the original string.
27 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
28
29 /// Count the number of tokens in `text`.
30 ///
31 /// Implementations should override this when a direct count is cheaper
32 /// than collecting tokens into a `Vec`.
33 fn token_count(&self, text: &str) -> usize {
34 self.tokenize(text).len()
35 }
36}
37
38/// Unicode-scalar whitespace tokenizer.
39///
40/// Splits on any sequence of Unicode whitespace via [`str::split_whitespace`]
41/// and discards empty spans. Zero-size; free to copy.
42///
43/// Token counts produced by this type are a *structural estimate* — see the
44/// [module documentation](self) for how they relate to subword model tokenizers.
45///
46/// # Performance
47///
48/// Both [`tokenize`](Tokenizer::tokenize) and [`token_count`](Tokenizer::token_count)
49/// are O(n) single-pass scans with no internal allocation beyond the returned
50/// `Vec`. An LRU cache would add memory pressure and synchronisation overhead
51/// that outweighs any benefit at these text sizes.
52#[derive(Clone, Copy, Debug, Default)]
53pub struct WhitespaceTokenizer;
54
55impl Tokenizer for WhitespaceTokenizer {
56 #[inline]
57 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
58 text.split_whitespace().collect()
59 }
60
61 #[inline]
62 fn token_count(&self, text: &str) -> usize {
63 text.split_whitespace().count()
64 }
65}
66
67#[cfg(test)]
68mod tests {
69 use super::*;
70
71 // --- Tokenizer::tokenize ---
72
73 #[test]
74 fn tokenize_splits_on_spaces() {
75 let tokens = WhitespaceTokenizer.tokenize("hello world foo");
76 assert_eq!(tokens, vec!["hello", "world", "foo"]);
77 }
78
79 #[test]
80 fn tokenize_splits_on_tabs_and_newlines() {
81 let tokens = WhitespaceTokenizer.tokenize("a\tb\nc");
82 assert_eq!(tokens, vec!["a", "b", "c"]);
83 }
84
85 #[test]
86 fn tokenize_collapses_runs_of_whitespace() {
87 let tokens = WhitespaceTokenizer.tokenize(" foo bar ");
88 assert_eq!(tokens, vec!["foo", "bar"]);
89 }
90
91 #[test]
92 fn tokenize_empty_string_returns_empty() {
93 assert!(WhitespaceTokenizer.tokenize("").is_empty());
94 }
95
96 #[test]
97 fn tokenize_whitespace_only_returns_empty() {
98 assert!(WhitespaceTokenizer.tokenize(" \t\n ").is_empty());
99 }
100
101 #[test]
102 fn tokenize_single_token_no_whitespace() {
103 let tokens = WhitespaceTokenizer.tokenize("solo");
104 assert_eq!(tokens, vec!["solo"]);
105 }
106
107 #[test]
108 fn tokenize_returns_slices_into_original() {
109 let text = String::from("alpha beta gamma");
110 let tokens = WhitespaceTokenizer.tokenize(&text);
111 // Pointers should point inside the original allocation.
112 for token in &tokens {
113 let token_ptr = token.as_ptr() as usize;
114 let text_start = text.as_ptr() as usize;
115 let text_end = text_start + text.len();
116 assert!(token_ptr >= text_start && token_ptr < text_end);
117 }
118 }
119
120 #[test]
121 fn tokenize_unicode_whitespace_splits_correctly() {
122 // U+3000 IDEOGRAPHIC SPACE is Unicode whitespace.
123 let tokens = WhitespaceTokenizer.tokenize("東京\u{3000}大阪");
124 assert_eq!(tokens, vec!["東京", "大阪"]);
125 }
126
127 // --- Tokenizer::token_count ---
128
129 #[test]
130 fn token_count_matches_tokenize_len() {
131 let text = "one two three four";
132 assert_eq!(
133 WhitespaceTokenizer.token_count(text),
134 WhitespaceTokenizer.tokenize(text).len()
135 );
136 }
137
138 #[test]
139 fn token_count_empty_is_zero() {
140 assert_eq!(WhitespaceTokenizer.token_count(""), 0);
141 }
142
143 #[test]
144 fn token_count_whitespace_only_is_zero() {
145 assert_eq!(WhitespaceTokenizer.token_count(" \t\n "), 0);
146 }
147
148 #[test]
149 fn token_count_single_word() {
150 assert_eq!(WhitespaceTokenizer.token_count("word"), 1);
151 }
152
153 // --- Trait default method ---
154
155 #[test]
156 fn default_token_count_delegates_to_tokenize() {
157 /// Tokenizer that always splits on '|' — exercises the default `token_count`.
158 struct PipeTokenizer;
159 impl Tokenizer for PipeTokenizer {
160 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
161 text.split('|').filter(|s| !s.is_empty()).collect()
162 }
163 }
164 // token_count falls back to tokenize().len() since PipeTokenizer doesn't override it.
165 assert_eq!(PipeTokenizer.token_count("a|b|c"), 3);
166 assert_eq!(PipeTokenizer.token_count(""), 0);
167 }
168
169 // --- Derive traits ---
170
171 #[test]
172 fn whitespace_tokenizer_is_clone_copy_and_debug() {
173 let t = WhitespaceTokenizer;
174 let cloned = t;
175 let copied = t;
176 assert_eq!(format!("{:?}", cloned), "WhitespaceTokenizer");
177 let _ = copied;
178 }
179
180 #[test]
181 fn whitespace_tokenizer_default_is_usable() {
182 let t = WhitespaceTokenizer;
183 assert_eq!(t.token_count("x y"), 2);
184 }
185}