ailoop_core/tokenizer.rs
1//! Cross-crate token-counting trait.
2//!
3//! `Tokenizer` is the single contract every ailoop crate measures
4//! against when it needs to know how many tokens a piece of text or a
5//! conversation is worth. It lives in `ailoop-core` because the types
6//! that get tokenized — [`Message`], its blocks, and the `Usage`
7//! reports providers send back — already live here, so neither
8//! `ailoop-history` (history compaction) nor `ailoop-prompts` (system
9//! prompt assembly) has to depend on the other to share a counter.
10//!
11//! Implementations fall in two families:
12//! - Offline tokenizers ship a model-specific BPE table and produce
13//! exact counts (e.g. tiktoken for OpenAI/Azure).
14//! - Online-calibrated tokenizers maintain an EMA over the
15//! tokens-per-char ratio observed in real provider responses
16//! (e.g. `ailoop_anthropic::OnlineCalibratedTokenizer`). Cheap and
17//! "good enough" for compaction budgets.
18//!
19//! [`CharTokenizer`] is a deliberately rough fallback (`len() / 4`)
20//! used by `ailoop-history` when no real tokenizer has been wired up.
21//! It is documented as a fallback rather than a recommended default:
22//! production callers should plug in a provider-specific implementation.
23
24use crate::{AssistantBlock, Message, ToolResultBlock, UserBlock};
25
26/// Counts tokens in text and full messages.
27///
28/// Implementations only have to provide [`Self::count_text`]; the
29/// message-level helpers walk every block kind that contributes to
30/// what the model actually sees on the wire (text, tool calls and
31/// their JSON args, tool results, reasoning text and signatures,
32/// redacted reasoning payloads).
33///
34/// Implementations must be `Send + Sync` because consumers wrap them
35/// behind `Arc<dyn Tokenizer>` and use them across `await` boundaries
36/// (the engine, conversation history, and middlewares are all multi-
37/// task by design).
38pub trait Tokenizer: Send + Sync {
39 /// Count tokens in a flat string. The only required method.
40 fn count_text(&self, text: &str) -> usize;
41
42 /// Count tokens in a single [`Message`], walking every block kind
43 /// the provider sees. Defaults to summing block-level
44 /// [`Self::count_text`] calls; override if your tokenizer has a
45 /// cheaper batch path.
46 fn count_message(&self, message: &Message) -> usize {
47 let mut total = 0;
48 match message {
49 Message::User { blocks } => {
50 for block in blocks {
51 match block {
52 UserBlock::Text { text, .. } => total += self.count_text(text),
53 UserBlock::ToolResult {
54 call_id, content, ..
55 } => {
56 total += self.count_text(call_id);
57 for tr_block in &content.blocks {
58 match tr_block {
59 ToolResultBlock::Text { text } => {
60 total += self.count_text(text);
61 }
62 // Image blocks contribute 0 in the
63 // default impl. CharTokenizer cannot
64 // estimate image tokens; calibrated
65 // tokenizers drift toward the truth
66 // via the Usage feedback loop.
67 ToolResultBlock::Image { .. } => {}
68 }
69 }
70 }
71 // Image / Document contribute 0 in the default
72 // impl. See the comment above for the rationale.
73 UserBlock::Image { .. } | UserBlock::Document { .. } => {}
74 }
75 }
76 }
77 Message::Assistant { blocks } => {
78 for block in blocks {
79 match block {
80 AssistantBlock::Text { text, .. } => total += self.count_text(text),
81 AssistantBlock::ToolCall { id, name, args, .. } => {
82 total += self.count_text(id)
83 + self.count_text(name)
84 + self.count_text(&args.to_string());
85 }
86 AssistantBlock::Reasoning { text, signature } => {
87 total += self.count_text(text);
88 if let Some(sig) = signature {
89 total += self.count_text(sig);
90 }
91 }
92 AssistantBlock::RedactedReasoning { data } => {
93 total += self.count_text(data);
94 }
95 }
96 }
97 }
98 }
99 total
100 }
101
102 /// Count tokens in a slice of messages — the budget unit
103 /// `History::compact_if_needed` measures against.
104 fn count_messages(&self, messages: &[Message]) -> usize {
105 messages.iter().map(|m| self.count_message(m)).sum()
106 }
107}
108
109/// Fallback `Tokenizer` that approximates tokens as `text.len() / 4`.
110///
111/// This is the rule-of-thumb every provider documentation suggests
112/// for back-of-envelope sizing — accurate enough to spot the difference
113/// between "10 tokens" and "10k tokens", but **not** a substitute for
114/// a real tokenizer when budgets are tight. It is the silent default
115/// in `ailoop-history::HistoryBuilder` so dev/test code does not
116/// have to wire one up; production callers should pass an explicit
117/// provider-specific tokenizer
118/// (e.g. `ailoop_anthropic::OnlineCalibratedTokenizer`) via
119/// `HistoryBuilder::tokenizer`.
120pub struct CharTokenizer;
121
122impl Tokenizer for CharTokenizer {
123 fn count_text(&self, text: &str) -> usize {
124 text.len() / 4
125 }
126}
127
128#[cfg(test)]
129mod tests {
130 use super::*;
131 use serde_json::json;
132
133 /// Test tokenizer that counts whitespace-delimited words. Useful for
134 /// asserting that `count_message` walks every block kind without
135 /// having to reason about a chars-based fallback.
136 struct WordTokenizer;
137
138 impl Tokenizer for WordTokenizer {
139 fn count_text(&self, text: &str) -> usize {
140 text.split_whitespace().count()
141 }
142 }
143
144 #[test]
145 fn char_tokenizer_uses_len_div_four() {
146 let t = CharTokenizer;
147 assert_eq!(t.count_text(""), 0);
148 assert_eq!(t.count_text("abcd"), 1);
149 assert_eq!(t.count_text("hello world"), 2); // 11 / 4 = 2
150 }
151
152 #[test]
153 fn count_message_walks_user_blocks() {
154 use crate::ToolResultContent;
155 let t = WordTokenizer;
156 let msg = Message::User {
157 blocks: vec![
158 UserBlock::text("hello world from user"),
159 UserBlock::tool_result("call_42", ToolResultContent::text("ok done")),
160 ],
161 };
162 // text(4) + call_id(1) + tool_result text(2) = 7
163 assert_eq!(t.count_message(&msg), 7);
164 }
165
166 #[test]
167 fn count_message_image_and_document_blocks_contribute_zero() {
168 use crate::Source;
169 let t = WordTokenizer;
170 let msg = Message::User {
171 blocks: vec![
172 UserBlock::image(Source::Url {
173 url: "https://example.com/x.png".into(),
174 }),
175 UserBlock::document(Source::Url {
176 url: "https://example.com/x.pdf".into(),
177 }),
178 ],
179 };
180 assert_eq!(t.count_message(&msg), 0);
181 }
182
183 #[test]
184 fn count_message_walks_assistant_blocks_including_reasoning() {
185 let t = WordTokenizer;
186 let msg = Message::Assistant {
187 blocks: vec![
188 AssistantBlock::text("two words"),
189 AssistantBlock::tool_call("id_1", "tool_name", json!({"k": "v"})),
190 AssistantBlock::Reasoning {
191 text: "thinking aloud".into(),
192 signature: Some("sig token".into()),
193 },
194 AssistantBlock::RedactedReasoning {
195 data: "redacted_blob".into(),
196 },
197 ],
198 };
199 // text(2) + tool_call: id(1) + name(1) + args.to_string()=`{"k":"v"}`(1)
200 // + reasoning(2) + signature(2) + redacted(1)
201 // = 2 + 1 + 1 + 1 + 2 + 2 + 1 = 10
202 assert_eq!(t.count_message(&msg), 10);
203 }
204
205 #[test]
206 fn count_messages_sums_each_message() {
207 let t = WordTokenizer;
208 let msgs = vec![
209 Message::user("one two three"),
210 Message::assistant_text("four five"),
211 ];
212 assert_eq!(t.count_messages(&msgs), 5);
213 }
214}