Skip to main content

ailoop_core/
tokenizer.rs

1//! Cross-crate token-counting trait.
2//!
3//! `Tokenizer` is the single contract every ailoop crate measures
4//! against when it needs to know how many tokens a piece of text or a
5//! conversation is worth. It lives in `ailoop-core` because the types
6//! that get tokenized — [`Message`], its blocks, and the `Usage`
7//! reports providers send back — already live here, so neither
8//! `ailoop-history` (history compaction) nor `ailoop-prompts` (system
9//! prompt assembly) has to depend on the other to share a counter.
10//!
11//! Implementations fall in two families:
12//! - Offline tokenizers ship a model-specific BPE table and produce
13//!   exact counts (e.g. tiktoken for OpenAI/Azure).
14//! - Online-calibrated tokenizers maintain an EMA over the
15//!   tokens-per-char ratio observed in real provider responses
16//!   (e.g. `ailoop_anthropic::OnlineCalibratedTokenizer`). Cheap and
17//!   "good enough" for compaction budgets.
18//!
19//! [`CharTokenizer`] is a deliberately rough fallback (`len() / 4`)
20//! used by `ailoop-history` when no real tokenizer has been wired up.
21//! It is documented as a fallback rather than a recommended default:
22//! production callers should plug in a provider-specific implementation.
23
24use crate::{AssistantBlock, Message, ToolResultBlock, UserBlock};
25
26/// Counts tokens in text and full messages.
27///
28/// Implementations only have to provide [`Self::count_text`]; the
29/// message-level helpers walk every block kind that contributes to
30/// what the model actually sees on the wire (text, tool calls and
31/// their JSON args, tool results, reasoning text and signatures,
32/// redacted reasoning payloads).
33///
34/// Implementations must be `Send + Sync` because consumers wrap them
35/// behind `Arc<dyn Tokenizer>` and use them across `await` boundaries
36/// (the engine, conversation history, and middlewares are all multi-
37/// task by design).
38pub trait Tokenizer: Send + Sync {
39    /// Count tokens in a flat string. The only required method.
40    fn count_text(&self, text: &str) -> usize;
41
42    /// Count tokens in a single [`Message`], walking every block kind
43    /// the provider sees. Defaults to summing block-level
44    /// [`Self::count_text`] calls; override if your tokenizer has a
45    /// cheaper batch path.
46    fn count_message(&self, message: &Message) -> usize {
47        let mut total = 0;
48        match message {
49            Message::User { blocks } => {
50                for block in blocks {
51                    match block {
52                        UserBlock::Text { text, .. } => total += self.count_text(text),
53                        UserBlock::ToolResult {
54                            call_id, content, ..
55                        } => {
56                            total += self.count_text(call_id);
57                            for tr_block in &content.blocks {
58                                match tr_block {
59                                    ToolResultBlock::Text { text } => {
60                                        total += self.count_text(text);
61                                    }
62                                    // Image blocks contribute 0 in the
63                                    // default impl. CharTokenizer cannot
64                                    // estimate image tokens; calibrated
65                                    // tokenizers drift toward the truth
66                                    // via the Usage feedback loop.
67                                    ToolResultBlock::Image { .. } => {}
68                                }
69                            }
70                        }
71                        // Image / Document contribute 0 in the default
72                        // impl. See the comment above for the rationale.
73                        UserBlock::Image { .. } | UserBlock::Document { .. } => {}
74                    }
75                }
76            }
77            Message::Assistant { blocks } => {
78                for block in blocks {
79                    match block {
80                        AssistantBlock::Text { text, .. } => total += self.count_text(text),
81                        AssistantBlock::ToolCall { id, name, args, .. } => {
82                            total += self.count_text(id)
83                                + self.count_text(name)
84                                + self.count_text(&args.to_string());
85                        }
86                        AssistantBlock::Reasoning { text, signature } => {
87                            total += self.count_text(text);
88                            if let Some(sig) = signature {
89                                total += self.count_text(sig);
90                            }
91                        }
92                        AssistantBlock::RedactedReasoning { data } => {
93                            total += self.count_text(data);
94                        }
95                    }
96                }
97            }
98        }
99        total
100    }
101
102    /// Count tokens in a slice of messages — the budget unit
103    /// `History::compact_if_needed` measures against.
104    fn count_messages(&self, messages: &[Message]) -> usize {
105        messages.iter().map(|m| self.count_message(m)).sum()
106    }
107}
108
109/// Fallback `Tokenizer` that approximates tokens as `text.len() / 4`.
110///
111/// This is the rule-of-thumb every provider documentation suggests
112/// for back-of-envelope sizing — accurate enough to spot the difference
113/// between "10 tokens" and "10k tokens", but **not** a substitute for
114/// a real tokenizer when budgets are tight. It is the silent default
115/// in `ailoop-history::HistoryBuilder` so dev/test code does not
116/// have to wire one up; production callers should pass an explicit
117/// provider-specific tokenizer
118/// (e.g. `ailoop_anthropic::OnlineCalibratedTokenizer`) via
119/// `HistoryBuilder::tokenizer`.
120pub struct CharTokenizer;
121
122impl Tokenizer for CharTokenizer {
123    fn count_text(&self, text: &str) -> usize {
124        text.len() / 4
125    }
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131    use serde_json::json;
132
133    /// Test tokenizer that counts whitespace-delimited words. Useful for
134    /// asserting that `count_message` walks every block kind without
135    /// having to reason about a chars-based fallback.
136    struct WordTokenizer;
137
138    impl Tokenizer for WordTokenizer {
139        fn count_text(&self, text: &str) -> usize {
140            text.split_whitespace().count()
141        }
142    }
143
144    #[test]
145    fn char_tokenizer_uses_len_div_four() {
146        let t = CharTokenizer;
147        assert_eq!(t.count_text(""), 0);
148        assert_eq!(t.count_text("abcd"), 1);
149        assert_eq!(t.count_text("hello world"), 2); // 11 / 4 = 2
150    }
151
152    #[test]
153    fn count_message_walks_user_blocks() {
154        use crate::ToolResultContent;
155        let t = WordTokenizer;
156        let msg = Message::User {
157            blocks: vec![
158                UserBlock::text("hello world from user"),
159                UserBlock::tool_result("call_42", ToolResultContent::text("ok done")),
160            ],
161        };
162        // text(4) + call_id(1) + tool_result text(2) = 7
163        assert_eq!(t.count_message(&msg), 7);
164    }
165
166    #[test]
167    fn count_message_image_and_document_blocks_contribute_zero() {
168        use crate::Source;
169        let t = WordTokenizer;
170        let msg = Message::User {
171            blocks: vec![
172                UserBlock::image(Source::Url {
173                    url: "https://example.com/x.png".into(),
174                }),
175                UserBlock::document(Source::Url {
176                    url: "https://example.com/x.pdf".into(),
177                }),
178            ],
179        };
180        assert_eq!(t.count_message(&msg), 0);
181    }
182
183    #[test]
184    fn count_message_walks_assistant_blocks_including_reasoning() {
185        let t = WordTokenizer;
186        let msg = Message::Assistant {
187            blocks: vec![
188                AssistantBlock::text("two words"),
189                AssistantBlock::tool_call("id_1", "tool_name", json!({"k": "v"})),
190                AssistantBlock::Reasoning {
191                    text: "thinking aloud".into(),
192                    signature: Some("sig token".into()),
193                },
194                AssistantBlock::RedactedReasoning {
195                    data: "redacted_blob".into(),
196                },
197            ],
198        };
199        // text(2) + tool_call: id(1) + name(1) + args.to_string()=`{"k":"v"}`(1)
200        //   + reasoning(2) + signature(2) + redacted(1)
201        // = 2 + 1 + 1 + 1 + 2 + 2 + 1 = 10
202        assert_eq!(t.count_message(&msg), 10);
203    }
204
205    #[test]
206    fn count_messages_sums_each_message() {
207        let t = WordTokenizer;
208        let msgs = vec![
209            Message::user("one two three"),
210            Message::assistant_text("four five"),
211        ];
212        assert_eq!(t.count_messages(&msgs), 5);
213    }
214}