use crate::{AssistantBlock, Message, ToolResultBlock, UserBlock};
pub trait Tokenizer: Send + Sync {
fn count_text(&self, text: &str) -> usize;
fn count_message(&self, message: &Message) -> usize {
let mut total = 0;
match message {
Message::User { blocks } => {
for block in blocks {
match block {
UserBlock::Text { text, .. } => total += self.count_text(text),
UserBlock::ToolResult {
call_id, content, ..
} => {
total += self.count_text(call_id);
for tr_block in &content.blocks {
match tr_block {
ToolResultBlock::Text { text } => {
total += self.count_text(text);
}
ToolResultBlock::Image { .. } => {}
}
}
}
UserBlock::Image { .. } | UserBlock::Document { .. } => {}
}
}
}
Message::Assistant { blocks } => {
for block in blocks {
match block {
AssistantBlock::Text { text, .. } => total += self.count_text(text),
AssistantBlock::ToolCall { id, name, args, .. } => {
total += self.count_text(id)
+ self.count_text(name)
+ self.count_text(&args.to_string());
}
AssistantBlock::Reasoning { text, signature } => {
total += self.count_text(text);
if let Some(sig) = signature {
total += self.count_text(sig);
}
}
AssistantBlock::RedactedReasoning { data } => {
total += self.count_text(data);
}
}
}
}
}
total
}
fn count_messages(&self, messages: &[Message]) -> usize {
messages.iter().map(|m| self.count_message(m)).sum()
}
}
pub struct CharTokenizer;
impl Tokenizer for CharTokenizer {
fn count_text(&self, text: &str) -> usize {
text.len() / 4
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
struct WordTokenizer;
impl Tokenizer for WordTokenizer {
fn count_text(&self, text: &str) -> usize {
text.split_whitespace().count()
}
}
#[test]
fn char_tokenizer_uses_len_div_four() {
let t = CharTokenizer;
assert_eq!(t.count_text(""), 0);
assert_eq!(t.count_text("abcd"), 1);
assert_eq!(t.count_text("hello world"), 2); }
#[test]
fn count_message_walks_user_blocks() {
use crate::ToolResultContent;
let t = WordTokenizer;
let msg = Message::User {
blocks: vec![
UserBlock::text("hello world from user"),
UserBlock::tool_result("call_42", ToolResultContent::text("ok done")),
],
};
assert_eq!(t.count_message(&msg), 7);
}
#[test]
fn count_message_image_and_document_blocks_contribute_zero() {
use crate::Source;
let t = WordTokenizer;
let msg = Message::User {
blocks: vec![
UserBlock::image(Source::Url {
url: "https://example.com/x.png".into(),
}),
UserBlock::document(Source::Url {
url: "https://example.com/x.pdf".into(),
}),
],
};
assert_eq!(t.count_message(&msg), 0);
}
#[test]
fn count_message_walks_assistant_blocks_including_reasoning() {
let t = WordTokenizer;
let msg = Message::Assistant {
blocks: vec![
AssistantBlock::text("two words"),
AssistantBlock::tool_call("id_1", "tool_name", json!({"k": "v"})),
AssistantBlock::Reasoning {
text: "thinking aloud".into(),
signature: Some("sig token".into()),
},
AssistantBlock::RedactedReasoning {
data: "redacted_blob".into(),
},
],
};
assert_eq!(t.count_message(&msg), 10);
}
#[test]
fn count_messages_sums_each_message() {
let t = WordTokenizer;
let msgs = vec![
Message::user("one two three"),
Message::assistant_text("four five"),
];
assert_eq!(t.count_messages(&msgs), 5);
}
}