semantic_memory/tokenizer.rs
1//! Pluggable token counting for context budget management.
2//!
3//! Provides the [`TokenCounter`] trait for text-to-token-count conversion,
4//! with [`EstimateTokenCounter`] as a simple default.
5
6use std::sync::Arc;
7
8/// Trait for counting tokens in text.
9///
10/// Implement this to plug in tiktoken, sentencepiece, or any
11/// model-specific tokenizer for accurate context budget management.
12///
13/// # Examples
14///
15/// ```rust
16/// use semantic_memory::TokenCounter;
17///
18/// struct MyTokenizer;
19/// impl TokenCounter for MyTokenizer {
20/// fn count_tokens(&self, text: &str) -> usize {
21/// text.split_whitespace().count()
22/// }
23/// }
24/// ```
25pub trait TokenCounter: Send + Sync {
26 /// Count the number of tokens in the given text.
27 fn count_tokens(&self, text: &str) -> usize;
28}
29
30/// Default token counter: estimates tokens as `len / 4`.
31///
32/// Acceptable for English prose (~4 chars per token on average).
33/// Inaccurate for CJK text (~1 token per char), code, or structured data.
34/// Replace with a real tokenizer for accurate budget management.
35pub struct EstimateTokenCounter;
36
37impl TokenCounter for EstimateTokenCounter {
38 fn count_tokens(&self, text: &str) -> usize {
39 if text.is_empty() {
40 0
41 } else {
42 (text.len() / 4).max(1)
43 }
44 }
45}
46
47/// Create the default token counter (estimate-based).
48pub(crate) fn default_token_counter() -> Arc<dyn TokenCounter> {
49 Arc::new(EstimateTokenCounter)
50}