semantic_memory/
tokenizer.rs

1//! Pluggable token counting for context budget management.
2//!
3//! Provides the [`TokenCounter`] trait for text-to-token-count conversion,
4//! with [`EstimateTokenCounter`] as a simple default.
5
6use std::sync::Arc;
7
8/// Trait for counting tokens in text.
9///
10/// Implement this to plug in tiktoken, sentencepiece, or any
11/// model-specific tokenizer for accurate context budget management.
12///
13/// # Examples
14///
15/// ```rust
16/// use semantic_memory::TokenCounter;
17///
18/// struct MyTokenizer;
19/// impl TokenCounter for MyTokenizer {
20///     fn count_tokens(&self, text: &str) -> usize {
21///         text.split_whitespace().count()
22///     }
23/// }
24/// ```
25pub trait TokenCounter: Send + Sync {
26    /// Count the number of tokens in the given text.
27    fn count_tokens(&self, text: &str) -> usize;
28}
29
30/// Default token counter: estimates tokens as `len / 4`.
31///
32/// Acceptable for English prose (~4 chars per token on average).
33/// Inaccurate for CJK text (~1 token per char), code, or structured data.
34/// Replace with a real tokenizer for accurate budget management.
35pub struct EstimateTokenCounter;
36
37impl TokenCounter for EstimateTokenCounter {
38    fn count_tokens(&self, text: &str) -> usize {
39        if text.is_empty() {
40            0
41        } else {
42            (text.len() / 4).max(1)
43        }
44    }
45}
46
47/// Create the default token counter (estimate-based).
48pub(crate) fn default_token_counter() -> Arc<dyn TokenCounter> {
49    Arc::new(EstimateTokenCounter)
50}
semantic_memory/tokenizer.rs

semantic_memory/
tokenizer.rs