agpm_cli/tokens/
mod.rs

1//! Token counting utilities using BPE tokenization.
2//!
3//! This module provides approximate token counting for resource content using the
4//! cl100k BPE encoding, which is compatible with Claude and GPT-4 models.
5//!
6//! # Usage
7//!
8//! ```rust,no_run
9//! use agpm_cli::tokens;
10//!
11//! let content = "Hello, world!";
12//! let count = tokens::count_tokens(content);
13//! println!("Approximate token count: {}", count);
14//! ```
15//!
16//! # Performance
17//!
18//! The tokenizer is lazily initialized on first use and cached for subsequent calls.
19//! Token counting is O(n) and optimized for high throughput.
20
21/// Get the cached tokenizer instance.
22///
23/// The bpe-openai crate uses LazyLock internally, so we just return the static reference.
24fn get_tokenizer() -> &'static bpe_openai::Tokenizer {
25    bpe_openai::cl100k_base()
26}
27
28/// Count approximate tokens in content using cl100k encoding.
29///
30/// This uses the cl100k BPE encoding which is compatible with Claude and GPT-4.
31/// The count is approximate since different models may use slightly different
32/// tokenization schemes.
33///
34/// # Arguments
35///
36/// * `content` - The text content to count tokens for
37///
38/// # Returns
39///
40/// The approximate number of tokens in the content.
41///
42/// # Example
43///
44/// ```rust,no_run
45/// use agpm_cli::tokens::count_tokens;
46///
47/// let tokens = count_tokens("Hello, world!");
48/// assert!(tokens > 0);
49/// ```
50#[must_use]
51pub fn count_tokens(content: &str) -> usize {
52    get_tokenizer().count(content)
53}
54
55/// Format a token count for human-readable display.
56///
57/// Formats large numbers with k/M suffixes for readability.
58///
59/// # Arguments
60///
61/// * `count` - The token count to format
62///
63/// # Returns
64///
65/// A formatted string representation (e.g., "150.2k", "1.5M").
66///
67/// # Examples
68///
69/// ```rust
70/// use agpm_cli::tokens::format_token_count;
71///
72/// assert_eq!(format_token_count(500), "500");
73/// assert_eq!(format_token_count(1500), "1.5k");
74/// assert_eq!(format_token_count(1500000), "1.5M");
75/// ```
76#[must_use]
77pub fn format_token_count(count: usize) -> String {
78    if count >= 1_000_000 {
79        format!("{:.1}M", count as f64 / 1_000_000.0)
80    } else if count >= 1_000 {
81        format!("{:.1}k", count as f64 / 1_000.0)
82    } else {
83        count.to_string()
84    }
85}
86
87#[cfg(test)]
88mod tests {
89    use super::*;
90
91    #[test]
92    fn test_count_tokens_empty() {
93        assert_eq!(count_tokens(""), 0);
94    }
95
96    #[test]
97    fn test_count_tokens_simple() {
98        // "Hello, world!" should be a few tokens
99        let count = count_tokens("Hello, world!");
100        assert!(count > 0);
101        assert!(count < 10);
102    }
103
104    #[test]
105    fn test_count_tokens_longer() {
106        let content = "This is a longer piece of text that should result in more tokens. \
107            The quick brown fox jumps over the lazy dog.";
108        let count = count_tokens(content);
109        assert!(count > 10);
110    }
111
112    #[test]
113    fn test_format_token_count_small() {
114        assert_eq!(format_token_count(0), "0");
115        assert_eq!(format_token_count(1), "1");
116        assert_eq!(format_token_count(999), "999");
117    }
118
119    #[test]
120    fn test_format_token_count_thousands() {
121        assert_eq!(format_token_count(1000), "1.0k");
122        assert_eq!(format_token_count(1500), "1.5k");
123        assert_eq!(format_token_count(12500), "12.5k");
124        assert_eq!(format_token_count(999999), "1000.0k");
125    }
126
127    #[test]
128    fn test_format_token_count_millions() {
129        assert_eq!(format_token_count(1_000_000), "1.0M");
130        assert_eq!(format_token_count(1_500_000), "1.5M");
131        assert_eq!(format_token_count(10_000_000), "10.0M");
132    }
133}
agpm_cli/tokens/mod.rs

agpm_cli/tokens/
mod.rs