agpm_cli/tokens/mod.rs
1//! Token counting utilities using BPE tokenization.
2//!
3//! This module provides approximate token counting for resource content using the
4//! cl100k BPE encoding, which is compatible with Claude and GPT-4 models.
5//!
6//! # Usage
7//!
8//! ```rust,no_run
9//! use agpm_cli::tokens;
10//!
11//! let content = "Hello, world!";
12//! let count = tokens::count_tokens(content);
13//! println!("Approximate token count: {}", count);
14//! ```
15//!
16//! # Performance
17//!
18//! The tokenizer is lazily initialized on first use and cached for subsequent calls.
19//! Token counting is O(n) and optimized for high throughput.
20
21/// Get the cached tokenizer instance.
22///
23/// The bpe-openai crate uses LazyLock internally, so we just return the static reference.
24fn get_tokenizer() -> &'static bpe_openai::Tokenizer {
25 bpe_openai::cl100k_base()
26}
27
28/// Count approximate tokens in content using cl100k encoding.
29///
30/// This uses the cl100k BPE encoding which is compatible with Claude and GPT-4.
31/// The count is approximate since different models may use slightly different
32/// tokenization schemes.
33///
34/// # Arguments
35///
36/// * `content` - The text content to count tokens for
37///
38/// # Returns
39///
40/// The approximate number of tokens in the content.
41///
42/// # Example
43///
44/// ```rust,no_run
45/// use agpm_cli::tokens::count_tokens;
46///
47/// let tokens = count_tokens("Hello, world!");
48/// assert!(tokens > 0);
49/// ```
50#[must_use]
51pub fn count_tokens(content: &str) -> usize {
52 get_tokenizer().count(content)
53}
54
55/// Format a token count for human-readable display.
56///
57/// Formats large numbers with k/M suffixes for readability.
58///
59/// # Arguments
60///
61/// * `count` - The token count to format
62///
63/// # Returns
64///
65/// A formatted string representation (e.g., "150.2k", "1.5M").
66///
67/// # Examples
68///
69/// ```rust
70/// use agpm_cli::tokens::format_token_count;
71///
72/// assert_eq!(format_token_count(500), "500");
73/// assert_eq!(format_token_count(1500), "1.5k");
74/// assert_eq!(format_token_count(1500000), "1.5M");
75/// ```
76#[must_use]
77pub fn format_token_count(count: usize) -> String {
78 if count >= 1_000_000 {
79 format!("{:.1}M", count as f64 / 1_000_000.0)
80 } else if count >= 1_000 {
81 format!("{:.1}k", count as f64 / 1_000.0)
82 } else {
83 count.to_string()
84 }
85}
86
87#[cfg(test)]
88mod tests {
89 use super::*;
90
91 #[test]
92 fn test_count_tokens_empty() {
93 assert_eq!(count_tokens(""), 0);
94 }
95
96 #[test]
97 fn test_count_tokens_simple() {
98 // "Hello, world!" should be a few tokens
99 let count = count_tokens("Hello, world!");
100 assert!(count > 0);
101 assert!(count < 10);
102 }
103
104 #[test]
105 fn test_count_tokens_longer() {
106 let content = "This is a longer piece of text that should result in more tokens. \
107 The quick brown fox jumps over the lazy dog.";
108 let count = count_tokens(content);
109 assert!(count > 10);
110 }
111
112 #[test]
113 fn test_format_token_count_small() {
114 assert_eq!(format_token_count(0), "0");
115 assert_eq!(format_token_count(1), "1");
116 assert_eq!(format_token_count(999), "999");
117 }
118
119 #[test]
120 fn test_format_token_count_thousands() {
121 assert_eq!(format_token_count(1000), "1.0k");
122 assert_eq!(format_token_count(1500), "1.5k");
123 assert_eq!(format_token_count(12500), "12.5k");
124 assert_eq!(format_token_count(999999), "1000.0k");
125 }
126
127 #[test]
128 fn test_format_token_count_millions() {
129 assert_eq!(format_token_count(1_000_000), "1.0M");
130 assert_eq!(format_token_count(1_500_000), "1.5M");
131 assert_eq!(format_token_count(10_000_000), "10.0M");
132 }
133}