Skip to main content

context_builder/
token_count.rs

1use ignore::DirEntry;
2use once_cell::sync::Lazy;
3use std::collections::BTreeMap;
4use std::fs;
5use std::path::Path;
6/// Token counting utilities for estimating LLM token usage
7use tiktoken_rs::{CoreBPE, cl100k_base};
8
9// Initialize the tokenizer once and reuse it
10static TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
11
12/// Estimates the number of tokens in a text string using a real tokenizer
13pub fn estimate_tokens(text: &str) -> usize {
14    TOKENIZER.encode_with_special_tokens(text).len()
15}
16
17/// Counts the tokens that would be generated for a file
18pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize {
19    let file_path = entry.path();
20    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
21
22    // Start with tokens for the file header (path, size, modified time)
23    let mut token_count = estimate_tokens(&format!(
24        "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n",
25        relative_path.display(),
26        entry.metadata().map(|m| m.len()).unwrap_or(0),
27        "Unknown"
28    )); // Using "Unknown" as placeholder for modified time in estimation
29
30    // Add tokens for the code fences
31    token_count += estimate_tokens("```\n```");
32
33    // Try to read file content
34    if let Ok(content) = fs::read_to_string(file_path) {
35        if line_numbers {
36            // When line numbers are enabled, we add the line number prefix to each line
37            let lines_with_numbers: String = content
38                .lines()
39                .enumerate()
40                .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line))
41                .collect();
42            token_count += estimate_tokens(&lines_with_numbers);
43        } else {
44            token_count += estimate_tokens(&content);
45        }
46    }
47
48    token_count
49}
50
51/// Counts the tokens that would be generated for the entire file tree section
52pub fn count_tree_tokens(tree: &BTreeMap<String, crate::tree::FileNode>, depth: usize) -> usize {
53    let mut token_count = 0;
54
55    // Add tokens for indentation
56    let indent = "  ".repeat(depth);
57
58    for (name, node) in tree {
59        match node {
60            crate::tree::FileNode::File => {
61                token_count += estimate_tokens(&format!("{}- πŸ“„ {}\n", indent, name));
62            }
63            crate::tree::FileNode::Directory(children) => {
64                token_count += estimate_tokens(&format!("{}- πŸ“ {}\n", indent, name));
65                token_count += count_tree_tokens(children, depth + 1);
66            }
67        }
68    }
69
70    token_count
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76    use std::collections::BTreeMap;
77
78    #[test]
79    fn test_estimate_tokens() {
80        // Test with a simple string
81        let text = "Hello, world!";
82        let tokens = estimate_tokens(text);
83        // "Hello, world!" is 4 tokens with cl100k_base
84        assert_eq!(tokens, 4);
85
86        // Test with code-like content
87        let code_text = "fn main() {\n    println!(\"Hello, world!\");\n}";
88        let tokens = estimate_tokens(code_text);
89        // This specific code snippet is 12 tokens with cl100k_base
90        assert_eq!(tokens, 12);
91    }
92
93    #[test]
94    fn test_count_tree_tokens() {
95        // Create a simple tree structure
96        let mut tree = BTreeMap::new();
97        tree.insert("file1.rs".to_string(), crate::tree::FileNode::File);
98
99        let mut subdir = BTreeMap::new();
100        subdir.insert("file2.md".to_string(), crate::tree::FileNode::File);
101        tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir));
102
103        let tokens = count_tree_tokens(&tree, 0);
104        // "- πŸ“„ file1.rs\n" -> 8 tokens
105        // "- πŸ“ src\n" -> 6 tokens
106        // "  - πŸ“„ file2.md\n" -> 9 tokens
107        // Total should be 23 tokens
108        assert_eq!(tokens, 23);
109    }
110
111    #[test]
112    fn test_token_estimation_format_consistency() {
113        use tempfile::tempdir;
114
115        let dir = tempdir().unwrap();
116        let test_file = dir.path().join("test.rs");
117        std::fs::write(&test_file, "fn main() {}\n").unwrap();
118
119        let entry = ignore::WalkBuilder::new(&test_file)
120            .build()
121            .next()
122            .unwrap()
123            .unwrap();
124
125        // Estimate tokens for the file
126        let estimated_tokens = count_file_tokens(dir.path(), &entry, false);
127
128        // Generate actual markdown content
129        let mut actual_content = Vec::new();
130        crate::markdown::process_file(
131            dir.path(),
132            &test_file,
133            &mut actual_content,
134            false,
135            None,
136            &crate::markdown::TreeSitterConfig::default(),
137        )
138        .unwrap();
139        let actual_content_str = String::from_utf8(actual_content).unwrap();
140
141        // Count actual tokens
142        let actual_tokens = estimate_tokens(&actual_content_str);
143
144        // The estimation should be close to actual (within a reasonable margin)
145        // Allow for some variance due to timestamp differences and minor formatting
146        let difference = actual_tokens.abs_diff(estimated_tokens);
147
148        // Should be within 10% or 20 tokens difference (whichever is larger)
149        let max_allowed_difference = std::cmp::max(actual_tokens / 10, 20);
150
151        assert!(
152            difference <= max_allowed_difference,
153            "Token estimation {} differs too much from actual {} (difference: {})",
154            estimated_tokens,
155            actual_tokens,
156            difference
157        );
158    }
159
160    #[test]
161    fn test_estimate_tokens_empty_string() {
162        let tokens = estimate_tokens("");
163        assert_eq!(tokens, 0);
164    }
165
166    #[test]
167    fn test_estimate_tokens_whitespace_only() {
168        let tokens = estimate_tokens("   \n\t  ");
169        assert!(tokens > 0); // Whitespace still counts as tokens
170    }
171
172    #[test]
173    fn test_estimate_tokens_unicode() {
174        let tokens = estimate_tokens("Hello δΈ–η•Œ! 🌍");
175        assert!(tokens > 0);
176        // Unicode characters may be encoded as multiple tokens
177        assert!(tokens >= 4);
178    }
179
180    #[test]
181    fn test_count_file_tokens_with_line_numbers() {
182        use tempfile::tempdir;
183
184        let dir = tempdir().unwrap();
185        let test_file = dir.path().join("test.rs");
186        std::fs::write(&test_file, "line 1\nline 2\nline 3").unwrap();
187
188        let entry = ignore::WalkBuilder::new(&test_file)
189            .build()
190            .next()
191            .unwrap()
192            .unwrap();
193
194        let tokens_without_line_numbers = count_file_tokens(dir.path(), &entry, false);
195        let tokens_with_line_numbers = count_file_tokens(dir.path(), &entry, true);
196
197        // With line numbers should have more tokens due to line number prefixes
198        assert!(tokens_with_line_numbers > tokens_without_line_numbers);
199    }
200
201    #[test]
202    fn test_count_file_tokens_unreadable_file() {
203        use tempfile::tempdir;
204
205        let dir = tempdir().unwrap();
206        let test_file = dir.path().join("nonexistent.txt");
207
208        // Create a mock DirEntry for a file that doesn't exist
209        // This simulates what happens when a file is deleted between discovery and processing
210        let walker = ignore::WalkBuilder::new(dir.path());
211        let mut found_entry = None;
212
213        // Create the file temporarily to get a DirEntry
214        std::fs::write(&test_file, "temp").unwrap();
215        for entry in walker.build() {
216            if let Ok(entry) = entry
217                && entry.path() == test_file
218            {
219                found_entry = Some(entry);
220                break;
221            }
222        }
223
224        // Now delete the file
225        std::fs::remove_file(&test_file).unwrap();
226
227        if let Some(entry) = found_entry {
228            let tokens = count_file_tokens(dir.path(), &entry, false);
229            // Should still return some tokens for the file header even if content can't be read
230            assert!(tokens > 0);
231        }
232    }
233
234    #[test]
235    fn test_count_tree_tokens_empty_tree() {
236        let tree = BTreeMap::new();
237        let tokens = count_tree_tokens(&tree, 0);
238        assert_eq!(tokens, 0);
239    }
240
241    #[test]
242    fn test_count_tree_tokens_nested_directories() {
243        let mut tree = BTreeMap::new();
244
245        // Create deeply nested structure
246        let mut level3 = BTreeMap::new();
247        level3.insert("deep_file.txt".to_string(), crate::tree::FileNode::File);
248
249        let mut level2 = BTreeMap::new();
250        level2.insert(
251            "level3".to_string(),
252            crate::tree::FileNode::Directory(level3),
253        );
254
255        let mut level1 = BTreeMap::new();
256        level1.insert(
257            "level2".to_string(),
258            crate::tree::FileNode::Directory(level2),
259        );
260
261        tree.insert(
262            "level1".to_string(),
263            crate::tree::FileNode::Directory(level1),
264        );
265
266        let tokens = count_tree_tokens(&tree, 0);
267        assert!(tokens > 0);
268
269        // Should account for indentation at different levels
270        let tokens_with_depth = count_tree_tokens(&tree, 2);
271        assert!(tokens_with_depth > tokens); // More indentation = more tokens
272    }
273
274    #[test]
275    fn test_count_tree_tokens_mixed_content() {
276        let mut tree = BTreeMap::new();
277
278        // Add files with various name lengths and characters
279        tree.insert("a.txt".to_string(), crate::tree::FileNode::File);
280        tree.insert(
281            "very_long_filename_with_underscores.rs".to_string(),
282            crate::tree::FileNode::File,
283        );
284        tree.insert("Ρ„Π°ΠΉΠ».txt".to_string(), crate::tree::FileNode::File); // Unicode filename
285
286        let mut subdir = BTreeMap::new();
287        subdir.insert("nested.md".to_string(), crate::tree::FileNode::File);
288        tree.insert(
289            "directory".to_string(),
290            crate::tree::FileNode::Directory(subdir),
291        );
292
293        let tokens = count_tree_tokens(&tree, 0);
294        assert!(tokens > 0);
295
296        // Verify it handles unicode filenames without crashing
297        assert!(tokens > 20); // Should be substantial given the content
298    }
299}