context_builder/
token_count.rs

1use ignore::DirEntry;
2use once_cell::sync::Lazy;
3use std::collections::BTreeMap;
4use std::fs;
5use std::path::Path;
6/// Token counting utilities for estimating LLM token usage
7use tiktoken_rs::{CoreBPE, cl100k_base};
8
9// Initialize the tokenizer once and reuse it
10static TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
11
12/// Estimates the number of tokens in a text string using a real tokenizer
13pub fn estimate_tokens(text: &str) -> usize {
14    TOKENIZER.encode_with_special_tokens(text).len()
15}
16
17/// Counts the tokens that would be generated for a file
18pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize {
19    let file_path = entry.path();
20    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
21
22    // Start with tokens for the file header (path, size, modified time)
23    let mut token_count = estimate_tokens(&format!(
24        "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n",
25        relative_path.display(),
26        entry.metadata().map(|m| m.len()).unwrap_or(0),
27        "Unknown"
28    )); // Using "Unknown" as placeholder for modified time in estimation
29
30    // Add tokens for the code fences
31    token_count += estimate_tokens("```\n```");
32
33    // Try to read file content
34    if let Ok(content) = fs::read_to_string(file_path) {
35        if line_numbers {
36            // When line numbers are enabled, we add the line number prefix to each line
37            let lines_with_numbers: String = content
38                .lines()
39                .enumerate()
40                .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line))
41                .collect();
42            token_count += estimate_tokens(&lines_with_numbers);
43        } else {
44            token_count += estimate_tokens(&content);
45        }
46    }
47
48    token_count
49}
50
51/// Counts the tokens that would be generated for the entire file tree section
52pub fn count_tree_tokens(tree: &BTreeMap<String, crate::tree::FileNode>, depth: usize) -> usize {
53    let mut token_count = 0;
54
55    // Add tokens for indentation
56    let indent = "  ".repeat(depth);
57
58    for (name, node) in tree {
59        match node {
60            crate::tree::FileNode::File => {
61                token_count += estimate_tokens(&format!("{}- πŸ“„ {}\n", indent, name));
62            }
63            crate::tree::FileNode::Directory(children) => {
64                token_count += estimate_tokens(&format!("{}- πŸ“ {}\n", indent, name));
65                token_count += count_tree_tokens(children, depth + 1);
66            }
67        }
68    }
69
70    token_count
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76    use std::collections::BTreeMap;
77
78    #[test]
79    fn test_estimate_tokens() {
80        // Test with a simple string
81        let text = "Hello, world!";
82        let tokens = estimate_tokens(text);
83        // "Hello, world!" is 4 tokens with cl100k_base
84        assert_eq!(tokens, 4);
85
86        // Test with code-like content
87        let code_text = "fn main() {\n    println!(\"Hello, world!\");\n}";
88        let tokens = estimate_tokens(code_text);
89        // This specific code snippet is 12 tokens with cl100k_base
90        assert_eq!(tokens, 12);
91    }
92
93    #[test]
94    fn test_count_tree_tokens() {
95        // Create a simple tree structure
96        let mut tree = BTreeMap::new();
97        tree.insert("file1.rs".to_string(), crate::tree::FileNode::File);
98
99        let mut subdir = BTreeMap::new();
100        subdir.insert("file2.md".to_string(), crate::tree::FileNode::File);
101        tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir));
102
103        let tokens = count_tree_tokens(&tree, 0);
104        // "- πŸ“„ file1.rs\n" -> 8 tokens
105        // "- πŸ“ src\n" -> 6 tokens
106        // "  - πŸ“„ file2.md\n" -> 9 tokens
107        // Total should be 23 tokens
108        assert_eq!(tokens, 23);
109    }
110
111    #[test]
112    fn test_token_estimation_format_consistency() {
113        use tempfile::tempdir;
114
115        let dir = tempdir().unwrap();
116        let test_file = dir.path().join("test.rs");
117        std::fs::write(&test_file, "fn main() {}\n").unwrap();
118
119        let entry = ignore::WalkBuilder::new(&test_file)
120            .build()
121            .next()
122            .unwrap()
123            .unwrap();
124
125        // Estimate tokens for the file
126        let estimated_tokens = count_file_tokens(dir.path(), &entry, false);
127
128        // Generate actual markdown content
129        let mut actual_content = Vec::new();
130        crate::markdown::process_file(dir.path(), &test_file, &mut actual_content, false, None)
131            .unwrap();
132        let actual_content_str = String::from_utf8(actual_content).unwrap();
133
134        // Count actual tokens
135        let actual_tokens = estimate_tokens(&actual_content_str);
136
137        // The estimation should be close to actual (within a reasonable margin)
138        // Allow for some variance due to timestamp differences and minor formatting
139        let difference = actual_tokens.abs_diff(estimated_tokens);
140
141        // Should be within 10% or 20 tokens difference (whichever is larger)
142        let max_allowed_difference = std::cmp::max(actual_tokens / 10, 20);
143
144        assert!(
145            difference <= max_allowed_difference,
146            "Token estimation {} differs too much from actual {} (difference: {})",
147            estimated_tokens,
148            actual_tokens,
149            difference
150        );
151    }
152
153    #[test]
154    fn test_estimate_tokens_empty_string() {
155        let tokens = estimate_tokens("");
156        assert_eq!(tokens, 0);
157    }
158
159    #[test]
160    fn test_estimate_tokens_whitespace_only() {
161        let tokens = estimate_tokens("   \n\t  ");
162        assert!(tokens > 0); // Whitespace still counts as tokens
163    }
164
165    #[test]
166    fn test_estimate_tokens_unicode() {
167        let tokens = estimate_tokens("Hello δΈ–η•Œ! 🌍");
168        assert!(tokens > 0);
169        // Unicode characters may be encoded as multiple tokens
170        assert!(tokens >= 4);
171    }
172
173    #[test]
174    fn test_count_file_tokens_with_line_numbers() {
175        use tempfile::tempdir;
176
177        let dir = tempdir().unwrap();
178        let test_file = dir.path().join("test.rs");
179        std::fs::write(&test_file, "line 1\nline 2\nline 3").unwrap();
180
181        let entry = ignore::WalkBuilder::new(&test_file)
182            .build()
183            .next()
184            .unwrap()
185            .unwrap();
186
187        let tokens_without_line_numbers = count_file_tokens(dir.path(), &entry, false);
188        let tokens_with_line_numbers = count_file_tokens(dir.path(), &entry, true);
189
190        // With line numbers should have more tokens due to line number prefixes
191        assert!(tokens_with_line_numbers > tokens_without_line_numbers);
192    }
193
194    #[test]
195    fn test_count_file_tokens_unreadable_file() {
196        use tempfile::tempdir;
197
198        let dir = tempdir().unwrap();
199        let test_file = dir.path().join("nonexistent.txt");
200
201        // Create a mock DirEntry for a file that doesn't exist
202        // This simulates what happens when a file is deleted between discovery and processing
203        let walker = ignore::WalkBuilder::new(dir.path());
204        let mut found_entry = None;
205
206        // Create the file temporarily to get a DirEntry
207        std::fs::write(&test_file, "temp").unwrap();
208        for entry in walker.build() {
209            if let Ok(entry) = entry
210                && entry.path() == test_file
211            {
212                found_entry = Some(entry);
213                break;
214            }
215        }
216
217        // Now delete the file
218        std::fs::remove_file(&test_file).unwrap();
219
220        if let Some(entry) = found_entry {
221            let tokens = count_file_tokens(dir.path(), &entry, false);
222            // Should still return some tokens for the file header even if content can't be read
223            assert!(tokens > 0);
224        }
225    }
226
227    #[test]
228    fn test_count_tree_tokens_empty_tree() {
229        let tree = BTreeMap::new();
230        let tokens = count_tree_tokens(&tree, 0);
231        assert_eq!(tokens, 0);
232    }
233
234    #[test]
235    fn test_count_tree_tokens_nested_directories() {
236        let mut tree = BTreeMap::new();
237
238        // Create deeply nested structure
239        let mut level3 = BTreeMap::new();
240        level3.insert("deep_file.txt".to_string(), crate::tree::FileNode::File);
241
242        let mut level2 = BTreeMap::new();
243        level2.insert(
244            "level3".to_string(),
245            crate::tree::FileNode::Directory(level3),
246        );
247
248        let mut level1 = BTreeMap::new();
249        level1.insert(
250            "level2".to_string(),
251            crate::tree::FileNode::Directory(level2),
252        );
253
254        tree.insert(
255            "level1".to_string(),
256            crate::tree::FileNode::Directory(level1),
257        );
258
259        let tokens = count_tree_tokens(&tree, 0);
260        assert!(tokens > 0);
261
262        // Should account for indentation at different levels
263        let tokens_with_depth = count_tree_tokens(&tree, 2);
264        assert!(tokens_with_depth > tokens); // More indentation = more tokens
265    }
266
267    #[test]
268    fn test_count_tree_tokens_mixed_content() {
269        let mut tree = BTreeMap::new();
270
271        // Add files with various name lengths and characters
272        tree.insert("a.txt".to_string(), crate::tree::FileNode::File);
273        tree.insert(
274            "very_long_filename_with_underscores.rs".to_string(),
275            crate::tree::FileNode::File,
276        );
277        tree.insert("Ρ„Π°ΠΉΠ».txt".to_string(), crate::tree::FileNode::File); // Unicode filename
278
279        let mut subdir = BTreeMap::new();
280        subdir.insert("nested.md".to_string(), crate::tree::FileNode::File);
281        tree.insert(
282            "directory".to_string(),
283            crate::tree::FileNode::Directory(subdir),
284        );
285
286        let tokens = count_tree_tokens(&tree, 0);
287        assert!(tokens > 0);
288
289        // Verify it handles unicode filenames without crashing
290        assert!(tokens > 20); // Should be substantial given the content
291    }
292}