context_builder/
token_count.rs

1use ignore::DirEntry;
2use once_cell::sync::Lazy;
3use std::collections::BTreeMap;
4use std::fs;
5use std::path::Path;
6/// Token counting utilities for estimating LLM token usage
7use tiktoken_rs::{CoreBPE, cl100k_base};
8
9// Initialize the tokenizer once and reuse it
10static TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
11
12/// Estimates the number of tokens in a text string using a real tokenizer
13pub fn estimate_tokens(text: &str) -> usize {
14    TOKENIZER.encode_with_special_tokens(text).len()
15}
16
17/// Counts the tokens that would be generated for a file
18pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize {
19    let file_path = entry.path();
20    let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
21
22    // Start with tokens for the file header (path, size, modified time)
23    let mut token_count = estimate_tokens(&format!(
24        "\n## File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n",
25        relative_path.display(),
26        entry.metadata().map(|m| m.len()).unwrap_or(0),
27        "Unknown"
28    )); // Using "Unknown" as placeholder for modified time in estimation
29
30    // Add tokens for the code fences
31    token_count += estimate_tokens("```\n```");
32
33    // Try to read file content
34    if let Ok(content) = fs::read_to_string(file_path) {
35        if line_numbers {
36            // When line numbers are enabled, we add the line number prefix to each line
37            let lines_with_numbers: String = content
38                .lines()
39                .enumerate()
40                .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line))
41                .collect();
42            token_count += estimate_tokens(&lines_with_numbers);
43        } else {
44            token_count += estimate_tokens(&content);
45        }
46    }
47
48    token_count
49}
50
51/// Counts the tokens that would be generated for the entire file tree section
52pub fn count_tree_tokens(tree: &BTreeMap<String, crate::tree::FileNode>, depth: usize) -> usize {
53    let mut token_count = 0;
54
55    // Add tokens for indentation
56    let indent = "  ".repeat(depth);
57
58    for (name, node) in tree {
59        match node {
60            crate::tree::FileNode::File => {
61                token_count += estimate_tokens(&format!("{}- 📄 {}\n", indent, name));
62            }
63            crate::tree::FileNode::Directory(children) => {
64                token_count += estimate_tokens(&format!("{}- 📁 {}\n", indent, name));
65                token_count += count_tree_tokens(children, depth + 1);
66            }
67        }
68    }
69
70    token_count
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76    use std::collections::BTreeMap;
77
78    #[test]
79    fn test_estimate_tokens() {
80        // Test with a simple string
81        let text = "Hello, world!";
82        let tokens = estimate_tokens(text);
83        // "Hello, world!" is 4 tokens with cl100k_base
84        assert_eq!(tokens, 4);
85
86        // Test with code-like content
87        let code_text = "fn main() {\n    println!(\"Hello, world!\");\n}";
88        let tokens = estimate_tokens(code_text);
89        // This specific code snippet is 12 tokens with cl100k_base
90        assert_eq!(tokens, 12);
91    }
92
93    #[test]
94    fn test_count_tree_tokens() {
95        // Create a simple tree structure
96        let mut tree = BTreeMap::new();
97        tree.insert("file1.rs".to_string(), crate::tree::FileNode::File);
98
99        let mut subdir = BTreeMap::new();
100        subdir.insert("file2.md".to_string(), crate::tree::FileNode::File);
101        tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir));
102
103        let tokens = count_tree_tokens(&tree, 0);
104        // "- 📄 file1.rs\n" -> 8 tokens
105        // "- 📁 src\n" -> 6 tokens
106        // "  - 📄 file2.md\n" -> 9 tokens
107        // Total should be 23 tokens
108        assert_eq!(tokens, 23);
109    }
110}