context_builder/
token_count.rs1use ignore::DirEntry;
2use once_cell::sync::Lazy;
3use std::collections::BTreeMap;
4use std::fs;
5use std::path::Path;
6use tiktoken_rs::{CoreBPE, cl100k_base};
8
9static TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
11
12pub fn estimate_tokens(text: &str) -> usize {
14 TOKENIZER.encode_with_special_tokens(text).len()
15}
16
17pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize {
19 let file_path = entry.path();
20 let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
21
22 let mut token_count = estimate_tokens(&format!(
24 "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n",
25 relative_path.display(),
26 entry.metadata().map(|m| m.len()).unwrap_or(0),
27 "Unknown"
28 )); token_count += estimate_tokens("```\n```");
32
33 if let Ok(content) = fs::read_to_string(file_path) {
35 if line_numbers {
36 let lines_with_numbers: String = content
38 .lines()
39 .enumerate()
40 .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line))
41 .collect();
42 token_count += estimate_tokens(&lines_with_numbers);
43 } else {
44 token_count += estimate_tokens(&content);
45 }
46 }
47
48 token_count
49}
50
51pub fn count_tree_tokens(tree: &BTreeMap<String, crate::tree::FileNode>, depth: usize) -> usize {
53 let mut token_count = 0;
54
55 let indent = " ".repeat(depth);
57
58 for (name, node) in tree {
59 match node {
60 crate::tree::FileNode::File => {
61 token_count += estimate_tokens(&format!("{}- π {}\n", indent, name));
62 }
63 crate::tree::FileNode::Directory(children) => {
64 token_count += estimate_tokens(&format!("{}- π {}\n", indent, name));
65 token_count += count_tree_tokens(children, depth + 1);
66 }
67 }
68 }
69
70 token_count
71}
72
73#[cfg(test)]
74mod tests {
75 use super::*;
76 use std::collections::BTreeMap;
77
78 #[test]
79 fn test_estimate_tokens() {
80 let text = "Hello, world!";
82 let tokens = estimate_tokens(text);
83 assert_eq!(tokens, 4);
85
86 let code_text = "fn main() {\n println!(\"Hello, world!\");\n}";
88 let tokens = estimate_tokens(code_text);
89 assert_eq!(tokens, 12);
91 }
92
93 #[test]
94 fn test_count_tree_tokens() {
95 let mut tree = BTreeMap::new();
97 tree.insert("file1.rs".to_string(), crate::tree::FileNode::File);
98
99 let mut subdir = BTreeMap::new();
100 subdir.insert("file2.md".to_string(), crate::tree::FileNode::File);
101 tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir));
102
103 let tokens = count_tree_tokens(&tree, 0);
104 assert_eq!(tokens, 23);
109 }
110
111 #[test]
112 fn test_token_estimation_format_consistency() {
113 use tempfile::tempdir;
114
115 let dir = tempdir().unwrap();
116 let test_file = dir.path().join("test.rs");
117 std::fs::write(&test_file, "fn main() {}\n").unwrap();
118
119 let entry = ignore::WalkBuilder::new(&test_file)
120 .build()
121 .next()
122 .unwrap()
123 .unwrap();
124
125 let estimated_tokens = count_file_tokens(dir.path(), &entry, false);
127
128 let mut actual_content = Vec::new();
130 crate::markdown::process_file(dir.path(), &test_file, &mut actual_content, false, None)
131 .unwrap();
132 let actual_content_str = String::from_utf8(actual_content).unwrap();
133
134 let actual_tokens = estimate_tokens(&actual_content_str);
136
137 let difference = actual_tokens.abs_diff(estimated_tokens);
140
141 let max_allowed_difference = std::cmp::max(actual_tokens / 10, 20);
143
144 assert!(
145 difference <= max_allowed_difference,
146 "Token estimation {} differs too much from actual {} (difference: {})",
147 estimated_tokens,
148 actual_tokens,
149 difference
150 );
151 }
152
153 #[test]
154 fn test_estimate_tokens_empty_string() {
155 let tokens = estimate_tokens("");
156 assert_eq!(tokens, 0);
157 }
158
159 #[test]
160 fn test_estimate_tokens_whitespace_only() {
161 let tokens = estimate_tokens(" \n\t ");
162 assert!(tokens > 0); }
164
165 #[test]
166 fn test_estimate_tokens_unicode() {
167 let tokens = estimate_tokens("Hello δΈη! π");
168 assert!(tokens > 0);
169 assert!(tokens >= 4);
171 }
172
173 #[test]
174 fn test_count_file_tokens_with_line_numbers() {
175 use tempfile::tempdir;
176
177 let dir = tempdir().unwrap();
178 let test_file = dir.path().join("test.rs");
179 std::fs::write(&test_file, "line 1\nline 2\nline 3").unwrap();
180
181 let entry = ignore::WalkBuilder::new(&test_file)
182 .build()
183 .next()
184 .unwrap()
185 .unwrap();
186
187 let tokens_without_line_numbers = count_file_tokens(dir.path(), &entry, false);
188 let tokens_with_line_numbers = count_file_tokens(dir.path(), &entry, true);
189
190 assert!(tokens_with_line_numbers > tokens_without_line_numbers);
192 }
193
194 #[test]
195 fn test_count_file_tokens_unreadable_file() {
196 use tempfile::tempdir;
197
198 let dir = tempdir().unwrap();
199 let test_file = dir.path().join("nonexistent.txt");
200
201 let walker = ignore::WalkBuilder::new(dir.path());
204 let mut found_entry = None;
205
206 std::fs::write(&test_file, "temp").unwrap();
208 for entry in walker.build() {
209 if let Ok(entry) = entry
210 && entry.path() == test_file
211 {
212 found_entry = Some(entry);
213 break;
214 }
215 }
216
217 std::fs::remove_file(&test_file).unwrap();
219
220 if let Some(entry) = found_entry {
221 let tokens = count_file_tokens(dir.path(), &entry, false);
222 assert!(tokens > 0);
224 }
225 }
226
227 #[test]
228 fn test_count_tree_tokens_empty_tree() {
229 let tree = BTreeMap::new();
230 let tokens = count_tree_tokens(&tree, 0);
231 assert_eq!(tokens, 0);
232 }
233
234 #[test]
235 fn test_count_tree_tokens_nested_directories() {
236 let mut tree = BTreeMap::new();
237
238 let mut level3 = BTreeMap::new();
240 level3.insert("deep_file.txt".to_string(), crate::tree::FileNode::File);
241
242 let mut level2 = BTreeMap::new();
243 level2.insert(
244 "level3".to_string(),
245 crate::tree::FileNode::Directory(level3),
246 );
247
248 let mut level1 = BTreeMap::new();
249 level1.insert(
250 "level2".to_string(),
251 crate::tree::FileNode::Directory(level2),
252 );
253
254 tree.insert(
255 "level1".to_string(),
256 crate::tree::FileNode::Directory(level1),
257 );
258
259 let tokens = count_tree_tokens(&tree, 0);
260 assert!(tokens > 0);
261
262 let tokens_with_depth = count_tree_tokens(&tree, 2);
264 assert!(tokens_with_depth > tokens); }
266
267 #[test]
268 fn test_count_tree_tokens_mixed_content() {
269 let mut tree = BTreeMap::new();
270
271 tree.insert("a.txt".to_string(), crate::tree::FileNode::File);
273 tree.insert(
274 "very_long_filename_with_underscores.rs".to_string(),
275 crate::tree::FileNode::File,
276 );
277 tree.insert("ΡΠ°ΠΉΠ».txt".to_string(), crate::tree::FileNode::File); let mut subdir = BTreeMap::new();
280 subdir.insert("nested.md".to_string(), crate::tree::FileNode::File);
281 tree.insert(
282 "directory".to_string(),
283 crate::tree::FileNode::Directory(subdir),
284 );
285
286 let tokens = count_tree_tokens(&tree, 0);
287 assert!(tokens > 0);
288
289 assert!(tokens > 20); }
292}