context_builder/
token_count.rs1use ignore::DirEntry;
2use once_cell::sync::Lazy;
3use std::collections::BTreeMap;
4use std::fs;
5use std::path::Path;
6use tiktoken_rs::{CoreBPE, cl100k_base};
8
9static TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
11
12pub fn estimate_tokens(text: &str) -> usize {
14 TOKENIZER.encode_with_special_tokens(text).len()
15}
16
17pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize {
19 let file_path = entry.path();
20 let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path);
21
22 let mut token_count = estimate_tokens(&format!(
24 "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n",
25 relative_path.display(),
26 entry.metadata().map(|m| m.len()).unwrap_or(0),
27 "Unknown"
28 )); token_count += estimate_tokens("```\n```");
32
33 if let Ok(content) = fs::read_to_string(file_path) {
35 if line_numbers {
36 let lines_with_numbers: String = content
38 .lines()
39 .enumerate()
40 .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line))
41 .collect();
42 token_count += estimate_tokens(&lines_with_numbers);
43 } else {
44 token_count += estimate_tokens(&content);
45 }
46 }
47
48 token_count
49}
50
51pub fn count_tree_tokens(tree: &BTreeMap<String, crate::tree::FileNode>, depth: usize) -> usize {
53 let mut token_count = 0;
54
55 let indent = " ".repeat(depth);
57
58 for (name, node) in tree {
59 match node {
60 crate::tree::FileNode::File => {
61 token_count += estimate_tokens(&format!("{}- π {}\n", indent, name));
62 }
63 crate::tree::FileNode::Directory(children) => {
64 token_count += estimate_tokens(&format!("{}- π {}\n", indent, name));
65 token_count += count_tree_tokens(children, depth + 1);
66 }
67 }
68 }
69
70 token_count
71}
72
73#[cfg(test)]
74mod tests {
75 use super::*;
76 use std::collections::BTreeMap;
77
78 #[test]
79 fn test_estimate_tokens() {
80 let text = "Hello, world!";
82 let tokens = estimate_tokens(text);
83 assert_eq!(tokens, 4);
85
86 let code_text = "fn main() {\n println!(\"Hello, world!\");\n}";
88 let tokens = estimate_tokens(code_text);
89 assert_eq!(tokens, 12);
91 }
92
93 #[test]
94 fn test_count_tree_tokens() {
95 let mut tree = BTreeMap::new();
97 tree.insert("file1.rs".to_string(), crate::tree::FileNode::File);
98
99 let mut subdir = BTreeMap::new();
100 subdir.insert("file2.md".to_string(), crate::tree::FileNode::File);
101 tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir));
102
103 let tokens = count_tree_tokens(&tree, 0);
104 assert_eq!(tokens, 23);
109 }
110
111 #[test]
112 fn test_token_estimation_format_consistency() {
113 use tempfile::tempdir;
114
115 let dir = tempdir().unwrap();
116 let test_file = dir.path().join("test.rs");
117 std::fs::write(&test_file, "fn main() {}\n").unwrap();
118
119 let entry = ignore::WalkBuilder::new(&test_file)
120 .build()
121 .next()
122 .unwrap()
123 .unwrap();
124
125 let estimated_tokens = count_file_tokens(dir.path(), &entry, false);
127
128 let mut actual_content = Vec::new();
130 crate::markdown::process_file(
131 dir.path(),
132 &test_file,
133 &mut actual_content,
134 false,
135 None,
136 &crate::markdown::TreeSitterConfig::default(),
137 )
138 .unwrap();
139 let actual_content_str = String::from_utf8(actual_content).unwrap();
140
141 let actual_tokens = estimate_tokens(&actual_content_str);
143
144 let difference = actual_tokens.abs_diff(estimated_tokens);
147
148 let max_allowed_difference = std::cmp::max(actual_tokens / 10, 20);
150
151 assert!(
152 difference <= max_allowed_difference,
153 "Token estimation {} differs too much from actual {} (difference: {})",
154 estimated_tokens,
155 actual_tokens,
156 difference
157 );
158 }
159
160 #[test]
161 fn test_estimate_tokens_empty_string() {
162 let tokens = estimate_tokens("");
163 assert_eq!(tokens, 0);
164 }
165
166 #[test]
167 fn test_estimate_tokens_whitespace_only() {
168 let tokens = estimate_tokens(" \n\t ");
169 assert!(tokens > 0); }
171
172 #[test]
173 fn test_estimate_tokens_unicode() {
174 let tokens = estimate_tokens("Hello δΈη! π");
175 assert!(tokens > 0);
176 assert!(tokens >= 4);
178 }
179
180 #[test]
181 fn test_count_file_tokens_with_line_numbers() {
182 use tempfile::tempdir;
183
184 let dir = tempdir().unwrap();
185 let test_file = dir.path().join("test.rs");
186 std::fs::write(&test_file, "line 1\nline 2\nline 3").unwrap();
187
188 let entry = ignore::WalkBuilder::new(&test_file)
189 .build()
190 .next()
191 .unwrap()
192 .unwrap();
193
194 let tokens_without_line_numbers = count_file_tokens(dir.path(), &entry, false);
195 let tokens_with_line_numbers = count_file_tokens(dir.path(), &entry, true);
196
197 assert!(tokens_with_line_numbers > tokens_without_line_numbers);
199 }
200
201 #[test]
202 fn test_count_file_tokens_unreadable_file() {
203 use tempfile::tempdir;
204
205 let dir = tempdir().unwrap();
206 let test_file = dir.path().join("nonexistent.txt");
207
208 let walker = ignore::WalkBuilder::new(dir.path());
211 let mut found_entry = None;
212
213 std::fs::write(&test_file, "temp").unwrap();
215 for entry in walker.build() {
216 if let Ok(entry) = entry
217 && entry.path() == test_file
218 {
219 found_entry = Some(entry);
220 break;
221 }
222 }
223
224 std::fs::remove_file(&test_file).unwrap();
226
227 if let Some(entry) = found_entry {
228 let tokens = count_file_tokens(dir.path(), &entry, false);
229 assert!(tokens > 0);
231 }
232 }
233
234 #[test]
235 fn test_count_tree_tokens_empty_tree() {
236 let tree = BTreeMap::new();
237 let tokens = count_tree_tokens(&tree, 0);
238 assert_eq!(tokens, 0);
239 }
240
241 #[test]
242 fn test_count_tree_tokens_nested_directories() {
243 let mut tree = BTreeMap::new();
244
245 let mut level3 = BTreeMap::new();
247 level3.insert("deep_file.txt".to_string(), crate::tree::FileNode::File);
248
249 let mut level2 = BTreeMap::new();
250 level2.insert(
251 "level3".to_string(),
252 crate::tree::FileNode::Directory(level3),
253 );
254
255 let mut level1 = BTreeMap::new();
256 level1.insert(
257 "level2".to_string(),
258 crate::tree::FileNode::Directory(level2),
259 );
260
261 tree.insert(
262 "level1".to_string(),
263 crate::tree::FileNode::Directory(level1),
264 );
265
266 let tokens = count_tree_tokens(&tree, 0);
267 assert!(tokens > 0);
268
269 let tokens_with_depth = count_tree_tokens(&tree, 2);
271 assert!(tokens_with_depth > tokens); }
273
274 #[test]
275 fn test_count_tree_tokens_mixed_content() {
276 let mut tree = BTreeMap::new();
277
278 tree.insert("a.txt".to_string(), crate::tree::FileNode::File);
280 tree.insert(
281 "very_long_filename_with_underscores.rs".to_string(),
282 crate::tree::FileNode::File,
283 );
284 tree.insert("ΡΠ°ΠΉΠ».txt".to_string(), crate::tree::FileNode::File); let mut subdir = BTreeMap::new();
287 subdir.insert("nested.md".to_string(), crate::tree::FileNode::File);
288 tree.insert(
289 "directory".to_string(),
290 crate::tree::FileNode::Directory(subdir),
291 );
292
293 let tokens = count_tree_tokens(&tree, 0);
294 assert!(tokens > 0);
295
296 assert!(tokens > 20); }
299}