infiniloom_engine/scanner/
process.rs1use std::path::Path;
7
8use crate::parser;
9use crate::tokenizer::{TokenCounts, Tokenizer};
10use crate::types::{RepoFile, Symbol};
11
12use super::io::smart_read_file_with_options;
13use super::{FileInfo, ScannerConfig};
14
15thread_local! {
17 static THREAD_TOKENIZER: Tokenizer = Tokenizer::new();
18}
19
20pub fn parse_with_thread_local(content: &str, path: &Path) -> Vec<Symbol> {
31 parser::parse_file_symbols(content, path)
32}
33
34pub fn count_tokens(content: &str, size_bytes: u64, accurate: bool) -> TokenCounts {
39 if accurate {
40 count_tokens_accurate(content)
41 } else {
42 estimate_tokens(size_bytes, Some(content))
43 }
44}
45
46pub fn count_tokens_accurate(content: &str) -> TokenCounts {
51 THREAD_TOKENIZER.with(|tokenizer| tokenizer.count_all(content))
52}
53
54pub fn estimate_tokens(size_bytes: u64, content: Option<&str>) -> TokenCounts {
59 let len = content.map(|c| c.len() as f32).unwrap_or(size_bytes as f32);
61
62 TokenCounts {
63 o200k: (len / 4.0) as u32, cl100k: (len / 3.7) as u32, claude: (len / 3.5) as u32,
66 gemini: (len / 3.8) as u32,
67 llama: (len / 3.5) as u32,
68 mistral: (len / 3.5) as u32,
69 deepseek: (len / 3.5) as u32,
70 qwen: (len / 3.5) as u32,
71 cohere: (len / 3.6) as u32,
72 grok: (len / 3.5) as u32,
73 }
74}
75
76pub fn estimate_lines(size_bytes: u64) -> u64 {
80 size_bytes / 40
81}
82
83pub fn process_file_content_only(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
87 let size_bytes = info.size_bytes.unwrap_or(0);
88 let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
89 let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
90
91 Some(RepoFile {
92 path: info.path,
93 relative_path: info.relative_path,
94 language: info.language,
95 size_bytes,
96 token_count,
97 symbols: Vec::new(),
98 importance: 0.5,
99 content: Some(content),
100 })
101}
102
103pub fn process_file_with_content(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
108 let size_bytes = info.size_bytes.unwrap_or(0);
109 let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
110 let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
111 let symbols = parse_with_thread_local(&content, &info.path);
112
113 Some(RepoFile {
114 path: info.path,
115 relative_path: info.relative_path,
116 language: info.language,
117 size_bytes,
118 token_count,
119 symbols,
120 importance: 0.5,
121 content: Some(content),
122 })
123}
124
125pub fn process_file_without_content(info: FileInfo, config: &ScannerConfig) -> RepoFile {
129 let size_bytes = info.size_bytes.unwrap_or(0);
130 let token_count = if config.accurate_tokens {
131 estimate_tokens(size_bytes, None)
133 } else {
134 estimate_tokens(size_bytes, None)
135 };
136
137 RepoFile {
138 path: info.path,
139 relative_path: info.relative_path,
140 language: info.language,
141 size_bytes,
142 token_count,
143 symbols: Vec::new(),
144 importance: 0.5,
145 content: None,
146 }
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152 use std::fs;
153 use std::path::PathBuf;
154 use tempfile::tempdir;
155
156 #[test]
157 fn test_estimate_tokens_from_content() {
158 let content = "Hello, World!";
159 let tokens = estimate_tokens(0, Some(content));
160 assert_eq!(tokens.o200k, 3);
162 }
163
164 #[test]
165 fn test_estimate_tokens_from_size() {
166 let tokens = estimate_tokens(1000, None);
167 assert_eq!(tokens.o200k, 250);
169 }
170
171 #[test]
172 fn test_estimate_lines() {
173 assert_eq!(estimate_lines(400), 10);
174 assert_eq!(estimate_lines(0), 0);
175 }
176
177 #[test]
178 fn test_count_tokens_configurable() {
179 let content = "fn main() {}";
180
181 let fast = count_tokens(content, content.len() as u64, false);
183
184 let accurate = count_tokens(content, content.len() as u64, true);
186
187 assert!(fast.o200k > 0);
189 assert!(accurate.o200k > 0);
190 }
191
192 #[test]
193 fn test_process_file_content_only() {
194 let dir = tempdir().unwrap();
195 let file_path = dir.path().join("test.rs");
196 fs::write(&file_path, "fn main() {}").unwrap();
197
198 let info = FileInfo {
199 path: file_path,
200 relative_path: "test.rs".to_string(),
201 size_bytes: Some(12),
202 language: Some("rust".to_string()),
203 };
204
205 let config = ScannerConfig::default();
206 let result = process_file_content_only(info, &config);
207
208 assert!(result.is_some());
209 let repo_file = result.unwrap();
210 assert!(repo_file.content.is_some());
211 assert!(repo_file.symbols.is_empty());
212 }
213
214 #[test]
215 fn test_process_file_with_content() {
216 let dir = tempdir().unwrap();
217 let file_path = dir.path().join("test.rs");
218 fs::write(&file_path, "fn main() {}").unwrap();
219
220 let info = FileInfo {
221 path: file_path,
222 relative_path: "test.rs".to_string(),
223 size_bytes: Some(12),
224 language: Some("rust".to_string()),
225 };
226
227 let config = ScannerConfig::default();
228 let result = process_file_with_content(info, &config);
229
230 assert!(result.is_some());
231 let repo_file = result.unwrap();
232 assert!(repo_file.content.is_some());
233 assert!(!repo_file.symbols.is_empty());
235 }
236
237 #[test]
238 fn test_process_file_without_content() {
239 let info = FileInfo {
240 path: PathBuf::from("/path/to/test.rs"),
241 relative_path: "test.rs".to_string(),
242 size_bytes: Some(1000),
243 language: Some("rust".to_string()),
244 };
245
246 let config = ScannerConfig::default();
247 let repo_file = process_file_without_content(info, &config);
248
249 assert!(repo_file.content.is_none());
250 assert!(repo_file.symbols.is_empty());
251 assert_eq!(repo_file.size_bytes, 1000);
252 }
253
254 #[test]
255 fn test_parse_with_thread_local_rust() {
256 let content = "fn main() {}";
257 let path = PathBuf::from("test.rs");
258 let symbols = parse_with_thread_local(content, &path);
259
260 assert!(!symbols.is_empty());
262 }
263
264 #[test]
265 fn test_parse_with_thread_local_unknown_extension() {
266 let content = "some content";
267 let path = PathBuf::from("test.unknown");
268 let symbols = parse_with_thread_local(content, &path);
269
270 assert!(symbols.is_empty());
272 }
273}