infiniloom_engine/scanner/
process.rs1use std::cell::RefCell;
7use std::path::Path;
8
9use crate::parser::{Language, Parser};
10use crate::tokenizer::{TokenCounts, Tokenizer};
11use crate::types::{RepoFile, Symbol};
12
13use super::io::smart_read_file_with_options;
14use super::{FileInfo, ScannerConfig};
15
16thread_local! {
18 static THREAD_PARSER: RefCell<Parser> = RefCell::new(Parser::new());
19 static THREAD_TOKENIZER: Tokenizer = Tokenizer::new();
20}
21
22pub fn parse_with_thread_local(content: &str, path: &Path) -> Vec<Symbol> {
26 THREAD_PARSER.with(|parser| {
27 let mut parser = parser.borrow_mut();
28 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
29 if let Some(lang) = Language::from_extension(ext) {
30 parser.parse(content, lang).unwrap_or_default()
31 } else {
32 Vec::new()
33 }
34 } else {
35 Vec::new()
36 }
37 })
38}
39
40pub fn count_tokens(content: &str, size_bytes: u64, accurate: bool) -> TokenCounts {
45 if accurate {
46 count_tokens_accurate(content)
47 } else {
48 estimate_tokens(size_bytes, Some(content))
49 }
50}
51
52pub fn count_tokens_accurate(content: &str) -> TokenCounts {
57 THREAD_TOKENIZER.with(|tokenizer| tokenizer.count_all(content))
58}
59
60pub fn estimate_tokens(size_bytes: u64, content: Option<&str>) -> TokenCounts {
65 let len = content.map(|c| c.len() as f32).unwrap_or(size_bytes as f32);
67
68 TokenCounts {
69 o200k: (len / 4.0) as u32, cl100k: (len / 3.7) as u32, claude: (len / 3.5) as u32,
72 gemini: (len / 3.8) as u32,
73 llama: (len / 3.5) as u32,
74 mistral: (len / 3.5) as u32,
75 deepseek: (len / 3.5) as u32,
76 qwen: (len / 3.5) as u32,
77 cohere: (len / 3.6) as u32,
78 grok: (len / 3.5) as u32,
79 }
80}
81
82pub fn estimate_lines(size_bytes: u64) -> u64 {
86 size_bytes / 40
87}
88
89pub fn process_file_content_only(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
93 let size_bytes = info.size_bytes.unwrap_or(0);
94 let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
95 let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
96
97 Some(RepoFile {
98 path: info.path,
99 relative_path: info.relative_path,
100 language: info.language,
101 size_bytes,
102 token_count,
103 symbols: Vec::new(),
104 importance: 0.5,
105 content: Some(content),
106 })
107}
108
109pub fn process_file_with_content(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
114 let size_bytes = info.size_bytes.unwrap_or(0);
115 let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
116 let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
117 let symbols = parse_with_thread_local(&content, &info.path);
118
119 Some(RepoFile {
120 path: info.path,
121 relative_path: info.relative_path,
122 language: info.language,
123 size_bytes,
124 token_count,
125 symbols,
126 importance: 0.5,
127 content: Some(content),
128 })
129}
130
131pub fn process_file_without_content(info: FileInfo, config: &ScannerConfig) -> RepoFile {
135 let size_bytes = info.size_bytes.unwrap_or(0);
136 let token_count = if config.accurate_tokens {
137 estimate_tokens(size_bytes, None)
139 } else {
140 estimate_tokens(size_bytes, None)
141 };
142
143 RepoFile {
144 path: info.path,
145 relative_path: info.relative_path,
146 language: info.language,
147 size_bytes,
148 token_count,
149 symbols: Vec::new(),
150 importance: 0.5,
151 content: None,
152 }
153}
154
155#[cfg(test)]
156mod tests {
157 use super::*;
158 use std::fs;
159 use std::path::PathBuf;
160 use tempfile::tempdir;
161
162 #[test]
163 fn test_estimate_tokens_from_content() {
164 let content = "Hello, World!";
165 let tokens = estimate_tokens(0, Some(content));
166 assert_eq!(tokens.o200k, 3);
168 }
169
170 #[test]
171 fn test_estimate_tokens_from_size() {
172 let tokens = estimate_tokens(1000, None);
173 assert_eq!(tokens.o200k, 250);
175 }
176
177 #[test]
178 fn test_estimate_lines() {
179 assert_eq!(estimate_lines(400), 10);
180 assert_eq!(estimate_lines(0), 0);
181 }
182
183 #[test]
184 fn test_count_tokens_configurable() {
185 let content = "fn main() {}";
186
187 let fast = count_tokens(content, content.len() as u64, false);
189
190 let accurate = count_tokens(content, content.len() as u64, true);
192
193 assert!(fast.o200k > 0);
195 assert!(accurate.o200k > 0);
196 }
197
198 #[test]
199 fn test_process_file_content_only() {
200 let dir = tempdir().unwrap();
201 let file_path = dir.path().join("test.rs");
202 fs::write(&file_path, "fn main() {}").unwrap();
203
204 let info = FileInfo {
205 path: file_path,
206 relative_path: "test.rs".to_string(),
207 size_bytes: Some(12),
208 language: Some("rust".to_string()),
209 };
210
211 let config = ScannerConfig::default();
212 let result = process_file_content_only(info, &config);
213
214 assert!(result.is_some());
215 let repo_file = result.unwrap();
216 assert!(repo_file.content.is_some());
217 assert!(repo_file.symbols.is_empty());
218 }
219
220 #[test]
221 fn test_process_file_with_content() {
222 let dir = tempdir().unwrap();
223 let file_path = dir.path().join("test.rs");
224 fs::write(&file_path, "fn main() {}").unwrap();
225
226 let info = FileInfo {
227 path: file_path,
228 relative_path: "test.rs".to_string(),
229 size_bytes: Some(12),
230 language: Some("rust".to_string()),
231 };
232
233 let config = ScannerConfig::default();
234 let result = process_file_with_content(info, &config);
235
236 assert!(result.is_some());
237 let repo_file = result.unwrap();
238 assert!(repo_file.content.is_some());
239 assert!(!repo_file.symbols.is_empty());
241 }
242
243 #[test]
244 fn test_process_file_without_content() {
245 let info = FileInfo {
246 path: PathBuf::from("/path/to/test.rs"),
247 relative_path: "test.rs".to_string(),
248 size_bytes: Some(1000),
249 language: Some("rust".to_string()),
250 };
251
252 let config = ScannerConfig::default();
253 let repo_file = process_file_without_content(info, &config);
254
255 assert!(repo_file.content.is_none());
256 assert!(repo_file.symbols.is_empty());
257 assert_eq!(repo_file.size_bytes, 1000);
258 }
259
260 #[test]
261 fn test_parse_with_thread_local_rust() {
262 let content = "fn main() {}";
263 let path = PathBuf::from("test.rs");
264 let symbols = parse_with_thread_local(content, &path);
265
266 assert!(!symbols.is_empty());
268 }
269
270 #[test]
271 fn test_parse_with_thread_local_unknown_extension() {
272 let content = "some content";
273 let path = PathBuf::from("test.unknown");
274 let symbols = parse_with_thread_local(content, &path);
275
276 assert!(symbols.is_empty());
278 }
279}