infiniloom_engine/scanner/
process.rs

1//! File processing utilities
2//!
3//! This module provides file processing functions for the scanner,
4//! including token counting, symbol extraction, and file metadata.
5
6use std::cell::RefCell;
7use std::path::Path;
8
9use crate::parser::{Language, Parser};
10use crate::tokenizer::{TokenCounts, Tokenizer};
11use crate::types::{RepoFile, Symbol};
12
13use super::io::smart_read_file_with_options;
14use super::{FileInfo, ScannerConfig};
15
16// Thread-local parser for lock-free parallel parsing
17thread_local! {
18    static THREAD_PARSER: RefCell<Parser> = RefCell::new(Parser::new());
19    static THREAD_TOKENIZER: Tokenizer = Tokenizer::new();
20}
21
22/// Parse content using thread-local parser (lock-free)
23///
24/// Each thread has its own parser instance, avoiding mutex contention.
25pub fn parse_with_thread_local(content: &str, path: &Path) -> Vec<Symbol> {
26    THREAD_PARSER.with(|parser| {
27        let mut parser = parser.borrow_mut();
28        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
29            if let Some(lang) = Language::from_extension(ext) {
30                parser.parse(content, lang).unwrap_or_default()
31            } else {
32                Vec::new()
33            }
34        } else {
35            Vec::new()
36        }
37    })
38}
39
40/// Count tokens using configurable method
41///
42/// When `accurate` is true, uses tiktoken for exact BPE counts.
43/// When false, uses fast estimation (~80x faster).
44pub fn count_tokens(content: &str, size_bytes: u64, accurate: bool) -> TokenCounts {
45    if accurate {
46        count_tokens_accurate(content)
47    } else {
48        estimate_tokens(size_bytes, Some(content))
49    }
50}
51
52/// Count tokens using thread-local tokenizer (accurate via tiktoken)
53///
54/// Provides exact BPE token counts for OpenAI models.
55/// More accurate but significantly slower than estimation.
56pub fn count_tokens_accurate(content: &str) -> TokenCounts {
57    THREAD_TOKENIZER.with(|tokenizer| tokenizer.count_all(content))
58}
59
60/// Estimate tokens from file size
61///
62/// Uses calibrated character-per-token ratios for each model family.
63/// Fast (~80x faster than tiktoken) with ~95% accuracy.
64pub fn estimate_tokens(size_bytes: u64, content: Option<&str>) -> TokenCounts {
65    // If we have content, use content length for better accuracy
66    let len = content.map(|c| c.len() as f32).unwrap_or(size_bytes as f32);
67
68    TokenCounts {
69        o200k: (len / 4.0) as u32,  // OpenAI modern (GPT-5.x, GPT-4o, O-series)
70        cl100k: (len / 3.7) as u32, // OpenAI legacy (GPT-4, GPT-3.5)
71        claude: (len / 3.5) as u32,
72        gemini: (len / 3.8) as u32,
73        llama: (len / 3.5) as u32,
74        mistral: (len / 3.5) as u32,
75        deepseek: (len / 3.5) as u32,
76        qwen: (len / 3.5) as u32,
77        cohere: (len / 3.6) as u32,
78        grok: (len / 3.5) as u32,
79    }
80}
81
82/// Estimate lines from file size
83///
84/// Uses average of ~40 characters per line.
85pub fn estimate_lines(size_bytes: u64) -> u64 {
86    size_bytes / 40
87}
88
89/// Process a file with content reading only (no parsing - fast path)
90///
91/// Reads file content but skips symbol extraction for speed.
92pub fn process_file_content_only(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
93    let size_bytes = info.size_bytes.unwrap_or(0);
94    let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
95    let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
96
97    Some(RepoFile {
98        path: info.path,
99        relative_path: info.relative_path,
100        language: info.language,
101        size_bytes,
102        token_count,
103        symbols: Vec::new(),
104        importance: 0.5,
105        content: Some(content),
106    })
107}
108
109/// Process a file with content reading and parsing (used in parallel)
110///
111/// Uses thread-local parser for lock-free parallel parsing.
112/// Uses memory-mapped I/O for files >= 1MB if enabled.
113pub fn process_file_with_content(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
114    let size_bytes = info.size_bytes.unwrap_or(0);
115    let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
116    let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
117    let symbols = parse_with_thread_local(&content, &info.path);
118
119    Some(RepoFile {
120        path: info.path,
121        relative_path: info.relative_path,
122        language: info.language,
123        size_bytes,
124        token_count,
125        symbols,
126        importance: 0.5,
127        content: Some(content),
128    })
129}
130
131/// Process a file without reading content (fast path)
132///
133/// Only collects metadata, skipping content reading and parsing.
134pub fn process_file_without_content(info: FileInfo, config: &ScannerConfig) -> RepoFile {
135    let size_bytes = info.size_bytes.unwrap_or(0);
136    let token_count = if config.accurate_tokens {
137        // Can't use accurate counting without content
138        estimate_tokens(size_bytes, None)
139    } else {
140        estimate_tokens(size_bytes, None)
141    };
142
143    RepoFile {
144        path: info.path,
145        relative_path: info.relative_path,
146        language: info.language,
147        size_bytes,
148        token_count,
149        symbols: Vec::new(),
150        importance: 0.5,
151        content: None,
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158    use std::fs;
159    use std::path::PathBuf;
160    use tempfile::tempdir;
161
162    #[test]
163    fn test_estimate_tokens_from_content() {
164        let content = "Hello, World!";
165        let tokens = estimate_tokens(0, Some(content));
166        // 13 chars / 4.0 = 3 (o200k)
167        assert_eq!(tokens.o200k, 3);
168    }
169
170    #[test]
171    fn test_estimate_tokens_from_size() {
172        let tokens = estimate_tokens(1000, None);
173        // 1000 bytes / 4.0 = 250 (o200k)
174        assert_eq!(tokens.o200k, 250);
175    }
176
177    #[test]
178    fn test_estimate_lines() {
179        assert_eq!(estimate_lines(400), 10);
180        assert_eq!(estimate_lines(0), 0);
181    }
182
183    #[test]
184    fn test_count_tokens_configurable() {
185        let content = "fn main() {}";
186
187        // Fast estimation
188        let fast = count_tokens(content, content.len() as u64, false);
189
190        // Accurate (tiktoken)
191        let accurate = count_tokens(content, content.len() as u64, true);
192
193        // Both should produce reasonable results
194        assert!(fast.o200k > 0);
195        assert!(accurate.o200k > 0);
196    }
197
198    #[test]
199    fn test_process_file_content_only() {
200        let dir = tempdir().unwrap();
201        let file_path = dir.path().join("test.rs");
202        fs::write(&file_path, "fn main() {}").unwrap();
203
204        let info = FileInfo {
205            path: file_path,
206            relative_path: "test.rs".to_string(),
207            size_bytes: Some(12),
208            language: Some("rust".to_string()),
209        };
210
211        let config = ScannerConfig::default();
212        let result = process_file_content_only(info, &config);
213
214        assert!(result.is_some());
215        let repo_file = result.unwrap();
216        assert!(repo_file.content.is_some());
217        assert!(repo_file.symbols.is_empty());
218    }
219
220    #[test]
221    fn test_process_file_with_content() {
222        let dir = tempdir().unwrap();
223        let file_path = dir.path().join("test.rs");
224        fs::write(&file_path, "fn main() {}").unwrap();
225
226        let info = FileInfo {
227            path: file_path,
228            relative_path: "test.rs".to_string(),
229            size_bytes: Some(12),
230            language: Some("rust".to_string()),
231        };
232
233        let config = ScannerConfig::default();
234        let result = process_file_with_content(info, &config);
235
236        assert!(result.is_some());
237        let repo_file = result.unwrap();
238        assert!(repo_file.content.is_some());
239        // Should have parsed symbols
240        assert!(!repo_file.symbols.is_empty());
241    }
242
243    #[test]
244    fn test_process_file_without_content() {
245        let info = FileInfo {
246            path: PathBuf::from("/path/to/test.rs"),
247            relative_path: "test.rs".to_string(),
248            size_bytes: Some(1000),
249            language: Some("rust".to_string()),
250        };
251
252        let config = ScannerConfig::default();
253        let repo_file = process_file_without_content(info, &config);
254
255        assert!(repo_file.content.is_none());
256        assert!(repo_file.symbols.is_empty());
257        assert_eq!(repo_file.size_bytes, 1000);
258    }
259
260    #[test]
261    fn test_parse_with_thread_local_rust() {
262        let content = "fn main() {}";
263        let path = PathBuf::from("test.rs");
264        let symbols = parse_with_thread_local(content, &path);
265
266        // Should parse the main function
267        assert!(!symbols.is_empty());
268    }
269
270    #[test]
271    fn test_parse_with_thread_local_unknown_extension() {
272        let content = "some content";
273        let path = PathBuf::from("test.unknown");
274        let symbols = parse_with_thread_local(content, &path);
275
276        // Unknown extension should return empty
277        assert!(symbols.is_empty());
278    }
279}