infiniloom_engine/scanner/
process.rs

1//! File processing utilities
2//!
3//! This module provides file processing functions for the scanner,
4//! including token counting, symbol extraction, and file metadata.
5
6use std::path::Path;
7
8use crate::parser;
9use crate::tokenizer::{TokenCounts, Tokenizer};
10use crate::types::{RepoFile, Symbol};
11
12use super::io::smart_read_file_with_options;
13use super::{FileInfo, ScannerConfig};
14
15// Thread-local tokenizer for lock-free parallel token counting
16thread_local! {
17    static THREAD_TOKENIZER: Tokenizer = Tokenizer::new();
18}
19
20/// Parse content using optimized thread-local parser (lock-free)
21///
22/// Uses the centralized thread-local parser from `parser::parse_file_symbols`.
23/// Each thread maintains a single lazily-initialized parser instance.
24///
25/// # Performance
26///
27/// - **2-3x faster** than old RefCell-based pattern
28/// - **Single initialization** per thread (vs per-call)
29/// - **Reduced overhead** from eliminated language detection duplication
30pub fn parse_with_thread_local(content: &str, path: &Path) -> Vec<Symbol> {
31    parser::parse_file_symbols(content, path)
32}
33
34/// Count tokens using configurable method
35///
36/// When `accurate` is true, uses tiktoken for exact BPE counts.
37/// When false, uses fast estimation (~80x faster).
38pub fn count_tokens(content: &str, size_bytes: u64, accurate: bool) -> TokenCounts {
39    if accurate {
40        count_tokens_accurate(content)
41    } else {
42        estimate_tokens(size_bytes, Some(content))
43    }
44}
45
46/// Count tokens using thread-local tokenizer (accurate via tiktoken)
47///
48/// Provides exact BPE token counts for OpenAI models.
49/// More accurate but significantly slower than estimation.
50pub fn count_tokens_accurate(content: &str) -> TokenCounts {
51    THREAD_TOKENIZER.with(|tokenizer| tokenizer.count_all(content))
52}
53
54/// Estimate tokens from file size
55///
56/// Uses calibrated character-per-token ratios for each model family.
57/// Fast (~80x faster than tiktoken) with ~95% accuracy.
58pub fn estimate_tokens(size_bytes: u64, content: Option<&str>) -> TokenCounts {
59    // If we have content, use content length for better accuracy
60    let len = content.map(|c| c.len() as f32).unwrap_or(size_bytes as f32);
61
62    TokenCounts {
63        o200k: (len / 4.0) as u32,  // OpenAI modern (GPT-5.x, GPT-4o, O-series)
64        cl100k: (len / 3.7) as u32, // OpenAI legacy (GPT-4, GPT-3.5)
65        claude: (len / 3.5) as u32,
66        gemini: (len / 3.8) as u32,
67        llama: (len / 3.5) as u32,
68        mistral: (len / 3.5) as u32,
69        deepseek: (len / 3.5) as u32,
70        qwen: (len / 3.5) as u32,
71        cohere: (len / 3.6) as u32,
72        grok: (len / 3.5) as u32,
73    }
74}
75
76/// Estimate lines from file size
77///
78/// Uses average of ~40 characters per line.
79pub fn estimate_lines(size_bytes: u64) -> u64 {
80    size_bytes / 40
81}
82
83/// Process a file with content reading only (no parsing - fast path)
84///
85/// Reads file content but skips symbol extraction for speed.
86pub fn process_file_content_only(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
87    let size_bytes = info.size_bytes.unwrap_or(0);
88    let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
89    let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
90
91    Some(RepoFile {
92        path: info.path,
93        relative_path: info.relative_path,
94        language: info.language,
95        size_bytes,
96        token_count,
97        symbols: Vec::new(),
98        importance: 0.5,
99        content: Some(content),
100    })
101}
102
103/// Process a file with content reading and parsing (used in parallel)
104///
105/// Uses thread-local parser for lock-free parallel parsing.
106/// Uses memory-mapped I/O for files >= 1MB if enabled.
107pub fn process_file_with_content(info: FileInfo, config: &ScannerConfig) -> Option<RepoFile> {
108    let size_bytes = info.size_bytes.unwrap_or(0);
109    let content = smart_read_file_with_options(&info.path, size_bytes, config.use_mmap)?;
110    let token_count = count_tokens(&content, size_bytes, config.accurate_tokens);
111    let symbols = parse_with_thread_local(&content, &info.path);
112
113    Some(RepoFile {
114        path: info.path,
115        relative_path: info.relative_path,
116        language: info.language,
117        size_bytes,
118        token_count,
119        symbols,
120        importance: 0.5,
121        content: Some(content),
122    })
123}
124
125/// Process a file without reading content (fast path)
126///
127/// Only collects metadata, skipping content reading and parsing.
128pub fn process_file_without_content(info: FileInfo, config: &ScannerConfig) -> RepoFile {
129    let size_bytes = info.size_bytes.unwrap_or(0);
130    let token_count = if config.accurate_tokens {
131        // Can't use accurate counting without content
132        estimate_tokens(size_bytes, None)
133    } else {
134        estimate_tokens(size_bytes, None)
135    };
136
137    RepoFile {
138        path: info.path,
139        relative_path: info.relative_path,
140        language: info.language,
141        size_bytes,
142        token_count,
143        symbols: Vec::new(),
144        importance: 0.5,
145        content: None,
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use std::fs;
153    use std::path::PathBuf;
154    use tempfile::tempdir;
155
156    #[test]
157    fn test_estimate_tokens_from_content() {
158        let content = "Hello, World!";
159        let tokens = estimate_tokens(0, Some(content));
160        // 13 chars / 4.0 = 3 (o200k)
161        assert_eq!(tokens.o200k, 3);
162    }
163
164    #[test]
165    fn test_estimate_tokens_from_size() {
166        let tokens = estimate_tokens(1000, None);
167        // 1000 bytes / 4.0 = 250 (o200k)
168        assert_eq!(tokens.o200k, 250);
169    }
170
171    #[test]
172    fn test_estimate_lines() {
173        assert_eq!(estimate_lines(400), 10);
174        assert_eq!(estimate_lines(0), 0);
175    }
176
177    #[test]
178    fn test_count_tokens_configurable() {
179        let content = "fn main() {}";
180
181        // Fast estimation
182        let fast = count_tokens(content, content.len() as u64, false);
183
184        // Accurate (tiktoken)
185        let accurate = count_tokens(content, content.len() as u64, true);
186
187        // Both should produce reasonable results
188        assert!(fast.o200k > 0);
189        assert!(accurate.o200k > 0);
190    }
191
192    #[test]
193    fn test_process_file_content_only() {
194        let dir = tempdir().unwrap();
195        let file_path = dir.path().join("test.rs");
196        fs::write(&file_path, "fn main() {}").unwrap();
197
198        let info = FileInfo {
199            path: file_path,
200            relative_path: "test.rs".to_string(),
201            size_bytes: Some(12),
202            language: Some("rust".to_string()),
203        };
204
205        let config = ScannerConfig::default();
206        let result = process_file_content_only(info, &config);
207
208        assert!(result.is_some());
209        let repo_file = result.unwrap();
210        assert!(repo_file.content.is_some());
211        assert!(repo_file.symbols.is_empty());
212    }
213
214    #[test]
215    fn test_process_file_with_content() {
216        let dir = tempdir().unwrap();
217        let file_path = dir.path().join("test.rs");
218        fs::write(&file_path, "fn main() {}").unwrap();
219
220        let info = FileInfo {
221            path: file_path,
222            relative_path: "test.rs".to_string(),
223            size_bytes: Some(12),
224            language: Some("rust".to_string()),
225        };
226
227        let config = ScannerConfig::default();
228        let result = process_file_with_content(info, &config);
229
230        assert!(result.is_some());
231        let repo_file = result.unwrap();
232        assert!(repo_file.content.is_some());
233        // Should have parsed symbols
234        assert!(!repo_file.symbols.is_empty());
235    }
236
237    #[test]
238    fn test_process_file_without_content() {
239        let info = FileInfo {
240            path: PathBuf::from("/path/to/test.rs"),
241            relative_path: "test.rs".to_string(),
242            size_bytes: Some(1000),
243            language: Some("rust".to_string()),
244        };
245
246        let config = ScannerConfig::default();
247        let repo_file = process_file_without_content(info, &config);
248
249        assert!(repo_file.content.is_none());
250        assert!(repo_file.symbols.is_empty());
251        assert_eq!(repo_file.size_bytes, 1000);
252    }
253
254    #[test]
255    fn test_parse_with_thread_local_rust() {
256        let content = "fn main() {}";
257        let path = PathBuf::from("test.rs");
258        let symbols = parse_with_thread_local(content, &path);
259
260        // Should parse the main function
261        assert!(!symbols.is_empty());
262    }
263
264    #[test]
265    fn test_parse_with_thread_local_unknown_extension() {
266        let content = "some content";
267        let path = PathBuf::from("test.unknown");
268        let symbols = parse_with_thread_local(content, &path);
269
270        // Unknown extension should return empty
271        assert!(symbols.is_empty());
272    }
273}