infiniloom_engine/
mmap_scanner.rs

1//! Memory-mapped file scanner for high-performance large repository scanning
2//!
3//! Uses memory-mapped I/O to avoid copying file contents into memory,
4//! enabling efficient scanning of very large files and repositories.
5
6use memmap2::{Mmap, MmapOptions};
7use rayon::prelude::*;
8use std::fs::File;
9use std::io;
10use std::path::Path;
11use std::sync::atomic::{AtomicU64, Ordering};
12
13use crate::tokenizer::{TokenCounts, TokenModel, Tokenizer};
14
15/// A memory-mapped file for efficient reading
16pub struct MappedFile {
17    mmap: Mmap,
18    path: String,
19}
20
21impl MappedFile {
22    /// Open a file with memory mapping
23    #[allow(unsafe_code)]
24    pub fn open(path: &Path) -> io::Result<Self> {
25        let file = File::open(path)?;
26        // SAFETY: mapping is read-only and the file remains open for the mmap lifetime.
27        let mmap = unsafe { MmapOptions::new().map(&file)? };
28
29        Ok(Self { mmap, path: path.to_string_lossy().to_string() })
30    }
31
32    /// Get the file contents as a byte slice
33    #[inline]
34    pub fn as_bytes(&self) -> &[u8] {
35        &self.mmap
36    }
37
38    /// Get the file contents as a string (if valid UTF-8)
39    pub fn as_str(&self) -> Option<&str> {
40        std::str::from_utf8(&self.mmap).ok()
41    }
42
43    /// Get file size
44    #[inline]
45    pub fn len(&self) -> usize {
46        self.mmap.len()
47    }
48
49    /// Check if empty
50    #[inline]
51    pub fn is_empty(&self) -> bool {
52        self.mmap.is_empty()
53    }
54
55    /// Get the file path
56    pub fn path(&self) -> &str {
57        &self.path
58    }
59
60    /// Check if content appears to be binary
61    pub fn is_binary(&self) -> bool {
62        // Check first 8KB for binary indicators
63        let check_len = self.mmap.len().min(8192);
64        let sample = &self.mmap[..check_len];
65
66        // Null bytes indicate binary
67        if sample.contains(&0) {
68            return true;
69        }
70
71        // High ratio of non-printable characters
72        let non_printable = sample
73            .iter()
74            .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
75            .count();
76
77        non_printable * 10 > check_len
78    }
79
80    /// Count lines efficiently using SIMD-friendly iteration
81    pub fn count_lines(&self) -> usize {
82        self.mmap.iter().filter(|&&b| b == b'\n').count()
83    }
84}
85
86/// High-performance scanner using memory-mapped files
87pub struct MmapScanner {
88    /// Minimum file size to use mmap (smaller files use regular read)
89    mmap_threshold: u64,
90    /// Maximum file size to process
91    max_file_size: u64,
92    /// Tokenizer for counting
93    tokenizer: Tokenizer,
94    /// Statistics
95    stats: ScanStats,
96}
97
98/// Scanning statistics
99#[derive(Debug, Default)]
100pub struct ScanStats {
101    pub files_scanned: AtomicU64,
102    pub bytes_read: AtomicU64,
103    pub files_skipped_binary: AtomicU64,
104    pub files_skipped_size: AtomicU64,
105    pub mmap_used: AtomicU64,
106    pub regular_read_used: AtomicU64,
107}
108
109impl ScanStats {
110    pub fn summary(&self) -> String {
111        format!(
112            "Scanned {} files ({} bytes), skipped {} binary + {} oversized, mmap: {}, regular: {}",
113            self.files_scanned.load(Ordering::Relaxed),
114            self.bytes_read.load(Ordering::Relaxed),
115            self.files_skipped_binary.load(Ordering::Relaxed),
116            self.files_skipped_size.load(Ordering::Relaxed),
117            self.mmap_used.load(Ordering::Relaxed),
118            self.regular_read_used.load(Ordering::Relaxed),
119        )
120    }
121}
122
123/// Result of scanning a single file
124#[derive(Debug)]
125pub struct ScannedFile {
126    pub path: String,
127    pub relative_path: String,
128    pub size_bytes: u64,
129    pub lines: usize,
130    pub token_counts: TokenCounts,
131    pub language: Option<String>,
132    pub content: Option<String>,
133    pub is_binary: bool,
134}
135
136impl MmapScanner {
137    /// Create a new scanner with default settings
138    pub fn new() -> Self {
139        Self {
140            mmap_threshold: 64 * 1024,       // 64KB
141            max_file_size: 50 * 1024 * 1024, // 50MB
142            tokenizer: Tokenizer::new(),
143            stats: ScanStats::default(),
144        }
145    }
146
147    /// Set minimum file size for memory mapping
148    pub fn with_mmap_threshold(mut self, bytes: u64) -> Self {
149        self.mmap_threshold = bytes;
150        self
151    }
152
153    /// Set maximum file size
154    pub fn with_max_file_size(mut self, bytes: u64) -> Self {
155        self.max_file_size = bytes;
156        self
157    }
158
159    /// Scan a single file
160    pub fn scan_file(&self, path: &Path, base_path: &Path) -> io::Result<Option<ScannedFile>> {
161        let metadata = path.metadata()?;
162        let size = metadata.len();
163
164        // Skip files over max size
165        if size > self.max_file_size {
166            self.stats
167                .files_skipped_size
168                .fetch_add(1, Ordering::Relaxed);
169            return Ok(None);
170        }
171
172        let relative_path = path
173            .strip_prefix(base_path)
174            .unwrap_or(path)
175            .to_string_lossy()
176            .to_string();
177
178        // Choose reading strategy based on file size
179        let (content_bytes, _use_mmap) = if size >= self.mmap_threshold {
180            self.stats.mmap_used.fetch_add(1, Ordering::Relaxed);
181            let mapped = MappedFile::open(path)?;
182
183            // Check for binary
184            if mapped.is_binary() {
185                self.stats
186                    .files_skipped_binary
187                    .fetch_add(1, Ordering::Relaxed);
188                return Ok(None);
189            }
190
191            (mapped.as_bytes().to_vec(), true)
192        } else {
193            self.stats.regular_read_used.fetch_add(1, Ordering::Relaxed);
194            let content = std::fs::read(path)?;
195
196            // Check for binary
197            if is_binary_content(&content) {
198                self.stats
199                    .files_skipped_binary
200                    .fetch_add(1, Ordering::Relaxed);
201                return Ok(None);
202            }
203
204            (content, false)
205        };
206
207        // Convert to string
208        let content_str = match String::from_utf8(content_bytes) {
209            Ok(s) => s,
210            Err(_) => {
211                self.stats
212                    .files_skipped_binary
213                    .fetch_add(1, Ordering::Relaxed);
214                return Ok(None);
215            },
216        };
217
218        // Count tokens
219        let token_counts = self.tokenizer.count_all(&content_str);
220
221        // Count lines
222        let lines = content_str.lines().count();
223
224        // Detect language
225        let language = detect_language(path);
226
227        self.stats.files_scanned.fetch_add(1, Ordering::Relaxed);
228        self.stats.bytes_read.fetch_add(size, Ordering::Relaxed);
229
230        Ok(Some(ScannedFile {
231            path: path.to_string_lossy().to_string(),
232            relative_path,
233            size_bytes: size,
234            lines,
235            token_counts,
236            language,
237            content: Some(content_str),
238            is_binary: false,
239        }))
240    }
241
242    /// Scan multiple files in parallel
243    pub fn scan_files_parallel(&self, paths: &[&Path], base_path: &Path) -> Vec<ScannedFile> {
244        paths
245            .par_iter()
246            .filter_map(|path| match self.scan_file(path, base_path) {
247                Ok(Some(file)) => Some(file),
248                Ok(None) => None,
249                Err(e) => {
250                    tracing::debug!("Error scanning {:?}: {}", path, e);
251                    None
252                },
253            })
254            .collect()
255    }
256
257    /// Get scanning statistics
258    pub fn stats(&self) -> &ScanStats {
259        &self.stats
260    }
261
262    /// Reset statistics
263    pub fn reset_stats(&self) {
264        self.stats.files_scanned.store(0, Ordering::Relaxed);
265        self.stats.bytes_read.store(0, Ordering::Relaxed);
266        self.stats.files_skipped_binary.store(0, Ordering::Relaxed);
267        self.stats.files_skipped_size.store(0, Ordering::Relaxed);
268        self.stats.mmap_used.store(0, Ordering::Relaxed);
269        self.stats.regular_read_used.store(0, Ordering::Relaxed);
270    }
271}
272
273impl Default for MmapScanner {
274    fn default() -> Self {
275        Self::new()
276    }
277}
278
279/// Quick binary check for content
280fn is_binary_content(content: &[u8]) -> bool {
281    let check_len = content.len().min(8192);
282    let sample = &content[..check_len];
283
284    if sample.contains(&0) {
285        return true;
286    }
287
288    let non_printable = sample
289        .iter()
290        .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
291        .count();
292
293    non_printable * 10 > check_len
294}
295
296/// Detect language from file extension
297fn detect_language(path: &Path) -> Option<String> {
298    let ext = path.extension()?.to_str()?;
299
300    let lang = match ext.to_lowercase().as_str() {
301        "py" | "pyw" | "pyi" => "python",
302        "js" | "mjs" | "cjs" => "javascript",
303        "jsx" => "jsx",
304        "ts" | "mts" | "cts" => "typescript",
305        "tsx" => "tsx",
306        "rs" => "rust",
307        "go" => "go",
308        "java" => "java",
309        "c" | "h" => "c",
310        "cpp" | "hpp" | "cc" | "cxx" => "cpp",
311        "cs" => "csharp",
312        "rb" => "ruby",
313        "php" => "php",
314        "swift" => "swift",
315        "kt" | "kts" => "kotlin",
316        "scala" => "scala",
317        "sh" | "bash" => "bash",
318        "lua" => "lua",
319        "zig" => "zig",
320        "md" | "markdown" => "markdown",
321        "json" => "json",
322        "yaml" | "yml" => "yaml",
323        "toml" => "toml",
324        "xml" => "xml",
325        "html" | "htm" => "html",
326        "css" => "css",
327        "scss" | "sass" => "scss",
328        "sql" => "sql",
329        _ => return None,
330    };
331
332    Some(lang.to_owned())
333}
334
335/// Streaming content processor for very large files
336pub struct StreamingProcessor {
337    chunk_size: usize,
338    tokenizer: Tokenizer,
339}
340
341impl StreamingProcessor {
342    /// Create a new streaming processor
343    pub fn new(chunk_size: usize) -> Self {
344        Self { chunk_size, tokenizer: Tokenizer::new() }
345    }
346
347    /// Process a file in chunks, yielding partial results
348    pub fn process_file<F>(&self, path: &Path, mut callback: F) -> io::Result<()>
349    where
350        F: FnMut(&str, usize, TokenCounts),
351    {
352        let mapped = MappedFile::open(path)?;
353
354        if mapped.is_binary() {
355            return Ok(());
356        }
357
358        let content = mapped
359            .as_str()
360            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid UTF-8"))?;
361
362        let mut offset = 0;
363        while offset < content.len() {
364            let end = (offset + self.chunk_size).min(content.len());
365
366            // Find line boundary
367            let chunk_end = if end < content.len() {
368                content[offset..end]
369                    .rfind('\n')
370                    .map(|i| offset + i + 1)
371                    .unwrap_or(end)
372            } else {
373                end
374            };
375
376            let chunk = &content[offset..chunk_end];
377            let tokens = self.tokenizer.count_all(chunk);
378
379            callback(chunk, offset, tokens);
380
381            offset = chunk_end;
382        }
383
384        Ok(())
385    }
386
387    /// Estimate total tokens without loading full content
388    pub fn estimate_tokens(&self, path: &Path, model: TokenModel) -> io::Result<u32> {
389        let metadata = path.metadata()?;
390        let size = metadata.len();
391
392        // Quick estimation based on file size
393        let chars_per_token = model.chars_per_token();
394        Ok((size as f32 / chars_per_token).ceil() as u32)
395    }
396}
397
398#[cfg(test)]
399#[allow(clippy::str_to_string)]
400mod tests {
401    use super::*;
402    use std::io::Write;
403    use tempfile::{tempdir, NamedTempFile};
404
405    #[test]
406    fn test_mapped_file() {
407        let mut temp = NamedTempFile::new().unwrap();
408        writeln!(temp, "Hello, World!").unwrap();
409        writeln!(temp, "Second line").unwrap();
410
411        let mapped = MappedFile::open(temp.path()).unwrap();
412
413        assert!(!mapped.is_empty());
414        assert!(!mapped.is_binary());
415        assert_eq!(mapped.count_lines(), 2);
416    }
417
418    #[test]
419    fn test_mapped_file_as_str() {
420        let mut temp = NamedTempFile::new().unwrap();
421        writeln!(temp, "Valid UTF-8 content").unwrap();
422
423        let mapped = MappedFile::open(temp.path()).unwrap();
424        let content = mapped.as_str();
425        assert!(content.is_some());
426        assert!(content.unwrap().contains("Valid UTF-8"));
427    }
428
429    #[test]
430    fn test_mapped_file_len_and_path() {
431        let mut temp = NamedTempFile::new().unwrap();
432        writeln!(temp, "Test content").unwrap();
433
434        let mapped = MappedFile::open(temp.path()).unwrap();
435        assert!(mapped.len() > 0);
436        assert!(!mapped.path().is_empty());
437        assert!(mapped
438            .path()
439            .contains(temp.path().file_name().unwrap().to_str().unwrap()));
440    }
441
442    #[test]
443    fn test_mapped_file_as_bytes() {
444        let mut temp = NamedTempFile::new().unwrap();
445        temp.write_all(b"Raw bytes").unwrap();
446
447        let mapped = MappedFile::open(temp.path()).unwrap();
448        let bytes = mapped.as_bytes();
449        assert_eq!(&bytes[..9], b"Raw bytes");
450    }
451
452    #[test]
453    fn test_mapped_file_empty() {
454        let temp = NamedTempFile::new().unwrap();
455        let mapped = MappedFile::open(temp.path()).unwrap();
456        assert!(mapped.is_empty());
457        assert_eq!(mapped.len(), 0);
458        assert_eq!(mapped.count_lines(), 0);
459    }
460
461    #[test]
462    fn test_mapped_file_invalid_utf8() {
463        let mut temp = NamedTempFile::new().unwrap();
464        // Write invalid UTF-8 sequence
465        temp.write_all(&[0xFF, 0xFE, 0x41, 0x42]).unwrap();
466
467        let mapped = MappedFile::open(temp.path()).unwrap();
468        // as_str returns None for invalid UTF-8
469        assert!(mapped.as_str().is_none());
470    }
471
472    #[test]
473    fn test_binary_detection() {
474        let mut temp = NamedTempFile::new().unwrap();
475        temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
476
477        let mapped = MappedFile::open(temp.path()).unwrap();
478        assert!(mapped.is_binary());
479    }
480
481    #[test]
482    fn test_binary_detection_high_non_printable() {
483        let mut temp = NamedTempFile::new().unwrap();
484        // Write many non-printable chars (not null)
485        let mut content = vec![0x01u8; 100];
486        content.extend(b"some text"); // Add some text to avoid null detection
487        temp.write_all(&content).unwrap();
488
489        let mapped = MappedFile::open(temp.path()).unwrap();
490        assert!(mapped.is_binary());
491    }
492
493    #[test]
494    fn test_binary_detection_text_with_tabs() {
495        let mut temp = NamedTempFile::new().unwrap();
496        // Text with tabs and newlines should NOT be binary
497        writeln!(temp, "Line 1\twith\ttabs").unwrap();
498        writeln!(temp, "Line 2\twith\ttabs").unwrap();
499
500        let mapped = MappedFile::open(temp.path()).unwrap();
501        assert!(!mapped.is_binary());
502    }
503
504    #[test]
505    fn test_scanner() {
506        let mut temp = NamedTempFile::with_suffix(".py").unwrap();
507        writeln!(temp, "def hello():").unwrap();
508        writeln!(temp, "    print('hello')").unwrap();
509
510        let scanner = MmapScanner::new();
511        let result = scanner
512            .scan_file(temp.path(), temp.path().parent().unwrap())
513            .unwrap();
514
515        assert!(result.is_some());
516        let file = result.unwrap();
517        assert_eq!(file.language, Some("python".to_string()));
518        assert!(file.token_counts.claude > 0);
519    }
520
521    #[test]
522    fn test_scanner_default() {
523        let scanner = MmapScanner::default();
524        // Should have same settings as new()
525        assert_eq!(scanner.mmap_threshold, 64 * 1024);
526        assert_eq!(scanner.max_file_size, 50 * 1024 * 1024);
527    }
528
529    #[test]
530    fn test_scanner_with_thresholds() {
531        let scanner = MmapScanner::new()
532            .with_mmap_threshold(1024)
533            .with_max_file_size(1024 * 1024);
534        assert_eq!(scanner.mmap_threshold, 1024);
535        assert_eq!(scanner.max_file_size, 1024 * 1024);
536    }
537
538    #[test]
539    fn test_scanner_skips_large_files() {
540        let mut temp = NamedTempFile::new().unwrap();
541        // Write content that would be under the threshold
542        writeln!(temp, "Small content").unwrap();
543
544        // Set max file size very small
545        let scanner = MmapScanner::new().with_max_file_size(5);
546        let result = scanner
547            .scan_file(temp.path(), temp.path().parent().unwrap())
548            .unwrap();
549
550        assert!(result.is_none());
551        assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 1);
552    }
553
554    #[test]
555    fn test_scanner_skips_binary_files() {
556        let mut temp = NamedTempFile::new().unwrap();
557        temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
558
559        let scanner = MmapScanner::new();
560        let result = scanner
561            .scan_file(temp.path(), temp.path().parent().unwrap())
562            .unwrap();
563
564        assert!(result.is_none());
565        assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 1);
566    }
567
568    #[test]
569    fn test_scanner_uses_mmap_for_large_files() {
570        let mut temp = NamedTempFile::with_suffix(".rs").unwrap();
571        // Write content larger than default mmap threshold (64KB)
572        let content = "fn test() {}\n".repeat(10000);
573        temp.write_all(content.as_bytes()).unwrap();
574
575        let scanner = MmapScanner::new().with_mmap_threshold(1024); // Set low threshold
576        let result = scanner
577            .scan_file(temp.path(), temp.path().parent().unwrap())
578            .unwrap();
579
580        assert!(result.is_some());
581        assert!(scanner.stats().mmap_used.load(Ordering::Relaxed) >= 1);
582    }
583
584    #[test]
585    fn test_scanner_uses_regular_read_for_small_files() {
586        let mut temp = NamedTempFile::with_suffix(".py").unwrap();
587        writeln!(temp, "x = 1").unwrap();
588
589        let scanner = MmapScanner::new().with_mmap_threshold(1024 * 1024); // High threshold
590        let result = scanner
591            .scan_file(temp.path(), temp.path().parent().unwrap())
592            .unwrap();
593
594        assert!(result.is_some());
595        assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 1);
596    }
597
598    #[test]
599    fn test_scanner_reset_stats() {
600        let mut temp = NamedTempFile::with_suffix(".py").unwrap();
601        writeln!(temp, "x = 1").unwrap();
602
603        let scanner = MmapScanner::new();
604        scanner
605            .scan_file(temp.path(), temp.path().parent().unwrap())
606            .unwrap();
607
608        assert!(scanner.stats().files_scanned.load(Ordering::Relaxed) >= 1);
609
610        scanner.reset_stats();
611
612        assert_eq!(scanner.stats().files_scanned.load(Ordering::Relaxed), 0);
613        assert_eq!(scanner.stats().bytes_read.load(Ordering::Relaxed), 0);
614        assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 0);
615        assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 0);
616        assert_eq!(scanner.stats().mmap_used.load(Ordering::Relaxed), 0);
617        assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 0);
618    }
619
620    #[test]
621    fn test_scan_stats_summary() {
622        let stats = ScanStats::default();
623        stats.files_scanned.store(10, Ordering::Relaxed);
624        stats.bytes_read.store(5000, Ordering::Relaxed);
625        stats.files_skipped_binary.store(2, Ordering::Relaxed);
626        stats.files_skipped_size.store(1, Ordering::Relaxed);
627        stats.mmap_used.store(5, Ordering::Relaxed);
628        stats.regular_read_used.store(5, Ordering::Relaxed);
629
630        let summary = stats.summary();
631        assert!(summary.contains("10 files"));
632        assert!(summary.contains("5000 bytes"));
633        assert!(summary.contains("2 binary"));
634        assert!(summary.contains("1 oversized"));
635        assert!(summary.contains("mmap: 5"));
636        assert!(summary.contains("regular: 5"));
637    }
638
639    #[test]
640    fn test_scan_files_parallel() {
641        let dir = tempdir().unwrap();
642        let file1 = dir.path().join("test1.py");
643        let file2 = dir.path().join("test2.rs");
644        let file3 = dir.path().join("binary.bin");
645
646        std::fs::write(&file1, "def foo(): pass\n").unwrap();
647        std::fs::write(&file2, "fn main() {}\n").unwrap();
648        std::fs::write(&file3, &[0x00, 0x01, 0x02]).unwrap(); // Binary
649
650        let scanner = MmapScanner::new();
651        let paths: Vec<&Path> = vec![file1.as_path(), file2.as_path(), file3.as_path()];
652        let results = scanner.scan_files_parallel(&paths, dir.path());
653
654        // Should get 2 files (binary skipped)
655        assert_eq!(results.len(), 2);
656        assert!(results
657            .iter()
658            .any(|f| f.language == Some("python".to_string())));
659        assert!(results
660            .iter()
661            .any(|f| f.language == Some("rust".to_string())));
662    }
663
664    #[test]
665    fn test_scan_files_parallel_with_errors() {
666        let dir = tempdir().unwrap();
667        let file1 = dir.path().join("test.py");
668        std::fs::write(&file1, "x = 1\n").unwrap();
669
670        let scanner = MmapScanner::new();
671        let nonexistent = Path::new("/nonexistent/file.py");
672        let paths: Vec<&Path> = vec![file1.as_path(), nonexistent];
673        let results = scanner.scan_files_parallel(&paths, dir.path());
674
675        // Should get 1 file (nonexistent skipped with error)
676        assert_eq!(results.len(), 1);
677    }
678
679    #[test]
680    fn test_detect_language() {
681        assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
682        assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
683        assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
684        assert_eq!(detect_language(Path::new("test.unknown")), None);
685    }
686
687    #[test]
688    fn test_detect_language_all_extensions() {
689        // Python
690        assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
691        assert_eq!(detect_language(Path::new("test.pyw")), Some("python".to_string()));
692        assert_eq!(detect_language(Path::new("test.pyi")), Some("python".to_string()));
693
694        // JavaScript
695        assert_eq!(detect_language(Path::new("test.js")), Some("javascript".to_string()));
696        assert_eq!(detect_language(Path::new("test.mjs")), Some("javascript".to_string()));
697        assert_eq!(detect_language(Path::new("test.cjs")), Some("javascript".to_string()));
698        assert_eq!(detect_language(Path::new("test.jsx")), Some("jsx".to_string()));
699
700        // TypeScript
701        assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
702        assert_eq!(detect_language(Path::new("test.mts")), Some("typescript".to_string()));
703        assert_eq!(detect_language(Path::new("test.cts")), Some("typescript".to_string()));
704        assert_eq!(detect_language(Path::new("test.tsx")), Some("tsx".to_string()));
705
706        // Systems languages
707        assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
708        assert_eq!(detect_language(Path::new("test.go")), Some("go".to_string()));
709        assert_eq!(detect_language(Path::new("test.c")), Some("c".to_string()));
710        assert_eq!(detect_language(Path::new("test.h")), Some("c".to_string()));
711        assert_eq!(detect_language(Path::new("test.cpp")), Some("cpp".to_string()));
712        assert_eq!(detect_language(Path::new("test.hpp")), Some("cpp".to_string()));
713        assert_eq!(detect_language(Path::new("test.cc")), Some("cpp".to_string()));
714        assert_eq!(detect_language(Path::new("test.cxx")), Some("cpp".to_string()));
715        assert_eq!(detect_language(Path::new("test.zig")), Some("zig".to_string()));
716
717        // JVM languages
718        assert_eq!(detect_language(Path::new("test.java")), Some("java".to_string()));
719        assert_eq!(detect_language(Path::new("test.kt")), Some("kotlin".to_string()));
720        assert_eq!(detect_language(Path::new("test.kts")), Some("kotlin".to_string()));
721        assert_eq!(detect_language(Path::new("test.scala")), Some("scala".to_string()));
722
723        // Other languages
724        assert_eq!(detect_language(Path::new("test.cs")), Some("csharp".to_string()));
725        assert_eq!(detect_language(Path::new("test.rb")), Some("ruby".to_string()));
726        assert_eq!(detect_language(Path::new("test.php")), Some("php".to_string()));
727        assert_eq!(detect_language(Path::new("test.swift")), Some("swift".to_string()));
728        assert_eq!(detect_language(Path::new("test.lua")), Some("lua".to_string()));
729
730        // Shell
731        assert_eq!(detect_language(Path::new("test.sh")), Some("bash".to_string()));
732        assert_eq!(detect_language(Path::new("test.bash")), Some("bash".to_string()));
733
734        // Markup/Data
735        assert_eq!(detect_language(Path::new("test.md")), Some("markdown".to_string()));
736        assert_eq!(detect_language(Path::new("test.markdown")), Some("markdown".to_string()));
737        assert_eq!(detect_language(Path::new("test.json")), Some("json".to_string()));
738        assert_eq!(detect_language(Path::new("test.yaml")), Some("yaml".to_string()));
739        assert_eq!(detect_language(Path::new("test.yml")), Some("yaml".to_string()));
740        assert_eq!(detect_language(Path::new("test.toml")), Some("toml".to_string()));
741        assert_eq!(detect_language(Path::new("test.xml")), Some("xml".to_string()));
742        assert_eq!(detect_language(Path::new("test.html")), Some("html".to_string()));
743        assert_eq!(detect_language(Path::new("test.htm")), Some("html".to_string()));
744        assert_eq!(detect_language(Path::new("test.css")), Some("css".to_string()));
745        assert_eq!(detect_language(Path::new("test.scss")), Some("scss".to_string()));
746        assert_eq!(detect_language(Path::new("test.sass")), Some("scss".to_string()));
747        assert_eq!(detect_language(Path::new("test.sql")), Some("sql".to_string()));
748
749        // No extension
750        assert_eq!(detect_language(Path::new("Makefile")), None);
751        assert_eq!(detect_language(Path::new("README")), None);
752    }
753
754    #[test]
755    fn test_detect_language_case_insensitive() {
756        // Extensions should be case-insensitive
757        assert_eq!(detect_language(Path::new("test.PY")), Some("python".to_string()));
758        assert_eq!(detect_language(Path::new("test.RS")), Some("rust".to_string()));
759        assert_eq!(detect_language(Path::new("test.Js")), Some("javascript".to_string()));
760    }
761
762    #[test]
763    fn test_is_binary_content() {
764        // Text content should not be binary
765        assert!(!is_binary_content(b"Hello, world!\n"));
766        assert!(!is_binary_content(b"Line 1\nLine 2\nLine 3\n"));
767        assert!(!is_binary_content(b"Tab\tseparated\tvalues\n"));
768
769        // Null bytes indicate binary
770        assert!(is_binary_content(&[0x00, 0x01, 0x02]));
771        assert!(is_binary_content(b"text\x00with\x00nulls"));
772
773        // High non-printable ratio indicates binary
774        let mostly_binary: Vec<u8> = (0u8..100).collect();
775        assert!(is_binary_content(&mostly_binary));
776    }
777
778    #[test]
779    fn test_streaming_processor() {
780        let mut temp = NamedTempFile::new().unwrap();
781        for i in 0..100 {
782            writeln!(temp, "Line {}: Some content here", i).unwrap();
783        }
784
785        let processor = StreamingProcessor::new(256);
786        let mut chunks = 0;
787
788        processor
789            .process_file(temp.path(), |_chunk, _offset, _tokens| {
790                chunks += 1;
791            })
792            .unwrap();
793
794        assert!(chunks > 1);
795    }
796
797    #[test]
798    fn test_streaming_processor_single_chunk() {
799        let mut temp = NamedTempFile::new().unwrap();
800        writeln!(temp, "Short content").unwrap();
801
802        let processor = StreamingProcessor::new(1024 * 1024); // Large chunk size
803        let mut chunks = 0;
804        let mut total_offset = 0;
805
806        processor
807            .process_file(temp.path(), |_chunk, offset, _tokens| {
808                chunks += 1;
809                total_offset = offset;
810            })
811            .unwrap();
812
813        assert_eq!(chunks, 1);
814        assert_eq!(total_offset, 0);
815    }
816
817    #[test]
818    fn test_streaming_processor_binary_file() {
819        let mut temp = NamedTempFile::new().unwrap();
820        temp.write_all(&[0x00, 0x01, 0x02]).unwrap();
821
822        let processor = StreamingProcessor::new(256);
823        let mut chunks = 0;
824
825        // Should return Ok(()) but not call callback for binary files
826        processor
827            .process_file(temp.path(), |_chunk, _offset, _tokens| {
828                chunks += 1;
829            })
830            .unwrap();
831
832        assert_eq!(chunks, 0);
833    }
834
835    #[test]
836    fn test_streaming_processor_estimate_tokens() {
837        let mut temp = NamedTempFile::new().unwrap();
838        let content = "x".repeat(1000);
839        temp.write_all(content.as_bytes()).unwrap();
840
841        let processor = StreamingProcessor::new(256);
842        let estimate = processor
843            .estimate_tokens(temp.path(), TokenModel::Claude)
844            .unwrap();
845
846        // Claude has ~4 chars per token, so 1000 chars should be ~250 tokens
847        assert!(estimate > 0);
848        assert!(estimate < 500);
849    }
850
851    #[test]
852    fn test_scanned_file_struct() {
853        let file = ScannedFile {
854            path: "/tmp/test.py".to_string(),
855            relative_path: "test.py".to_string(),
856            size_bytes: 100,
857            lines: 10,
858            token_counts: TokenCounts::default(),
859            language: Some("python".to_string()),
860            content: Some("x = 1".to_string()),
861            is_binary: false,
862        };
863
864        assert_eq!(file.path, "/tmp/test.py");
865        assert_eq!(file.relative_path, "test.py");
866        assert_eq!(file.size_bytes, 100);
867        assert_eq!(file.lines, 10);
868        assert!(!file.is_binary);
869    }
870
871    #[test]
872    fn test_mapped_file_open_nonexistent() {
873        let result = MappedFile::open(Path::new("/nonexistent/file.txt"));
874        assert!(result.is_err());
875    }
876
877    #[test]
878    fn test_scanner_nonexistent_file() {
879        let scanner = MmapScanner::new();
880        let result =
881            scanner.scan_file(Path::new("/nonexistent/file.py"), Path::new("/nonexistent"));
882        assert!(result.is_err());
883    }
884
885    #[test]
886    fn test_streaming_processor_invalid_utf8() {
887        let mut temp = NamedTempFile::new().unwrap();
888        // Write invalid UTF-8 that's not binary (no nulls, not high non-printable ratio)
889        // This is tricky - we need bytes that:
890        // 1. Don't contain null (0x00)
891        // 2. Don't have high non-printable ratio
892        // 3. Are invalid UTF-8
893        // Write text with an invalid UTF-8 sequence embedded
894        temp.write_all(b"Hello ").unwrap();
895        temp.write_all(&[0xFF, 0xFE]).unwrap(); // Invalid UTF-8
896        temp.write_all(b" World").unwrap();
897
898        let processor = StreamingProcessor::new(256);
899        let result = processor.process_file(temp.path(), |_, _, _| {});
900
901        // Should return an error for invalid UTF-8
902        assert!(result.is_err());
903    }
904}