1use memmap2::{Mmap, MmapOptions};
7use rayon::prelude::*;
8use std::fs::File;
9use std::io;
10use std::path::Path;
11use std::sync::atomic::{AtomicU64, Ordering};
12
13use crate::tokenizer::{TokenCounts, TokenModel, Tokenizer};
14
15pub struct MappedFile {
17 mmap: Mmap,
18 path: String,
19}
20
21impl MappedFile {
22 #[allow(unsafe_code)]
24 pub fn open(path: &Path) -> io::Result<Self> {
25 let file = File::open(path)?;
26 let mmap = unsafe { MmapOptions::new().map(&file)? };
28
29 Ok(Self { mmap, path: path.to_string_lossy().to_string() })
30 }
31
32 #[inline]
34 pub fn as_bytes(&self) -> &[u8] {
35 &self.mmap
36 }
37
38 pub fn as_str(&self) -> Option<&str> {
40 std::str::from_utf8(&self.mmap).ok()
41 }
42
43 #[inline]
45 pub fn len(&self) -> usize {
46 self.mmap.len()
47 }
48
49 #[inline]
51 pub fn is_empty(&self) -> bool {
52 self.mmap.is_empty()
53 }
54
55 pub fn path(&self) -> &str {
57 &self.path
58 }
59
60 pub fn is_binary(&self) -> bool {
62 let check_len = self.mmap.len().min(8192);
64 let sample = &self.mmap[..check_len];
65
66 if sample.contains(&0) {
68 return true;
69 }
70
71 let non_printable = sample
73 .iter()
74 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
75 .count();
76
77 non_printable * 10 > check_len
78 }
79
80 pub fn count_lines(&self) -> usize {
82 self.mmap.iter().filter(|&&b| b == b'\n').count()
83 }
84}
85
86pub struct MmapScanner {
88 mmap_threshold: u64,
90 max_file_size: u64,
92 tokenizer: Tokenizer,
94 stats: ScanStats,
96}
97
98#[derive(Debug, Default)]
100pub struct ScanStats {
101 pub files_scanned: AtomicU64,
102 pub bytes_read: AtomicU64,
103 pub files_skipped_binary: AtomicU64,
104 pub files_skipped_size: AtomicU64,
105 pub mmap_used: AtomicU64,
106 pub regular_read_used: AtomicU64,
107}
108
109impl ScanStats {
110 pub fn summary(&self) -> String {
111 format!(
112 "Scanned {} files ({} bytes), skipped {} binary + {} oversized, mmap: {}, regular: {}",
113 self.files_scanned.load(Ordering::Relaxed),
114 self.bytes_read.load(Ordering::Relaxed),
115 self.files_skipped_binary.load(Ordering::Relaxed),
116 self.files_skipped_size.load(Ordering::Relaxed),
117 self.mmap_used.load(Ordering::Relaxed),
118 self.regular_read_used.load(Ordering::Relaxed),
119 )
120 }
121}
122
123#[derive(Debug)]
125pub struct ScannedFile {
126 pub path: String,
127 pub relative_path: String,
128 pub size_bytes: u64,
129 pub lines: usize,
130 pub token_counts: TokenCounts,
131 pub language: Option<String>,
132 pub content: Option<String>,
133 pub is_binary: bool,
134}
135
136impl MmapScanner {
137 pub fn new() -> Self {
139 Self {
140 mmap_threshold: 64 * 1024, max_file_size: 50 * 1024 * 1024, tokenizer: Tokenizer::new(),
143 stats: ScanStats::default(),
144 }
145 }
146
147 pub fn with_mmap_threshold(mut self, bytes: u64) -> Self {
149 self.mmap_threshold = bytes;
150 self
151 }
152
153 pub fn with_max_file_size(mut self, bytes: u64) -> Self {
155 self.max_file_size = bytes;
156 self
157 }
158
159 pub fn scan_file(&self, path: &Path, base_path: &Path) -> io::Result<Option<ScannedFile>> {
161 let metadata = path.metadata()?;
162 let size = metadata.len();
163
164 if size > self.max_file_size {
166 self.stats
167 .files_skipped_size
168 .fetch_add(1, Ordering::Relaxed);
169 return Ok(None);
170 }
171
172 let relative_path = path
173 .strip_prefix(base_path)
174 .unwrap_or(path)
175 .to_string_lossy()
176 .to_string();
177
178 let (content_bytes, _use_mmap) = if size >= self.mmap_threshold {
180 self.stats.mmap_used.fetch_add(1, Ordering::Relaxed);
181 let mapped = MappedFile::open(path)?;
182
183 if mapped.is_binary() {
185 self.stats
186 .files_skipped_binary
187 .fetch_add(1, Ordering::Relaxed);
188 return Ok(None);
189 }
190
191 (mapped.as_bytes().to_vec(), true)
192 } else {
193 self.stats.regular_read_used.fetch_add(1, Ordering::Relaxed);
194 let content = std::fs::read(path)?;
195
196 if is_binary_content(&content) {
198 self.stats
199 .files_skipped_binary
200 .fetch_add(1, Ordering::Relaxed);
201 return Ok(None);
202 }
203
204 (content, false)
205 };
206
207 let content_str = match String::from_utf8(content_bytes) {
209 Ok(s) => s,
210 Err(_) => {
211 self.stats
212 .files_skipped_binary
213 .fetch_add(1, Ordering::Relaxed);
214 return Ok(None);
215 },
216 };
217
218 let token_counts = self.tokenizer.count_all(&content_str);
220
221 let lines = content_str.lines().count();
223
224 let language = detect_language(path);
226
227 self.stats.files_scanned.fetch_add(1, Ordering::Relaxed);
228 self.stats.bytes_read.fetch_add(size, Ordering::Relaxed);
229
230 Ok(Some(ScannedFile {
231 path: path.to_string_lossy().to_string(),
232 relative_path,
233 size_bytes: size,
234 lines,
235 token_counts,
236 language,
237 content: Some(content_str),
238 is_binary: false,
239 }))
240 }
241
242 pub fn scan_files_parallel(&self, paths: &[&Path], base_path: &Path) -> Vec<ScannedFile> {
244 paths
245 .par_iter()
246 .filter_map(|path| match self.scan_file(path, base_path) {
247 Ok(Some(file)) => Some(file),
248 Ok(None) => None,
249 Err(e) => {
250 tracing::debug!("Error scanning {:?}: {}", path, e);
251 None
252 },
253 })
254 .collect()
255 }
256
257 pub fn stats(&self) -> &ScanStats {
259 &self.stats
260 }
261
262 pub fn reset_stats(&self) {
264 self.stats.files_scanned.store(0, Ordering::Relaxed);
265 self.stats.bytes_read.store(0, Ordering::Relaxed);
266 self.stats.files_skipped_binary.store(0, Ordering::Relaxed);
267 self.stats.files_skipped_size.store(0, Ordering::Relaxed);
268 self.stats.mmap_used.store(0, Ordering::Relaxed);
269 self.stats.regular_read_used.store(0, Ordering::Relaxed);
270 }
271}
272
273impl Default for MmapScanner {
274 fn default() -> Self {
275 Self::new()
276 }
277}
278
279fn is_binary_content(content: &[u8]) -> bool {
281 let check_len = content.len().min(8192);
282 let sample = &content[..check_len];
283
284 if sample.contains(&0) {
285 return true;
286 }
287
288 let non_printable = sample
289 .iter()
290 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
291 .count();
292
293 non_printable * 10 > check_len
294}
295
296fn detect_language(path: &Path) -> Option<String> {
298 let ext = path.extension()?.to_str()?;
299
300 let lang = match ext.to_lowercase().as_str() {
301 "py" | "pyw" | "pyi" => "python",
302 "js" | "mjs" | "cjs" => "javascript",
303 "jsx" => "jsx",
304 "ts" | "mts" | "cts" => "typescript",
305 "tsx" => "tsx",
306 "rs" => "rust",
307 "go" => "go",
308 "java" => "java",
309 "c" | "h" => "c",
310 "cpp" | "hpp" | "cc" | "cxx" => "cpp",
311 "cs" => "csharp",
312 "rb" => "ruby",
313 "php" => "php",
314 "swift" => "swift",
315 "kt" | "kts" => "kotlin",
316 "scala" => "scala",
317 "sh" | "bash" => "bash",
318 "lua" => "lua",
319 "zig" => "zig",
320 "md" | "markdown" => "markdown",
321 "json" => "json",
322 "yaml" | "yml" => "yaml",
323 "toml" => "toml",
324 "xml" => "xml",
325 "html" | "htm" => "html",
326 "css" => "css",
327 "scss" | "sass" => "scss",
328 "sql" => "sql",
329 _ => return None,
330 };
331
332 Some(lang.to_owned())
333}
334
335pub struct StreamingProcessor {
337 chunk_size: usize,
338 tokenizer: Tokenizer,
339}
340
341impl StreamingProcessor {
342 pub fn new(chunk_size: usize) -> Self {
344 Self { chunk_size, tokenizer: Tokenizer::new() }
345 }
346
347 pub fn process_file<F>(&self, path: &Path, mut callback: F) -> io::Result<()>
349 where
350 F: FnMut(&str, usize, TokenCounts),
351 {
352 let mapped = MappedFile::open(path)?;
353
354 if mapped.is_binary() {
355 return Ok(());
356 }
357
358 let content = mapped
359 .as_str()
360 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid UTF-8"))?;
361
362 let mut offset = 0;
363 while offset < content.len() {
364 let end = (offset + self.chunk_size).min(content.len());
365
366 let chunk_end = if end < content.len() {
368 content[offset..end]
369 .rfind('\n')
370 .map_or(end, |i| offset + i + 1)
371 } else {
372 end
373 };
374
375 let chunk = &content[offset..chunk_end];
376 let tokens = self.tokenizer.count_all(chunk);
377
378 callback(chunk, offset, tokens);
379
380 offset = chunk_end;
381 }
382
383 Ok(())
384 }
385
386 pub fn estimate_tokens(&self, path: &Path, model: TokenModel) -> io::Result<u32> {
388 let metadata = path.metadata()?;
389 let size = metadata.len();
390
391 let chars_per_token = model.chars_per_token();
393 Ok((size as f32 / chars_per_token).ceil() as u32)
394 }
395}
396
397#[cfg(test)]
398#[allow(clippy::str_to_string)]
399mod tests {
400 use super::*;
401 use std::io::Write;
402 use tempfile::{tempdir, NamedTempFile};
403
404 #[test]
405 fn test_mapped_file() {
406 let mut temp = NamedTempFile::new().unwrap();
407 writeln!(temp, "Hello, World!").unwrap();
408 writeln!(temp, "Second line").unwrap();
409
410 let mapped = MappedFile::open(temp.path()).unwrap();
411
412 assert!(!mapped.is_empty());
413 assert!(!mapped.is_binary());
414 assert_eq!(mapped.count_lines(), 2);
415 }
416
417 #[test]
418 fn test_mapped_file_as_str() {
419 let mut temp = NamedTempFile::new().unwrap();
420 writeln!(temp, "Valid UTF-8 content").unwrap();
421
422 let mapped = MappedFile::open(temp.path()).unwrap();
423 let content = mapped.as_str();
424 assert!(content.is_some());
425 assert!(content.unwrap().contains("Valid UTF-8"));
426 }
427
428 #[test]
429 fn test_mapped_file_len_and_path() {
430 let mut temp = NamedTempFile::new().unwrap();
431 writeln!(temp, "Test content").unwrap();
432
433 let mapped = MappedFile::open(temp.path()).unwrap();
434 assert!(!mapped.is_empty());
435 assert!(!mapped.path().is_empty());
436 assert!(mapped
437 .path()
438 .contains(temp.path().file_name().unwrap().to_str().unwrap()));
439 }
440
441 #[test]
442 fn test_mapped_file_as_bytes() {
443 let mut temp = NamedTempFile::new().unwrap();
444 temp.write_all(b"Raw bytes").unwrap();
445
446 let mapped = MappedFile::open(temp.path()).unwrap();
447 let bytes = mapped.as_bytes();
448 assert_eq!(&bytes[..9], b"Raw bytes");
449 }
450
451 #[test]
452 fn test_mapped_file_empty() {
453 let temp = NamedTempFile::new().unwrap();
454 let mapped = MappedFile::open(temp.path()).unwrap();
455 assert!(mapped.is_empty());
456 assert_eq!(mapped.len(), 0);
457 assert_eq!(mapped.count_lines(), 0);
458 }
459
460 #[test]
461 fn test_mapped_file_invalid_utf8() {
462 let mut temp = NamedTempFile::new().unwrap();
463 temp.write_all(&[0xFF, 0xFE, 0x41, 0x42]).unwrap();
465
466 let mapped = MappedFile::open(temp.path()).unwrap();
467 assert!(mapped.as_str().is_none());
469 }
470
471 #[test]
472 fn test_binary_detection() {
473 let mut temp = NamedTempFile::new().unwrap();
474 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
475
476 let mapped = MappedFile::open(temp.path()).unwrap();
477 assert!(mapped.is_binary());
478 }
479
480 #[test]
481 fn test_binary_detection_high_non_printable() {
482 let mut temp = NamedTempFile::new().unwrap();
483 let mut content = vec![0x01u8; 100];
485 content.extend(b"some text"); temp.write_all(&content).unwrap();
487
488 let mapped = MappedFile::open(temp.path()).unwrap();
489 assert!(mapped.is_binary());
490 }
491
492 #[test]
493 fn test_binary_detection_text_with_tabs() {
494 let mut temp = NamedTempFile::new().unwrap();
495 writeln!(temp, "Line 1\twith\ttabs").unwrap();
497 writeln!(temp, "Line 2\twith\ttabs").unwrap();
498
499 let mapped = MappedFile::open(temp.path()).unwrap();
500 assert!(!mapped.is_binary());
501 }
502
503 #[test]
504 fn test_scanner() {
505 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
506 writeln!(temp, "def hello():").unwrap();
507 writeln!(temp, " print('hello')").unwrap();
508
509 let scanner = MmapScanner::new();
510 let result = scanner
511 .scan_file(temp.path(), temp.path().parent().unwrap())
512 .unwrap();
513
514 assert!(result.is_some());
515 let file = result.unwrap();
516 assert_eq!(file.language, Some("python".to_string()));
517 assert!(file.token_counts.claude > 0);
518 }
519
520 #[test]
521 fn test_scanner_default() {
522 let scanner = MmapScanner::default();
523 assert_eq!(scanner.mmap_threshold, 64 * 1024);
525 assert_eq!(scanner.max_file_size, 50 * 1024 * 1024);
526 }
527
528 #[test]
529 fn test_scanner_with_thresholds() {
530 let scanner = MmapScanner::new()
531 .with_mmap_threshold(1024)
532 .with_max_file_size(1024 * 1024);
533 assert_eq!(scanner.mmap_threshold, 1024);
534 assert_eq!(scanner.max_file_size, 1024 * 1024);
535 }
536
537 #[test]
538 fn test_scanner_skips_large_files() {
539 let mut temp = NamedTempFile::new().unwrap();
540 writeln!(temp, "Small content").unwrap();
542
543 let scanner = MmapScanner::new().with_max_file_size(5);
545 let result = scanner
546 .scan_file(temp.path(), temp.path().parent().unwrap())
547 .unwrap();
548
549 assert!(result.is_none());
550 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 1);
551 }
552
553 #[test]
554 fn test_scanner_skips_binary_files() {
555 let mut temp = NamedTempFile::new().unwrap();
556 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
557
558 let scanner = MmapScanner::new();
559 let result = scanner
560 .scan_file(temp.path(), temp.path().parent().unwrap())
561 .unwrap();
562
563 assert!(result.is_none());
564 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 1);
565 }
566
567 #[test]
568 fn test_scanner_uses_mmap_for_large_files() {
569 let mut temp = NamedTempFile::with_suffix(".rs").unwrap();
570 let content = "fn test() {}\n".repeat(10000);
572 temp.write_all(content.as_bytes()).unwrap();
573
574 let scanner = MmapScanner::new().with_mmap_threshold(1024); let result = scanner
576 .scan_file(temp.path(), temp.path().parent().unwrap())
577 .unwrap();
578
579 assert!(result.is_some());
580 assert!(scanner.stats().mmap_used.load(Ordering::Relaxed) >= 1);
581 }
582
583 #[test]
584 fn test_scanner_uses_regular_read_for_small_files() {
585 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
586 writeln!(temp, "x = 1").unwrap();
587
588 let scanner = MmapScanner::new().with_mmap_threshold(1024 * 1024); let result = scanner
590 .scan_file(temp.path(), temp.path().parent().unwrap())
591 .unwrap();
592
593 assert!(result.is_some());
594 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 1);
595 }
596
597 #[test]
598 fn test_scanner_reset_stats() {
599 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
600 writeln!(temp, "x = 1").unwrap();
601
602 let scanner = MmapScanner::new();
603 scanner
604 .scan_file(temp.path(), temp.path().parent().unwrap())
605 .unwrap();
606
607 assert!(scanner.stats().files_scanned.load(Ordering::Relaxed) >= 1);
608
609 scanner.reset_stats();
610
611 assert_eq!(scanner.stats().files_scanned.load(Ordering::Relaxed), 0);
612 assert_eq!(scanner.stats().bytes_read.load(Ordering::Relaxed), 0);
613 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 0);
614 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 0);
615 assert_eq!(scanner.stats().mmap_used.load(Ordering::Relaxed), 0);
616 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 0);
617 }
618
619 #[test]
620 fn test_scan_stats_summary() {
621 let stats = ScanStats::default();
622 stats.files_scanned.store(10, Ordering::Relaxed);
623 stats.bytes_read.store(5000, Ordering::Relaxed);
624 stats.files_skipped_binary.store(2, Ordering::Relaxed);
625 stats.files_skipped_size.store(1, Ordering::Relaxed);
626 stats.mmap_used.store(5, Ordering::Relaxed);
627 stats.regular_read_used.store(5, Ordering::Relaxed);
628
629 let summary = stats.summary();
630 assert!(summary.contains("10 files"));
631 assert!(summary.contains("5000 bytes"));
632 assert!(summary.contains("2 binary"));
633 assert!(summary.contains("1 oversized"));
634 assert!(summary.contains("mmap: 5"));
635 assert!(summary.contains("regular: 5"));
636 }
637
638 #[test]
639 fn test_scan_files_parallel() {
640 let dir = tempdir().unwrap();
641 let file1 = dir.path().join("test1.py");
642 let file2 = dir.path().join("test2.rs");
643 let file3 = dir.path().join("binary.bin");
644
645 std::fs::write(&file1, "def foo(): pass\n").unwrap();
646 std::fs::write(&file2, "fn main() {}\n").unwrap();
647 std::fs::write(&file3, [0x00, 0x01, 0x02]).unwrap(); let scanner = MmapScanner::new();
650 let paths: Vec<&Path> = vec![file1.as_path(), file2.as_path(), file3.as_path()];
651 let results = scanner.scan_files_parallel(&paths, dir.path());
652
653 assert_eq!(results.len(), 2);
655 assert!(results
656 .iter()
657 .any(|f| f.language == Some("python".to_string())));
658 assert!(results
659 .iter()
660 .any(|f| f.language == Some("rust".to_string())));
661 }
662
663 #[test]
664 fn test_scan_files_parallel_with_errors() {
665 let dir = tempdir().unwrap();
666 let file1 = dir.path().join("test.py");
667 std::fs::write(&file1, "x = 1\n").unwrap();
668
669 let scanner = MmapScanner::new();
670 let nonexistent = Path::new("/nonexistent/file.py");
671 let paths: Vec<&Path> = vec![file1.as_path(), nonexistent];
672 let results = scanner.scan_files_parallel(&paths, dir.path());
673
674 assert_eq!(results.len(), 1);
676 }
677
678 #[test]
679 fn test_detect_language() {
680 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
681 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
682 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
683 assert_eq!(detect_language(Path::new("test.unknown")), None);
684 }
685
686 #[test]
687 fn test_detect_language_all_extensions() {
688 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
690 assert_eq!(detect_language(Path::new("test.pyw")), Some("python".to_string()));
691 assert_eq!(detect_language(Path::new("test.pyi")), Some("python".to_string()));
692
693 assert_eq!(detect_language(Path::new("test.js")), Some("javascript".to_string()));
695 assert_eq!(detect_language(Path::new("test.mjs")), Some("javascript".to_string()));
696 assert_eq!(detect_language(Path::new("test.cjs")), Some("javascript".to_string()));
697 assert_eq!(detect_language(Path::new("test.jsx")), Some("jsx".to_string()));
698
699 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
701 assert_eq!(detect_language(Path::new("test.mts")), Some("typescript".to_string()));
702 assert_eq!(detect_language(Path::new("test.cts")), Some("typescript".to_string()));
703 assert_eq!(detect_language(Path::new("test.tsx")), Some("tsx".to_string()));
704
705 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
707 assert_eq!(detect_language(Path::new("test.go")), Some("go".to_string()));
708 assert_eq!(detect_language(Path::new("test.c")), Some("c".to_string()));
709 assert_eq!(detect_language(Path::new("test.h")), Some("c".to_string()));
710 assert_eq!(detect_language(Path::new("test.cpp")), Some("cpp".to_string()));
711 assert_eq!(detect_language(Path::new("test.hpp")), Some("cpp".to_string()));
712 assert_eq!(detect_language(Path::new("test.cc")), Some("cpp".to_string()));
713 assert_eq!(detect_language(Path::new("test.cxx")), Some("cpp".to_string()));
714 assert_eq!(detect_language(Path::new("test.zig")), Some("zig".to_string()));
715
716 assert_eq!(detect_language(Path::new("test.java")), Some("java".to_string()));
718 assert_eq!(detect_language(Path::new("test.kt")), Some("kotlin".to_string()));
719 assert_eq!(detect_language(Path::new("test.kts")), Some("kotlin".to_string()));
720 assert_eq!(detect_language(Path::new("test.scala")), Some("scala".to_string()));
721
722 assert_eq!(detect_language(Path::new("test.cs")), Some("csharp".to_string()));
724 assert_eq!(detect_language(Path::new("test.rb")), Some("ruby".to_string()));
725 assert_eq!(detect_language(Path::new("test.php")), Some("php".to_string()));
726 assert_eq!(detect_language(Path::new("test.swift")), Some("swift".to_string()));
727 assert_eq!(detect_language(Path::new("test.lua")), Some("lua".to_string()));
728
729 assert_eq!(detect_language(Path::new("test.sh")), Some("bash".to_string()));
731 assert_eq!(detect_language(Path::new("test.bash")), Some("bash".to_string()));
732
733 assert_eq!(detect_language(Path::new("test.md")), Some("markdown".to_string()));
735 assert_eq!(detect_language(Path::new("test.markdown")), Some("markdown".to_string()));
736 assert_eq!(detect_language(Path::new("test.json")), Some("json".to_string()));
737 assert_eq!(detect_language(Path::new("test.yaml")), Some("yaml".to_string()));
738 assert_eq!(detect_language(Path::new("test.yml")), Some("yaml".to_string()));
739 assert_eq!(detect_language(Path::new("test.toml")), Some("toml".to_string()));
740 assert_eq!(detect_language(Path::new("test.xml")), Some("xml".to_string()));
741 assert_eq!(detect_language(Path::new("test.html")), Some("html".to_string()));
742 assert_eq!(detect_language(Path::new("test.htm")), Some("html".to_string()));
743 assert_eq!(detect_language(Path::new("test.css")), Some("css".to_string()));
744 assert_eq!(detect_language(Path::new("test.scss")), Some("scss".to_string()));
745 assert_eq!(detect_language(Path::new("test.sass")), Some("scss".to_string()));
746 assert_eq!(detect_language(Path::new("test.sql")), Some("sql".to_string()));
747
748 assert_eq!(detect_language(Path::new("Makefile")), None);
750 assert_eq!(detect_language(Path::new("README")), None);
751 }
752
753 #[test]
754 fn test_detect_language_case_insensitive() {
755 assert_eq!(detect_language(Path::new("test.PY")), Some("python".to_string()));
757 assert_eq!(detect_language(Path::new("test.RS")), Some("rust".to_string()));
758 assert_eq!(detect_language(Path::new("test.Js")), Some("javascript".to_string()));
759 }
760
761 #[test]
762 fn test_is_binary_content() {
763 assert!(!is_binary_content(b"Hello, world!\n"));
765 assert!(!is_binary_content(b"Line 1\nLine 2\nLine 3\n"));
766 assert!(!is_binary_content(b"Tab\tseparated\tvalues\n"));
767
768 assert!(is_binary_content(&[0x00, 0x01, 0x02]));
770 assert!(is_binary_content(b"text\x00with\x00nulls"));
771
772 let mostly_binary: Vec<u8> = (0u8..100).collect();
774 assert!(is_binary_content(&mostly_binary));
775 }
776
777 #[test]
778 fn test_streaming_processor() {
779 let mut temp = NamedTempFile::new().unwrap();
780 for i in 0..100 {
781 writeln!(temp, "Line {}: Some content here", i).unwrap();
782 }
783
784 let processor = StreamingProcessor::new(256);
785 let mut chunks = 0;
786
787 processor
788 .process_file(temp.path(), |_chunk, _offset, _tokens| {
789 chunks += 1;
790 })
791 .unwrap();
792
793 assert!(chunks > 1);
794 }
795
796 #[test]
797 fn test_streaming_processor_single_chunk() {
798 let mut temp = NamedTempFile::new().unwrap();
799 writeln!(temp, "Short content").unwrap();
800
801 let processor = StreamingProcessor::new(1024 * 1024); let mut chunks = 0;
803 let mut total_offset = 0;
804
805 processor
806 .process_file(temp.path(), |_chunk, offset, _tokens| {
807 chunks += 1;
808 total_offset = offset;
809 })
810 .unwrap();
811
812 assert_eq!(chunks, 1);
813 assert_eq!(total_offset, 0);
814 }
815
816 #[test]
817 fn test_streaming_processor_binary_file() {
818 let mut temp = NamedTempFile::new().unwrap();
819 temp.write_all(&[0x00, 0x01, 0x02]).unwrap();
820
821 let processor = StreamingProcessor::new(256);
822 let mut chunks = 0;
823
824 processor
826 .process_file(temp.path(), |_chunk, _offset, _tokens| {
827 chunks += 1;
828 })
829 .unwrap();
830
831 assert_eq!(chunks, 0);
832 }
833
834 #[test]
835 fn test_streaming_processor_estimate_tokens() {
836 let mut temp = NamedTempFile::new().unwrap();
837 let content = "x".repeat(1000);
838 temp.write_all(content.as_bytes()).unwrap();
839
840 let processor = StreamingProcessor::new(256);
841 let estimate = processor
842 .estimate_tokens(temp.path(), TokenModel::Claude)
843 .unwrap();
844
845 assert!(estimate > 0);
847 assert!(estimate < 500);
848 }
849
850 #[test]
851 fn test_scanned_file_struct() {
852 let file = ScannedFile {
853 path: "/tmp/test.py".to_string(),
854 relative_path: "test.py".to_string(),
855 size_bytes: 100,
856 lines: 10,
857 token_counts: TokenCounts::default(),
858 language: Some("python".to_string()),
859 content: Some("x = 1".to_string()),
860 is_binary: false,
861 };
862
863 assert_eq!(file.path, "/tmp/test.py");
864 assert_eq!(file.relative_path, "test.py");
865 assert_eq!(file.size_bytes, 100);
866 assert_eq!(file.lines, 10);
867 assert!(!file.is_binary);
868 }
869
870 #[test]
871 fn test_mapped_file_open_nonexistent() {
872 let result = MappedFile::open(Path::new("/nonexistent/file.txt"));
873 assert!(result.is_err());
874 }
875
876 #[test]
877 fn test_scanner_nonexistent_file() {
878 let scanner = MmapScanner::new();
879 let result =
880 scanner.scan_file(Path::new("/nonexistent/file.py"), Path::new("/nonexistent"));
881 assert!(result.is_err());
882 }
883
884 #[test]
885 fn test_streaming_processor_invalid_utf8() {
886 let mut temp = NamedTempFile::new().unwrap();
887 temp.write_all(b"Hello ").unwrap();
894 temp.write_all(&[0xFF, 0xFE]).unwrap(); temp.write_all(b" World").unwrap();
896
897 let processor = StreamingProcessor::new(256);
898 let result = processor.process_file(temp.path(), |_, _, _| {});
899
900 assert!(result.is_err());
902 }
903}