1use memmap2::{Mmap, MmapOptions};
7use rayon::prelude::*;
8use std::fs::File;
9use std::io;
10use std::path::Path;
11use std::sync::atomic::{AtomicU64, Ordering};
12
13use crate::tokenizer::{TokenCounts, TokenModel, Tokenizer};
14
15pub struct MappedFile {
17 mmap: Mmap,
18 path: String,
19}
20
21impl MappedFile {
22 #[allow(unsafe_code)]
24 pub fn open(path: &Path) -> io::Result<Self> {
25 let file = File::open(path)?;
26 let mmap = unsafe { MmapOptions::new().map(&file)? };
28
29 Ok(Self { mmap, path: path.to_string_lossy().to_string() })
30 }
31
32 #[inline]
34 pub fn as_bytes(&self) -> &[u8] {
35 &self.mmap
36 }
37
38 pub fn as_str(&self) -> Option<&str> {
40 std::str::from_utf8(&self.mmap).ok()
41 }
42
43 #[inline]
45 pub fn len(&self) -> usize {
46 self.mmap.len()
47 }
48
49 #[inline]
51 pub fn is_empty(&self) -> bool {
52 self.mmap.is_empty()
53 }
54
55 pub fn path(&self) -> &str {
57 &self.path
58 }
59
60 pub fn is_binary(&self) -> bool {
62 let check_len = self.mmap.len().min(8192);
64 let sample = &self.mmap[..check_len];
65
66 if sample.contains(&0) {
68 return true;
69 }
70
71 let non_printable = sample
73 .iter()
74 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
75 .count();
76
77 non_printable * 10 > check_len
78 }
79
80 pub fn count_lines(&self) -> usize {
82 self.mmap.iter().filter(|&&b| b == b'\n').count()
83 }
84}
85
86pub struct MmapScanner {
88 mmap_threshold: u64,
90 max_file_size: u64,
92 tokenizer: Tokenizer,
94 stats: ScanStats,
96}
97
98#[derive(Debug, Default)]
100pub struct ScanStats {
101 pub files_scanned: AtomicU64,
102 pub bytes_read: AtomicU64,
103 pub files_skipped_binary: AtomicU64,
104 pub files_skipped_size: AtomicU64,
105 pub mmap_used: AtomicU64,
106 pub regular_read_used: AtomicU64,
107}
108
109impl ScanStats {
110 pub fn summary(&self) -> String {
111 format!(
112 "Scanned {} files ({} bytes), skipped {} binary + {} oversized, mmap: {}, regular: {}",
113 self.files_scanned.load(Ordering::Relaxed),
114 self.bytes_read.load(Ordering::Relaxed),
115 self.files_skipped_binary.load(Ordering::Relaxed),
116 self.files_skipped_size.load(Ordering::Relaxed),
117 self.mmap_used.load(Ordering::Relaxed),
118 self.regular_read_used.load(Ordering::Relaxed),
119 )
120 }
121}
122
123#[derive(Debug)]
125pub struct ScannedFile {
126 pub path: String,
127 pub relative_path: String,
128 pub size_bytes: u64,
129 pub lines: usize,
130 pub token_counts: TokenCounts,
131 pub language: Option<String>,
132 pub content: Option<String>,
133 pub is_binary: bool,
134}
135
136impl MmapScanner {
137 pub fn new() -> Self {
139 Self {
140 mmap_threshold: 64 * 1024, max_file_size: 50 * 1024 * 1024, tokenizer: Tokenizer::new(),
143 stats: ScanStats::default(),
144 }
145 }
146
147 pub fn with_mmap_threshold(mut self, bytes: u64) -> Self {
149 self.mmap_threshold = bytes;
150 self
151 }
152
153 pub fn with_max_file_size(mut self, bytes: u64) -> Self {
155 self.max_file_size = bytes;
156 self
157 }
158
159 pub fn scan_file(&self, path: &Path, base_path: &Path) -> io::Result<Option<ScannedFile>> {
161 let metadata = path.metadata()?;
162 let size = metadata.len();
163
164 if size > self.max_file_size {
166 self.stats
167 .files_skipped_size
168 .fetch_add(1, Ordering::Relaxed);
169 return Ok(None);
170 }
171
172 let relative_path = path
173 .strip_prefix(base_path)
174 .unwrap_or(path)
175 .to_string_lossy()
176 .to_string();
177
178 let (content_bytes, _use_mmap) = if size >= self.mmap_threshold {
180 self.stats.mmap_used.fetch_add(1, Ordering::Relaxed);
181 let mapped = MappedFile::open(path)?;
182
183 if mapped.is_binary() {
185 self.stats
186 .files_skipped_binary
187 .fetch_add(1, Ordering::Relaxed);
188 return Ok(None);
189 }
190
191 (mapped.as_bytes().to_vec(), true)
192 } else {
193 self.stats.regular_read_used.fetch_add(1, Ordering::Relaxed);
194 let content = std::fs::read(path)?;
195
196 if is_binary_content(&content) {
198 self.stats
199 .files_skipped_binary
200 .fetch_add(1, Ordering::Relaxed);
201 return Ok(None);
202 }
203
204 (content, false)
205 };
206
207 let content_str = match String::from_utf8(content_bytes) {
209 Ok(s) => s,
210 Err(_) => {
211 self.stats
212 .files_skipped_binary
213 .fetch_add(1, Ordering::Relaxed);
214 return Ok(None);
215 },
216 };
217
218 let token_counts = self.tokenizer.count_all(&content_str);
220
221 let lines = content_str.lines().count();
223
224 let language = detect_language(path);
226
227 self.stats.files_scanned.fetch_add(1, Ordering::Relaxed);
228 self.stats.bytes_read.fetch_add(size, Ordering::Relaxed);
229
230 Ok(Some(ScannedFile {
231 path: path.to_string_lossy().to_string(),
232 relative_path,
233 size_bytes: size,
234 lines,
235 token_counts,
236 language,
237 content: Some(content_str),
238 is_binary: false,
239 }))
240 }
241
242 pub fn scan_files_parallel(&self, paths: &[&Path], base_path: &Path) -> Vec<ScannedFile> {
244 paths
245 .par_iter()
246 .filter_map(|path| match self.scan_file(path, base_path) {
247 Ok(Some(file)) => Some(file),
248 Ok(None) => None,
249 Err(e) => {
250 tracing::debug!("Error scanning {:?}: {}", path, e);
251 None
252 },
253 })
254 .collect()
255 }
256
257 pub fn stats(&self) -> &ScanStats {
259 &self.stats
260 }
261
262 pub fn reset_stats(&self) {
264 self.stats.files_scanned.store(0, Ordering::Relaxed);
265 self.stats.bytes_read.store(0, Ordering::Relaxed);
266 self.stats.files_skipped_binary.store(0, Ordering::Relaxed);
267 self.stats.files_skipped_size.store(0, Ordering::Relaxed);
268 self.stats.mmap_used.store(0, Ordering::Relaxed);
269 self.stats.regular_read_used.store(0, Ordering::Relaxed);
270 }
271}
272
273impl Default for MmapScanner {
274 fn default() -> Self {
275 Self::new()
276 }
277}
278
279fn is_binary_content(content: &[u8]) -> bool {
281 let check_len = content.len().min(8192);
282 let sample = &content[..check_len];
283
284 if sample.contains(&0) {
285 return true;
286 }
287
288 let non_printable = sample
289 .iter()
290 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
291 .count();
292
293 non_printable * 10 > check_len
294}
295
296fn detect_language(path: &Path) -> Option<String> {
298 let ext = path.extension()?.to_str()?;
299
300 let lang = match ext.to_lowercase().as_str() {
301 "py" | "pyw" | "pyi" => "python",
302 "js" | "mjs" | "cjs" => "javascript",
303 "jsx" => "jsx",
304 "ts" | "mts" | "cts" => "typescript",
305 "tsx" => "tsx",
306 "rs" => "rust",
307 "go" => "go",
308 "java" => "java",
309 "c" | "h" => "c",
310 "cpp" | "hpp" | "cc" | "cxx" => "cpp",
311 "cs" => "csharp",
312 "rb" => "ruby",
313 "php" => "php",
314 "swift" => "swift",
315 "kt" | "kts" => "kotlin",
316 "scala" => "scala",
317 "sh" | "bash" => "bash",
318 "lua" => "lua",
319 "zig" => "zig",
320 "md" | "markdown" => "markdown",
321 "json" => "json",
322 "yaml" | "yml" => "yaml",
323 "toml" => "toml",
324 "xml" => "xml",
325 "html" | "htm" => "html",
326 "css" => "css",
327 "scss" | "sass" => "scss",
328 "sql" => "sql",
329 _ => return None,
330 };
331
332 Some(lang.to_owned())
333}
334
335pub struct StreamingProcessor {
337 chunk_size: usize,
338 tokenizer: Tokenizer,
339}
340
341impl StreamingProcessor {
342 pub fn new(chunk_size: usize) -> Self {
344 Self { chunk_size, tokenizer: Tokenizer::new() }
345 }
346
347 pub fn process_file<F>(&self, path: &Path, mut callback: F) -> io::Result<()>
349 where
350 F: FnMut(&str, usize, TokenCounts),
351 {
352 let mapped = MappedFile::open(path)?;
353
354 if mapped.is_binary() {
355 return Ok(());
356 }
357
358 let content = mapped
359 .as_str()
360 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid UTF-8"))?;
361
362 let mut offset = 0;
363 while offset < content.len() {
364 let end = (offset + self.chunk_size).min(content.len());
365
366 let chunk_end = if end < content.len() {
368 content[offset..end]
369 .rfind('\n')
370 .map(|i| offset + i + 1)
371 .unwrap_or(end)
372 } else {
373 end
374 };
375
376 let chunk = &content[offset..chunk_end];
377 let tokens = self.tokenizer.count_all(chunk);
378
379 callback(chunk, offset, tokens);
380
381 offset = chunk_end;
382 }
383
384 Ok(())
385 }
386
387 pub fn estimate_tokens(&self, path: &Path, model: TokenModel) -> io::Result<u32> {
389 let metadata = path.metadata()?;
390 let size = metadata.len();
391
392 let chars_per_token = model.chars_per_token();
394 Ok((size as f32 / chars_per_token).ceil() as u32)
395 }
396}
397
398#[cfg(test)]
399#[allow(clippy::str_to_string)]
400mod tests {
401 use super::*;
402 use std::io::Write;
403 use tempfile::{tempdir, NamedTempFile};
404
405 #[test]
406 fn test_mapped_file() {
407 let mut temp = NamedTempFile::new().unwrap();
408 writeln!(temp, "Hello, World!").unwrap();
409 writeln!(temp, "Second line").unwrap();
410
411 let mapped = MappedFile::open(temp.path()).unwrap();
412
413 assert!(!mapped.is_empty());
414 assert!(!mapped.is_binary());
415 assert_eq!(mapped.count_lines(), 2);
416 }
417
418 #[test]
419 fn test_mapped_file_as_str() {
420 let mut temp = NamedTempFile::new().unwrap();
421 writeln!(temp, "Valid UTF-8 content").unwrap();
422
423 let mapped = MappedFile::open(temp.path()).unwrap();
424 let content = mapped.as_str();
425 assert!(content.is_some());
426 assert!(content.unwrap().contains("Valid UTF-8"));
427 }
428
429 #[test]
430 fn test_mapped_file_len_and_path() {
431 let mut temp = NamedTempFile::new().unwrap();
432 writeln!(temp, "Test content").unwrap();
433
434 let mapped = MappedFile::open(temp.path()).unwrap();
435 assert!(mapped.len() > 0);
436 assert!(!mapped.path().is_empty());
437 assert!(mapped.path().contains(temp.path().file_name().unwrap().to_str().unwrap()));
438 }
439
440 #[test]
441 fn test_mapped_file_as_bytes() {
442 let mut temp = NamedTempFile::new().unwrap();
443 temp.write_all(b"Raw bytes").unwrap();
444
445 let mapped = MappedFile::open(temp.path()).unwrap();
446 let bytes = mapped.as_bytes();
447 assert_eq!(&bytes[..9], b"Raw bytes");
448 }
449
450 #[test]
451 fn test_mapped_file_empty() {
452 let temp = NamedTempFile::new().unwrap();
453 let mapped = MappedFile::open(temp.path()).unwrap();
454 assert!(mapped.is_empty());
455 assert_eq!(mapped.len(), 0);
456 assert_eq!(mapped.count_lines(), 0);
457 }
458
459 #[test]
460 fn test_mapped_file_invalid_utf8() {
461 let mut temp = NamedTempFile::new().unwrap();
462 temp.write_all(&[0xFF, 0xFE, 0x41, 0x42]).unwrap();
464
465 let mapped = MappedFile::open(temp.path()).unwrap();
466 assert!(mapped.as_str().is_none());
468 }
469
470 #[test]
471 fn test_binary_detection() {
472 let mut temp = NamedTempFile::new().unwrap();
473 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
474
475 let mapped = MappedFile::open(temp.path()).unwrap();
476 assert!(mapped.is_binary());
477 }
478
479 #[test]
480 fn test_binary_detection_high_non_printable() {
481 let mut temp = NamedTempFile::new().unwrap();
482 let mut content = vec![0x01u8; 100];
484 content.extend(b"some text"); temp.write_all(&content).unwrap();
486
487 let mapped = MappedFile::open(temp.path()).unwrap();
488 assert!(mapped.is_binary());
489 }
490
491 #[test]
492 fn test_binary_detection_text_with_tabs() {
493 let mut temp = NamedTempFile::new().unwrap();
494 writeln!(temp, "Line 1\twith\ttabs").unwrap();
496 writeln!(temp, "Line 2\twith\ttabs").unwrap();
497
498 let mapped = MappedFile::open(temp.path()).unwrap();
499 assert!(!mapped.is_binary());
500 }
501
502 #[test]
503 fn test_scanner() {
504 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
505 writeln!(temp, "def hello():").unwrap();
506 writeln!(temp, " print('hello')").unwrap();
507
508 let scanner = MmapScanner::new();
509 let result = scanner
510 .scan_file(temp.path(), temp.path().parent().unwrap())
511 .unwrap();
512
513 assert!(result.is_some());
514 let file = result.unwrap();
515 assert_eq!(file.language, Some("python".to_string()));
516 assert!(file.token_counts.claude > 0);
517 }
518
519 #[test]
520 fn test_scanner_default() {
521 let scanner = MmapScanner::default();
522 assert_eq!(scanner.mmap_threshold, 64 * 1024);
524 assert_eq!(scanner.max_file_size, 50 * 1024 * 1024);
525 }
526
527 #[test]
528 fn test_scanner_with_thresholds() {
529 let scanner = MmapScanner::new()
530 .with_mmap_threshold(1024)
531 .with_max_file_size(1024 * 1024);
532 assert_eq!(scanner.mmap_threshold, 1024);
533 assert_eq!(scanner.max_file_size, 1024 * 1024);
534 }
535
536 #[test]
537 fn test_scanner_skips_large_files() {
538 let mut temp = NamedTempFile::new().unwrap();
539 writeln!(temp, "Small content").unwrap();
541
542 let scanner = MmapScanner::new().with_max_file_size(5);
544 let result = scanner
545 .scan_file(temp.path(), temp.path().parent().unwrap())
546 .unwrap();
547
548 assert!(result.is_none());
549 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 1);
550 }
551
552 #[test]
553 fn test_scanner_skips_binary_files() {
554 let mut temp = NamedTempFile::new().unwrap();
555 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
556
557 let scanner = MmapScanner::new();
558 let result = scanner
559 .scan_file(temp.path(), temp.path().parent().unwrap())
560 .unwrap();
561
562 assert!(result.is_none());
563 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 1);
564 }
565
566 #[test]
567 fn test_scanner_uses_mmap_for_large_files() {
568 let mut temp = NamedTempFile::with_suffix(".rs").unwrap();
569 let content = "fn test() {}\n".repeat(10000);
571 temp.write_all(content.as_bytes()).unwrap();
572
573 let scanner = MmapScanner::new().with_mmap_threshold(1024); let result = scanner
575 .scan_file(temp.path(), temp.path().parent().unwrap())
576 .unwrap();
577
578 assert!(result.is_some());
579 assert!(scanner.stats().mmap_used.load(Ordering::Relaxed) >= 1);
580 }
581
582 #[test]
583 fn test_scanner_uses_regular_read_for_small_files() {
584 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
585 writeln!(temp, "x = 1").unwrap();
586
587 let scanner = MmapScanner::new().with_mmap_threshold(1024 * 1024); let result = scanner
589 .scan_file(temp.path(), temp.path().parent().unwrap())
590 .unwrap();
591
592 assert!(result.is_some());
593 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 1);
594 }
595
596 #[test]
597 fn test_scanner_reset_stats() {
598 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
599 writeln!(temp, "x = 1").unwrap();
600
601 let scanner = MmapScanner::new();
602 scanner.scan_file(temp.path(), temp.path().parent().unwrap()).unwrap();
603
604 assert!(scanner.stats().files_scanned.load(Ordering::Relaxed) >= 1);
605
606 scanner.reset_stats();
607
608 assert_eq!(scanner.stats().files_scanned.load(Ordering::Relaxed), 0);
609 assert_eq!(scanner.stats().bytes_read.load(Ordering::Relaxed), 0);
610 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 0);
611 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 0);
612 assert_eq!(scanner.stats().mmap_used.load(Ordering::Relaxed), 0);
613 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 0);
614 }
615
616 #[test]
617 fn test_scan_stats_summary() {
618 let stats = ScanStats::default();
619 stats.files_scanned.store(10, Ordering::Relaxed);
620 stats.bytes_read.store(5000, Ordering::Relaxed);
621 stats.files_skipped_binary.store(2, Ordering::Relaxed);
622 stats.files_skipped_size.store(1, Ordering::Relaxed);
623 stats.mmap_used.store(5, Ordering::Relaxed);
624 stats.regular_read_used.store(5, Ordering::Relaxed);
625
626 let summary = stats.summary();
627 assert!(summary.contains("10 files"));
628 assert!(summary.contains("5000 bytes"));
629 assert!(summary.contains("2 binary"));
630 assert!(summary.contains("1 oversized"));
631 assert!(summary.contains("mmap: 5"));
632 assert!(summary.contains("regular: 5"));
633 }
634
635 #[test]
636 fn test_scan_files_parallel() {
637 let dir = tempdir().unwrap();
638 let file1 = dir.path().join("test1.py");
639 let file2 = dir.path().join("test2.rs");
640 let file3 = dir.path().join("binary.bin");
641
642 std::fs::write(&file1, "def foo(): pass\n").unwrap();
643 std::fs::write(&file2, "fn main() {}\n").unwrap();
644 std::fs::write(&file3, &[0x00, 0x01, 0x02]).unwrap(); let scanner = MmapScanner::new();
647 let paths: Vec<&Path> = vec![file1.as_path(), file2.as_path(), file3.as_path()];
648 let results = scanner.scan_files_parallel(&paths, dir.path());
649
650 assert_eq!(results.len(), 2);
652 assert!(results.iter().any(|f| f.language == Some("python".to_string())));
653 assert!(results.iter().any(|f| f.language == Some("rust".to_string())));
654 }
655
656 #[test]
657 fn test_scan_files_parallel_with_errors() {
658 let dir = tempdir().unwrap();
659 let file1 = dir.path().join("test.py");
660 std::fs::write(&file1, "x = 1\n").unwrap();
661
662 let scanner = MmapScanner::new();
663 let nonexistent = Path::new("/nonexistent/file.py");
664 let paths: Vec<&Path> = vec![file1.as_path(), nonexistent];
665 let results = scanner.scan_files_parallel(&paths, dir.path());
666
667 assert_eq!(results.len(), 1);
669 }
670
671 #[test]
672 fn test_detect_language() {
673 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
674 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
675 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
676 assert_eq!(detect_language(Path::new("test.unknown")), None);
677 }
678
679 #[test]
680 fn test_detect_language_all_extensions() {
681 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
683 assert_eq!(detect_language(Path::new("test.pyw")), Some("python".to_string()));
684 assert_eq!(detect_language(Path::new("test.pyi")), Some("python".to_string()));
685
686 assert_eq!(detect_language(Path::new("test.js")), Some("javascript".to_string()));
688 assert_eq!(detect_language(Path::new("test.mjs")), Some("javascript".to_string()));
689 assert_eq!(detect_language(Path::new("test.cjs")), Some("javascript".to_string()));
690 assert_eq!(detect_language(Path::new("test.jsx")), Some("jsx".to_string()));
691
692 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
694 assert_eq!(detect_language(Path::new("test.mts")), Some("typescript".to_string()));
695 assert_eq!(detect_language(Path::new("test.cts")), Some("typescript".to_string()));
696 assert_eq!(detect_language(Path::new("test.tsx")), Some("tsx".to_string()));
697
698 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
700 assert_eq!(detect_language(Path::new("test.go")), Some("go".to_string()));
701 assert_eq!(detect_language(Path::new("test.c")), Some("c".to_string()));
702 assert_eq!(detect_language(Path::new("test.h")), Some("c".to_string()));
703 assert_eq!(detect_language(Path::new("test.cpp")), Some("cpp".to_string()));
704 assert_eq!(detect_language(Path::new("test.hpp")), Some("cpp".to_string()));
705 assert_eq!(detect_language(Path::new("test.cc")), Some("cpp".to_string()));
706 assert_eq!(detect_language(Path::new("test.cxx")), Some("cpp".to_string()));
707 assert_eq!(detect_language(Path::new("test.zig")), Some("zig".to_string()));
708
709 assert_eq!(detect_language(Path::new("test.java")), Some("java".to_string()));
711 assert_eq!(detect_language(Path::new("test.kt")), Some("kotlin".to_string()));
712 assert_eq!(detect_language(Path::new("test.kts")), Some("kotlin".to_string()));
713 assert_eq!(detect_language(Path::new("test.scala")), Some("scala".to_string()));
714
715 assert_eq!(detect_language(Path::new("test.cs")), Some("csharp".to_string()));
717 assert_eq!(detect_language(Path::new("test.rb")), Some("ruby".to_string()));
718 assert_eq!(detect_language(Path::new("test.php")), Some("php".to_string()));
719 assert_eq!(detect_language(Path::new("test.swift")), Some("swift".to_string()));
720 assert_eq!(detect_language(Path::new("test.lua")), Some("lua".to_string()));
721
722 assert_eq!(detect_language(Path::new("test.sh")), Some("bash".to_string()));
724 assert_eq!(detect_language(Path::new("test.bash")), Some("bash".to_string()));
725
726 assert_eq!(detect_language(Path::new("test.md")), Some("markdown".to_string()));
728 assert_eq!(detect_language(Path::new("test.markdown")), Some("markdown".to_string()));
729 assert_eq!(detect_language(Path::new("test.json")), Some("json".to_string()));
730 assert_eq!(detect_language(Path::new("test.yaml")), Some("yaml".to_string()));
731 assert_eq!(detect_language(Path::new("test.yml")), Some("yaml".to_string()));
732 assert_eq!(detect_language(Path::new("test.toml")), Some("toml".to_string()));
733 assert_eq!(detect_language(Path::new("test.xml")), Some("xml".to_string()));
734 assert_eq!(detect_language(Path::new("test.html")), Some("html".to_string()));
735 assert_eq!(detect_language(Path::new("test.htm")), Some("html".to_string()));
736 assert_eq!(detect_language(Path::new("test.css")), Some("css".to_string()));
737 assert_eq!(detect_language(Path::new("test.scss")), Some("scss".to_string()));
738 assert_eq!(detect_language(Path::new("test.sass")), Some("scss".to_string()));
739 assert_eq!(detect_language(Path::new("test.sql")), Some("sql".to_string()));
740
741 assert_eq!(detect_language(Path::new("Makefile")), None);
743 assert_eq!(detect_language(Path::new("README")), None);
744 }
745
746 #[test]
747 fn test_detect_language_case_insensitive() {
748 assert_eq!(detect_language(Path::new("test.PY")), Some("python".to_string()));
750 assert_eq!(detect_language(Path::new("test.RS")), Some("rust".to_string()));
751 assert_eq!(detect_language(Path::new("test.Js")), Some("javascript".to_string()));
752 }
753
754 #[test]
755 fn test_is_binary_content() {
756 assert!(!is_binary_content(b"Hello, world!\n"));
758 assert!(!is_binary_content(b"Line 1\nLine 2\nLine 3\n"));
759 assert!(!is_binary_content(b"Tab\tseparated\tvalues\n"));
760
761 assert!(is_binary_content(&[0x00, 0x01, 0x02]));
763 assert!(is_binary_content(b"text\x00with\x00nulls"));
764
765 let mostly_binary: Vec<u8> = (0u8..100).collect();
767 assert!(is_binary_content(&mostly_binary));
768 }
769
770 #[test]
771 fn test_streaming_processor() {
772 let mut temp = NamedTempFile::new().unwrap();
773 for i in 0..100 {
774 writeln!(temp, "Line {}: Some content here", i).unwrap();
775 }
776
777 let processor = StreamingProcessor::new(256);
778 let mut chunks = 0;
779
780 processor
781 .process_file(temp.path(), |_chunk, _offset, _tokens| {
782 chunks += 1;
783 })
784 .unwrap();
785
786 assert!(chunks > 1);
787 }
788
789 #[test]
790 fn test_streaming_processor_single_chunk() {
791 let mut temp = NamedTempFile::new().unwrap();
792 writeln!(temp, "Short content").unwrap();
793
794 let processor = StreamingProcessor::new(1024 * 1024); let mut chunks = 0;
796 let mut total_offset = 0;
797
798 processor
799 .process_file(temp.path(), |_chunk, offset, _tokens| {
800 chunks += 1;
801 total_offset = offset;
802 })
803 .unwrap();
804
805 assert_eq!(chunks, 1);
806 assert_eq!(total_offset, 0);
807 }
808
809 #[test]
810 fn test_streaming_processor_binary_file() {
811 let mut temp = NamedTempFile::new().unwrap();
812 temp.write_all(&[0x00, 0x01, 0x02]).unwrap();
813
814 let processor = StreamingProcessor::new(256);
815 let mut chunks = 0;
816
817 processor
819 .process_file(temp.path(), |_chunk, _offset, _tokens| {
820 chunks += 1;
821 })
822 .unwrap();
823
824 assert_eq!(chunks, 0);
825 }
826
827 #[test]
828 fn test_streaming_processor_estimate_tokens() {
829 let mut temp = NamedTempFile::new().unwrap();
830 let content = "x".repeat(1000);
831 temp.write_all(content.as_bytes()).unwrap();
832
833 let processor = StreamingProcessor::new(256);
834 let estimate = processor.estimate_tokens(temp.path(), TokenModel::Claude).unwrap();
835
836 assert!(estimate > 0);
838 assert!(estimate < 500);
839 }
840
841 #[test]
842 fn test_scanned_file_struct() {
843 let file = ScannedFile {
844 path: "/tmp/test.py".to_string(),
845 relative_path: "test.py".to_string(),
846 size_bytes: 100,
847 lines: 10,
848 token_counts: TokenCounts::default(),
849 language: Some("python".to_string()),
850 content: Some("x = 1".to_string()),
851 is_binary: false,
852 };
853
854 assert_eq!(file.path, "/tmp/test.py");
855 assert_eq!(file.relative_path, "test.py");
856 assert_eq!(file.size_bytes, 100);
857 assert_eq!(file.lines, 10);
858 assert!(!file.is_binary);
859 }
860
861 #[test]
862 fn test_mapped_file_open_nonexistent() {
863 let result = MappedFile::open(Path::new("/nonexistent/file.txt"));
864 assert!(result.is_err());
865 }
866
867 #[test]
868 fn test_scanner_nonexistent_file() {
869 let scanner = MmapScanner::new();
870 let result = scanner.scan_file(Path::new("/nonexistent/file.py"), Path::new("/nonexistent"));
871 assert!(result.is_err());
872 }
873
874 #[test]
875 fn test_streaming_processor_invalid_utf8() {
876 let mut temp = NamedTempFile::new().unwrap();
877 temp.write_all(b"Hello ").unwrap();
884 temp.write_all(&[0xFF, 0xFE]).unwrap(); temp.write_all(b" World").unwrap();
886
887 let processor = StreamingProcessor::new(256);
888 let result = processor.process_file(temp.path(), |_, _, _| {});
889
890 assert!(result.is_err());
892 }
893}