1use memmap2::{Mmap, MmapOptions};
7use rayon::prelude::*;
8use std::fs::File;
9use std::io;
10use std::path::Path;
11use std::sync::atomic::{AtomicU64, Ordering};
12
13use crate::tokenizer::{TokenCounts, TokenModel, Tokenizer};
14
15pub struct MappedFile {
17 mmap: Mmap,
18 path: String,
19}
20
21impl MappedFile {
22 #[allow(unsafe_code)]
24 pub fn open(path: &Path) -> io::Result<Self> {
25 let file = File::open(path)?;
26 let mmap = unsafe { MmapOptions::new().map(&file)? };
28
29 Ok(Self { mmap, path: path.to_string_lossy().to_string() })
30 }
31
32 #[inline]
34 pub fn as_bytes(&self) -> &[u8] {
35 &self.mmap
36 }
37
38 pub fn as_str(&self) -> Option<&str> {
40 std::str::from_utf8(&self.mmap).ok()
41 }
42
43 #[inline]
45 pub fn len(&self) -> usize {
46 self.mmap.len()
47 }
48
49 #[inline]
51 pub fn is_empty(&self) -> bool {
52 self.mmap.is_empty()
53 }
54
55 pub fn path(&self) -> &str {
57 &self.path
58 }
59
60 pub fn is_binary(&self) -> bool {
62 let check_len = self.mmap.len().min(8192);
64 let sample = &self.mmap[..check_len];
65
66 if sample.contains(&0) {
68 return true;
69 }
70
71 let non_printable = sample
73 .iter()
74 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
75 .count();
76
77 non_printable * 10 > check_len
78 }
79
80 pub fn count_lines(&self) -> usize {
82 self.mmap.iter().filter(|&&b| b == b'\n').count()
83 }
84}
85
86pub struct MmapScanner {
88 mmap_threshold: u64,
90 max_file_size: u64,
92 tokenizer: Tokenizer,
94 stats: ScanStats,
96}
97
98#[derive(Debug, Default)]
100pub struct ScanStats {
101 pub files_scanned: AtomicU64,
102 pub bytes_read: AtomicU64,
103 pub files_skipped_binary: AtomicU64,
104 pub files_skipped_size: AtomicU64,
105 pub mmap_used: AtomicU64,
106 pub regular_read_used: AtomicU64,
107}
108
109impl ScanStats {
110 pub fn summary(&self) -> String {
111 format!(
112 "Scanned {} files ({} bytes), skipped {} binary + {} oversized, mmap: {}, regular: {}",
113 self.files_scanned.load(Ordering::Relaxed),
114 self.bytes_read.load(Ordering::Relaxed),
115 self.files_skipped_binary.load(Ordering::Relaxed),
116 self.files_skipped_size.load(Ordering::Relaxed),
117 self.mmap_used.load(Ordering::Relaxed),
118 self.regular_read_used.load(Ordering::Relaxed),
119 )
120 }
121}
122
123#[derive(Debug)]
125pub struct ScannedFile {
126 pub path: String,
127 pub relative_path: String,
128 pub size_bytes: u64,
129 pub lines: usize,
130 pub token_counts: TokenCounts,
131 pub language: Option<String>,
132 pub content: Option<String>,
133 pub is_binary: bool,
134}
135
136impl MmapScanner {
137 pub fn new() -> Self {
139 Self {
140 mmap_threshold: 64 * 1024, max_file_size: 50 * 1024 * 1024, tokenizer: Tokenizer::new(),
143 stats: ScanStats::default(),
144 }
145 }
146
147 pub fn with_mmap_threshold(mut self, bytes: u64) -> Self {
149 self.mmap_threshold = bytes;
150 self
151 }
152
153 pub fn with_max_file_size(mut self, bytes: u64) -> Self {
155 self.max_file_size = bytes;
156 self
157 }
158
159 pub fn scan_file(&self, path: &Path, base_path: &Path) -> io::Result<Option<ScannedFile>> {
161 let metadata = path.metadata()?;
162 let size = metadata.len();
163
164 if size > self.max_file_size {
166 self.stats
167 .files_skipped_size
168 .fetch_add(1, Ordering::Relaxed);
169 return Ok(None);
170 }
171
172 let relative_path = path
173 .strip_prefix(base_path)
174 .unwrap_or(path)
175 .to_string_lossy()
176 .to_string();
177
178 let (content_bytes, _use_mmap) = if size >= self.mmap_threshold {
180 self.stats.mmap_used.fetch_add(1, Ordering::Relaxed);
181 let mapped = MappedFile::open(path)?;
182
183 if mapped.is_binary() {
185 self.stats
186 .files_skipped_binary
187 .fetch_add(1, Ordering::Relaxed);
188 return Ok(None);
189 }
190
191 (mapped.as_bytes().to_vec(), true)
192 } else {
193 self.stats.regular_read_used.fetch_add(1, Ordering::Relaxed);
194 let content = std::fs::read(path)?;
195
196 if is_binary_content(&content) {
198 self.stats
199 .files_skipped_binary
200 .fetch_add(1, Ordering::Relaxed);
201 return Ok(None);
202 }
203
204 (content, false)
205 };
206
207 let content_str = match String::from_utf8(content_bytes) {
209 Ok(s) => s,
210 Err(_) => {
211 self.stats
212 .files_skipped_binary
213 .fetch_add(1, Ordering::Relaxed);
214 return Ok(None);
215 },
216 };
217
218 let token_counts = self.tokenizer.count_all(&content_str);
220
221 let lines = content_str.lines().count();
223
224 let language = detect_language(path);
226
227 self.stats.files_scanned.fetch_add(1, Ordering::Relaxed);
228 self.stats.bytes_read.fetch_add(size, Ordering::Relaxed);
229
230 Ok(Some(ScannedFile {
231 path: path.to_string_lossy().to_string(),
232 relative_path,
233 size_bytes: size,
234 lines,
235 token_counts,
236 language,
237 content: Some(content_str),
238 is_binary: false,
239 }))
240 }
241
242 pub fn scan_files_parallel(&self, paths: &[&Path], base_path: &Path) -> Vec<ScannedFile> {
244 paths
245 .par_iter()
246 .filter_map(|path| match self.scan_file(path, base_path) {
247 Ok(Some(file)) => Some(file),
248 Ok(None) => None,
249 Err(e) => {
250 tracing::debug!("Error scanning {:?}: {}", path, e);
251 None
252 },
253 })
254 .collect()
255 }
256
257 pub fn stats(&self) -> &ScanStats {
259 &self.stats
260 }
261
262 pub fn reset_stats(&self) {
264 self.stats.files_scanned.store(0, Ordering::Relaxed);
265 self.stats.bytes_read.store(0, Ordering::Relaxed);
266 self.stats.files_skipped_binary.store(0, Ordering::Relaxed);
267 self.stats.files_skipped_size.store(0, Ordering::Relaxed);
268 self.stats.mmap_used.store(0, Ordering::Relaxed);
269 self.stats.regular_read_used.store(0, Ordering::Relaxed);
270 }
271}
272
273impl Default for MmapScanner {
274 fn default() -> Self {
275 Self::new()
276 }
277}
278
279fn is_binary_content(content: &[u8]) -> bool {
281 let check_len = content.len().min(8192);
282 let sample = &content[..check_len];
283
284 if sample.contains(&0) {
285 return true;
286 }
287
288 let non_printable = sample
289 .iter()
290 .filter(|&&b| b < 32 && b != b'\t' && b != b'\n' && b != b'\r')
291 .count();
292
293 non_printable * 10 > check_len
294}
295
296fn detect_language(path: &Path) -> Option<String> {
298 let ext = path.extension()?.to_str()?;
299
300 let lang = match ext.to_lowercase().as_str() {
301 "py" | "pyw" | "pyi" => "python",
302 "js" | "mjs" | "cjs" => "javascript",
303 "jsx" => "jsx",
304 "ts" | "mts" | "cts" => "typescript",
305 "tsx" => "tsx",
306 "rs" => "rust",
307 "go" => "go",
308 "java" => "java",
309 "c" | "h" => "c",
310 "cpp" | "hpp" | "cc" | "cxx" => "cpp",
311 "cs" => "csharp",
312 "rb" => "ruby",
313 "php" => "php",
314 "swift" => "swift",
315 "kt" | "kts" => "kotlin",
316 "scala" => "scala",
317 "sh" | "bash" => "bash",
318 "lua" => "lua",
319 "zig" => "zig",
320 "md" | "markdown" => "markdown",
321 "json" => "json",
322 "yaml" | "yml" => "yaml",
323 "toml" => "toml",
324 "xml" => "xml",
325 "html" | "htm" => "html",
326 "css" => "css",
327 "scss" | "sass" => "scss",
328 "sql" => "sql",
329 _ => return None,
330 };
331
332 Some(lang.to_owned())
333}
334
335pub struct StreamingProcessor {
337 chunk_size: usize,
338 tokenizer: Tokenizer,
339}
340
341impl StreamingProcessor {
342 pub fn new(chunk_size: usize) -> Self {
344 Self { chunk_size, tokenizer: Tokenizer::new() }
345 }
346
347 pub fn process_file<F>(&self, path: &Path, mut callback: F) -> io::Result<()>
349 where
350 F: FnMut(&str, usize, TokenCounts),
351 {
352 let mapped = MappedFile::open(path)?;
353
354 if mapped.is_binary() {
355 return Ok(());
356 }
357
358 let content = mapped
359 .as_str()
360 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Invalid UTF-8"))?;
361
362 let mut offset = 0;
363 while offset < content.len() {
364 let end = (offset + self.chunk_size).min(content.len());
365
366 let chunk_end = if end < content.len() {
368 content[offset..end]
369 .rfind('\n')
370 .map(|i| offset + i + 1)
371 .unwrap_or(end)
372 } else {
373 end
374 };
375
376 let chunk = &content[offset..chunk_end];
377 let tokens = self.tokenizer.count_all(chunk);
378
379 callback(chunk, offset, tokens);
380
381 offset = chunk_end;
382 }
383
384 Ok(())
385 }
386
387 pub fn estimate_tokens(&self, path: &Path, model: TokenModel) -> io::Result<u32> {
389 let metadata = path.metadata()?;
390 let size = metadata.len();
391
392 let chars_per_token = model.chars_per_token();
394 Ok((size as f32 / chars_per_token).ceil() as u32)
395 }
396}
397
398#[cfg(test)]
399#[allow(clippy::str_to_string)]
400mod tests {
401 use super::*;
402 use std::io::Write;
403 use tempfile::{tempdir, NamedTempFile};
404
405 #[test]
406 fn test_mapped_file() {
407 let mut temp = NamedTempFile::new().unwrap();
408 writeln!(temp, "Hello, World!").unwrap();
409 writeln!(temp, "Second line").unwrap();
410
411 let mapped = MappedFile::open(temp.path()).unwrap();
412
413 assert!(!mapped.is_empty());
414 assert!(!mapped.is_binary());
415 assert_eq!(mapped.count_lines(), 2);
416 }
417
418 #[test]
419 fn test_mapped_file_as_str() {
420 let mut temp = NamedTempFile::new().unwrap();
421 writeln!(temp, "Valid UTF-8 content").unwrap();
422
423 let mapped = MappedFile::open(temp.path()).unwrap();
424 let content = mapped.as_str();
425 assert!(content.is_some());
426 assert!(content.unwrap().contains("Valid UTF-8"));
427 }
428
429 #[test]
430 fn test_mapped_file_len_and_path() {
431 let mut temp = NamedTempFile::new().unwrap();
432 writeln!(temp, "Test content").unwrap();
433
434 let mapped = MappedFile::open(temp.path()).unwrap();
435 assert!(mapped.len() > 0);
436 assert!(!mapped.path().is_empty());
437 assert!(mapped
438 .path()
439 .contains(temp.path().file_name().unwrap().to_str().unwrap()));
440 }
441
442 #[test]
443 fn test_mapped_file_as_bytes() {
444 let mut temp = NamedTempFile::new().unwrap();
445 temp.write_all(b"Raw bytes").unwrap();
446
447 let mapped = MappedFile::open(temp.path()).unwrap();
448 let bytes = mapped.as_bytes();
449 assert_eq!(&bytes[..9], b"Raw bytes");
450 }
451
452 #[test]
453 fn test_mapped_file_empty() {
454 let temp = NamedTempFile::new().unwrap();
455 let mapped = MappedFile::open(temp.path()).unwrap();
456 assert!(mapped.is_empty());
457 assert_eq!(mapped.len(), 0);
458 assert_eq!(mapped.count_lines(), 0);
459 }
460
461 #[test]
462 fn test_mapped_file_invalid_utf8() {
463 let mut temp = NamedTempFile::new().unwrap();
464 temp.write_all(&[0xFF, 0xFE, 0x41, 0x42]).unwrap();
466
467 let mapped = MappedFile::open(temp.path()).unwrap();
468 assert!(mapped.as_str().is_none());
470 }
471
472 #[test]
473 fn test_binary_detection() {
474 let mut temp = NamedTempFile::new().unwrap();
475 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
476
477 let mapped = MappedFile::open(temp.path()).unwrap();
478 assert!(mapped.is_binary());
479 }
480
481 #[test]
482 fn test_binary_detection_high_non_printable() {
483 let mut temp = NamedTempFile::new().unwrap();
484 let mut content = vec![0x01u8; 100];
486 content.extend(b"some text"); temp.write_all(&content).unwrap();
488
489 let mapped = MappedFile::open(temp.path()).unwrap();
490 assert!(mapped.is_binary());
491 }
492
493 #[test]
494 fn test_binary_detection_text_with_tabs() {
495 let mut temp = NamedTempFile::new().unwrap();
496 writeln!(temp, "Line 1\twith\ttabs").unwrap();
498 writeln!(temp, "Line 2\twith\ttabs").unwrap();
499
500 let mapped = MappedFile::open(temp.path()).unwrap();
501 assert!(!mapped.is_binary());
502 }
503
504 #[test]
505 fn test_scanner() {
506 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
507 writeln!(temp, "def hello():").unwrap();
508 writeln!(temp, " print('hello')").unwrap();
509
510 let scanner = MmapScanner::new();
511 let result = scanner
512 .scan_file(temp.path(), temp.path().parent().unwrap())
513 .unwrap();
514
515 assert!(result.is_some());
516 let file = result.unwrap();
517 assert_eq!(file.language, Some("python".to_string()));
518 assert!(file.token_counts.claude > 0);
519 }
520
521 #[test]
522 fn test_scanner_default() {
523 let scanner = MmapScanner::default();
524 assert_eq!(scanner.mmap_threshold, 64 * 1024);
526 assert_eq!(scanner.max_file_size, 50 * 1024 * 1024);
527 }
528
529 #[test]
530 fn test_scanner_with_thresholds() {
531 let scanner = MmapScanner::new()
532 .with_mmap_threshold(1024)
533 .with_max_file_size(1024 * 1024);
534 assert_eq!(scanner.mmap_threshold, 1024);
535 assert_eq!(scanner.max_file_size, 1024 * 1024);
536 }
537
538 #[test]
539 fn test_scanner_skips_large_files() {
540 let mut temp = NamedTempFile::new().unwrap();
541 writeln!(temp, "Small content").unwrap();
543
544 let scanner = MmapScanner::new().with_max_file_size(5);
546 let result = scanner
547 .scan_file(temp.path(), temp.path().parent().unwrap())
548 .unwrap();
549
550 assert!(result.is_none());
551 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 1);
552 }
553
554 #[test]
555 fn test_scanner_skips_binary_files() {
556 let mut temp = NamedTempFile::new().unwrap();
557 temp.write_all(&[0x00, 0x01, 0x02, 0x03]).unwrap();
558
559 let scanner = MmapScanner::new();
560 let result = scanner
561 .scan_file(temp.path(), temp.path().parent().unwrap())
562 .unwrap();
563
564 assert!(result.is_none());
565 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 1);
566 }
567
568 #[test]
569 fn test_scanner_uses_mmap_for_large_files() {
570 let mut temp = NamedTempFile::with_suffix(".rs").unwrap();
571 let content = "fn test() {}\n".repeat(10000);
573 temp.write_all(content.as_bytes()).unwrap();
574
575 let scanner = MmapScanner::new().with_mmap_threshold(1024); let result = scanner
577 .scan_file(temp.path(), temp.path().parent().unwrap())
578 .unwrap();
579
580 assert!(result.is_some());
581 assert!(scanner.stats().mmap_used.load(Ordering::Relaxed) >= 1);
582 }
583
584 #[test]
585 fn test_scanner_uses_regular_read_for_small_files() {
586 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
587 writeln!(temp, "x = 1").unwrap();
588
589 let scanner = MmapScanner::new().with_mmap_threshold(1024 * 1024); let result = scanner
591 .scan_file(temp.path(), temp.path().parent().unwrap())
592 .unwrap();
593
594 assert!(result.is_some());
595 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 1);
596 }
597
598 #[test]
599 fn test_scanner_reset_stats() {
600 let mut temp = NamedTempFile::with_suffix(".py").unwrap();
601 writeln!(temp, "x = 1").unwrap();
602
603 let scanner = MmapScanner::new();
604 scanner
605 .scan_file(temp.path(), temp.path().parent().unwrap())
606 .unwrap();
607
608 assert!(scanner.stats().files_scanned.load(Ordering::Relaxed) >= 1);
609
610 scanner.reset_stats();
611
612 assert_eq!(scanner.stats().files_scanned.load(Ordering::Relaxed), 0);
613 assert_eq!(scanner.stats().bytes_read.load(Ordering::Relaxed), 0);
614 assert_eq!(scanner.stats().files_skipped_binary.load(Ordering::Relaxed), 0);
615 assert_eq!(scanner.stats().files_skipped_size.load(Ordering::Relaxed), 0);
616 assert_eq!(scanner.stats().mmap_used.load(Ordering::Relaxed), 0);
617 assert_eq!(scanner.stats().regular_read_used.load(Ordering::Relaxed), 0);
618 }
619
620 #[test]
621 fn test_scan_stats_summary() {
622 let stats = ScanStats::default();
623 stats.files_scanned.store(10, Ordering::Relaxed);
624 stats.bytes_read.store(5000, Ordering::Relaxed);
625 stats.files_skipped_binary.store(2, Ordering::Relaxed);
626 stats.files_skipped_size.store(1, Ordering::Relaxed);
627 stats.mmap_used.store(5, Ordering::Relaxed);
628 stats.regular_read_used.store(5, Ordering::Relaxed);
629
630 let summary = stats.summary();
631 assert!(summary.contains("10 files"));
632 assert!(summary.contains("5000 bytes"));
633 assert!(summary.contains("2 binary"));
634 assert!(summary.contains("1 oversized"));
635 assert!(summary.contains("mmap: 5"));
636 assert!(summary.contains("regular: 5"));
637 }
638
639 #[test]
640 fn test_scan_files_parallel() {
641 let dir = tempdir().unwrap();
642 let file1 = dir.path().join("test1.py");
643 let file2 = dir.path().join("test2.rs");
644 let file3 = dir.path().join("binary.bin");
645
646 std::fs::write(&file1, "def foo(): pass\n").unwrap();
647 std::fs::write(&file2, "fn main() {}\n").unwrap();
648 std::fs::write(&file3, &[0x00, 0x01, 0x02]).unwrap(); let scanner = MmapScanner::new();
651 let paths: Vec<&Path> = vec![file1.as_path(), file2.as_path(), file3.as_path()];
652 let results = scanner.scan_files_parallel(&paths, dir.path());
653
654 assert_eq!(results.len(), 2);
656 assert!(results
657 .iter()
658 .any(|f| f.language == Some("python".to_string())));
659 assert!(results
660 .iter()
661 .any(|f| f.language == Some("rust".to_string())));
662 }
663
664 #[test]
665 fn test_scan_files_parallel_with_errors() {
666 let dir = tempdir().unwrap();
667 let file1 = dir.path().join("test.py");
668 std::fs::write(&file1, "x = 1\n").unwrap();
669
670 let scanner = MmapScanner::new();
671 let nonexistent = Path::new("/nonexistent/file.py");
672 let paths: Vec<&Path> = vec![file1.as_path(), nonexistent];
673 let results = scanner.scan_files_parallel(&paths, dir.path());
674
675 assert_eq!(results.len(), 1);
677 }
678
679 #[test]
680 fn test_detect_language() {
681 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
682 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
683 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
684 assert_eq!(detect_language(Path::new("test.unknown")), None);
685 }
686
687 #[test]
688 fn test_detect_language_all_extensions() {
689 assert_eq!(detect_language(Path::new("test.py")), Some("python".to_string()));
691 assert_eq!(detect_language(Path::new("test.pyw")), Some("python".to_string()));
692 assert_eq!(detect_language(Path::new("test.pyi")), Some("python".to_string()));
693
694 assert_eq!(detect_language(Path::new("test.js")), Some("javascript".to_string()));
696 assert_eq!(detect_language(Path::new("test.mjs")), Some("javascript".to_string()));
697 assert_eq!(detect_language(Path::new("test.cjs")), Some("javascript".to_string()));
698 assert_eq!(detect_language(Path::new("test.jsx")), Some("jsx".to_string()));
699
700 assert_eq!(detect_language(Path::new("test.ts")), Some("typescript".to_string()));
702 assert_eq!(detect_language(Path::new("test.mts")), Some("typescript".to_string()));
703 assert_eq!(detect_language(Path::new("test.cts")), Some("typescript".to_string()));
704 assert_eq!(detect_language(Path::new("test.tsx")), Some("tsx".to_string()));
705
706 assert_eq!(detect_language(Path::new("test.rs")), Some("rust".to_string()));
708 assert_eq!(detect_language(Path::new("test.go")), Some("go".to_string()));
709 assert_eq!(detect_language(Path::new("test.c")), Some("c".to_string()));
710 assert_eq!(detect_language(Path::new("test.h")), Some("c".to_string()));
711 assert_eq!(detect_language(Path::new("test.cpp")), Some("cpp".to_string()));
712 assert_eq!(detect_language(Path::new("test.hpp")), Some("cpp".to_string()));
713 assert_eq!(detect_language(Path::new("test.cc")), Some("cpp".to_string()));
714 assert_eq!(detect_language(Path::new("test.cxx")), Some("cpp".to_string()));
715 assert_eq!(detect_language(Path::new("test.zig")), Some("zig".to_string()));
716
717 assert_eq!(detect_language(Path::new("test.java")), Some("java".to_string()));
719 assert_eq!(detect_language(Path::new("test.kt")), Some("kotlin".to_string()));
720 assert_eq!(detect_language(Path::new("test.kts")), Some("kotlin".to_string()));
721 assert_eq!(detect_language(Path::new("test.scala")), Some("scala".to_string()));
722
723 assert_eq!(detect_language(Path::new("test.cs")), Some("csharp".to_string()));
725 assert_eq!(detect_language(Path::new("test.rb")), Some("ruby".to_string()));
726 assert_eq!(detect_language(Path::new("test.php")), Some("php".to_string()));
727 assert_eq!(detect_language(Path::new("test.swift")), Some("swift".to_string()));
728 assert_eq!(detect_language(Path::new("test.lua")), Some("lua".to_string()));
729
730 assert_eq!(detect_language(Path::new("test.sh")), Some("bash".to_string()));
732 assert_eq!(detect_language(Path::new("test.bash")), Some("bash".to_string()));
733
734 assert_eq!(detect_language(Path::new("test.md")), Some("markdown".to_string()));
736 assert_eq!(detect_language(Path::new("test.markdown")), Some("markdown".to_string()));
737 assert_eq!(detect_language(Path::new("test.json")), Some("json".to_string()));
738 assert_eq!(detect_language(Path::new("test.yaml")), Some("yaml".to_string()));
739 assert_eq!(detect_language(Path::new("test.yml")), Some("yaml".to_string()));
740 assert_eq!(detect_language(Path::new("test.toml")), Some("toml".to_string()));
741 assert_eq!(detect_language(Path::new("test.xml")), Some("xml".to_string()));
742 assert_eq!(detect_language(Path::new("test.html")), Some("html".to_string()));
743 assert_eq!(detect_language(Path::new("test.htm")), Some("html".to_string()));
744 assert_eq!(detect_language(Path::new("test.css")), Some("css".to_string()));
745 assert_eq!(detect_language(Path::new("test.scss")), Some("scss".to_string()));
746 assert_eq!(detect_language(Path::new("test.sass")), Some("scss".to_string()));
747 assert_eq!(detect_language(Path::new("test.sql")), Some("sql".to_string()));
748
749 assert_eq!(detect_language(Path::new("Makefile")), None);
751 assert_eq!(detect_language(Path::new("README")), None);
752 }
753
754 #[test]
755 fn test_detect_language_case_insensitive() {
756 assert_eq!(detect_language(Path::new("test.PY")), Some("python".to_string()));
758 assert_eq!(detect_language(Path::new("test.RS")), Some("rust".to_string()));
759 assert_eq!(detect_language(Path::new("test.Js")), Some("javascript".to_string()));
760 }
761
762 #[test]
763 fn test_is_binary_content() {
764 assert!(!is_binary_content(b"Hello, world!\n"));
766 assert!(!is_binary_content(b"Line 1\nLine 2\nLine 3\n"));
767 assert!(!is_binary_content(b"Tab\tseparated\tvalues\n"));
768
769 assert!(is_binary_content(&[0x00, 0x01, 0x02]));
771 assert!(is_binary_content(b"text\x00with\x00nulls"));
772
773 let mostly_binary: Vec<u8> = (0u8..100).collect();
775 assert!(is_binary_content(&mostly_binary));
776 }
777
778 #[test]
779 fn test_streaming_processor() {
780 let mut temp = NamedTempFile::new().unwrap();
781 for i in 0..100 {
782 writeln!(temp, "Line {}: Some content here", i).unwrap();
783 }
784
785 let processor = StreamingProcessor::new(256);
786 let mut chunks = 0;
787
788 processor
789 .process_file(temp.path(), |_chunk, _offset, _tokens| {
790 chunks += 1;
791 })
792 .unwrap();
793
794 assert!(chunks > 1);
795 }
796
797 #[test]
798 fn test_streaming_processor_single_chunk() {
799 let mut temp = NamedTempFile::new().unwrap();
800 writeln!(temp, "Short content").unwrap();
801
802 let processor = StreamingProcessor::new(1024 * 1024); let mut chunks = 0;
804 let mut total_offset = 0;
805
806 processor
807 .process_file(temp.path(), |_chunk, offset, _tokens| {
808 chunks += 1;
809 total_offset = offset;
810 })
811 .unwrap();
812
813 assert_eq!(chunks, 1);
814 assert_eq!(total_offset, 0);
815 }
816
817 #[test]
818 fn test_streaming_processor_binary_file() {
819 let mut temp = NamedTempFile::new().unwrap();
820 temp.write_all(&[0x00, 0x01, 0x02]).unwrap();
821
822 let processor = StreamingProcessor::new(256);
823 let mut chunks = 0;
824
825 processor
827 .process_file(temp.path(), |_chunk, _offset, _tokens| {
828 chunks += 1;
829 })
830 .unwrap();
831
832 assert_eq!(chunks, 0);
833 }
834
835 #[test]
836 fn test_streaming_processor_estimate_tokens() {
837 let mut temp = NamedTempFile::new().unwrap();
838 let content = "x".repeat(1000);
839 temp.write_all(content.as_bytes()).unwrap();
840
841 let processor = StreamingProcessor::new(256);
842 let estimate = processor
843 .estimate_tokens(temp.path(), TokenModel::Claude)
844 .unwrap();
845
846 assert!(estimate > 0);
848 assert!(estimate < 500);
849 }
850
851 #[test]
852 fn test_scanned_file_struct() {
853 let file = ScannedFile {
854 path: "/tmp/test.py".to_string(),
855 relative_path: "test.py".to_string(),
856 size_bytes: 100,
857 lines: 10,
858 token_counts: TokenCounts::default(),
859 language: Some("python".to_string()),
860 content: Some("x = 1".to_string()),
861 is_binary: false,
862 };
863
864 assert_eq!(file.path, "/tmp/test.py");
865 assert_eq!(file.relative_path, "test.py");
866 assert_eq!(file.size_bytes, 100);
867 assert_eq!(file.lines, 10);
868 assert!(!file.is_binary);
869 }
870
871 #[test]
872 fn test_mapped_file_open_nonexistent() {
873 let result = MappedFile::open(Path::new("/nonexistent/file.txt"));
874 assert!(result.is_err());
875 }
876
877 #[test]
878 fn test_scanner_nonexistent_file() {
879 let scanner = MmapScanner::new();
880 let result =
881 scanner.scan_file(Path::new("/nonexistent/file.py"), Path::new("/nonexistent"));
882 assert!(result.is_err());
883 }
884
885 #[test]
886 fn test_streaming_processor_invalid_utf8() {
887 let mut temp = NamedTempFile::new().unwrap();
888 temp.write_all(b"Hello ").unwrap();
895 temp.write_all(&[0xFF, 0xFE]).unwrap(); temp.write_all(b" World").unwrap();
897
898 let processor = StreamingProcessor::new(256);
899 let result = processor.process_file(temp.path(), |_, _, _| {});
900
901 assert!(result.is_err());
903 }
904}