1use crate::{ContentAnalyzer, GitIntegrator, LanguageDetector, MetadataExtractor};
7use scribe_core::{
8 FileInfo, GitFileStatus, GitStatus, Language, RenderDecision, Result, ScribeError,
9};
10
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14use std::time::Instant;
15
16use futures::stream::{self, StreamExt};
17use ignore::{DirEntry as IgnoreDirEntry, WalkBuilder, WalkState};
18use rayon::prelude::*;
19use tokio::sync::{RwLock, Semaphore};
20use walkdir::{DirEntry, WalkDir};
21
22#[derive(Debug)]
24pub struct Scanner {
25 stats: Arc<ScannerStats>,
26 semaphore: Arc<Semaphore>,
27}
28
29#[derive(Debug, Default)]
31pub struct ScannerStats {
32 files_processed: AtomicUsize,
33 directories_traversed: AtomicUsize,
34 binary_files_skipped: AtomicUsize,
35 errors_encountered: AtomicUsize,
36}
37
38#[derive(Debug, Clone)]
40pub struct ScanOptions {
41 pub parallel_processing: bool,
43 pub max_concurrency: usize,
45 pub metadata_extraction: bool,
47 pub content_analysis: bool,
49 pub git_integration: bool,
51 pub follow_symlinks: bool,
53 pub include_hidden: bool,
55 pub max_file_size: Option<u64>,
57 pub include_extensions: Option<Vec<String>>,
59 pub exclude_extensions: Option<Vec<String>>,
61}
62
63#[derive(Debug, Clone)]
65pub struct ScanResult {
66 pub files: Vec<FileInfo>,
67 pub stats: ScanProgress,
68 pub duration: std::time::Duration,
69 pub errors: Vec<String>,
70}
71
72#[derive(Debug, Clone)]
74pub struct ScanProgress {
75 pub files_processed: usize,
76 pub directories_traversed: usize,
77 pub binary_files_skipped: usize,
78 pub errors_encountered: usize,
79 pub bytes_processed: u64,
80}
81
82impl Default for ScanOptions {
83 fn default() -> Self {
84 Self {
85 parallel_processing: true,
86 max_concurrency: num_cpus::get().min(16), metadata_extraction: true,
88 content_analysis: false,
89 git_integration: false,
90 follow_symlinks: false,
91 include_hidden: false,
92 max_file_size: Some(50 * 1024 * 1024), include_extensions: None,
94 exclude_extensions: None,
95 }
96 }
97}
98
99impl ScanOptions {
100 pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
102 self.parallel_processing = enabled;
103 self
104 }
105
106 pub fn with_max_concurrency(mut self, max: usize) -> Self {
108 self.max_concurrency = max;
109 self
110 }
111
112 pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
114 self.metadata_extraction = enabled;
115 self
116 }
117
118 pub fn with_content_analysis(mut self, enabled: bool) -> Self {
120 self.content_analysis = enabled;
121 self
122 }
123
124 pub fn with_git_integration(mut self, enabled: bool) -> Self {
126 self.git_integration = enabled;
127 self
128 }
129
130 pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
132 self.follow_symlinks = enabled;
133 self
134 }
135
136 pub fn with_include_hidden(mut self, enabled: bool) -> Self {
138 self.include_hidden = enabled;
139 self
140 }
141
142 pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
144 self.max_file_size = size;
145 self
146 }
147
148 pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
150 self.include_extensions = Some(extensions);
151 self
152 }
153
154 pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
156 self.exclude_extensions = Some(extensions);
157 self
158 }
159}
160
161impl Scanner {
162 pub fn new() -> Self {
164 Self {
165 stats: Arc::new(ScannerStats::default()),
166 semaphore: Arc::new(Semaphore::new(16)), }
168 }
169
170 pub async fn scan<P: AsRef<Path>>(
172 &self,
173 path: P,
174 options: ScanOptions,
175 ) -> Result<Vec<FileInfo>> {
176 let start_time = Instant::now();
177 let path = path.as_ref();
178
179 if !path.exists() {
181 return Err(ScribeError::path(
182 format!("Path does not exist: {}", path.display()),
183 path,
184 ));
185 }
186
187 if !path.is_dir() {
188 return Err(ScribeError::path(
189 format!("Path is not a directory: {}", path.display()),
190 path,
191 ));
192 }
193
194 let metadata_extractor = if options.metadata_extraction {
196 Some(MetadataExtractor::new())
197 } else {
198 None
199 };
200
201 let content_analyzer = if options.content_analysis {
202 Some(ContentAnalyzer::new())
203 } else {
204 None
205 };
206
207 let git_integrator = if options.git_integration {
208 GitIntegrator::new(path).ok()
209 } else {
210 None
211 };
212
213 let language_detector = LanguageDetector::new();
214
215 let file_paths = if let Some(ref git) = git_integrator {
217 match git.list_tracked_files().await {
218 Ok(paths) => {
219 log::debug!(
220 "Using git ls-files for file discovery: {} files",
221 paths.len()
222 );
223 paths
224 }
225 Err(_) => {
226 log::debug!("Git discovery failed, falling back to filesystem walk");
227 self.discover_files_filesystem(path, &options).await?
228 }
229 }
230 } else {
231 self.discover_files_filesystem(path, &options).await?
232 };
233
234 log::info!("Discovered {} files for processing", file_paths.len());
235
236 if let Some(ref git) = git_integrator {
238 if let Err(e) = git.load_batch_file_statuses().await {
239 log::debug!("Failed to load batch git statuses: {}", e);
240 }
241 }
242
243 let files = if options.parallel_processing {
245 log::debug!(
246 "Processing files in parallel with concurrency={}",
247 options.max_concurrency
248 );
249 self.process_files_parallel(
250 file_paths,
251 &options,
252 metadata_extractor.as_ref(),
253 content_analyzer.as_ref(),
254 git_integrator.as_ref(),
255 &language_detector,
256 )
257 .await?
258 } else {
259 log::debug!("Processing files sequentially");
260 self.process_files_sequential(
261 file_paths,
262 &options,
263 metadata_extractor.as_ref(),
264 content_analyzer.as_ref(),
265 git_integrator.as_ref(),
266 &language_detector,
267 )
268 .await?
269 };
270
271 log::info!(
272 "Scanning completed in {:.2}s: {} files processed",
273 start_time.elapsed().as_secs_f64(),
274 files.len()
275 );
276
277 Ok(files)
278 }
279
280 async fn discover_files_filesystem(
282 &self,
283 root: &Path,
284 options: &ScanOptions,
285 ) -> Result<Vec<PathBuf>> {
286 let mut builder = WalkBuilder::new(root);
287
288 builder
289 .follow_links(options.follow_symlinks)
290 .hidden(!options.include_hidden)
291 .git_ignore(true)
292 .git_exclude(true)
293 .require_git(false);
294
295 let mut files = Vec::new();
296
297 builder.build().for_each(|entry| {
299 match entry {
300 Ok(entry) => {
301 if entry.file_type().map_or(false, |ft| ft.is_file()) {
302 let path = entry.path().to_path_buf();
303
304 if self.should_include_file(&path, options) {
306 files.push(path);
307 }
308 }
309
310 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
311 self.stats
312 .directories_traversed
313 .fetch_add(1, Ordering::Relaxed);
314 }
315 }
316 Err(err) => {
317 log::warn!("Error during filesystem traversal: {}", err);
318 self.stats
319 .errors_encountered
320 .fetch_add(1, Ordering::Relaxed);
321 }
322 }
323 });
325
326 Ok(files)
327 }
328
329 async fn process_files_parallel(
331 &self,
332 file_paths: Vec<PathBuf>,
333 options: &ScanOptions,
334 metadata_extractor: Option<&MetadataExtractor>,
335 content_analyzer: Option<&ContentAnalyzer>,
336 git_integrator: Option<&GitIntegrator>,
337 language_detector: &LanguageDetector,
338 ) -> Result<Vec<FileInfo>> {
339 let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
340 let results = Arc::new(RwLock::new(Vec::new()));
341
342 let chunk_size = 1000;
344 for chunk in file_paths.chunks(chunk_size) {
345 let futures: Vec<_> = chunk
346 .iter()
347 .map(|path| {
348 let semaphore = Arc::clone(&semaphore);
349 let results = Arc::clone(&results);
350 let path = path.clone();
351
352 async move {
353 let _permit = semaphore.acquire().await.unwrap();
354
355 match self
356 .process_single_file(
357 &path,
358 options,
359 metadata_extractor,
360 content_analyzer,
361 git_integrator,
362 language_detector,
363 )
364 .await
365 {
366 Ok(Some(file_info)) => {
367 results.write().await.push(file_info);
368 }
369 Ok(None) => {
370 }
372 Err(err) => {
373 log::debug!("Error processing file {}: {}", path.display(), err);
374 self.stats
375 .errors_encountered
376 .fetch_add(1, Ordering::Relaxed);
377 }
378 }
379 }
380 })
381 .collect();
382
383 stream::iter(futures)
385 .buffer_unordered(options.max_concurrency)
386 .collect::<Vec<_>>()
387 .await;
388 }
389
390 let results = results.read().await;
391 Ok(results.clone())
392 }
393
394 async fn process_files_sequential(
396 &self,
397 file_paths: Vec<PathBuf>,
398 options: &ScanOptions,
399 metadata_extractor: Option<&MetadataExtractor>,
400 content_analyzer: Option<&ContentAnalyzer>,
401 git_integrator: Option<&GitIntegrator>,
402 language_detector: &LanguageDetector,
403 ) -> Result<Vec<FileInfo>> {
404 let mut results = Vec::new();
405
406 for path in file_paths {
407 match self
408 .process_single_file(
409 &path,
410 options,
411 metadata_extractor,
412 content_analyzer,
413 git_integrator,
414 language_detector,
415 )
416 .await
417 {
418 Ok(Some(file_info)) => {
419 results.push(file_info);
420 }
421 Ok(None) => {
422 }
424 Err(err) => {
425 log::debug!("Error processing file {}: {}", path.display(), err);
426 self.stats
427 .errors_encountered
428 .fetch_add(1, Ordering::Relaxed);
429 }
430 }
431 }
432
433 Ok(results)
434 }
435
436 async fn process_single_file(
438 &self,
439 path: &Path,
440 options: &ScanOptions,
441 metadata_extractor: Option<&MetadataExtractor>,
442 content_analyzer: Option<&ContentAnalyzer>,
443 git_integrator: Option<&GitIntegrator>,
444 language_detector: &LanguageDetector,
445 ) -> Result<Option<FileInfo>> {
446 if !path.exists() {
448 return Ok(None);
449 }
450
451 let metadata = tokio::fs::metadata(path).await?;
452
453 if let Some(max_size) = options.max_file_size {
455 if metadata.len() > max_size {
456 log::debug!(
457 "Skipping large file: {} ({} bytes)",
458 path.display(),
459 metadata.len()
460 );
461 return Ok(None);
462 }
463 }
464
465 let language = language_detector.detect_language(path);
467
468 if self.is_likely_binary(path, &language) {
470 self.stats
471 .binary_files_skipped
472 .fetch_add(1, Ordering::Relaxed);
473 return Ok(None);
474 }
475
476 let relative_path = path.to_string_lossy().to_string();
478
479 let file_type = FileInfo::classify_file_type(
480 &relative_path,
481 &language,
482 path.extension().and_then(|e| e.to_str()).unwrap_or(""),
483 );
484
485 let mut file_info = FileInfo {
486 path: path.to_path_buf(),
487 relative_path,
488 size: metadata.len(),
489 modified: metadata.modified().ok(),
490 decision: RenderDecision::include("scanned file"),
491 file_type,
492 language,
493 content: None,
494 token_estimate: None,
495 line_count: None,
496 char_count: None,
497 is_binary: false, git_status: None,
499 centrality_score: None, };
501
502 if let Some(extractor) = metadata_extractor {
504 if let Ok(file_metadata) = extractor.extract_metadata(path).await {
505 file_info.size = file_metadata.size;
506 }
508 }
509
510 if let Some(analyzer) = content_analyzer {
512 if let Ok(content_stats) = analyzer.analyze_file(path).await {
513 }
516 }
517
518 if let Some(git) = git_integrator {
520 if let Ok(git_info) = git.get_file_info(path).await {
521 file_info.git_status = Some(GitStatus {
523 working_tree: git_info.status,
524 index: GitFileStatus::Unmodified,
525 });
526 }
527 }
528
529 self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
530 Ok(Some(file_info))
531 }
532
533 fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
535 let extension = path
536 .extension()
537 .and_then(|ext| ext.to_str())
538 .unwrap_or("")
539 .to_lowercase();
540
541 if let Some(ref exclude) = options.exclude_extensions {
543 if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
544 return false;
545 }
546 }
547
548 if let Some(ref include) = options.include_extensions {
550 return include.iter().any(|ext| ext.to_lowercase() == extension);
551 }
552
553 true
554 }
555
556 fn is_likely_binary(&self, path: &Path, language: &Language) -> bool {
558 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
560 let binary_extensions = [
561 "bin", "exe", "dll", "so", "dylib", "a", "lib", "obj", "o", "class", "jar", "war",
562 "ear", "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "pdf", "doc", "docx",
563 "xls", "xlsx", "ppt", "pptx", "zip", "tar", "gz", "bz2", "rar", "7z", "mp3", "mp4",
564 "avi", "mkv", "mov", "wmv", "ttf", "otf", "woff", "woff2",
565 ];
566
567 if binary_extensions.contains(&extension.to_lowercase().as_str()) {
568 return true;
569 }
570 }
571
572 matches!(language, Language::Unknown)
575 }
576
577 pub fn files_processed(&self) -> usize {
579 self.stats.files_processed.load(Ordering::Relaxed)
580 }
581
582 pub fn directories_traversed(&self) -> usize {
584 self.stats.directories_traversed.load(Ordering::Relaxed)
585 }
586
587 pub fn binary_files_skipped(&self) -> usize {
589 self.stats.binary_files_skipped.load(Ordering::Relaxed)
590 }
591
592 pub fn errors_encountered(&self) -> usize {
594 self.stats.errors_encountered.load(Ordering::Relaxed)
595 }
596}
597
598impl Default for Scanner {
599 fn default() -> Self {
600 Self::new()
601 }
602}
603
604#[cfg(test)]
605mod tests {
606 use super::*;
607 use std::fs;
608 use tempfile::TempDir;
609 use tokio::fs as async_fs;
610
611 #[tokio::test]
612 async fn test_scanner_creation() {
613 let scanner = Scanner::new();
614 assert_eq!(scanner.files_processed(), 0);
615 assert_eq!(scanner.directories_traversed(), 0);
616 }
617
618 #[tokio::test]
619 async fn test_scan_empty_directory() {
620 let scanner = Scanner::new();
621 let temp_dir = TempDir::new().unwrap();
622
623 let options = ScanOptions::default();
624 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
625
626 assert!(results.is_empty());
627 }
628
629 #[tokio::test]
630 async fn test_scan_with_files() {
631 let scanner = Scanner::new();
632 let temp_dir = TempDir::new().unwrap();
633
634 let rust_file = temp_dir.path().join("test.rs");
636 let python_file = temp_dir.path().join("test.py");
637 let binary_file = temp_dir.path().join("test.bin");
638
639 fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
640 fs::write(&python_file, "print('Hello, world!')").unwrap();
641 fs::write(&binary_file, &[0u8; 256]).unwrap(); let options = ScanOptions::default();
644 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
645
646 assert_eq!(results.len(), 2);
648 assert!(results
649 .iter()
650 .any(|f| f.path.file_name().unwrap() == "test.rs"));
651 assert!(results
652 .iter()
653 .any(|f| f.path.file_name().unwrap() == "test.py"));
654
655 let rust_file_info = results
657 .iter()
658 .find(|f| f.path.file_name().unwrap() == "test.rs")
659 .unwrap();
660 assert_eq!(rust_file_info.language, Language::Rust);
661
662 let python_file_info = results
663 .iter()
664 .find(|f| f.path.file_name().unwrap() == "test.py")
665 .unwrap();
666 assert_eq!(python_file_info.language, Language::Python);
667 }
668
669 #[tokio::test]
670 async fn test_scan_options_extension_filtering() {
671 let scanner = Scanner::new();
672 let temp_dir = TempDir::new().unwrap();
673
674 fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
676 fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
677 fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
678
679 let options = ScanOptions::default()
681 .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
682 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
683
684 assert_eq!(results.len(), 2);
685 assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
686 assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
687 assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
688 }
689
690 #[tokio::test]
691 async fn test_parallel_processing() {
692 let scanner = Scanner::new();
693 let temp_dir = TempDir::new().unwrap();
694
695 for i in 0..150 {
697 let file_path = temp_dir.path().join(format!("test_{}.rs", i));
698 fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
699 }
700
701 let options = ScanOptions::default()
702 .with_parallel_processing(true)
703 .with_max_concurrency(4);
704
705 let start = Instant::now();
706 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
707 let duration = start.elapsed();
708
709 assert_eq!(results.len(), 150);
710 log::info!("Parallel scan of 150 files took: {:?}", duration);
711
712 for i in 0..150 {
714 assert!(results
715 .iter()
716 .any(|f| { f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str() }));
717 }
718 }
719
720 #[test]
721 fn test_scan_options_builder() {
722 let options = ScanOptions::default()
723 .with_parallel_processing(true)
724 .with_max_concurrency(8)
725 .with_metadata_extraction(true)
726 .with_content_analysis(true)
727 .with_git_integration(false)
728 .with_follow_symlinks(false)
729 .with_include_hidden(true)
730 .with_max_file_size(Some(1024 * 1024));
731
732 assert_eq!(options.parallel_processing, true);
733 assert_eq!(options.max_concurrency, 8);
734 assert_eq!(options.metadata_extraction, true);
735 assert_eq!(options.content_analysis, true);
736 assert_eq!(options.git_integration, false);
737 assert_eq!(options.follow_symlinks, false);
738 assert_eq!(options.include_hidden, true);
739 assert_eq!(options.max_file_size, Some(1024 * 1024));
740 }
741
742 #[test]
743 fn test_binary_file_detection() {
744 let scanner = Scanner::new();
745
746 assert!(scanner.is_likely_binary(Path::new("test.exe"), &Language::Unknown));
748 assert!(scanner.is_likely_binary(Path::new("test.png"), &Language::Unknown));
749 assert!(scanner.is_likely_binary(Path::new("test.pdf"), &Language::Unknown));
750
751 assert!(!scanner.is_likely_binary(Path::new("test.rs"), &Language::Rust));
753 assert!(!scanner.is_likely_binary(Path::new("test.py"), &Language::Python));
754 assert!(!scanner.is_likely_binary(Path::new("test.md"), &Language::Markdown));
755 }
756}