1use crate::{GitIntegrator, LanguageDetector, MetadataExtractor};
7use scribe_core::{
8 FileInfo, GitFileStatus, GitStatus, Language, RenderDecision, Result, ScribeError,
9};
10
11use std::path::{Path, PathBuf};
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::Arc;
14use std::time::Instant;
15
16use futures::stream::{self, StreamExt};
17use ignore::{DirEntry as IgnoreDirEntry, WalkBuilder, WalkState};
18use rayon::prelude::*;
19use tokio::sync::{RwLock, Semaphore};
20use walkdir::{DirEntry, WalkDir};
21
22#[derive(Debug)]
24pub struct Scanner {
25 stats: Arc<ScannerStats>,
26 semaphore: Arc<Semaphore>,
27}
28
29#[derive(Debug, Default)]
31pub struct ScannerStats {
32 files_processed: AtomicUsize,
33 directories_traversed: AtomicUsize,
34 binary_files_skipped: AtomicUsize,
35 errors_encountered: AtomicUsize,
36}
37
38#[derive(Debug, Clone)]
40pub struct ScanOptions {
41 pub parallel_processing: bool,
43 pub max_concurrency: usize,
45 pub metadata_extraction: bool,
47 pub git_integration: bool,
49 pub follow_symlinks: bool,
51 pub include_hidden: bool,
53 pub max_file_size: Option<u64>,
55 pub include_extensions: Option<Vec<String>>,
57 pub exclude_extensions: Option<Vec<String>>,
59}
60
61#[derive(Debug, Clone)]
63pub struct ScanResult {
64 pub files: Vec<FileInfo>,
65 pub stats: ScanProgress,
66 pub duration: std::time::Duration,
67 pub errors: Vec<String>,
68}
69
70#[derive(Debug, Clone)]
72pub struct ScanProgress {
73 pub files_processed: usize,
74 pub directories_traversed: usize,
75 pub binary_files_skipped: usize,
76 pub errors_encountered: usize,
77 pub bytes_processed: u64,
78}
79
80impl Default for ScanOptions {
81 fn default() -> Self {
82 Self {
83 parallel_processing: true,
84 max_concurrency: num_cpus::get().min(16), metadata_extraction: true,
86 git_integration: false,
87 follow_symlinks: false,
88 include_hidden: false,
89 max_file_size: Some(50 * 1024 * 1024), include_extensions: None,
91 exclude_extensions: None,
92 }
93 }
94}
95
96impl ScanOptions {
97 pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
99 self.parallel_processing = enabled;
100 self
101 }
102
103 pub fn with_max_concurrency(mut self, max: usize) -> Self {
105 self.max_concurrency = max;
106 self
107 }
108
109 pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
111 self.metadata_extraction = enabled;
112 self
113 }
114
115 pub fn with_git_integration(mut self, enabled: bool) -> Self {
117 self.git_integration = enabled;
118 self
119 }
120
121 pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
123 self.follow_symlinks = enabled;
124 self
125 }
126
127 pub fn with_include_hidden(mut self, enabled: bool) -> Self {
129 self.include_hidden = enabled;
130 self
131 }
132
133 pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
135 self.max_file_size = size;
136 self
137 }
138
139 pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
141 self.include_extensions = Some(extensions);
142 self
143 }
144
145 pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
147 self.exclude_extensions = Some(extensions);
148 self
149 }
150}
151
152impl Scanner {
153 pub fn new() -> Self {
155 Self {
156 stats: Arc::new(ScannerStats::default()),
157 semaphore: Arc::new(Semaphore::new(16)), }
159 }
160
161 pub async fn scan<P: AsRef<Path>>(
163 &self,
164 path: P,
165 options: ScanOptions,
166 ) -> Result<Vec<FileInfo>> {
167 let start_time = Instant::now();
168 let path = path.as_ref();
169
170 if !path.exists() {
172 return Err(ScribeError::path(
173 format!("Path does not exist: {}", path.display()),
174 path,
175 ));
176 }
177
178 if !path.is_dir() {
179 return Err(ScribeError::path(
180 format!("Path is not a directory: {}", path.display()),
181 path,
182 ));
183 }
184
185 let metadata_extractor = if options.metadata_extraction {
187 Some(MetadataExtractor::new())
188 } else {
189 None
190 };
191
192 let git_integrator = if options.git_integration {
193 GitIntegrator::new(path).ok()
194 } else {
195 None
196 };
197
198 let language_detector = LanguageDetector::new();
199
200 let file_paths = if let Some(ref git) = git_integrator {
202 match git.list_tracked_files().await {
203 Ok(paths) => {
204 log::debug!(
205 "Using git ls-files for file discovery: {} files",
206 paths.len()
207 );
208 paths
209 }
210 Err(_) => {
211 log::debug!("Git discovery failed, falling back to filesystem walk");
212 self.discover_files_filesystem(path, &options).await?
213 }
214 }
215 } else {
216 self.discover_files_filesystem(path, &options).await?
217 };
218
219 log::info!("Discovered {} files for processing", file_paths.len());
220
221 if let Some(ref git) = git_integrator {
223 if let Err(e) = git.load_batch_file_statuses().await {
224 log::debug!("Failed to load batch git statuses: {}", e);
225 }
226 }
227
228 let files = if options.parallel_processing {
230 log::debug!(
231 "Processing files in parallel with concurrency={}",
232 options.max_concurrency
233 );
234 self.process_files_parallel(
235 file_paths,
236 &options,
237 metadata_extractor.as_ref(),
238 git_integrator.as_ref(),
239 &language_detector,
240 )
241 .await?
242 } else {
243 log::debug!("Processing files sequentially");
244 self.process_files_sequential(
245 file_paths,
246 &options,
247 metadata_extractor.as_ref(),
248 git_integrator.as_ref(),
249 &language_detector,
250 )
251 .await?
252 };
253
254 log::info!(
255 "Scanning completed in {:.2}s: {} files processed",
256 start_time.elapsed().as_secs_f64(),
257 files.len()
258 );
259
260 Ok(files)
261 }
262
263 async fn discover_files_filesystem(
265 &self,
266 root: &Path,
267 options: &ScanOptions,
268 ) -> Result<Vec<PathBuf>> {
269 let mut builder = WalkBuilder::new(root);
270
271 builder
272 .follow_links(options.follow_symlinks)
273 .hidden(!options.include_hidden)
274 .git_ignore(true)
275 .git_exclude(true)
276 .require_git(false);
277
278 let mut files = Vec::new();
279
280 builder.build().for_each(|entry| {
282 match entry {
283 Ok(entry) => {
284 if entry.file_type().map_or(false, |ft| ft.is_file()) {
285 let path = entry.path().to_path_buf();
286
287 if self.should_include_file(&path, options) {
289 files.push(path);
290 }
291 }
292
293 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
294 self.stats
295 .directories_traversed
296 .fetch_add(1, Ordering::Relaxed);
297 }
298 }
299 Err(err) => {
300 log::warn!("Error during filesystem traversal: {}", err);
301 self.stats
302 .errors_encountered
303 .fetch_add(1, Ordering::Relaxed);
304 }
305 }
306 });
308
309 Ok(files)
310 }
311
312 async fn process_files_parallel(
314 &self,
315 file_paths: Vec<PathBuf>,
316 options: &ScanOptions,
317 metadata_extractor: Option<&MetadataExtractor>,
318 git_integrator: Option<&GitIntegrator>,
319 language_detector: &LanguageDetector,
320 ) -> Result<Vec<FileInfo>> {
321 let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
322 let results = Arc::new(RwLock::new(Vec::new()));
323
324 let chunk_size = 1000;
326 for chunk in file_paths.chunks(chunk_size) {
327 let futures: Vec<_> = chunk
328 .iter()
329 .map(|path| {
330 let semaphore = Arc::clone(&semaphore);
331 let results = Arc::clone(&results);
332 let path = path.clone();
333
334 async move {
335 let _permit = semaphore.acquire().await.unwrap();
336
337 match self
338 .process_single_file(
339 &path,
340 options,
341 metadata_extractor,
342 git_integrator,
343 language_detector,
344 )
345 .await
346 {
347 Ok(Some(file_info)) => {
348 results.write().await.push(file_info);
349 }
350 Ok(None) => {
351 }
353 Err(err) => {
354 log::debug!("Error processing file {}: {}", path.display(), err);
355 self.stats
356 .errors_encountered
357 .fetch_add(1, Ordering::Relaxed);
358 }
359 }
360 }
361 })
362 .collect();
363
364 stream::iter(futures)
366 .buffer_unordered(options.max_concurrency)
367 .collect::<Vec<_>>()
368 .await;
369 }
370
371 let results = results.read().await;
372 Ok(results.clone())
373 }
374
375 async fn process_files_sequential(
377 &self,
378 file_paths: Vec<PathBuf>,
379 options: &ScanOptions,
380 metadata_extractor: Option<&MetadataExtractor>,
381 git_integrator: Option<&GitIntegrator>,
382 language_detector: &LanguageDetector,
383 ) -> Result<Vec<FileInfo>> {
384 let mut results = Vec::new();
385
386 for path in file_paths {
387 match self
388 .process_single_file(
389 &path,
390 options,
391 metadata_extractor,
392 git_integrator,
393 language_detector,
394 )
395 .await
396 {
397 Ok(Some(file_info)) => {
398 results.push(file_info);
399 }
400 Ok(None) => {
401 }
403 Err(err) => {
404 log::debug!("Error processing file {}: {}", path.display(), err);
405 self.stats
406 .errors_encountered
407 .fetch_add(1, Ordering::Relaxed);
408 }
409 }
410 }
411
412 Ok(results)
413 }
414
415 async fn process_single_file(
417 &self,
418 path: &Path,
419 options: &ScanOptions,
420 metadata_extractor: Option<&MetadataExtractor>,
421 git_integrator: Option<&GitIntegrator>,
422 language_detector: &LanguageDetector,
423 ) -> Result<Option<FileInfo>> {
424 if !path.exists() {
426 return Ok(None);
427 }
428
429 let metadata = tokio::fs::metadata(path).await?;
430
431 if let Some(max_size) = options.max_file_size {
433 if metadata.len() > max_size {
434 log::debug!(
435 "Skipping large file: {} ({} bytes)",
436 path.display(),
437 metadata.len()
438 );
439 return Ok(None);
440 }
441 }
442
443 let language = language_detector.detect_language(path);
445
446 if self.is_likely_binary(path, &language) {
448 self.stats
449 .binary_files_skipped
450 .fetch_add(1, Ordering::Relaxed);
451 return Ok(None);
452 }
453
454 let relative_path = path.to_string_lossy().to_string();
456
457 let file_type = FileInfo::classify_file_type(
458 &relative_path,
459 &language,
460 path.extension().and_then(|e| e.to_str()).unwrap_or(""),
461 );
462
463 let mut file_info = FileInfo {
464 path: path.to_path_buf(),
465 relative_path,
466 size: metadata.len(),
467 modified: metadata.modified().ok(),
468 decision: RenderDecision::include("scanned file"),
469 file_type,
470 language,
471 content: None,
472 token_estimate: None,
473 line_count: None,
474 char_count: None,
475 is_binary: false, git_status: None,
477 centrality_score: None, };
479
480 if let Some(extractor) = metadata_extractor {
482 if let Ok(file_metadata) = extractor.extract_metadata(path).await {
483 file_info.size = file_metadata.size;
484 }
486 }
487
488 if let Some(git) = git_integrator {
490 if let Ok(git_info) = git.get_file_info(path).await {
491 file_info.git_status = Some(GitStatus {
493 working_tree: git_info.status,
494 index: GitFileStatus::Unmodified,
495 });
496 }
497 }
498
499 self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
500 Ok(Some(file_info))
501 }
502
503 fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
505 let extension = path
506 .extension()
507 .and_then(|ext| ext.to_str())
508 .unwrap_or("")
509 .to_lowercase();
510
511 if let Some(ref exclude) = options.exclude_extensions {
513 if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
514 return false;
515 }
516 }
517
518 if let Some(ref include) = options.include_extensions {
520 return include.iter().any(|ext| ext.to_lowercase() == extension);
521 }
522
523 true
524 }
525
526 fn is_likely_binary(&self, path: &Path, _language: &Language) -> bool {
528 let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
529 FileInfo::detect_binary_with_hint(path, extension)
530 }
531
532 pub fn files_processed(&self) -> usize {
534 self.stats.files_processed.load(Ordering::Relaxed)
535 }
536
537 pub fn directories_traversed(&self) -> usize {
539 self.stats.directories_traversed.load(Ordering::Relaxed)
540 }
541
542 pub fn binary_files_skipped(&self) -> usize {
544 self.stats.binary_files_skipped.load(Ordering::Relaxed)
545 }
546
547 pub fn errors_encountered(&self) -> usize {
549 self.stats.errors_encountered.load(Ordering::Relaxed)
550 }
551}
552
553impl Default for Scanner {
554 fn default() -> Self {
555 Self::new()
556 }
557}
558
559#[cfg(test)]
560mod tests {
561 use super::*;
562 use std::fs;
563 use tempfile::TempDir;
564 use tokio::fs as async_fs;
565
566 #[tokio::test]
567 async fn test_scanner_creation() {
568 let scanner = Scanner::new();
569 assert_eq!(scanner.files_processed(), 0);
570 assert_eq!(scanner.directories_traversed(), 0);
571 }
572
573 #[tokio::test]
574 async fn test_scan_empty_directory() {
575 let scanner = Scanner::new();
576 let temp_dir = TempDir::new().unwrap();
577
578 let options = ScanOptions::default();
579 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
580
581 assert!(results.is_empty());
582 }
583
584 #[tokio::test]
585 async fn test_scan_with_files() {
586 let scanner = Scanner::new();
587 let temp_dir = TempDir::new().unwrap();
588
589 let rust_file = temp_dir.path().join("test.rs");
591 let python_file = temp_dir.path().join("test.py");
592 let binary_file = temp_dir.path().join("test.bin");
593
594 fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
595 fs::write(&python_file, "print('Hello, world!')").unwrap();
596 fs::write(&binary_file, &[0u8; 256]).unwrap(); let options = ScanOptions::default();
599 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
600
601 assert_eq!(results.len(), 2);
603 assert!(results
604 .iter()
605 .any(|f| f.path.file_name().unwrap() == "test.rs"));
606 assert!(results
607 .iter()
608 .any(|f| f.path.file_name().unwrap() == "test.py"));
609
610 let rust_file_info = results
612 .iter()
613 .find(|f| f.path.file_name().unwrap() == "test.rs")
614 .unwrap();
615 assert_eq!(rust_file_info.language, Language::Rust);
616
617 let python_file_info = results
618 .iter()
619 .find(|f| f.path.file_name().unwrap() == "test.py")
620 .unwrap();
621 assert_eq!(python_file_info.language, Language::Python);
622 }
623
624 #[tokio::test]
625 async fn test_scan_options_extension_filtering() {
626 let scanner = Scanner::new();
627 let temp_dir = TempDir::new().unwrap();
628
629 fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
631 fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
632 fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
633
634 let options = ScanOptions::default()
636 .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
637 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
638
639 assert_eq!(results.len(), 2);
640 assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
641 assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
642 assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
643 }
644
645 #[tokio::test]
646 async fn test_parallel_processing() {
647 let scanner = Scanner::new();
648 let temp_dir = TempDir::new().unwrap();
649
650 for i in 0..150 {
652 let file_path = temp_dir.path().join(format!("test_{}.rs", i));
653 fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
654 }
655
656 let options = ScanOptions::default()
657 .with_parallel_processing(true)
658 .with_max_concurrency(4);
659
660 let start = Instant::now();
661 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
662 let duration = start.elapsed();
663
664 assert_eq!(results.len(), 150);
665 log::info!("Parallel scan of 150 files took: {:?}", duration);
666
667 for i in 0..150 {
669 assert!(results
670 .iter()
671 .any(|f| { f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str() }));
672 }
673 }
674
675 #[test]
676 fn test_scan_options_builder() {
677 let options = ScanOptions::default()
678 .with_parallel_processing(true)
679 .with_max_concurrency(8)
680 .with_metadata_extraction(true)
681 .with_git_integration(false)
682 .with_follow_symlinks(false)
683 .with_include_hidden(true)
684 .with_max_file_size(Some(1024 * 1024));
685
686 assert_eq!(options.parallel_processing, true);
687 assert_eq!(options.max_concurrency, 8);
688 assert_eq!(options.metadata_extraction, true);
689 assert_eq!(options.git_integration, false);
690 assert_eq!(options.follow_symlinks, false);
691 assert_eq!(options.include_hidden, true);
692 assert_eq!(options.max_file_size, Some(1024 * 1024));
693 }
694
695 #[test]
696 fn test_binary_file_detection() {
697 let scanner = Scanner::new();
698 let temp_dir = tempfile::TempDir::new().unwrap();
699
700 let text_path = temp_dir.path().join("test.rs");
701 std::fs::write(&text_path, "fn main() {}\n").unwrap();
702
703 let binary_path = temp_dir.path().join("image.png");
704 std::fs::write(&binary_path, &[0u8, 159, 146, 150, 0, 1]).unwrap();
705
706 assert!(scanner.is_likely_binary(&binary_path, &Language::Unknown));
707 assert!(!scanner.is_likely_binary(&text_path, &Language::Rust));
708 }
709}