1use scribe_core::{Result, ScribeError, FileInfo, Language, GitStatus, GitFileStatus, RenderDecision};
7use crate::{MetadataExtractor, ContentAnalyzer, GitIntegrator, LanguageDetector};
8
9use std::path::{Path, PathBuf};
10use std::sync::atomic::{AtomicUsize, Ordering};
11use std::sync::Arc;
12use std::time::Instant;
13
14use walkdir::{WalkDir, DirEntry};
15use ignore::{WalkBuilder, WalkState, DirEntry as IgnoreDirEntry};
16use rayon::prelude::*;
17use tokio::sync::{Semaphore, RwLock};
18use futures::stream::{self, StreamExt};
19
20#[derive(Debug)]
22pub struct Scanner {
23 stats: Arc<ScannerStats>,
24 semaphore: Arc<Semaphore>,
25}
26
27#[derive(Debug, Default)]
29pub struct ScannerStats {
30 files_processed: AtomicUsize,
31 directories_traversed: AtomicUsize,
32 binary_files_skipped: AtomicUsize,
33 errors_encountered: AtomicUsize,
34}
35
36#[derive(Debug, Clone)]
38pub struct ScanOptions {
39 pub parallel_processing: bool,
41 pub max_concurrency: usize,
43 pub metadata_extraction: bool,
45 pub content_analysis: bool,
47 pub git_integration: bool,
49 pub follow_symlinks: bool,
51 pub include_hidden: bool,
53 pub max_file_size: Option<u64>,
55 pub include_extensions: Option<Vec<String>>,
57 pub exclude_extensions: Option<Vec<String>>,
59}
60
61#[derive(Debug, Clone)]
63pub struct ScanResult {
64 pub files: Vec<FileInfo>,
65 pub stats: ScanProgress,
66 pub duration: std::time::Duration,
67 pub errors: Vec<String>,
68}
69
70#[derive(Debug, Clone)]
72pub struct ScanProgress {
73 pub files_processed: usize,
74 pub directories_traversed: usize,
75 pub binary_files_skipped: usize,
76 pub errors_encountered: usize,
77 pub bytes_processed: u64,
78}
79
80impl Default for ScanOptions {
81 fn default() -> Self {
82 Self {
83 parallel_processing: true,
84 max_concurrency: num_cpus::get().min(16), metadata_extraction: true,
86 content_analysis: false,
87 git_integration: false,
88 follow_symlinks: false,
89 include_hidden: false,
90 max_file_size: Some(50 * 1024 * 1024), include_extensions: None,
92 exclude_extensions: None,
93 }
94 }
95}
96
97impl ScanOptions {
98 pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
100 self.parallel_processing = enabled;
101 self
102 }
103
104 pub fn with_max_concurrency(mut self, max: usize) -> Self {
106 self.max_concurrency = max;
107 self
108 }
109
110 pub fn with_metadata_extraction(mut self, enabled: bool) -> Self {
112 self.metadata_extraction = enabled;
113 self
114 }
115
116 pub fn with_content_analysis(mut self, enabled: bool) -> Self {
118 self.content_analysis = enabled;
119 self
120 }
121
122 pub fn with_git_integration(mut self, enabled: bool) -> Self {
124 self.git_integration = enabled;
125 self
126 }
127
128 pub fn with_follow_symlinks(mut self, enabled: bool) -> Self {
130 self.follow_symlinks = enabled;
131 self
132 }
133
134 pub fn with_include_hidden(mut self, enabled: bool) -> Self {
136 self.include_hidden = enabled;
137 self
138 }
139
140 pub fn with_max_file_size(mut self, size: Option<u64>) -> Self {
142 self.max_file_size = size;
143 self
144 }
145
146 pub fn with_include_extensions(mut self, extensions: Vec<String>) -> Self {
148 self.include_extensions = Some(extensions);
149 self
150 }
151
152 pub fn with_exclude_extensions(mut self, extensions: Vec<String>) -> Self {
154 self.exclude_extensions = Some(extensions);
155 self
156 }
157}
158
159impl Scanner {
160 pub fn new() -> Self {
162 Self {
163 stats: Arc::new(ScannerStats::default()),
164 semaphore: Arc::new(Semaphore::new(16)), }
166 }
167
168 pub async fn scan<P: AsRef<Path>>(&self, path: P, options: ScanOptions) -> Result<Vec<FileInfo>> {
170 let start_time = Instant::now();
171 let path = path.as_ref();
172
173 if !path.exists() {
175 return Err(ScribeError::path(format!("Path does not exist: {}", path.display()), path));
176 }
177
178 if !path.is_dir() {
179 return Err(ScribeError::path(format!("Path is not a directory: {}", path.display()), path));
180 }
181
182 let metadata_extractor = if options.metadata_extraction {
184 Some(MetadataExtractor::new())
185 } else {
186 None
187 };
188
189 let content_analyzer = if options.content_analysis {
190 Some(ContentAnalyzer::new())
191 } else {
192 None
193 };
194
195 let git_integrator = if options.git_integration {
196 GitIntegrator::new(path).ok()
197 } else {
198 None
199 };
200
201 let language_detector = LanguageDetector::new();
202
203 let file_paths = if let Some(ref git) = git_integrator {
205 match git.list_tracked_files().await {
206 Ok(paths) => {
207 log::debug!("Using git ls-files for file discovery: {} files", paths.len());
208 paths
209 }
210 Err(_) => {
211 log::debug!("Git discovery failed, falling back to filesystem walk");
212 self.discover_files_filesystem(path, &options).await?
213 }
214 }
215 } else {
216 self.discover_files_filesystem(path, &options).await?
217 };
218
219 log::info!("Discovered {} files for processing", file_paths.len());
220
221 let files = self.process_files_sequential(
223 file_paths,
224 &options,
225 metadata_extractor.as_ref(),
226 content_analyzer.as_ref(),
227 git_integrator.as_ref(),
228 &language_detector,
229 ).await?;
230
231 log::info!(
232 "Scanning completed in {:.2}s: {} files processed",
233 start_time.elapsed().as_secs_f64(),
234 files.len()
235 );
236
237 Ok(files)
238 }
239
240 async fn discover_files_filesystem(&self, root: &Path, options: &ScanOptions) -> Result<Vec<PathBuf>> {
242 let mut builder = WalkBuilder::new(root);
243
244 builder
245 .follow_links(options.follow_symlinks)
246 .hidden(!options.include_hidden)
247 .git_ignore(true)
248 .git_exclude(true)
249 .require_git(false);
250
251 let mut files = Vec::new();
252
253 builder.build().for_each(|entry| {
255 match entry {
256 Ok(entry) => {
257 if entry.file_type().map_or(false, |ft| ft.is_file()) {
258 let path = entry.path().to_path_buf();
259
260 if self.should_include_file(&path, options) {
262 files.push(path);
263 }
264 }
265
266 if entry.file_type().map_or(false, |ft| ft.is_dir()) {
267 self.stats.directories_traversed.fetch_add(1, Ordering::Relaxed);
268 }
269 }
270 Err(err) => {
271 log::warn!("Error during filesystem traversal: {}", err);
272 self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
273 }
274 }
275 });
277
278 Ok(files)
279 }
280
281 async fn process_files_parallel(
283 &self,
284 file_paths: Vec<PathBuf>,
285 options: &ScanOptions,
286 metadata_extractor: Option<&MetadataExtractor>,
287 content_analyzer: Option<&ContentAnalyzer>,
288 git_integrator: Option<&GitIntegrator>,
289 language_detector: &LanguageDetector,
290 ) -> Result<Vec<FileInfo>> {
291 let semaphore = Arc::new(Semaphore::new(options.max_concurrency));
292 let results = Arc::new(RwLock::new(Vec::new()));
293
294 let chunk_size = 1000;
296 for chunk in file_paths.chunks(chunk_size) {
297 let futures: Vec<_> = chunk.iter().map(|path| {
298 let semaphore = Arc::clone(&semaphore);
299 let results = Arc::clone(&results);
300 let path = path.clone();
301
302 async move {
303 let _permit = semaphore.acquire().await.unwrap();
304
305 match self.process_single_file(
306 &path,
307 options,
308 metadata_extractor,
309 content_analyzer,
310 git_integrator,
311 language_detector,
312 ).await {
313 Ok(Some(file_info)) => {
314 results.write().await.push(file_info);
315 }
316 Ok(None) => {
317 }
319 Err(err) => {
320 log::debug!("Error processing file {}: {}", path.display(), err);
321 self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
322 }
323 }
324 }
325 }).collect();
326
327 stream::iter(futures)
329 .buffer_unordered(options.max_concurrency)
330 .collect::<Vec<_>>()
331 .await;
332 }
333
334 let results = results.read().await;
335 Ok(results.clone())
336 }
337
338 async fn process_files_sequential(
340 &self,
341 file_paths: Vec<PathBuf>,
342 options: &ScanOptions,
343 metadata_extractor: Option<&MetadataExtractor>,
344 content_analyzer: Option<&ContentAnalyzer>,
345 git_integrator: Option<&GitIntegrator>,
346 language_detector: &LanguageDetector,
347 ) -> Result<Vec<FileInfo>> {
348 let mut results = Vec::new();
349
350 for path in file_paths {
351 match self.process_single_file(
352 &path,
353 options,
354 metadata_extractor,
355 content_analyzer,
356 git_integrator,
357 language_detector,
358 ).await {
359 Ok(Some(file_info)) => {
360 results.push(file_info);
361 }
362 Ok(None) => {
363 }
365 Err(err) => {
366 log::debug!("Error processing file {}: {}", path.display(), err);
367 self.stats.errors_encountered.fetch_add(1, Ordering::Relaxed);
368 }
369 }
370 }
371
372 Ok(results)
373 }
374
375 async fn process_single_file(
377 &self,
378 path: &Path,
379 options: &ScanOptions,
380 metadata_extractor: Option<&MetadataExtractor>,
381 content_analyzer: Option<&ContentAnalyzer>,
382 git_integrator: Option<&GitIntegrator>,
383 language_detector: &LanguageDetector,
384 ) -> Result<Option<FileInfo>> {
385 if !path.exists() {
387 return Ok(None);
388 }
389
390 let metadata = std::fs::metadata(path)?;
391
392 if let Some(max_size) = options.max_file_size {
394 if metadata.len() > max_size {
395 log::debug!("Skipping large file: {} ({} bytes)", path.display(), metadata.len());
396 return Ok(None);
397 }
398 }
399
400 let language = language_detector.detect_language(path);
402
403 if self.is_likely_binary(path, &language) {
405 self.stats.binary_files_skipped.fetch_add(1, Ordering::Relaxed);
406 return Ok(None);
407 }
408
409 let relative_path = path.to_string_lossy().to_string();
411
412 let file_type = FileInfo::classify_file_type(&relative_path, &language,
413 path.extension().and_then(|e| e.to_str()).unwrap_or(""));
414
415 let mut file_info = FileInfo {
416 path: path.to_path_buf(),
417 relative_path,
418 size: metadata.len(),
419 modified: metadata.modified().ok(),
420 decision: RenderDecision::include("scanned file"),
421 file_type,
422 language,
423 content: None,
424 token_estimate: None,
425 line_count: None,
426 char_count: None,
427 is_binary: false, git_status: None,
429 };
430
431 if let Some(extractor) = metadata_extractor {
433 if let Ok(file_metadata) = extractor.extract_metadata(path).await {
434 file_info.size = file_metadata.size;
435 }
437 }
438
439 if let Some(analyzer) = content_analyzer {
441 if let Ok(content_stats) = analyzer.analyze_file(path).await {
442 }
445 }
446
447 if let Some(git) = git_integrator {
449 if let Ok(git_info) = git.get_file_info(path).await {
450 file_info.git_status = Some(GitStatus {
452 working_tree: git_info.status,
453 index: GitFileStatus::Unmodified,
454 });
455 }
456 }
457
458 self.stats.files_processed.fetch_add(1, Ordering::Relaxed);
459 Ok(Some(file_info))
460 }
461
462 fn should_include_file(&self, path: &Path, options: &ScanOptions) -> bool {
464 let extension = path.extension()
465 .and_then(|ext| ext.to_str())
466 .unwrap_or("")
467 .to_lowercase();
468
469 if let Some(ref exclude) = options.exclude_extensions {
471 if exclude.iter().any(|ext| ext.to_lowercase() == extension) {
472 return false;
473 }
474 }
475
476 if let Some(ref include) = options.include_extensions {
478 return include.iter().any(|ext| ext.to_lowercase() == extension);
479 }
480
481 true
482 }
483
484 fn is_likely_binary(&self, path: &Path, language: &Language) -> bool {
486 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
488 let binary_extensions = [
489 "bin", "exe", "dll", "so", "dylib", "a", "lib",
490 "obj", "o", "class", "jar", "war", "ear",
491 "png", "jpg", "jpeg", "gif", "bmp", "ico", "svg",
492 "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
493 "zip", "tar", "gz", "bz2", "rar", "7z",
494 "mp3", "mp4", "avi", "mkv", "mov", "wmv",
495 "ttf", "otf", "woff", "woff2",
496 ];
497
498 if binary_extensions.contains(&extension.to_lowercase().as_str()) {
499 return true;
500 }
501 }
502
503 matches!(language, Language::Unknown)
506 }
507
508 pub fn files_processed(&self) -> usize {
510 self.stats.files_processed.load(Ordering::Relaxed)
511 }
512
513 pub fn directories_traversed(&self) -> usize {
515 self.stats.directories_traversed.load(Ordering::Relaxed)
516 }
517
518 pub fn binary_files_skipped(&self) -> usize {
520 self.stats.binary_files_skipped.load(Ordering::Relaxed)
521 }
522
523 pub fn errors_encountered(&self) -> usize {
525 self.stats.errors_encountered.load(Ordering::Relaxed)
526 }
527}
528
529impl Default for Scanner {
530 fn default() -> Self {
531 Self::new()
532 }
533}
534
535#[cfg(test)]
536mod tests {
537 use super::*;
538 use tempfile::TempDir;
539 use std::fs;
540 use tokio::fs as async_fs;
541
542 #[tokio::test]
543 async fn test_scanner_creation() {
544 let scanner = Scanner::new();
545 assert_eq!(scanner.files_processed(), 0);
546 assert_eq!(scanner.directories_traversed(), 0);
547 }
548
549 #[tokio::test]
550 async fn test_scan_empty_directory() {
551 let scanner = Scanner::new();
552 let temp_dir = TempDir::new().unwrap();
553
554 let options = ScanOptions::default();
555 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
556
557 assert!(results.is_empty());
558 }
559
560 #[tokio::test]
561 async fn test_scan_with_files() {
562 let scanner = Scanner::new();
563 let temp_dir = TempDir::new().unwrap();
564
565 let rust_file = temp_dir.path().join("test.rs");
567 let python_file = temp_dir.path().join("test.py");
568 let binary_file = temp_dir.path().join("test.bin");
569
570 fs::write(&rust_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
571 fs::write(&python_file, "print('Hello, world!')").unwrap();
572 fs::write(&binary_file, &[0u8; 256]).unwrap(); let options = ScanOptions::default();
575 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
576
577 assert_eq!(results.len(), 2);
579 assert!(results.iter().any(|f| f.path.file_name().unwrap() == "test.rs"));
580 assert!(results.iter().any(|f| f.path.file_name().unwrap() == "test.py"));
581
582 let rust_file_info = results.iter().find(|f| f.path.file_name().unwrap() == "test.rs").unwrap();
584 assert_eq!(rust_file_info.language, Language::Rust);
585
586 let python_file_info = results.iter().find(|f| f.path.file_name().unwrap() == "test.py").unwrap();
587 assert_eq!(python_file_info.language, Language::Python);
588 }
589
590 #[tokio::test]
591 async fn test_scan_options_extension_filtering() {
592 let scanner = Scanner::new();
593 let temp_dir = TempDir::new().unwrap();
594
595 fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
597 fs::write(temp_dir.path().join("test.py"), "print('hello')").unwrap();
598 fs::write(temp_dir.path().join("test.js"), "console.log('hello')").unwrap();
599
600 let options = ScanOptions::default()
602 .with_include_extensions(vec!["rs".to_string(), "py".to_string()]);
603 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
604
605 assert_eq!(results.len(), 2);
606 assert!(results.iter().any(|f| f.path.extension().unwrap() == "rs"));
607 assert!(results.iter().any(|f| f.path.extension().unwrap() == "py"));
608 assert!(!results.iter().any(|f| f.path.extension().unwrap() == "js"));
609 }
610
611 #[tokio::test]
612 async fn test_parallel_processing() {
613 let scanner = Scanner::new();
614 let temp_dir = TempDir::new().unwrap();
615
616 for i in 0..150 {
618 let file_path = temp_dir.path().join(format!("test_{}.rs", i));
619 fs::write(&file_path, format!("fn main_{i}() {{}}")).unwrap();
620 }
621
622 let options = ScanOptions::default()
623 .with_parallel_processing(true)
624 .with_max_concurrency(4);
625
626 let start = Instant::now();
627 let results = scanner.scan(temp_dir.path(), options).await.unwrap();
628 let duration = start.elapsed();
629
630 assert_eq!(results.len(), 150);
631 log::info!("Parallel scan of 150 files took: {:?}", duration);
632
633 for i in 0..150 {
635 assert!(results.iter().any(|f| {
636 f.path.file_name().unwrap() == format!("test_{}.rs", i).as_str()
637 }));
638 }
639 }
640
641 #[test]
642 fn test_scan_options_builder() {
643 let options = ScanOptions::default()
644 .with_parallel_processing(true)
645 .with_max_concurrency(8)
646 .with_metadata_extraction(true)
647 .with_content_analysis(true)
648 .with_git_integration(false)
649 .with_follow_symlinks(false)
650 .with_include_hidden(true)
651 .with_max_file_size(Some(1024 * 1024));
652
653 assert_eq!(options.parallel_processing, true);
654 assert_eq!(options.max_concurrency, 8);
655 assert_eq!(options.metadata_extraction, true);
656 assert_eq!(options.content_analysis, true);
657 assert_eq!(options.git_integration, false);
658 assert_eq!(options.follow_symlinks, false);
659 assert_eq!(options.include_hidden, true);
660 assert_eq!(options.max_file_size, Some(1024 * 1024));
661 }
662
663 #[test]
664 fn test_binary_file_detection() {
665 let scanner = Scanner::new();
666
667 assert!(scanner.is_likely_binary(Path::new("test.exe"), &Language::Unknown));
669 assert!(scanner.is_likely_binary(Path::new("test.png"), &Language::Unknown));
670 assert!(scanner.is_likely_binary(Path::new("test.pdf"), &Language::Unknown));
671
672 assert!(!scanner.is_likely_binary(Path::new("test.rs"), &Language::Rust));
674 assert!(!scanner.is_likely_binary(Path::new("test.py"), &Language::Python));
675 assert!(!scanner.is_likely_binary(Path::new("test.md"), &Language::Markdown));
676 }
677}