st/scanner.rs
1//
2// -----------------------------------------------------------------------------
3// WELCOME TO THE JUNGLE! ...The filesystem jungle, that is. ๐ด
4//
5// You've found scanner.rs, the intrepid explorer and engine room of st.
6// This module is the Indiana Jones of our codebase. It bravely dives into
7// the deepest, darkest directories, dodges `.gitignore` traps, inspects
8// every file for treasure (metadata), and reports back its findings.
9//
10// So grab your hat, and let's go on an adventure!
11//
12// Brought to you by The Cheet - making filesystem traversal a rock concert! ๐ฅ๐งป
13// -----------------------------------------------------------------------------
14//
15
16use crate::interest_calculator::InterestCalculator;
17use crate::scanner_interest::{ChangeType, InterestScore, TraversalContext};
18use crate::scanner_safety::{estimate_node_size, ScannerSafetyLimits, ScannerSafetyTracker};
19use crate::scanner_state::ScanState;
20use crate::security_scan::{SecurityFinding, SecurityScanner};
21use anyhow::Result;
22use globset::{Glob, GlobSet, GlobSetBuilder}; // For powerful gitignore-style pattern matching.
23use regex::Regex; // For user-defined find patterns.
24use std::collections::{HashMap, HashSet}; // Our trusty hash-based collections.
25use std::fs; // Filesystem operations, the bread and butter here.
26use std::io::{BufRead, BufReader}; // For efficient reading, especially for content search.
27use std::path::{Path, PathBuf}; // Path manipulation is key.
28use std::sync::mpsc; // For streaming results from a worker thread.
29use std::time::SystemTime; // To know when files were last touched.
30use walkdir::{DirEntry, WalkDir}; // The excellent `walkdir` crate does the actual directory walking.
31
32// Unix-specific imports for richer metadata like permissions, UID, GID.
33// On other platforms, we'll use sensible defaults.
34#[cfg(unix)]
35use std::os::unix::fs::{MetadataExt, PermissionsExt};
36
37/// # FileNode: The Ultimate Backstage Pass
38///
39/// Every file and directory we meet gets one of these. It's a VIP pass that
40/// holds all the juicy details: its name, size, when it was last cool (modified),
41/// and whether it's on the super-secret "ignored" list. It's the atom of our
42/// `st` universe.
43#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
44pub struct FileNode {
45 /// The full path to the file or directory. The source of truth for location!
46 pub path: PathBuf,
47 /// Is it a directory? `true` if yes, `false` if it's a file or symlink.
48 pub is_dir: bool,
49 /// Size of the file in bytes. For directories, this is often 0 or metadata-dependent.
50 pub size: u64,
51 /// File permissions (e.g., `rwxr-xr-x`). Stored as a u32, typically from Unix mode.
52 pub permissions: u32,
53 /// User ID of the owner (Unix-specific).
54 pub uid: u32,
55 /// Group ID of the owner (Unix-specific).
56 pub gid: u32,
57 /// Timestamp of the last modification. Tells us how fresh or ancient a file is.
58 pub modified: SystemTime,
59 /// Is it a symbolic link? `true` if yes. We handle these with care.
60 pub is_symlink: bool,
61 /// Is it a hidden file (e.g., starts with a `.` on Unix)?
62 pub is_hidden: bool,
63 /// Did we encounter a "Permission Denied" error when trying to access this?
64 /// Important for gracefully handling parts of the filesystem we can't read.
65 pub permission_denied: bool,
66 /// Is this file or directory ignored based on `.gitignore` or default ignore rules?
67 pub is_ignored: bool,
68 /// The depth of this entry relative to the scan root (root is depth 0).
69 pub depth: usize,
70 /// The specific type of the file (e.g., RegularFile, Symlink, Executable).
71 pub file_type: FileType,
72 /// A category assigned based on extension or name, used for coloring and context.
73 /// (e.g., Rust, Python, Image, Archive).
74 pub category: FileCategory,
75 /// For content search: Information about where matches were found
76 /// `None` if no search was performed or no matches.
77 pub search_matches: Option<SearchMatches>,
78 /// The filesystem type this file resides on
79 pub filesystem_type: FilesystemType,
80 /// Git branch if this directory contains a .git folder
81 pub git_branch: Option<String>,
82
83 // --- Smart Scanning Fields (Phase 2: Intelligent Context-Aware Scanning) ---
84 // These fields enable "surface what matters" scanning
85
86 /// How we reached this location (direct, symlink, mount, dependency)
87 #[serde(skip_serializing_if = "Option::is_none")]
88 pub traversal_context: Option<TraversalContext>,
89
90 /// Interest score - how relevant is this file right now?
91 #[serde(skip_serializing_if = "Option::is_none")]
92 pub interest: Option<InterestScore>,
93
94 /// Security findings detected during scan
95 #[serde(skip_serializing_if = "Vec::is_empty", default)]
96 pub security_findings: Vec<SecurityFinding>,
97
98 /// Change status since last scan (Added, Modified, Deleted, etc.)
99 #[serde(skip_serializing_if = "Option::is_none")]
100 pub change_status: Option<ChangeType>,
101
102 /// Content hash for change detection (Blake3/SHA256)
103 #[serde(skip_serializing_if = "Option::is_none")]
104 pub content_hash: Option<String>,
105}
106
107/// Information about search matches within a file
108#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
109pub struct SearchMatches {
110 /// First match position (line, column)
111 pub first_match: (usize, usize),
112 /// Total number of matches found
113 pub total_count: usize,
114 /// List of all match positions (line, column) - limited to prevent memory issues
115 pub positions: Vec<(usize, usize)>,
116 /// Whether the search was truncated due to too many matches
117 pub truncated: bool,
118 /// Line content for each match (line number, line content, column) - optional for compatibility
119 #[serde(skip_serializing_if = "Option::is_none")]
120 pub line_content: Option<Vec<(usize, String, usize)>>,
121}
122
123/// # FileType: Distinguishing Different Kinds of Filesystem Objects
124///
125/// This enum helps us categorize entries beyond just "file" or "directory".
126/// It's especially useful on Unix-like systems where you have sockets, pipes, etc.
127#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
128pub enum FileType {
129 Directory, // A folder, a container of other things.
130 RegularFile, // Your everyday, garden-variety file.
131 Symlink, // A pointer to another file or directory.
132 Executable, // A file that can be run (has execute permissions).
133 Socket, // A Unix domain socket.
134 Pipe, // A named pipe (FIFO).
135 BlockDevice, // A block special file (e.g., /dev/sda).
136 CharDevice, // A character special file (e.g., /dev/tty).
137}
138
139/// # FilesystemType: Identifying the underlying filesystem
140///
141/// This enum represents different filesystem types with single-character codes
142/// for compact display. The mapping is designed to be memorable and intuitive.
143#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
144pub enum FilesystemType {
145 Ext4, // '4' - The most common Linux filesystem
146 Ext3, // '3' - Older ext filesystem
147 Ext2, // '2' - Even older ext filesystem
148 Xfs, // 'X' - XFS filesystem
149 Btrfs, // 'B' - Btrfs (B-tree filesystem)
150 Zfs, // 'Z' - ZFS filesystem
151 Ntfs, // 'N' - Windows NTFS
152 Fat32, // 'F' - FAT32
153 ExFat, // 'E' - exFAT
154 Apfs, // 'A' - Apple File System
155 Hfs, // 'H' - HFS+ (older Mac)
156 Nfs, // 'R' - Remote NFS mount
157 Smb, // 'S' - SMB/CIFS network filesystem
158 Tmpfs, // 'T' - Temporary filesystem (RAM)
159 Procfs, // 'P' - /proc virtual filesystem
160 Sysfs, // 'Y' - /sys virtual filesystem
161 Devfs, // 'D' - /dev virtual filesystem
162 Mem8, // 'M' - MEM|8 filesystem (Coming soon - Quantum File System) - https://m8.is
163 Unknown, // '?' - Unknown filesystem
164}
165
166impl FilesystemType {
167 /// Get the single-character code for this filesystem type
168 pub fn to_char(&self) -> char {
169 match self {
170 FilesystemType::Ext4 => '4',
171 FilesystemType::Ext3 => '3',
172 FilesystemType::Ext2 => '2',
173 FilesystemType::Xfs => 'X',
174 FilesystemType::Btrfs => 'B',
175 FilesystemType::Zfs => 'Z',
176 FilesystemType::Ntfs => 'N',
177 FilesystemType::Fat32 => 'F',
178 FilesystemType::ExFat => 'E',
179 FilesystemType::Apfs => 'A',
180 FilesystemType::Hfs => 'H',
181 FilesystemType::Nfs => 'R',
182 FilesystemType::Smb => 'S',
183 FilesystemType::Tmpfs => 'T',
184 FilesystemType::Procfs => 'P',
185 FilesystemType::Sysfs => 'Y',
186 FilesystemType::Devfs => 'D',
187 FilesystemType::Mem8 => 'M',
188 FilesystemType::Unknown => '?',
189 }
190 }
191
192 /// Check if this is a virtual filesystem that should be skipped
193 pub fn is_virtual(&self) -> bool {
194 matches!(
195 self,
196 FilesystemType::Procfs
197 | FilesystemType::Sysfs
198 | FilesystemType::Devfs
199 | FilesystemType::Tmpfs
200 )
201 }
202
203 /// Check if this filesystem type should be shown by default
204 /// (only "interesting" filesystems based on platform)
205 pub fn should_show_by_default(&self) -> bool {
206 #[cfg(target_os = "linux")]
207 {
208 matches!(
209 self,
210 FilesystemType::Ext4
211 | FilesystemType::Ext3
212 | FilesystemType::Xfs
213 | FilesystemType::Btrfs
214 | FilesystemType::Zfs
215 | FilesystemType::Nfs
216 | FilesystemType::Smb
217 | FilesystemType::Mem8
218 )
219 }
220 #[cfg(target_os = "macos")]
221 {
222 matches!(
223 self,
224 FilesystemType::Apfs
225 | FilesystemType::Hfs
226 | FilesystemType::Nfs
227 | FilesystemType::Smb
228 | FilesystemType::Mem8
229 )
230 }
231 #[cfg(target_os = "windows")]
232 {
233 matches!(
234 self,
235 FilesystemType::Ntfs
236 | FilesystemType::Fat32
237 | FilesystemType::ExFat
238 | FilesystemType::Mem8
239 )
240 }
241 #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
242 {
243 // Show all non-virtual filesystems on other platforms
244 !self.is_virtual()
245 }
246 }
247}
248
249/// # FileCategory: Adding Semantic Flavor to Files
250///
251/// This enum provides a higher-level categorization based on common file extensions
252/// or names. It's primarily used for display purposes, like coloring output,
253/// and can also help in understanding the nature of a directory's contents.
254/// Trish loves how this makes the tree output more intuitive!
255#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
256pub enum FileCategory {
257 // --- Programming Languages ---
258 Rust, // .rs
259 Python, // .py, .pyw, .pyx, .pyi
260 JavaScript, // .js, .mjs, .cjs
261 TypeScript, // .ts, .tsx
262 Java, // .java, .class, .jar
263 C, // .c, .h
264 Cpp, // .cpp, .cc, .cxx, .hpp, .hxx
265 Go, // .go
266 Ruby, // .rb
267 PHP, // .php - Not sure php is programming.
268 Shell, // .sh, .bash, .zsh, .fish
269
270 // --- Markup & Data Formats ---
271 Markdown, // .md, .markdown
272 Html, // .html, .htm
273 Css, // .css, .scss, .sass, .less
274 Json, // .json, .jsonc
275 Yaml, // .yaml, .yml
276 Xml, // .xml, .svg (SVG is XML-based)
277 Toml, // .toml
278 Csv, // .csv
279
280 // --- Build Systems & Configuration ---
281 Makefile, // Makefile, makefile, GNUmakefile
282 Dockerfile, // Dockerfile, .dockerfile
283 GitConfig, // .gitignore, .gitconfig, .gitmodules
284
285 // --- Archives & Compressed Files ---
286 Archive, // .zip, .tar, .gz, .bz2, .xz, .7z, .rar
287
288 // --- Media Files ---
289 Image, // .jpg, .jpeg, .png, .gif, .bmp, .ico, .webp
290 Video, // .mp4, .avi, .mkv, .mov, .wmv, .flv, .webm
291 Audio, // .mp3, .wav, .flac, .aac, .ogg, .wma
292
293 // --- System & Binary Files ---
294 SystemFile, // Special system files like swap.img, vmlinuz
295 Binary, // Executables, shared libraries (.exe, .dll, .so, .dylib, .o, .a)
296
297 // --- Database ---
298 Database, // .db, .sqlite, .mdb, .accdb, .dbf
299
300 // --- Office & Documents ---
301 Office, // .doc, .docx, .odt
302 Spreadsheet, // .xls, .xlsx, .ods, .csv
303 PowerPoint, // .ppt, .pptx, .odp
304 Pdf, // .pdf
305 Ebook, // .epub, .mobi, .azw
306
307 // --- Text Variants ---
308 Log, // .log
309 Config, // .ini, .cfg, .conf, .env, .properties
310 License, // LICENSE, COPYING files
311 Readme, // README files
312 Txt, // .txt
313 Rtf, // .rtf
314
315 // --- Security & Crypto ---
316 Certificate, // .crt, .cert, .pem, .key
317 Encrypted, // .gpg, .pgp, .aes
318
319 // --- Fonts ---
320 Font, // .ttf, .otf, .woff, .woff2
321
322 // --- Virtual & Disk Images ---
323 DiskImage, // .img, .iso, .vdi, .vmdk, .vhd, .dd, .dmg
324
325 // --- 3D & CAD ---
326 Model3D, // .obj, .stl, .dae, .fbx, .blend
327
328 // --- Scientific & Data ---
329 Jupyter, // .ipynb
330 RData, // .rdata, .rds
331 Matlab, // .m, .mat
332
333 // --- Web Assets ---
334 WebAsset, // .wasm, .map
335
336 // --- Package & Dependencies ---
337 Package, // package.json, Cargo.toml, requirements.txt, etc.
338 Lock, // package-lock.json, Cargo.lock, yarn.lock
339
340 // --- Testing ---
341 Test, // Files with test_, _test, .test, .spec patterns
342
343 // --- Memory Files (Our special type!) ---
344 Memory, // .mem8, .m8 - MEM|8 memory files
345
346 // --- Others ---
347 Backup, // .bak, .backup, ~
348 Temp, // .tmp, .temp, .swp
349 Unknown, // If we can't categorize it, it's a mysterious Unknown!
350}
351
352/// # TreeStats: The Final Scoreboard
353///
354/// After the concert is over, this is where we see how we did. It's the
355/// scoreboard that tracks total files, total directories, the biggest hits
356/// (largest files), and more. It's the answer to "So, how was the show?"
357#[derive(Debug, Default)]
358pub struct TreeStats {
359 /// Total number of files encountered (excluding directories).
360 pub total_files: u64,
361 /// Total number of directories encountered.
362 pub total_dirs: u64,
363 /// Total size of all files (in bytes).
364 pub total_size: u64,
365 /// A map of file extensions to their counts (e.g., {"rs": 10, "toml": 2}).
366 pub file_types: HashMap<String, u64>,
367 /// Top N largest files found (path and size). N is usually 10.
368 pub largest_files: Vec<(u64, PathBuf)>,
369 /// Top N newest files found (path and modification time).
370 pub newest_files: Vec<(SystemTime, PathBuf)>,
371 /// Top N oldest files found (path and modification time).
372 pub oldest_files: Vec<(SystemTime, PathBuf)>,
373}
374
375impl TreeStats {
376 /// Updates the statistics based on a newly processed `FileNode`.
377 /// This method is called for each non-permission-denied node.
378 pub fn update_file(&mut self, node: &FileNode) {
379 if node.is_dir {
380 self.total_dirs += 1;
381 } else {
382 // It's a file!
383 self.total_files += 1;
384 self.total_size += node.size;
385
386 // Track file extensions for type distribution.
387 if let Some(ext) = node.path.extension() {
388 if let Some(ext_str) = ext.to_str() {
389 *self.file_types.entry(ext_str.to_string()).or_insert(0) += 1;
390 }
391 }
392
393 // --- Update Top N Lists ---
394 // These lists are kept sorted and truncated to maintain a fixed size (e.g., top 10).
395
396 // Update largest files: Add, sort by size (desc), truncate.
397 self.largest_files.push((node.size, node.path.clone()));
398 self.largest_files.sort_by(|a, b| b.0.cmp(&a.0)); // Largest first
399 self.largest_files.truncate(10); // Keep only the top 10
400
401 // Update newest files: Add, sort by modification time (desc), truncate.
402 self.newest_files.push((node.modified, node.path.clone()));
403 self.newest_files.sort_by(|a, b| b.0.cmp(&a.0)); // Newest first
404 self.newest_files.truncate(10);
405
406 // Update oldest files: Add, sort by modification time (asc), truncate.
407 self.oldest_files.push((node.modified, node.path.clone()));
408 self.oldest_files.sort_by(|a, b| a.0.cmp(&b.0)); // Oldest first
409 self.oldest_files.truncate(10);
410 }
411 }
412}
413
414/// # ScannerConfig: The Rider for our Rock Star Scanner
415///
416/// This is the list of demands for our scanner. "Don't show me hidden files,"
417/// "I only want to see files bigger than a tour bus," "Ignore the messy backstage
418/// area (`.gitignore`)." We build this from the user's command-line arguments
419/// to make sure the scanner puts on the exact show the user wants to see.
420#[derive(Default, Clone)]
421pub struct ScannerConfig {
422 /// Maximum depth to traverse into subdirectories.
423 pub max_depth: usize,
424 /// Should symbolic links be followed? (Currently always `false`).
425 pub follow_symlinks: bool,
426 /// Should `.gitignore` files be respected?
427 pub respect_gitignore: bool,
428 /// Should hidden files (starting with `.`) be shown?
429 pub show_hidden: bool,
430 /// Should ignored files/directories be shown (usually in brackets)?
431 pub show_ignored: bool,
432 /// An optional regex pattern to filter files/directories by name.
433 pub find_pattern: Option<Regex>,
434 /// An optional file extension to filter by (e.g., "rs").
435 pub file_type_filter: Option<String>,
436 /// Optional entry type filter ("f" for files, "d" for directories).
437 pub entry_type_filter: Option<String>,
438 /// Optional minimum file size filter.
439 pub min_size: Option<u64>,
440 /// Optional maximum file size filter.
441 pub max_size: Option<u64>,
442 /// Optional filter for files newer than a specific date.
443 pub newer_than: Option<SystemTime>,
444 /// Optional filter for files older than a specific date.
445 pub older_than: Option<SystemTime>,
446 /// Should the scanner use its built-in list of default ignore patterns
447 /// (like `node_modules`, `__pycache__`, `target/`)?
448 pub use_default_ignores: bool,
449 /// An optional keyword to search for within file contents.
450 pub search_keyword: Option<String>,
451 /// Should filesystem type indicators be shown?
452 pub show_filesystems: bool,
453 /// Sort field for results (name, size, date, type)
454 pub sort_field: Option<String>,
455 /// Limit results to top N entries (useful with sort)
456 pub top_n: Option<usize>,
457 /// Include actual line content in search results (for AI/MCP use)
458 pub include_line_content: bool,
459
460 // --- Smart Scanning Options (Phase 2: Intelligent Context-Aware Scanning) ---
461
462 /// Compute interest scores for each node (default: true when smart mode is enabled)
463 pub compute_interest: bool,
464 /// Perform security scanning during traversal (default: true)
465 pub security_scan: bool,
466 /// Minimum interest score to include in results (0.0-1.0, default: 0.0)
467 pub min_interest: f32,
468 /// Track how we reached each location (symlink, mount, etc.)
469 pub track_traversal: bool,
470 /// Only show changes since last scan
471 pub changes_only: bool,
472 /// Path to previous state file for comparison (or auto-detect from ~/.st/scan_states/)
473 pub compare_state: Option<PathBuf>,
474 /// Enable smart mode - groups by interest, shows changes, minimal output
475 pub smart_mode: bool,
476}
477
478// --- Default Ignore Patterns: The "Please Don't Play These Songs" List ---
479// Every band has songs they'd rather not play. This is our list of files and
480// directories (`node_modules`, `target/`, etc.) that we usually skip to keep
481// the show clean and focused on the hits. A tidy tree is a happy tree!
482const DEFAULT_IGNORE_PATTERNS: &[&str] = &[
483 // Version control systems (but not all hidden dirs like .ssh)
484 ".git",
485 ".svn",
486 ".hg",
487 ".bzr",
488 "_darcs",
489 // Python artifacts
490 "__pycache__",
491 "*.pyc",
492 "*.pyo",
493 "*.pyd",
494 ".Python",
495 ".pytest_cache",
496 ".tox",
497 ".coverage",
498 "*.egg-info",
499 ".eggs",
500 // Node.js / JavaScript artifacts
501 "node_modules",
502 ".npm",
503 ".yarn",
504 ".pnpm-store",
505 "bower_components",
506 ".next",
507 ".nuxt",
508 // General cache directories often found in projects
509 ".cache", // Common cache dir name
510 // Virtual environments
511 "venv",
512 "env",
513 "ENV",
514 "virtualenv",
515 ".venv",
516 ".env",
517 "conda-meta",
518 // Build/compilation artifacts from various languages/systems
519 "target", // Rust
520 "build",
521 "dist",
522 "out",
523 "bin",
524 "obj", // Common build output dirs
525 "*.o",
526 "*.a",
527 "*.so",
528 "*.dll",
529 "*.dylib", // Object files, libraries
530 // Package manager caches/data
531 ".cargo",
532 ".rustup", // Rust
533 ".gem",
534 ".bundle", // Ruby
535 // IDEs and editor-specific files/directories
536 ".idea",
537 ".vscode",
538 ".vs", // Common IDE metadata
539 "*.swp",
540 "*.swo",
541 "*~", // Vim/editor backup/swap files
542 ".project",
543 ".classpath",
544 ".settings", // Eclipse/Java
545 // Development tool caches
546 ".mypy_cache",
547 ".ruff_cache",
548 ".hypothesis",
549 ".pytest_cache",
550 ".tox",
551 ".coverage",
552 ".sass-cache",
553 // OS-specific junk files
554 ".DS_Store", // macOS
555 "Thumbs.db", // Windows
556 "desktop.ini", // Windows
557 "$RECYCLE.BIN", // Windows recycle bin
558 // Common temporary file/directory names and patterns
559 "tmp",
560 "temp",
561 ".tmp",
562 ".temp",
563 "*.tmp",
564 "*.temp",
565 // More cache directories
566 ".sass-cache", // Sass CSS preprocessor
567 "__MACOSX", // macOS archive metadata
568 // System directories that are almost never useful to traverse deeply from a user's project root.
569 // These are more aggressively ignored if `st` is run on `/`.
570 // "proc", "sys", "dev", "lost+found", "mnt", "media", // Handled by DEFAULT_SYSTEM_PATHS
571 // Other common ignores
572 ".vagrant",
573 ".terraform",
574];
575
576// Default paths that are almost always too noisy or problematic to scan,
577// especially if `st` is run from `/` or a very high-level directory.
578// These are typically mount points for virtual filesystems or system-critical areas.
579const DEFAULT_SYSTEM_PATHS: &[&str] = &[
580 "/proc",
581 "/sys",
582 "/dev",
583 "/run",
584 "/tmp",
585 "/var/tmp",
586 "/lost+found",
587 "/mnt",
588 "/media",
589 "/snap", // Common mount points or special dirs
590];
591
592// Specific individual files (absolute paths) that should always be ignored
593// due to their special nature (e.g., virtual files representing system memory).
594const DEFAULT_IGNORE_FILES: &[&str] = &[
595 "/proc/kcore", // Virtual file representing physical memory, can be huge & slow.
596 "/proc/kmsg", // Kernel messages, can be an infinite stream.
597 "/proc/kallsyms", // Kernel symbols, can be large.
598];
599
600/// # Scanner: The Rock Star of our Show
601///
602/// BEHOLD! The `Scanner` itself! This is the main act. It takes the config,
603/// the ignore lists, and a path, and it puts on a spectacular show of directory
604/// traversal. It's fast, it's smart, and it knows all the best moves.
605pub struct Scanner {
606 /// The configuration for this scanning operation.
607 config: ScannerConfig,
608 /// Compiled `GlobSet` from `.gitignore` files, if respected and found.
609 gitignore: Option<GlobSet>,
610 /// Compiled `GlobSet` from our `DEFAULT_IGNORE_PATTERNS`.
611 default_ignores: Option<GlobSet>,
612 /// A set of absolute system paths to ignore (e.g., /proc, /sys).
613 system_paths: HashSet<PathBuf>,
614 /// A set of specific absolute file paths to ignore (e.g., /proc/kcore).
615 ignore_files: HashSet<PathBuf>,
616 /// The root path from which the scan originates.
617 root: PathBuf,
618 /// Safety limits to prevent crashes on large directories
619 safety_limits: ScannerSafetyLimits,
620
621 // --- Smart Scanning Components (Phase 4) ---
622
623 /// Security scanner for detecting supply chain attack patterns
624 security_scanner: Option<SecurityScanner>,
625 /// Interest calculator for scoring file relevance
626 interest_calculator: Option<InterestCalculator>,
627}
628
629impl Scanner {
630 /// Returns the canonicalized root path of the scanner
631 pub fn root(&self) -> &Path {
632 &self.root
633 }
634
635 /// Quick scan for basic project analysis - lighter weight than full scan
636 /// Returns only basic stats and key files for faster integration
637 pub fn quick_scan(&self) -> Result<(Vec<FileNode>, TreeStats)> {
638 let mut config = self.config.clone();
639 config.max_depth = 3; // Limit depth for quick scan
640
641 let quick_scanner = Scanner::new(&self.root, config)?;
642 quick_scanner.scan()
643 }
644
645 /// Find files modified within a specific time range
646 /// Useful for finding recent activity in projects
647 pub fn find_recent_files(&self, hours_ago: u64) -> Result<Vec<FileNode>> {
648 let cutoff_time =
649 std::time::SystemTime::now() - std::time::Duration::from_secs(hours_ago * 3600);
650
651 let (nodes, _) = self.scan()?;
652 Ok(nodes
653 .into_iter()
654 .filter(|node| !node.is_dir && node.modified > cutoff_time)
655 .collect())
656 }
657
658 /// Get key project files (build configs, main files, etc.)
659 /// Returns a filtered list of important files for project analysis
660 pub fn find_key_files(&self) -> Result<Vec<FileNode>> {
661 let (nodes, _) = self.scan()?;
662
663 let important_patterns = [
664 "main.rs",
665 "lib.rs",
666 "mod.rs",
667 "package.json",
668 "Cargo.toml",
669 "requirements.txt",
670 "pyproject.toml",
671 "README.md",
672 "LICENSE",
673 "Makefile",
674 "CMakeLists.txt",
675 "index.js",
676 "app.js",
677 "server.js",
678 "main.js",
679 "main.py",
680 "__init__.py",
681 "setup.py",
682 "go.mod",
683 "main.go",
684 "pom.xml",
685 "build.gradle",
686 "build.xml",
687 ".gitignore",
688 "docker-compose.yml",
689 "Dockerfile",
690 ];
691
692 Ok(nodes
693 .into_iter()
694 .filter(|node| {
695 if node.is_dir {
696 return false;
697 }
698
699 let file_name = node.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
700
701 important_patterns.contains(&file_name)
702 })
703 .collect())
704 }
705
706 /// ## `get_file_category`
707 /// Determines a `FileCategory` for a given path and `FileType`.
708 /// This function uses a series of heuristics based on file extensions and common names
709 /// to classify files into broad categories, useful for display and understanding content.
710 /// It's like a quick identification guide for files!
711 fn get_file_category(path: &Path, file_type: FileType) -> FileCategory {
712 // Directories don't get a specific content category here; their content defines them.
713 if matches!(file_type, FileType::Directory) {
714 return FileCategory::Unknown;
715 }
716
717 // First, check for some very specific system file names.
718 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
719 if name == "swap.img"
720 || name == "swapfile"
721 || name.starts_with("vmlinuz")
722 || name.starts_with("initrd")
723 {
724 return FileCategory::SystemFile;
725 }
726 }
727
728 // Primary categorization is by file extension.
729 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
730 match ext.to_lowercase().as_str() {
731 // --- Programming Languages ---
732 "rs" => FileCategory::Rust,
733 "py" | "pyw" | "pyx" | "pyi" => FileCategory::Python,
734 "js" | "mjs" | "cjs" => FileCategory::JavaScript,
735 "ts" | "tsx" => FileCategory::TypeScript,
736 "java" | "class" | "jar" => FileCategory::Java,
737 "c" | "h" => FileCategory::C,
738 "cpp" | "cc" | "cxx" | "hpp" | "hxx" => FileCategory::Cpp,
739 "go" => FileCategory::Go,
740 "rb" => FileCategory::Ruby,
741 "php" => FileCategory::PHP,
742 "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => FileCategory::Shell,
743
744 // --- Markup/Data ---
745 "md" | "markdown" => FileCategory::Markdown,
746 "html" | "htm" => FileCategory::Html,
747 "css" | "scss" | "sass" | "less" => FileCategory::Css,
748 "json" | "jsonc" | "geojson" => FileCategory::Json,
749 "yaml" | "yml" => FileCategory::Yaml,
750 "xml" | "svg" | "plist" | "kml" | "gpx" => FileCategory::Xml, // SVG and others are XML-based
751 "toml" => FileCategory::Toml,
752
753 // --- Build/Config (some are also by name) ---
754 "dockerfile" => FileCategory::Dockerfile, // Extension variant
755 // .gitignore, .gitconfig are usually by name, handled below
756
757 // --- Archives ---
758 "zip" | "tar" | "gz" | "tgz" | "bz2" | "tbz2" | "xz" | "txz" | "7z" | "rar" => {
759 FileCategory::Archive
760 }
761
762 // --- Media ---
763 "jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "webp" | "tiff" | "tif"
764 | "heic" | "heif" => FileCategory::Image,
765 "mp4" | "avi" | "mkv" | "mov" | "wmv" | "flv" | "webm" | "mpeg" | "mpg" => {
766 FileCategory::Video
767 }
768 "mp3" | "wav" | "flac" | "aac" | "ogg" | "wma" | "m4a" => FileCategory::Audio,
769
770 // --- Binary/Executable (some overlap with system, but these are common distributable/object formats) ---
771 "exe" | "dll" | "so" | "dylib" | "o" | "a" | "lib" | "msi" | "deb" | "rpm"
772 | "app" => FileCategory::Binary,
773
774 // --- Database Files ---
775 "db" | "sqlite" | "sqlitedb" | "sqlite3" | "db3" | "db4" | "db5" | "mdb"
776 | "accdb" | "dbf" => FileCategory::Database,
777
778 // --- Office & Documents ---
779 "doc" | "docx" | "odt" | "rtf" => FileCategory::Office,
780 "xls" | "xlsx" | "ods" | "csv" | "tsv" => FileCategory::Spreadsheet,
781 "ppt" | "pptx" | "odp" => FileCategory::PowerPoint,
782 "pdf" => FileCategory::Pdf,
783 "epub" | "mobi" | "azw" | "azw3" | "fb2" => FileCategory::Ebook,
784
785 // --- Text & Config Files ---
786 "txt" | "text" => FileCategory::Txt,
787 "log" => FileCategory::Log,
788 "ini" | "cfg" | "conf" | "config" | "properties" | "env" => FileCategory::Config,
789
790 // --- Security & Crypto ---
791 "crt" | "cert" | "pem" | "key" | "pub" | "cer" | "der" => FileCategory::Certificate,
792 "gpg" | "pgp" | "aes" | "enc" | "asc" => FileCategory::Encrypted,
793
794 // --- Fonts ---
795 "ttf" | "otf" | "woff" | "woff2" | "eot" | "fon" | "fnt" => FileCategory::Font,
796
797 // --- Disk Images ---
798 "img" | "vdi" | "vmdk" | "vhd" | "vhdx" | "dd" | "hdd" | "qcow" | "qcow2" => {
799 FileCategory::DiskImage
800 }
801 "iso" | "dmg" => FileCategory::DiskImage, // These can be both archives and disk images, but treating as disk images
802
803 // --- 3D & CAD ---
804 "obj" | "stl" | "dae" | "fbx" | "blend" | "3ds" | "ply" | "gltf" | "glb" => {
805 FileCategory::Model3D
806 }
807
808 // --- Scientific & Data ---
809 "ipynb" => FileCategory::Jupyter,
810 "rdata" | "rds" | "rda" => FileCategory::RData,
811 "m" | "mat" | "mlx" => FileCategory::Matlab,
812
813 // --- Web Assets ---
814 "wasm" | "map" | "sourcemap" => FileCategory::WebAsset,
815
816 // --- Memory Files (MEM|8!) ---
817 "mem8" | "m8" | "mq" => FileCategory::Memory,
818
819 // --- Backup & Temp ---
820 "bak" | "backup" | "old" | "orig" => FileCategory::Backup,
821 "tmp" | "temp" | "swp" | "swo" | "swn" => FileCategory::Temp,
822
823 _ => FileCategory::Unknown, // Extension not recognized
824 }
825 } else {
826 // No extension, or extension parsing failed. Try common filenames.
827 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
828 // Check for test files
829 if name.starts_with("test_")
830 || name.ends_with("_test")
831 || name.contains(".test.")
832 || name.contains(".spec.")
833 {
834 return FileCategory::Test;
835 }
836
837 // Check for specific filenames
838 match name {
839 "Makefile" | "makefile" | "GNUmakefile" => FileCategory::Makefile,
840 "Dockerfile" => FileCategory::Dockerfile,
841 ".gitignore" | ".gitconfig" | ".gitattributes" | ".gitmodules" => {
842 FileCategory::GitConfig
843 }
844 "LICENSE" | "LICENCE" | "COPYING" => FileCategory::License,
845 "README" | "README.md" | "README.txt" | "README.rst" => FileCategory::Readme,
846 "package.json" | "Cargo.toml" | "requirements.txt" | "pyproject.toml"
847 | "pom.xml" | "build.gradle" | "go.mod" | "composer.json" => {
848 FileCategory::Package
849 }
850 "package-lock.json" | "Cargo.lock" | "yarn.lock" | "pnpm-lock.yaml"
851 | "poetry.lock" | "Gemfile.lock" => FileCategory::Lock,
852 _ => {
853 // Check for backup files ending with ~
854 if name.ends_with('~') {
855 FileCategory::Backup
856 } else if matches!(file_type, FileType::Executable) {
857 FileCategory::Binary
858 } else {
859 FileCategory::Unknown
860 }
861 }
862 }
863 } else {
864 FileCategory::Unknown // Path has no filename component (should be rare for actual files).
865 }
866 }
867 }
868
869 /// ## `Scanner::new` - Constructor
870 ///
871 /// Creates a new `Scanner` instance. This involves:
872 /// 1. Storing the provided `config` and `root` path.
873 /// 2. Loading and compiling `.gitignore` patterns if `config.respect_gitignore` is true.
874 /// 3. Compiling the `DEFAULT_IGNORE_PATTERNS` if `config.use_default_ignores` is true.
875 /// 4. Initializing sets of system paths and specific files to always ignore.
876 ///
877 /// This setup prepares the scanner for efficient `should_ignore` checks during traversal.
878 pub fn new(root: &Path, config: ScannerConfig) -> Result<Self> {
879 // Canonicalize the root path to get the absolute path
880 // If canonicalize fails (e.g., path doesn't exist), fall back to absolute path
881 let canonical_root = root
882 .canonicalize()
883 .or_else(|_| std::env::current_dir().map(|cwd| cwd.join(root)))
884 .unwrap_or_else(|_| root.to_path_buf());
885
886 // Load .gitignore patterns from the root directory if requested.
887 let gitignore = if config.respect_gitignore {
888 Self::load_gitignore(&canonical_root)? // This can return None if no .gitignore or error.
889 } else {
890 None // Not respecting .gitignore.
891 };
892
893 // Build the GlobSet for default ignore patterns if requested.
894 let default_ignores = if config.use_default_ignores {
895 Self::build_default_ignores()? // This can return None if patterns are invalid (unlikely for defaults).
896 } else {
897 None // Not using default ignores.st
898 };
899
900 // Initialize the set of system paths to ignore (e.g., /proc, /sys).
901 let system_paths: HashSet<PathBuf> = if config.use_default_ignores {
902 DEFAULT_SYSTEM_PATHS
903 .iter()
904 .map(PathBuf::from) // Convert string slices to PathBufs
905 .collect() // Collect into a HashSet for quick lookups.
906 } else {
907 HashSet::new() // Empty set if not using default ignores.
908 };
909
910 // Initialize the set of specific files to ignore (e.g., /proc/kcore).
911 let ignore_files: HashSet<PathBuf> = if config.use_default_ignores {
912 DEFAULT_IGNORE_FILES.iter().map(PathBuf::from).collect()
913 } else {
914 HashSet::new()
915 };
916
917 // Determine appropriate safety limits based on the path
918 let safety_limits =
919 if canonical_root.as_os_str() == std::env::var("HOME").unwrap_or_default().as_str() {
920 // Home directory needs special care
921 ScannerSafetyLimits::for_home_directory()
922 } else if canonical_root.starts_with("/") && canonical_root.components().count() <= 2 {
923 // Root or near-root paths need limits
924 ScannerSafetyLimits::for_home_directory()
925 } else {
926 // Regular directories can use default limits
927 ScannerSafetyLimits::default()
928 };
929
930 // Initialize security scanner if enabled
931 let security_scanner = if config.security_scan {
932 Some(SecurityScanner::new())
933 } else {
934 None
935 };
936
937 // Initialize interest calculator if smart mode or interest computation enabled
938 let interest_calculator = if config.compute_interest || config.smart_mode {
939 // Try to load previous state for change detection
940 let calc = InterestCalculator::new();
941 let calc = if let Ok(Some(prev_state)) = ScanState::load(&canonical_root) {
942 calc.with_previous_state(prev_state)
943 } else {
944 calc
945 };
946 Some(calc)
947 } else {
948 None
949 };
950
951 Ok(Self {
952 config,
953 gitignore,
954 default_ignores,
955 system_paths,
956 ignore_files,
957 root: canonical_root, // Store a copy of the root path.
958 safety_limits,
959 security_scanner,
960 interest_calculator,
961 })
962 }
963
964 /// ## `build_default_ignores`
965 ///
966 /// Compiles the `DEFAULT_IGNORE_PATTERNS` array into a `GlobSet` for efficient matching.
967 /// This `GlobSet` is used to quickly check if a file/directory name matches any of the
968 /// common patterns we want to ignore by default (like `node_modules`, `target/`).
969 /// Returns `Ok(Some(GlobSet))` on success, or `Ok(None)` if no patterns (should not happen),
970 /// or an `Err` if glob compilation fails (very unlikely for our hardcoded patterns).
971 fn build_default_ignores() -> Result<Option<GlobSet>> {
972 let mut builder = GlobSetBuilder::new(); // Start with an empty builder.
973
974 // Add each default pattern to the builder.
975 for pattern_str in DEFAULT_IGNORE_PATTERNS {
976 // Glob::new can fail if the pattern is malformed, but ours should be fine.
977 if let Ok(glob) = Glob::new(pattern_str) {
978 builder.add(glob);
979 }
980 // Silently ignore malformed default patterns, though this shouldn't occur.
981 }
982
983 // Build the GlobSet from the accumulated patterns.
984 // This can fail if, for example, the set is empty or patterns are incompatible,
985 // but again, highly unlikely for our predefined set.
986 Ok(Some(builder.build()?))
987 }
988
989 /// ## `load_gitignore`
990 ///
991 /// Reads the `.gitignore` file from the specified `root` directory (if it exists)
992 /// and compiles its patterns into a `GlobSet`.
993 /// Lines starting with `#` (comments) and empty lines are ignored.
994 /// Returns `Ok(Some(GlobSet))` if `.gitignore` is found and parsed,
995 /// `Ok(None)` if no `.gitignore` file exists, or an `Err` on I/O or parsing issues.
996 fn load_gitignore(root: &Path) -> Result<Option<GlobSet>> {
997 let gitignore_path = root.join(".gitignore"); // Construct path to .gitignore.
998 if !gitignore_path.exists() {
999 return Ok(None); // No .gitignore file found, nothing to load.
1000 }
1001
1002 let mut builder = GlobSetBuilder::new();
1003 // Read the entire .gitignore file, handling non-UTF-8 content gracefully
1004 let content = match fs::read(&gitignore_path) {
1005 Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(),
1006 Err(e) => {
1007 eprintln!(
1008 "Warning: Could not read .gitignore at {:?}: {}",
1009 gitignore_path, e
1010 );
1011 return Ok(None);
1012 }
1013 };
1014
1015 // Process each line of the .gitignore file.
1016 for line in content.lines() {
1017 let trimmed_line = line.trim(); // Remove leading/trailing whitespace.
1018 // Ignore empty lines and lines that are comments (start with '#').
1019 if !trimmed_line.is_empty() && !trimmed_line.starts_with('#') {
1020 // Attempt to compile the line as a glob pattern.
1021 // If successful, add it to our GlobSet builder.
1022 if let Ok(glob) = Glob::new(trimmed_line) {
1023 builder.add(glob);
1024 }
1025 // Malformed patterns in user's .gitignore are silently skipped.
1026 }
1027 }
1028
1029 // Build the final GlobSet from all valid patterns.
1030 Ok(Some(builder.build()?))
1031 }
1032
1033 /// Stream nodes as they are discovered
1034 /// This version of scan is optimized for the `--stream` flag.
1035 /// It sends `FileNode` objects through the `sender` channel as soon as they are processed.
1036 /// This allows the formatter to start displaying output immediately, which is great for large directories.
1037 /// Returns the final `TreeStats` once the scan is complete.
1038 pub fn scan_stream(&self, sender: mpsc::Sender<FileNode>) -> Result<TreeStats> {
1039 let mut stats = TreeStats::default();
1040
1041 // When searching, we need to collect all nodes first to determine which directories to show
1042 if self.config.search_keyword.is_some() {
1043 // Use the non-streaming scan and then send results in order
1044 let (nodes, stats) = self.scan()?;
1045 for node in nodes {
1046 if sender.send(node).is_err() {
1047 break; // Receiver disconnected
1048 }
1049 }
1050 return Ok(stats);
1051 }
1052
1053 // Initialize safety tracker for streaming mode
1054 let safety_tracker = ScannerSafetyTracker::new(self.safety_limits.clone());
1055
1056 // Original streaming logic for non-search cases
1057 let mut walker = WalkDir::new(&self.root)
1058 .max_depth(self.config.max_depth)
1059 .follow_links(self.config.follow_symlinks)
1060 .into_iter();
1061
1062 // Loop through each entry provided by WalkDir.
1063 while let Some(entry_result) = walker.next() {
1064 // Check safety limits
1065 if let Err(safety_error) = safety_tracker.should_continue() {
1066 eprintln!("โ ๏ธ {}", safety_error);
1067 eprintln!(" Use --max-depth or scan a more specific directory");
1068 break;
1069 }
1070
1071 match entry_result {
1072 Ok(entry) => {
1073 // Successfully read a directory entry.
1074 let depth = entry.depth();
1075 let path = entry.path();
1076
1077 // Determine if this entry should be ignored based on various rules.
1078 let is_ignored_by_rules = self.should_ignore(path)?;
1079
1080 if is_ignored_by_rules {
1081 // The entry matches an ignore rule.
1082 if self.config.show_ignored {
1083 // If we're showing ignored items, process it but mark as ignored.
1084 if let Some(mut node) =
1085 self.process_entry(&entry, depth, is_ignored_by_rules)?
1086 {
1087 // Perform content search if applicable, even for ignored files being shown.
1088 if !node.is_dir && self.should_search_file(&node) {
1089 node.search_matches = self.search_in_file(&node.path);
1090 }
1091
1092 // Track node for safety limits
1093 safety_tracker.add_file(estimate_node_size(
1094 node.path.to_string_lossy().len(),
1095 ));
1096
1097 // Send the (ignored) node through the channel.
1098 if sender.send(node.clone()).is_err() {
1099 break; // Receiver has disconnected, stop scanning.
1100 }
1101
1102 // Update stats for ignored items if they aren't permission-denied.
1103 // This ensures `show_ignored` gives a full picture.
1104 if !node.permission_denied {
1105 stats.update_file(&node);
1106 }
1107 }
1108 // If this ignored item is a directory, tell WalkDir not to descend into it.
1109 if entry.file_type().is_dir() {
1110 // `ignored_dirs.insert(path.to_path_buf());` // Not strictly needed if just skipping.
1111 walker.skip_current_dir();
1112 }
1113 } else {
1114 // We are *not* showing ignored items, and this one is ignored.
1115 // If it's a directory, skip its contents. Otherwise, just continue.
1116 if entry.file_type().is_dir() {
1117 walker.skip_current_dir();
1118 }
1119 // `continue;` // Implicitly done by not processing further.
1120 }
1121 } else {
1122 // The entry is NOT ignored by rules. Process it normally.
1123 if let Some(mut node) = self.process_entry(&entry, depth, false)? {
1124 // `is_ignored` is false here
1125 // Perform content search if applicable.
1126 if !node.is_dir && self.should_search_file(&node) {
1127 node.search_matches = self.search_in_file(&node.path);
1128 }
1129
1130 // Apply filters (size, date, type, find pattern).
1131 // A file is included if it's a directory, or it matches filters, or it has a search match.
1132 let has_search_match = node
1133 .search_matches
1134 .as_ref()
1135 .is_some_and(|m| m.total_count > 0);
1136
1137 // If we have a search keyword, only include files with matches
1138 let should_include_file = if self.config.search_keyword.is_some() {
1139 has_search_match
1140 } else {
1141 self.should_include(&node)
1142 };
1143
1144 if node.is_dir || should_include_file {
1145 // Track node for safety limits
1146 safety_tracker.add_file(estimate_node_size(
1147 node.path.to_string_lossy().len(),
1148 ));
1149
1150 // Send the processed node through the channel.
1151 if sender.send(node.clone()).is_err() {
1152 break; // Receiver disconnected.
1153 }
1154
1155 // Update statistics for included, non-permission-denied items.
1156 if !node.permission_denied {
1157 stats.update_file(&node);
1158 }
1159 }
1160 } else {
1161 // process_entry returned None, which means this is a hidden entry and show_hidden is false
1162 // If it's a directory, we need to skip its contents
1163 if entry.file_type().is_dir() {
1164 walker.skip_current_dir();
1165 }
1166 }
1167 }
1168 }
1169 Err(e) => {
1170 // An error occurred trying to access a directory entry (e.g., permission denied).
1171 if let Some(path) = e.path() {
1172 let depth = e.depth();
1173
1174 // Check if this is a "directory contents" error vs "directory entry" error.
1175 // If this is a permission error, it's likely we already processed the directory
1176 // entry successfully but can't read its contents. In that case, skip creating
1177 // a duplicate node since we already marked the original as permission_denied.
1178 let is_contents_error = e.io_error().is_some_and(|io_err| {
1179 io_err.kind() == std::io::ErrorKind::PermissionDenied
1180 });
1181
1182 if !is_contents_error {
1183 // Create a special node representing the permission-denied entry.
1184 let node = self.create_permission_denied_node(path, depth);
1185 safety_tracker
1186 .add_file(estimate_node_size(node.path.to_string_lossy().len()));
1187
1188 if sender.send(node.clone()).is_err() {
1189 break; // Receiver disconnected.
1190 }
1191 // Still update stats (e.g., directory count) for permission-denied entries if shown.
1192 stats.update_file(&node);
1193 }
1194
1195 // Tell WalkDir not to try to descend into this unreadable directory.
1196 walker.skip_current_dir();
1197 }
1198 // If the error is not path-specific, it might be logged or ignored depending on severity.
1199 // For now, we primarily handle path-specific errors like permission issues.
1200 }
1201 }
1202 }
1203 // Scan complete, return the accumulated statistics.
1204 Ok(stats)
1205 }
1206
1207 /// ## `should_search_file`
1208 /// This function is called before `search_in_file` to decide if it's worth attempting a search.
1209 /// It checks if a search keyword is configured and if the file is likely text-based.
1210 fn should_search_file(&self, node: &FileNode) -> bool {
1211 // No search keyword? No search.
1212 if self.config.search_keyword.is_none() {
1213 return false;
1214 }
1215
1216 // If there's a file type filter, only search files that match it
1217 if let Some(ref filter_ext) = self.config.file_type_filter {
1218 if let Some(ext) = node.path.extension() {
1219 if ext.to_str() != Some(filter_ext) {
1220 return false;
1221 }
1222 } else {
1223 // No extension, doesn't match filter
1224 return false;
1225 }
1226 }
1227
1228 // Skip directories, symlinks, and special files.
1229 if node.is_dir || node.is_symlink || node.permission_denied {
1230 return false;
1231 }
1232
1233 // Skip binary and system files based on category.
1234 matches!(
1235 node.category,
1236 FileCategory::Rust
1237 | FileCategory::Python
1238 | FileCategory::JavaScript
1239 | FileCategory::TypeScript
1240 | FileCategory::Java
1241 | FileCategory::C
1242 | FileCategory::Cpp
1243 | FileCategory::Go
1244 | FileCategory::Ruby
1245 | FileCategory::PHP
1246 | FileCategory::Shell
1247 | FileCategory::Markdown
1248 | FileCategory::Html
1249 | FileCategory::Css
1250 | FileCategory::Json
1251 | FileCategory::Yaml
1252 | FileCategory::Xml
1253 | FileCategory::Toml
1254 | FileCategory::Makefile
1255 | FileCategory::Dockerfile
1256 | FileCategory::GitConfig
1257 )
1258 }
1259
1260 /// ## `search_in_file`
1261 ///
1262 /// Searches for the configured keyword within a file and returns match information.
1263 /// Returns line and column positions for each match, up to a reasonable limit.
1264 /// The search is case-sensitive. Optionally includes the actual line content.
1265 fn search_in_file(&self, path: &Path) -> Option<SearchMatches> {
1266 // Ensure there's a keyword to search for.
1267 let keyword = self.config.search_keyword.as_ref()?;
1268 if keyword.is_empty() {
1269 return None;
1270 }
1271
1272 // Attempt to open the file for reading.
1273 let file = match fs::File::open(path) {
1274 Ok(f) => f,
1275 Err(_) => return None,
1276 };
1277
1278 let mut positions = Vec::new();
1279 let mut line_content_vec = Vec::new();
1280 let reader = BufReader::new(file);
1281 let mut line_number = 1;
1282 let mut first_match: Option<(usize, usize)> = None;
1283 let mut total_count = 0;
1284
1285 // Read and process the file line by line.
1286 for line_result in reader.lines() {
1287 match line_result {
1288 Ok(line_content) => {
1289 // Find all occurrences of the keyword in the current line.
1290 let mut line_has_match = false;
1291 let mut first_column_in_line = None;
1292
1293 for (column_index, _) in line_content.match_indices(keyword) {
1294 total_count += 1;
1295 line_has_match = true;
1296
1297 // Column numbers are 1-based for user display
1298 let match_pos = (line_number, column_index + 1);
1299
1300 if first_match.is_none() {
1301 first_match = Some(match_pos);
1302 }
1303
1304 if first_column_in_line.is_none() {
1305 first_column_in_line = Some(column_index + 1);
1306 }
1307
1308 // Only store first 100 positions to prevent memory issues
1309 if positions.len() < 100 {
1310 positions.push(match_pos);
1311 }
1312
1313 // Stop processing this file if we've found too many matches
1314 if total_count > 100 {
1315 let line_content_option = if self.config.include_line_content {
1316 Some(line_content_vec)
1317 } else {
1318 None
1319 };
1320
1321 return Some(SearchMatches {
1322 first_match: first_match.unwrap(),
1323 total_count,
1324 positions,
1325 truncated: true,
1326 line_content: line_content_option,
1327 });
1328 }
1329 }
1330
1331 // If this line has matches and we're including content, add it
1332 if line_has_match
1333 && self.config.include_line_content
1334 && line_content_vec.len() < 100
1335 {
1336 line_content_vec.push((
1337 line_number,
1338 line_content.clone(),
1339 first_column_in_line.unwrap(),
1340 ));
1341 }
1342
1343 line_number += 1;
1344 }
1345 Err(_) => {
1346 // Invalid UTF-8 or other error, stop searching this file
1347 break;
1348 }
1349 }
1350 }
1351
1352 // Return matches if any were found
1353 first_match.map(|first| {
1354 let line_content_option =
1355 if self.config.include_line_content && !line_content_vec.is_empty() {
1356 Some(line_content_vec)
1357 } else {
1358 None
1359 };
1360
1361 SearchMatches {
1362 first_match: first,
1363 total_count,
1364 positions,
1365 truncated: false,
1366 line_content: line_content_option,
1367 }
1368 })
1369 }
1370
1371 /// ## `enrich_with_smart_scanning` - Add Security & Interest Data
1372 ///
1373 /// Enriches a FileNode with security findings and interest scores.
1374 /// This is the heart of "surface what matters" - we analyze each file
1375 /// for potential security issues and calculate how interesting it is.
1376 fn enrich_with_smart_scanning(&self, node: &mut FileNode) {
1377 // Skip directories and very large files for content-based analysis
1378 if node.is_dir || node.size > 10_000_000 {
1379 // Still calculate interest score for directories
1380 if let Some(calc) = &self.interest_calculator {
1381 node.interest = Some(calc.calculate(node));
1382 node.traversal_context = Some(calc.build_traversal_context(node, None));
1383 }
1384 return;
1385 }
1386
1387 // Try to read file content for security scanning
1388 let content = if self.security_scanner.is_some() && self.should_scan_for_security(node) {
1389 fs::read_to_string(&node.path).ok()
1390 } else {
1391 None
1392 };
1393
1394 // Security scanning
1395 if let (Some(scanner), Some(ref content)) = (&self.security_scanner, &content) {
1396 let findings = scanner.scan_file_content(&node.path, content);
1397 if !findings.is_empty() {
1398 node.security_findings = findings;
1399 }
1400 }
1401
1402 // Interest calculation (with or without security findings)
1403 if let Some(calc) = &self.interest_calculator {
1404 let (score, _additional_findings) = if let Some(ref content) = content {
1405 calc.calculate_with_security(node, Some(content))
1406 } else {
1407 (calc.calculate(node), Vec::new())
1408 };
1409 node.interest = Some(score);
1410 node.traversal_context = Some(calc.build_traversal_context(node, None));
1411 }
1412 }
1413
1414 /// Check if a file should be scanned for security patterns
1415 fn should_scan_for_security(&self, node: &FileNode) -> bool {
1416 // Skip binary files based on category
1417 !matches!(
1418 node.category,
1419 FileCategory::Binary
1420 | FileCategory::Archive
1421 | FileCategory::Image
1422 | FileCategory::Video
1423 | FileCategory::Audio
1424 | FileCategory::DiskImage
1425 | FileCategory::Font
1426 | FileCategory::Encrypted
1427 )
1428 }
1429
1430 /// ## `scan` - The Full Scan (Non-Streaming)
1431 ///
1432 /// Performs a complete directory scan, collecting all `FileNode`s that meet the criteria
1433 /// (not ignored, or shown if ignored, and pass filters if any).
1434 /// This method first traverses the entire directory structure defined by `config.max_depth`,
1435 /// creating `FileNode` objects for each entry. It then performs a second pass if filters
1436 /// are active to ensure that directories are only included if they (or their subdirectories)
1437 /// contain files that match the filters.
1438 /// Returns a tuple: `(Vec<FileNode>, TreeStats)`.
1439 /// ## `scan` - The "Scan-It-All-Then-Sort-It-Out" Method
1440 ///
1441 /// This is the classic way to scan. It's a two-act show:
1442 /// 1. **Act I**: Walk through every single file and directory, collecting a huge list of `FileNode`s.
1443 /// 2. **Act II**: If there are filters, go through that huge list and pick out only the ones that
1444 /// match, making sure to keep their parent directories so the tree still makes sense.
1445 /// It's thorough and great for when you need the whole picture before making decisions.
1446 pub fn scan(&self) -> Result<(Vec<FileNode>, TreeStats)> {
1447 let mut all_nodes_collected = Vec::new(); // Stores all nodes initially encountered.
1448 // `ignored_dirs` was here, but its primary use with `skip_current_dir` is within the loop.
1449 // If we need to track them for other reasons post-loop, it could be reinstated.
1450
1451 // Initialize safety tracker
1452 let safety_tracker = ScannerSafetyTracker::new(self.safety_limits.clone());
1453
1454 let mut walker = WalkDir::new(&self.root)
1455 .max_depth(self.config.max_depth)
1456 .follow_links(self.config.follow_symlinks)
1457 .into_iter();
1458
1459 while let Some(entry_result) = walker.next() {
1460 // Check safety limits
1461 if let Err(safety_error) = safety_tracker.should_continue() {
1462 eprintln!("โ ๏ธ {}", safety_error);
1463 eprintln!(" Use --max-depth, --stream mode, or scan a more specific directory");
1464 break;
1465 }
1466
1467 match entry_result {
1468 Ok(entry) => {
1469 let depth = entry.depth();
1470 let path = entry.path();
1471 let is_ignored_by_rules = self.should_ignore(path)?;
1472
1473 if is_ignored_by_rules {
1474 if self.config.show_ignored {
1475 // Process and add the ignored entry.
1476 if let Some(mut node) = self.process_entry(&entry, depth, true)? {
1477 if !node.is_dir && self.should_search_file(&node) {
1478 node.search_matches = self.search_in_file(&node.path);
1479 }
1480 // Smart scanning even for ignored files (they might have security issues!)
1481 self.enrich_with_smart_scanning(&mut node);
1482 safety_tracker.add_file(estimate_node_size(
1483 node.path.to_string_lossy().len(),
1484 ));
1485 all_nodes_collected.push(node);
1486 }
1487 if entry.file_type().is_dir() {
1488 walker.skip_current_dir(); // Don't descend into ignored dirs if showing them.
1489 }
1490 } else {
1491 // Not showing ignored, and it's a directory: skip its contents.
1492 if entry.file_type().is_dir() {
1493 walker.skip_current_dir();
1494 }
1495 // If it's a file, it's simply skipped by not adding to `all_nodes_collected`.
1496 }
1497 } else {
1498 // Not ignored by rules, process normally.
1499 if let Some(mut node) = self.process_entry(&entry, depth, false)? {
1500 if !node.is_dir && self.should_search_file(&node) {
1501 node.search_matches = self.search_in_file(&node.path);
1502 }
1503 // Smart scanning: add security findings and interest scores
1504 self.enrich_with_smart_scanning(&mut node);
1505 all_nodes_collected.push(node);
1506 } else {
1507 // process_entry returned None, which means this is a hidden entry and show_hidden is false
1508 // If it's a directory, we need to skip its contents
1509 if entry.file_type().is_dir() {
1510 walker.skip_current_dir();
1511 }
1512 }
1513 }
1514 }
1515 Err(e) => {
1516 // Handle errors like permission denied.
1517 if let Some(path) = e.path() {
1518 let depth = e.depth();
1519 all_nodes_collected.push(self.create_permission_denied_node(path, depth));
1520 if e.io_error().is_some_and(|io_err| {
1521 io_err.kind() == std::io::ErrorKind::PermissionDenied
1522 }) {
1523 walker.skip_current_dir(); // Skip unreadable directory.
1524 }
1525 }
1526 }
1527 }
1528 }
1529
1530 // If filters are active, we need a second pass to ensure directories are only included
1531 // if they contain (or lead to) matching files.
1532 // Also, calculate stats based on the *final* list of nodes.
1533 let (final_nodes, final_stats) = if self.has_active_filters() {
1534 self.filter_nodes_and_calculate_stats(all_nodes_collected)
1535 } else {
1536 // No filters, so all collected nodes are final. Calculate stats on them.
1537 let mut stats = TreeStats::default();
1538 for node in &all_nodes_collected {
1539 // Only update stats for non-permission-denied items, or items that are directories.
1540 // (Permission denied files usually have size 0 and aren't "counted" in the same way).
1541 if !node.permission_denied || node.is_dir {
1542 stats.update_file(node);
1543 }
1544 }
1545 (all_nodes_collected, stats)
1546 };
1547
1548 // Apply sorting and top-N filtering if requested
1549 let sorted_nodes = self.apply_sorting_and_limit(final_nodes);
1550
1551 // Save scan state for future change detection (if smart mode enabled)
1552 if self.config.smart_mode || self.config.compute_interest {
1553 self.save_scan_state(&sorted_nodes);
1554 }
1555
1556 Ok((sorted_nodes, final_stats))
1557 }
1558
1559 /// Save the current scan state for future change detection
1560 fn save_scan_state(&self, nodes: &[FileNode]) {
1561 use crate::scanner_state::FileSignature;
1562
1563 let mut state = ScanState::new(self.root.clone());
1564
1565 for node in nodes {
1566 if let Ok(sig) = FileSignature::from_path(&node.path) {
1567 state.add_signature(node.path.clone(), sig);
1568 }
1569 }
1570
1571 // Save state (ignore errors - this is best-effort)
1572 if let Err(e) = state.save() {
1573 // Only log in debug mode, don't clutter normal output
1574 tracing::debug!("Could not save scan state: {}", e);
1575 }
1576 }
1577
1578 /// ## `has_active_filters`
1579 ///
1580 /// Helper function to quickly check if any of the primary filtering criteria
1581 /// (find pattern, type, size, date) are currently set in the configuration.
1582 /// This determines if the second filtering pass (`filter_nodes_and_calculate_stats`) is needed.
1583 /// Note: `search_keyword` is handled slightly differently; it can make a file appear
1584 /// even if other filters would exclude it, so it's part of `should_include` logic.
1585 fn has_active_filters(&self) -> bool {
1586 self.config.find_pattern.is_some()
1587 || self.config.file_type_filter.is_some()
1588 || self.config.entry_type_filter.is_some()
1589 || self.config.min_size.is_some()
1590 || self.config.max_size.is_some()
1591 || self.config.newer_than.is_some()
1592 || self.config.older_than.is_some()
1593 || self.config.search_keyword.is_some() // Now search_keyword is also a filter
1594 }
1595
1596 /// ## `filter_nodes_and_calculate_stats` (Formerly `filter_nodes_with_ancestors`)
1597 ///
1598 /// This crucial function takes all nodes collected during the initial traversal
1599 /// and filters them based on the `ScannerConfig`. It ensures that:
1600 /// 1. Files are included if they directly match all active filters OR if they contain a search match.
1601 /// 2. Directories are included if they themselves match a `--find` pattern OR
1602 /// if they are an ancestor of an included file.
1603 /// It then calculates `TreeStats` based on this final, filtered list of nodes.
1604 /// This replaces the older `filter_nodes_with_ancestors` to integrate stat calculation
1605 /// and clarify the logic for directory inclusion with `--find`.
1606 fn filter_nodes_and_calculate_stats(
1607 &self,
1608 all_nodes_collected: Vec<FileNode>,
1609 ) -> (Vec<FileNode>, TreeStats) {
1610 let mut final_stats = TreeStats::default();
1611 let mut included_files_and_matching_dirs = Vec::new(); // Files that pass filters, and Dirs that match --find
1612 let mut required_ancestor_dirs = HashSet::new(); // Ancestors of included_files
1613
1614 // --- Pass 1: Identify matching files and directories that directly match --find ---
1615 for node in &all_nodes_collected {
1616 if node.permission_denied {
1617 // Skip permission denied entries for filtering logic
1618 continue;
1619 }
1620
1621 let has_search_match = node
1622 .search_matches
1623 .as_ref()
1624 .is_some_and(|m| m.total_count > 0);
1625
1626 if node.is_dir {
1627 // For directories, only the --find pattern applies directly.
1628 // Other filters (size, date, type) don't apply to directories themselves.
1629 if self
1630 .config
1631 .find_pattern
1632 .as_ref()
1633 .is_some_and(|p| p.is_match(&node.path.to_string_lossy()))
1634 {
1635 included_files_and_matching_dirs.push(node.clone());
1636 // Add ancestors of this directly matched directory
1637 let mut current = node.path.parent();
1638 while let Some(parent_path) = current {
1639 if parent_path == self.root || required_ancestor_dirs.contains(parent_path)
1640 {
1641 break;
1642 }
1643 required_ancestor_dirs.insert(parent_path.to_path_buf());
1644 current = parent_path.parent();
1645 }
1646 }
1647 } else {
1648 // For files, check if it passes all filters OR has a search match.
1649 // If we have a search keyword, ONLY include files with search matches
1650 if self.config.search_keyword.is_some() {
1651 if has_search_match {
1652 // Even with search matches, the file must still pass other filters
1653 if self.should_include(node) {
1654 included_files_and_matching_dirs.push(node.clone());
1655 // Add all ancestors of this matching file to `required_ancestor_dirs`.
1656 let mut current = node.path.parent();
1657 while let Some(parent_path) = current {
1658 // Stop if we reach the root or an already added ancestor.
1659 if parent_path == self.root
1660 || required_ancestor_dirs.contains(parent_path)
1661 {
1662 break;
1663 }
1664 required_ancestor_dirs.insert(parent_path.to_path_buf());
1665 current = parent_path.parent();
1666 }
1667 }
1668 }
1669 } else {
1670 // No search keyword, use normal filtering
1671 if has_search_match || self.should_include(node) {
1672 included_files_and_matching_dirs.push(node.clone());
1673 // Add all ancestors of this matching file to `required_ancestor_dirs`.
1674 let mut current = node.path.parent();
1675 while let Some(parent_path) = current {
1676 // Stop if we reach the root or an already added ancestor.
1677 if parent_path == self.root
1678 || required_ancestor_dirs.contains(parent_path)
1679 {
1680 break;
1681 }
1682 required_ancestor_dirs.insert(parent_path.to_path_buf());
1683 current = parent_path.parent();
1684 }
1685 }
1686 }
1687 }
1688 }
1689
1690 // --- Pass 2: Build the final list of nodes ---
1691 let mut final_node_list = Vec::new();
1692 let mut added_paths = HashSet::new(); // To prevent duplicates if a dir is both an ancestor and matches --find
1693
1694 // Always add the root node if there's anything to show.
1695 if !included_files_and_matching_dirs.is_empty() {
1696 if let Some(root_node) = all_nodes_collected.iter().find(|n| n.path == self.root) {
1697 if added_paths.insert(root_node.path.clone()) {
1698 final_node_list.push(root_node.clone());
1699 }
1700 }
1701 }
1702
1703 // Add required ancestor directories and directly matching directories from `all_nodes_collected`.
1704 for node in &all_nodes_collected {
1705 if node.permission_denied {
1706 // Also include permission denied nodes if they are part of the path
1707 if (required_ancestor_dirs.contains(&node.path)
1708 || node.path == self.root && !final_node_list.is_empty())
1709 && added_paths.insert(node.path.clone())
1710 {
1711 final_node_list.push(node.clone());
1712 }
1713 continue;
1714 }
1715
1716 if node.is_dir {
1717 // Is it a required ancestor OR a directory that itself matched --find?
1718 let is_find_match = self
1719 .config
1720 .find_pattern
1721 .as_ref()
1722 .is_some_and(|p| p.is_match(&node.path.to_string_lossy()));
1723 if (required_ancestor_dirs.contains(&node.path)
1724 || (is_find_match && node.path != self.root))
1725 && added_paths.insert(node.path.clone())
1726 {
1727 final_node_list.push(node.clone());
1728 }
1729 }
1730 }
1731
1732 // Add the files that passed filters or had search matches.
1733 for node in included_files_and_matching_dirs {
1734 // If it's a directory, it was already handled above (if it matched --find).
1735 // If it's a file, add it now.
1736 if !node.is_dir {
1737 if added_paths.insert(node.path.clone()) {
1738 final_node_list.push(node);
1739 }
1740 } else {
1741 // It's a directory that matched --find
1742 if added_paths.insert(node.path.clone()) {
1743 final_node_list.push(node);
1744 }
1745 }
1746 }
1747
1748 // Sort the final list by path for consistent output.
1749 final_node_list.sort_by(|a, b| a.path.cmp(&b.path));
1750
1751 // --- Pass 3: Calculate stats on the final_node_list ---
1752 for node in &final_node_list {
1753 // Update stats, ensuring not to double-count or miscount permission-denied entries.
1754 if !node.permission_denied || node.is_dir {
1755 // Dirs (even denied) contribute to dir count.
1756 final_stats.update_file(node);
1757 }
1758 }
1759
1760 (final_node_list, final_stats)
1761 }
1762
1763 /// ## `process_entry`
1764 ///
1765 /// Converts a `walkdir::DirEntry` into our `FileNode` struct.
1766 /// This involves fetching metadata, determining file type, category, hidden status, etc.
1767 /// It also incorporates the `is_ignored_by_rules` status passed to it.
1768 /// Returns `Ok(Some(FileNode))` on success, `Ok(None)` if the entry should be skipped
1769 /// (e.g., hidden and not showing hidden), or an `Err` if metadata access fails.
1770 /// The `is_ignored_by_rules` parameter tells this function if `should_ignore` already determined this node is ignored.
1771 fn process_entry(
1772 &self,
1773 entry: &DirEntry,
1774 depth: usize,
1775 is_ignored_by_rules: bool,
1776 ) -> Result<Option<FileNode>> {
1777 let path = entry.path();
1778
1779 // Determine if the file is hidden (starts with '.').
1780 let is_hidden = path
1781 .file_name()
1782 .and_then(|name_osstr| name_osstr.to_str()) // Convert OsStr to &str
1783 .is_some_and(|name_str| name_str.starts_with('.'));
1784
1785 // Skip if hidden and we are not configured to show hidden files,
1786 // UNLESS it's an ignored item that we *are* configured to show (is_ignored_by_rules = true, config.show_ignored = true).
1787 // The `is_ignored_by_rules` flag takes precedence for display if `config.show_ignored` is true.
1788 if is_hidden && !self.config.show_hidden && !is_ignored_by_rules {
1789 // If it's a directory, we need to tell walkdir to skip its contents.
1790 if entry.file_type().is_dir() {
1791 // This is tricky because `process_entry` doesn't have `walker` to call `skip_current_dir()`.
1792 // The caller (`scan` or `scan_stream`) handles `skip_current_dir` based on `should_ignore`
1793 // and hidden status before calling `process_entry` or by checking the returned node.
1794 // For now, returning None signals to the caller that this node (and its children if a dir)
1795 // should not be further processed or added, unless `show_ignored` logic overrides.
1796 }
1797 return Ok(None); // Skip this hidden entry.
1798 }
1799
1800 // Try to get metadata for the entry. This can fail (e.g., permission denied).
1801 let metadata = match entry.metadata() {
1802 Ok(md) => md,
1803 Err(_e) => {
1804 // If metadata fails, it's likely a permission issue or a broken symlink.
1805 // We create a special "permission_denied_node" in the calling `scan`/`scan_stream` methods
1806 // because they have access to `walker.skip_current_dir()`.
1807 // Here, we can't fully form that node, so we might return an error or a partial node.
1808 // For simplicity, if metadata fails here, we treat it as an inaccessible entry.
1809 // The main scan loops handle creating a FileNode for permission denied errors from WalkDir.
1810 // This specific call path implies WalkDir *could* read the entry but metadata() failed.
1811 // This is less common than WalkDir itself erroring.
1812 // Let's assume the main loops catch this via `Err(e)` from `walker.next()`.
1813 // If `process_entry` is called on an entry that `WalkDir` gave Ok for, but `metadata()` fails,
1814 // it's an edge case. We'll return a basic node marked as permission denied.
1815 return Ok(Some(self.create_permission_denied_node(path, depth)));
1816 }
1817 };
1818
1819 let file_type = self.determine_file_type(&metadata);
1820 let category = Self::get_file_category(path, file_type);
1821
1822 // Determine the size. For special virtual files (like in /proc or /sys),
1823 // reported size can be misleading (e.g., 0 or huge). We mark these as size 0.
1824 let size = if self.is_special_virtual_file(path, &metadata) {
1825 0
1826 } else {
1827 metadata.len()
1828 };
1829
1830 // Check if this is a directory that we can't read the contents of
1831 let permission_denied = if metadata.is_dir() {
1832 // Try to read the directory to see if we have permission
1833 std::fs::read_dir(path).is_err()
1834 } else {
1835 false
1836 };
1837
1838 // Check for git branch if this is a directory
1839 let git_branch = if metadata.is_dir() {
1840 Self::get_git_branch(path)
1841 } else {
1842 None
1843 };
1844
1845 Ok(Some(FileNode {
1846 path: path.to_path_buf(),
1847 is_dir: metadata.is_dir(),
1848 size,
1849 permissions: Self::get_permissions(&metadata),
1850 uid: Self::get_uid(&metadata),
1851 gid: Self::get_gid(&metadata),
1852 modified: metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH), // Fallback for modified time
1853 is_symlink: metadata.file_type().is_symlink(), // Use file_type() for symlink check
1854 is_hidden,
1855 permission_denied, // Set based on whether we can read directory contents
1856 is_ignored: is_ignored_by_rules, // Use the pre-determined ignore status.
1857 depth,
1858 file_type,
1859 category,
1860 search_matches: None, // Search matches are added later by the caller if needed.
1861 filesystem_type: Self::get_filesystem_type(path),
1862 git_branch,
1863 // Smart scanning fields - populated later by interest calculator
1864 traversal_context: None,
1865 interest: None,
1866 security_findings: Vec::new(),
1867 change_status: None,
1868 content_hash: None,
1869 }))
1870 }
1871
1872 /// ## `get_git_branch`
1873 ///
1874 /// Gets the current git branch if this directory contains a .git folder.
1875 /// Reads directly from .git/HEAD for speed (no subprocess).
1876 fn get_git_branch(path: &Path) -> Option<String> {
1877 let git_dir = path.join(".git");
1878 if !git_dir.exists() {
1879 return None;
1880 }
1881
1882 // Read .git/HEAD to get the current ref
1883 let head_path = git_dir.join("HEAD");
1884 let head_content = std::fs::read_to_string(&head_path).ok()?;
1885 let head_content = head_content.trim();
1886
1887 // HEAD can be either:
1888 // 1. "ref: refs/heads/branch-name" (normal branch)
1889 // 2. A raw commit hash (detached HEAD)
1890 if let Some(branch_ref) = head_content.strip_prefix("ref: refs/heads/") {
1891 Some(branch_ref.to_string())
1892 } else if head_content.len() >= 7 {
1893 // Detached HEAD - show abbreviated commit hash
1894 Some(format!(":{}", &head_content[..7]))
1895 } else {
1896 None
1897 }
1898 }
1899
1900 /// ## `get_filesystem_type`
1901 ///
1902 /// Detects the filesystem type for a given path
1903 #[cfg(unix)]
1904 fn get_filesystem_type(path: &Path) -> FilesystemType {
1905 // Skip filesystem detection in CI environments to avoid hangs
1906 if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
1907 return FilesystemType::Unknown;
1908 }
1909
1910 #[cfg(target_os = "linux")]
1911 {
1912 Self::get_filesystem_type_linux(path)
1913 }
1914 #[cfg(not(target_os = "linux"))]
1915 {
1916 // On non-Linux Unix systems, we can't reliably detect filesystem type
1917 // Just check for special paths
1918 if let Some(path_str) = path.to_str() {
1919 if path_str.starts_with("/proc") {
1920 return FilesystemType::Procfs;
1921 } else if path_str.starts_with("/sys") {
1922 return FilesystemType::Sysfs;
1923 } else if path_str.starts_with("/dev") {
1924 return FilesystemType::Devfs;
1925 }
1926 }
1927 FilesystemType::Unknown
1928 }
1929 }
1930
1931 /// ## `get_filesystem_type_linux`
1932 ///
1933 /// Detects the filesystem type for a given path using statfs on Linux systems
1934 #[cfg(target_os = "linux")]
1935 fn get_filesystem_type_linux(path: &Path) -> FilesystemType {
1936 // Double-check for CI environment
1937 if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
1938 return FilesystemType::Unknown;
1939 }
1940
1941 use libc::statfs;
1942 use std::ffi::CString;
1943 use std::mem;
1944
1945 // Filesystem magic numbers from statfs.h
1946 type FsType = i64;
1947
1948 const EXT4_SUPER_MAGIC: FsType = 0xef53;
1949 const XFS_SUPER_MAGIC: FsType = 0x58465342;
1950 const BTRFS_SUPER_MAGIC: FsType = 0x9123683e;
1951 const ZFS_SUPER_MAGIC: FsType = 0x2fc12fc1;
1952 const NTFS_SB_MAGIC: FsType = 0x5346544e;
1953 const MSDOS_SUPER_MAGIC: FsType = 0x4d44; // FAT
1954 const EXFAT_SUPER_MAGIC: FsType = 0x2011bab0;
1955 const APFS_SUPER_MAGIC: FsType = 0x42535041; // 'APFS'
1956 const HFS_SUPER_MAGIC: FsType = 0x482b; // HFS+
1957 const NFS_SUPER_MAGIC: FsType = 0x6969;
1958 const SMB_SUPER_MAGIC: FsType = 0x517b;
1959 const TMPFS_MAGIC: FsType = 0x01021994;
1960 const PROC_SUPER_MAGIC: FsType = 0x9fa0;
1961 const SYSFS_MAGIC: FsType = 0x62656572;
1962 const DEVFS_SUPER_MAGIC: FsType = 0x1373;
1963
1964 let path_cstr = match CString::new(path.to_string_lossy().as_bytes()) {
1965 Ok(s) => s,
1966 Err(_) => return FilesystemType::Unknown,
1967 };
1968
1969 let mut stat_buf: libc::statfs = unsafe { mem::zeroed() };
1970 let result = unsafe { statfs(path_cstr.as_ptr(), &mut stat_buf) };
1971
1972 if result != 0 {
1973 // statfs failed, fall back to path-based detection for virtual filesystems
1974 if let Some(path_str) = path.to_str() {
1975 if path_str.starts_with("/proc") {
1976 return FilesystemType::Procfs;
1977 } else if path_str.starts_with("/sys") {
1978 return FilesystemType::Sysfs;
1979 } else if path_str.starts_with("/dev") {
1980 return FilesystemType::Devfs;
1981 }
1982 }
1983 return FilesystemType::Unknown;
1984 }
1985
1986 // Check for Mem8 filesystem by looking for .mem8 marker files
1987 if path.join(".mem8").exists() || path.to_string_lossy().contains("mem8") {
1988 return FilesystemType::Mem8;
1989 }
1990
1991 match stat_buf.f_type {
1992 EXT4_SUPER_MAGIC => FilesystemType::Ext4, // TODO: Distinguish ext2/3/4
1993 XFS_SUPER_MAGIC => FilesystemType::Xfs,
1994 BTRFS_SUPER_MAGIC => FilesystemType::Btrfs,
1995 ZFS_SUPER_MAGIC => FilesystemType::Zfs,
1996 NTFS_SB_MAGIC => FilesystemType::Ntfs,
1997 MSDOS_SUPER_MAGIC => FilesystemType::Fat32,
1998 EXFAT_SUPER_MAGIC => FilesystemType::ExFat,
1999 APFS_SUPER_MAGIC => FilesystemType::Apfs,
2000 HFS_SUPER_MAGIC => FilesystemType::Hfs,
2001 NFS_SUPER_MAGIC => FilesystemType::Nfs,
2002 SMB_SUPER_MAGIC => FilesystemType::Smb,
2003 TMPFS_MAGIC => FilesystemType::Tmpfs,
2004 PROC_SUPER_MAGIC => FilesystemType::Procfs,
2005 SYSFS_MAGIC => FilesystemType::Sysfs,
2006 DEVFS_SUPER_MAGIC => FilesystemType::Devfs,
2007 _ => FilesystemType::Unknown,
2008 }
2009 }
2010
2011 #[cfg(not(unix))]
2012 fn get_filesystem_type(_path: &Path) -> FilesystemType {
2013 // On non-Unix systems, we can't easily detect filesystem type
2014 FilesystemType::Unknown
2015 }
2016
2017 /// ## `is_virtual_filesystem`
2018 ///
2019 /// Checks if a path is on a virtual filesystem
2020 fn is_virtual_filesystem(path: &Path) -> bool {
2021 Self::get_filesystem_type(path).is_virtual()
2022 }
2023
2024 /// ## `is_special_virtual_file`
2025 ///
2026 /// Checks if a file is likely a special virtual file (e.g., in /proc, /sys, /dev)
2027 /// where reported metadata like size might be zero, misleading, or cause issues if read.
2028 /// This helps in deciding to report size as 0 for such files.
2029 #[allow(unused_variables)]
2030 fn is_special_virtual_file(&self, path: &Path, metadata: &fs::Metadata) -> bool {
2031 // Check if the path starts with known virtual filesystem prefixes.
2032 if let Some(path_str) = path.to_str() {
2033 if path_str.starts_with("/proc/")
2034 || path_str.starts_with("/sys/")
2035 || path_str.starts_with("/dev/")
2036 {
2037 return true;
2038 }
2039 }
2040
2041 // Check for specific problematic files by absolute path.
2042 if self.ignore_files.contains(path) {
2043 // Uses the pre-built HashSet of specific problem files.
2044 return true;
2045 }
2046
2047 // On Unix, check for special file types like character devices, block devices, FIFOs, sockets.
2048 // These often have size 0 or non-standard size reporting.
2049 #[cfg(unix)]
2050 {
2051 use std::os::unix::fs::FileTypeExt; // For is_char_device(), is_block_device(), etc.
2052 let ft = metadata.file_type();
2053 if ft.is_char_device() || ft.is_block_device() || ft.is_fifo() || ft.is_socket() {
2054 return true;
2055 }
2056 }
2057
2058 false // Not determined to be a special virtual file by these checks.
2059 }
2060
2061 /// ## `create_permission_denied_node`
2062 ///
2063 /// Helper to create a `FileNode` representing an entry (usually a directory)
2064 /// that could not be accessed due to permission errors.
2065 /// These nodes are marked specially so formatters can indicate the issue.
2066 fn create_permission_denied_node(&self, path: &Path, depth: usize) -> FileNode {
2067 FileNode {
2068 path: path.to_path_buf(),
2069 is_dir: true, // Assume it's a directory, as that's common for permission errors during traversal.
2070 size: 0, // No size info available.
2071 permissions: 0, // No permission info.
2072 uid: 0, // No UID info.
2073 gid: 0, // No GID info.
2074 modified: SystemTime::UNIX_EPOCH, // Default timestamp.
2075 is_symlink: false,
2076 is_hidden: false, // Cannot determine if hidden.
2077 permission_denied: true, // Mark as permission denied.
2078 is_ignored: false, // Not ignored by rules, but inaccessible.
2079 depth,
2080 file_type: FileType::Directory, // Assume directory.
2081 category: FileCategory::Unknown,
2082 search_matches: None,
2083 filesystem_type: Self::get_filesystem_type(path),
2084 git_branch: None, // Can't check git for permission-denied directories
2085 // Smart scanning fields - N/A for permission denied nodes
2086 traversal_context: None,
2087 interest: None,
2088 security_findings: Vec::new(),
2089 change_status: None,
2090 content_hash: None,
2091 }
2092 }
2093
2094 /// ## `should_ignore` - The Bouncer at the Club Door
2095 ///
2096 /// This function is our tough-but-fair bouncer. It checks every file and
2097 /// directory against our lists (`.gitignore`, default ignores, etc.).
2098 /// "Sorry, `node_modules`, you're not on the list tonight."
2099 /// It's the first line of defense against clutter.
2100 fn should_ignore(&self, path: &Path) -> Result<bool> {
2101 // --- Rule 0: Never ignore the root path itself ---
2102 // If the user explicitly asks to scan a directory, we should show it
2103 // even if it would normally be ignored (e.g., scanning 'target' directory)
2104 if path == self.root {
2105 return Ok(false);
2106 }
2107
2108 // --- Rule 1: Check against specific, always-ignored files (absolute paths) ---
2109 if self.config.use_default_ignores && self.ignore_files.contains(path) {
2110 return Ok(true); // Matches a specific problematic file.
2111 }
2112
2113 // --- Rule 2: ALWAYS skip virtual filesystems like /proc, /sys, /dev ---
2114 // These are checked regardless of use_default_ignores because they're not real files
2115 // and can cause issues (huge fake sizes, hangs, etc.)
2116 if Self::is_virtual_filesystem(path) {
2117 return Ok(true);
2118 }
2119
2120 // --- Rule 3: Check against other system paths if using default ignores ---
2121 if self.config.use_default_ignores {
2122 // Check for exact match of a system path.
2123 if self.system_paths.contains(path) {
2124 return Ok(true);
2125 }
2126 // Check if the current path is a child of any registered system path.
2127 for system_root_path in &self.system_paths {
2128 if path.starts_with(system_root_path) {
2129 return Ok(true); // It's inside /tmp, /var/tmp, etc.
2130 }
2131 }
2132 }
2133
2134 // --- Rule 3: Check against default ignore patterns (GlobSet) ---
2135 // These patterns usually match file/directory names or relative paths within a project.
2136 if let Some(ref default_ignore_set) = self.default_ignores {
2137 // Check if the simple file/directory name matches any default pattern.
2138 // (e.g., "node_modules" will match `path/to/project/node_modules`)
2139 if let Some(file_name) = path.file_name() {
2140 if default_ignore_set.is_match(Path::new(file_name)) {
2141 return Ok(true);
2142 }
2143 }
2144 // Also check the path relative to the scan root against default patterns.
2145 // This handles patterns like "*.pyc" or "build/outputs/".
2146 if let Ok(relative_path_to_root) = path.strip_prefix(&self.root) {
2147 if default_ignore_set.is_match(relative_path_to_root) {
2148 return Ok(true);
2149 }
2150 }
2151 }
2152
2153 // --- Rule 4: Check against .gitignore patterns (GlobSet) ---
2154 // These patterns are always relative to the root of the scan (where .gitignore is located).
2155 if let Some(ref gitignore_set) = self.gitignore {
2156 if let Ok(relative_path_to_root) = path.strip_prefix(&self.root) {
2157 if gitignore_set.is_match(relative_path_to_root) {
2158 return Ok(true); // Matches a .gitignore pattern.
2159 }
2160 }
2161 // If strip_prefix fails (path is not under root), it can't match .gitignore relative patterns.
2162 }
2163
2164 // If none of the above rules triggered, the path is not ignored.
2165 Ok(false)
2166 }
2167
2168 /// ## `should_include` - The Velvet Rope
2169 ///
2170 /// Once a file gets past the bouncer (`should_ignore`), it has to get past
2171 /// the velvet rope. This function checks if the file meets the specific criteria
2172 /// for this party: "Are you a `.rs` file? Are you bigger than 1MB?"
2173 /// Only the coolest files that match all the rules get in.
2174 fn should_include(&self, node: &FileNode) -> bool {
2175 // --- Filter by --find pattern (applies to both files and directories) ---
2176 if let Some(ref find_regex_pattern) = self.config.find_pattern {
2177 // Convert path to string for regex matching. Lossy conversion is acceptable for matching.
2178 let path_str = node.path.to_string_lossy();
2179 if !find_regex_pattern.is_match(&path_str) {
2180 return false; // Path doesn't match the --find pattern.
2181 }
2182 }
2183
2184 // --- Filter by entry type (--entry-type) ---
2185 if let Some(ref entry_type) = self.config.entry_type_filter {
2186 match entry_type.as_str() {
2187 "f" => {
2188 if node.is_dir {
2189 return false; // Looking for files only, but this is a directory
2190 }
2191 }
2192 "d" => {
2193 if !node.is_dir {
2194 return false; // Looking for directories only, but this is a file
2195 }
2196 }
2197 _ => {} // Should not happen due to clap validation
2198 }
2199 }
2200
2201 // --- Filters below only apply to files, not directories ---
2202 if !node.is_dir {
2203 // --- Filter by file extension (--type) ---
2204 if let Some(ref required_extension) = self.config.file_type_filter {
2205 match node
2206 .path
2207 .extension()
2208 .and_then(|ext_osstr| ext_osstr.to_str())
2209 {
2210 Some(file_ext_str) => {
2211 if !file_ext_str.eq_ignore_ascii_case(required_extension) {
2212 return false; // Extension doesn't match.
2213 }
2214 }
2215 None => return false, // File has no extension, so cannot match.
2216 }
2217 }
2218
2219 // --- Filter by minimum size (--min-size) ---
2220 if let Some(min_allowed_size) = self.config.min_size {
2221 if node.size < min_allowed_size {
2222 return false; // File is too small.
2223 }
2224 }
2225
2226 // --- Filter by maximum size (--max-size) ---
2227 if let Some(max_allowed_size) = self.config.max_size {
2228 if node.size > max_allowed_size {
2229 return false; // File is too large.
2230 }
2231 }
2232 } // End of file-only filters
2233
2234 // --- Date filters (apply to both files and directories based on their modification time) ---
2235 // --- Filter by newer_than date (--newer-than) ---
2236 if let Some(min_modification_date) = self.config.newer_than {
2237 if node.modified < min_modification_date {
2238 return false; // Entry is older than required.
2239 }
2240 }
2241
2242 // --- Filter by older_than date (--older-than) ---
2243 if let Some(max_modification_date) = self.config.older_than {
2244 if node.modified > max_modification_date {
2245 return false; // Entry is newer than allowed.
2246 }
2247 }
2248
2249 // If all applicable filters passed (or no filters were active for a category), include the node.
2250 true
2251 }
2252
2253 /// ## `determine_file_type` (Helper for `process_entry`)
2254 ///
2255 /// Examines `fs::Metadata` to determine a more specific `FileType`
2256 /// than just `is_dir` or `is_file`. On Unix, this can identify symlinks,
2257 /// sockets, FIFOs, block/char devices, and executables (by permission).
2258 /// On non-Unix, it's simpler (dir, symlink, or regular file).
2259 fn determine_file_type(&self, metadata: &fs::Metadata) -> FileType {
2260 #[cfg(unix)] // Unix-specific detailed file type detection
2261 {
2262 use std::os::unix::fs::FileTypeExt; // For is_socket, is_fifo, etc.
2263 let ft = metadata.file_type(); // Get the rich FileType from metadata.
2264
2265 if ft.is_dir() {
2266 FileType::Directory
2267 } else if ft.is_symlink() {
2268 // Check symlink before other types, as it can point to them.
2269 FileType::Symlink
2270 } else if ft.is_socket() {
2271 FileType::Socket
2272 } else if ft.is_fifo() {
2273 // Named pipe
2274 FileType::Pipe
2275 } else if ft.is_block_device() {
2276 FileType::BlockDevice
2277 } else if ft.is_char_device() {
2278 FileType::CharDevice
2279 // Check for executable permission (any of user, group, other execute bits are set).
2280 // This applies to regular files that are not dirs, symlinks, or other special types.
2281 } else if ft.is_file() && (metadata.permissions().mode() & 0o111 != 0) {
2282 FileType::Executable
2283 } else {
2284 // If none of the above, it's a regular (non-executable) file.
2285 FileType::RegularFile
2286 }
2287 }
2288
2289 #[cfg(not(unix))] // Simpler detection for non-Unix platforms
2290 {
2291 if metadata.is_dir() {
2292 FileType::Directory
2293 } else if metadata.file_type().is_symlink() {
2294 // `is_symlink()` is part of stable `fs::FileType`
2295 FileType::Symlink
2296 } else {
2297 // No easy cross-platform way to check executable bit without external crates or OS-specific calls.
2298 // So, on non-Unix, we don't distinguish Executable from RegularFile here.
2299 FileType::RegularFile
2300 }
2301 }
2302 }
2303
2304 // --- Platform-Dependent Metadata Helpers ---
2305 // These provide a consistent way to get permissions, UID, and GID,
2306 // with sensible defaults for non-Unix systems where these concepts might not directly apply
2307 // or be easily accessible via standard Rust fs::Metadata.
2308
2309 #[cfg(unix)]
2310 fn get_permissions(metadata: &fs::Metadata) -> u32 {
2311 // On Unix, get the mode and mask it to get the permission bits (e.g., 0o755).
2312 metadata.permissions().mode() & 0o777
2313 }
2314 #[cfg(not(unix))]
2315 fn get_permissions(_metadata: &fs::Metadata) -> u32 {
2316 0o755 // A common default permission (rwxr-xr-x) for non-Unix.
2317 }
2318
2319 #[cfg(unix)]
2320 fn get_uid(metadata: &fs::Metadata) -> u32 {
2321 metadata.uid() // Get User ID from metadata.
2322 }
2323 #[cfg(not(unix))]
2324 fn get_uid(_metadata: &fs::Metadata) -> u32 {
2325 1000 // Common default UID placeholder for non-Unix.
2326 }
2327
2328 #[cfg(unix)]
2329 fn get_gid(metadata: &fs::Metadata) -> u32 {
2330 metadata.gid() // Get Group ID from metadata.
2331 }
2332 #[cfg(not(unix))]
2333 fn get_gid(_metadata: &fs::Metadata) -> u32 {
2334 0
2335 }
2336
2337 /// Apply sorting and optional top-N limit to the results
2338 fn apply_sorting_and_limit(&self, mut nodes: Vec<FileNode>) -> Vec<FileNode> {
2339 // If no sort field specified, return as-is
2340 let sort_field = match &self.config.sort_field {
2341 Some(field) => field,
2342 None => return nodes,
2343 };
2344
2345 // Sort based on the field
2346 match sort_field.as_str() {
2347 "name" | "a-to-z" => {
2348 // Sort by name alphabetically (A to Z)
2349 nodes.sort_by(|a, b| {
2350 let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2351 let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2352 name_a.cmp(&name_b)
2353 });
2354 }
2355 "z-to-a" => {
2356 // Sort by name reverse alphabetically (Z to A)
2357 nodes.sort_by(|a, b| {
2358 let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2359 let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2360 name_b.cmp(&name_a)
2361 });
2362 }
2363 "size" | "largest" => {
2364 // Sort by size descending (largest first)
2365 nodes.sort_by(|a, b| b.size.cmp(&a.size));
2366 }
2367 "smallest" => {
2368 // Sort by size ascending (smallest first)
2369 nodes.sort_by(|a, b| a.size.cmp(&b.size));
2370 }
2371 "date" | "newest" => {
2372 // Sort by modification time descending (newest first)
2373 nodes.sort_by(|a, b| b.modified.cmp(&a.modified));
2374 }
2375 "oldest" => {
2376 // Sort by modification time ascending (oldest first)
2377 nodes.sort_by(|a, b| a.modified.cmp(&b.modified));
2378 }
2379 "type" => {
2380 // Sort by file extension, then by name
2381 nodes.sort_by(|a, b| {
2382 let ext_a = a.path.extension().unwrap_or_default().to_string_lossy();
2383 let ext_b = b.path.extension().unwrap_or_default().to_string_lossy();
2384 match ext_a.cmp(&ext_b) {
2385 std::cmp::Ordering::Equal => {
2386 let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2387 let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2388 name_a.cmp(&name_b)
2389 }
2390 other => other,
2391 }
2392 });
2393 }
2394 _ => {
2395 // Unknown sort field, don't sort
2396 eprintln!("Warning: Unknown sort field '{}', ignoring", sort_field);
2397 }
2398 }
2399
2400 // Apply top-N limit if specified
2401 if let Some(limit) = self.config.top_n {
2402 nodes.truncate(limit);
2403 }
2404
2405 nodes
2406 }
2407} // end impl Scanner
2408
2409/// # `parse_size` - The Universal Translator for Sizes
2410///
2411/// This handy function takes something a human understands, like "2.5M", and
2412/// translates it into something a computer understands (2,621,440 bytes).
2413/// It's like having a Babel fish for file sizes. Why should we have to do
2414/// that math when the computer can do it for us?
2415pub fn parse_size(size_str: &str) -> Result<u64> {
2416 let size_str = size_str.trim().to_uppercase();
2417 if size_str.is_empty() {
2418 return Err(anyhow::anyhow!("Empty size string"));
2419 }
2420
2421 // Find the first alphabetic character which marks the start of the unit.
2422 let unit_start_index = size_str
2423 .find(|c: char| c.is_alphabetic())
2424 .unwrap_or(size_str.len());
2425 let (num_part_str, unit_part) = size_str.split_at(unit_start_index);
2426
2427 // Trim any space from the number part before parsing.
2428 let num_part_str = num_part_str.trim();
2429
2430 if num_part_str.is_empty() {
2431 return Err(anyhow::anyhow!("Missing number for size string"));
2432 }
2433
2434 let num: f64 = match num_part_str.parse() {
2435 Ok(n) => n,
2436 Err(e) => return Err(anyhow::anyhow!("Invalid number '{}': {}", num_part_str, e)),
2437 };
2438
2439 // Check for negative numbers.
2440 if num.is_sign_negative() {
2441 return Err(anyhow::anyhow!("Size cannot be negative: {}", num));
2442 }
2443
2444 let multiplier = match unit_part {
2445 "K" | "KB" => 1024.0,
2446 "M" | "MB" => 1024.0 * 1024.0,
2447 "G" | "GB" => 1024.0 * 1024.0 * 1024.0,
2448 "T" | "TB" => 1024.0 * 1024.0 * 1024.0 * 1024.0,
2449 "B" | "" => 1.0,
2450 _ => return Err(anyhow::anyhow!("Invalid size unit: '{}'", unit_part)),
2451 };
2452
2453 Ok((num * multiplier) as u64)
2454}
2455
2456// --- Unit Tests: Ensuring Our Scanner Behaves ---
2457// Aye, even the most brilliant code needs tests to keep it honest!
2458// These tests cover some basic functionality of the scanner.
2459#[cfg(test)]
2460mod tests {
2461 use super::*; // Import everything from the parent module (scanner.rs).
2462
2463 #[test]
2464 fn test_parse_size_valid_inputs() {
2465 assert_eq!(parse_size("100").unwrap(), 100);
2466 assert_eq!(parse_size("100B").unwrap(), 100);
2467 assert_eq!(parse_size("1k").unwrap(), 1024);
2468 assert_eq!(parse_size("1K").unwrap(), 1024);
2469 assert_eq!(parse_size("1KB").unwrap(), 1024);
2470 assert_eq!(parse_size("2.5M").unwrap(), (2.5 * 1024.0 * 1024.0) as u64);
2471 assert_eq!(parse_size("1GB").unwrap(), 1024 * 1024 * 1024);
2472 assert_eq!(
2473 parse_size("0.5T").unwrap(),
2474 (0.5 * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64
2475 );
2476 assert_eq!(parse_size(" 2 MB ").unwrap(), 2 * 1024 * 1024); // Test with whitespace
2477 }
2478
2479 #[test]
2480 fn test_parse_size_invalid_inputs() {
2481 assert!(parse_size("100X").is_err());
2482 assert!(parse_size("garbage").is_err());
2483 assert!(parse_size("-100M").is_err());
2484 assert!(parse_size("1..5K").is_err());
2485 }
2486
2487 #[test]
2488 fn test_parse_size_zero_and_empty() {
2489 assert_eq!(parse_size("0").unwrap(), 0);
2490 assert!(parse_size("").is_err());
2491 assert!(parse_size(" ").is_err());
2492 }
2493
2494 // Basic test for Scanner creation. More comprehensive tests would involve
2495 // creating a temporary directory structure and verifying scan results.
2496 #[test]
2497 fn test_scanner_creation_defaults() {
2498 let temp_dir = tempfile::tempdir().unwrap();
2499 let config = ScannerConfig {
2500 max_depth: 5,
2501 follow_symlinks: false,
2502 respect_gitignore: true,
2503 show_hidden: false,
2504 show_ignored: false,
2505 find_pattern: None,
2506 file_type_filter: None,
2507 entry_type_filter: None,
2508 min_size: None,
2509 max_size: None,
2510 newer_than: None,
2511 older_than: None,
2512 use_default_ignores: true,
2513 search_keyword: None,
2514 show_filesystems: false,
2515 sort_field: None,
2516 top_n: None,
2517 include_line_content: false,
2518 // Smart scanning options
2519 compute_interest: false,
2520 security_scan: false,
2521 min_interest: 0.0,
2522 track_traversal: false,
2523 changes_only: false,
2524 compare_state: None,
2525 smart_mode: false,
2526 };
2527 let scanner_result = Scanner::new(temp_dir.path(), config);
2528 assert!(scanner_result.is_ok());
2529 }
2530}