st/
scanner.rs

1//
2// -----------------------------------------------------------------------------
3//  WELCOME TO THE JUNGLE! ...The filesystem jungle, that is. 🌴
4//
5//  You've found scanner.rs, the intrepid explorer and engine room of st.
6//  This module is the Indiana Jones of our codebase. It bravely dives into
7//  the deepest, darkest directories, dodges `.gitignore` traps, inspects
8//  every file for treasure (metadata), and reports back its findings.
9//
10//  So grab your hat, and let's go on an adventure!
11//
12//  Brought to you by The Cheet - making filesystem traversal a rock concert! 🥁🧻
13// -----------------------------------------------------------------------------
14//
15
16use crate::interest_calculator::InterestCalculator;
17use crate::scanner_interest::{ChangeType, InterestScore, TraversalContext};
18use crate::scanner_safety::{estimate_node_size, ScannerSafetyLimits, ScannerSafetyTracker};
19use crate::scanner_state::ScanState;
20use crate::security_scan::{SecurityFinding, SecurityScanner};
21use anyhow::Result;
22use globset::{Glob, GlobSet, GlobSetBuilder}; // For powerful gitignore-style pattern matching.
23use regex::Regex; // For user-defined find patterns.
24use std::collections::{HashMap, HashSet}; // Our trusty hash-based collections.
25use std::fs; // Filesystem operations, the bread and butter here.
26use std::io::{BufRead, BufReader}; // For efficient reading, especially for content search.
27use std::path::{Path, PathBuf}; // Path manipulation is key.
28use std::sync::mpsc; // For streaming results from a worker thread.
29use std::time::SystemTime; // To know when files were last touched.
30use walkdir::{DirEntry, WalkDir}; // The excellent `walkdir` crate does the actual directory walking.
31
32// Unix-specific imports for richer metadata like permissions, UID, GID.
33// On other platforms, we'll use sensible defaults.
34#[cfg(unix)]
35use std::os::unix::fs::{MetadataExt, PermissionsExt};
36
37/// # FileNode: The Ultimate Backstage Pass
38///
39/// Every file and directory we meet gets one of these. It's a VIP pass that
40/// holds all the juicy details: its name, size, when it was last cool (modified),
41/// and whether it's on the super-secret "ignored" list. It's the atom of our
42/// `st` universe.
43#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
44pub struct FileNode {
45    /// The full path to the file or directory. The source of truth for location!
46    pub path: PathBuf,
47    /// Is it a directory? `true` if yes, `false` if it's a file or symlink.
48    pub is_dir: bool,
49    /// Size of the file in bytes. For directories, this is often 0 or metadata-dependent.
50    pub size: u64,
51    /// File permissions (e.g., `rwxr-xr-x`). Stored as a u32, typically from Unix mode.
52    pub permissions: u32,
53    /// User ID of the owner (Unix-specific).
54    pub uid: u32,
55    /// Group ID of the owner (Unix-specific).
56    pub gid: u32,
57    /// Timestamp of the last modification. Tells us how fresh or ancient a file is.
58    pub modified: SystemTime,
59    /// Is it a symbolic link? `true` if yes. We handle these with care.
60    pub is_symlink: bool,
61    /// Is it a hidden file (e.g., starts with a `.` on Unix)?
62    pub is_hidden: bool,
63    /// Did we encounter a "Permission Denied" error when trying to access this?
64    /// Important for gracefully handling parts of the filesystem we can't read.
65    pub permission_denied: bool,
66    /// Is this file or directory ignored based on `.gitignore` or default ignore rules?
67    pub is_ignored: bool,
68    /// The depth of this entry relative to the scan root (root is depth 0).
69    pub depth: usize,
70    /// The specific type of the file (e.g., RegularFile, Symlink, Executable).
71    pub file_type: FileType,
72    /// A category assigned based on extension or name, used for coloring and context.
73    /// (e.g., Rust, Python, Image, Archive).
74    pub category: FileCategory,
75    /// For content search: Information about where matches were found
76    /// `None` if no search was performed or no matches.
77    pub search_matches: Option<SearchMatches>,
78    /// The filesystem type this file resides on
79    pub filesystem_type: FilesystemType,
80    /// Git branch if this directory contains a .git folder
81    pub git_branch: Option<String>,
82
83    // --- Smart Scanning Fields (Phase 2: Intelligent Context-Aware Scanning) ---
84    // These fields enable "surface what matters" scanning
85
86    /// How we reached this location (direct, symlink, mount, dependency)
87    #[serde(skip_serializing_if = "Option::is_none")]
88    pub traversal_context: Option<TraversalContext>,
89
90    /// Interest score - how relevant is this file right now?
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub interest: Option<InterestScore>,
93
94    /// Security findings detected during scan
95    #[serde(skip_serializing_if = "Vec::is_empty", default)]
96    pub security_findings: Vec<SecurityFinding>,
97
98    /// Change status since last scan (Added, Modified, Deleted, etc.)
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub change_status: Option<ChangeType>,
101
102    /// Content hash for change detection (Blake3/SHA256)
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub content_hash: Option<String>,
105}
106
107/// Information about search matches within a file
108#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
109pub struct SearchMatches {
110    /// First match position (line, column)
111    pub first_match: (usize, usize),
112    /// Total number of matches found
113    pub total_count: usize,
114    /// List of all match positions (line, column) - limited to prevent memory issues
115    pub positions: Vec<(usize, usize)>,
116    /// Whether the search was truncated due to too many matches
117    pub truncated: bool,
118    /// Line content for each match (line number, line content, column) - optional for compatibility
119    #[serde(skip_serializing_if = "Option::is_none")]
120    pub line_content: Option<Vec<(usize, String, usize)>>,
121}
122
123/// # FileType: Distinguishing Different Kinds of Filesystem Objects
124///
125/// This enum helps us categorize entries beyond just "file" or "directory".
126/// It's especially useful on Unix-like systems where you have sockets, pipes, etc.
127#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
128pub enum FileType {
129    Directory,   // A folder, a container of other things.
130    RegularFile, // Your everyday, garden-variety file.
131    Symlink,     // A pointer to another file or directory.
132    Executable,  // A file that can be run (has execute permissions).
133    Socket,      // A Unix domain socket.
134    Pipe,        // A named pipe (FIFO).
135    BlockDevice, // A block special file (e.g., /dev/sda).
136    CharDevice,  // A character special file (e.g., /dev/tty).
137}
138
139/// # FilesystemType: Identifying the underlying filesystem
140///
141/// This enum represents different filesystem types with single-character codes
142/// for compact display. The mapping is designed to be memorable and intuitive.
143#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
144pub enum FilesystemType {
145    Ext4,    // '4' - The most common Linux filesystem
146    Ext3,    // '3' - Older ext filesystem
147    Ext2,    // '2' - Even older ext filesystem
148    Xfs,     // 'X' - XFS filesystem
149    Btrfs,   // 'B' - Btrfs (B-tree filesystem)
150    Zfs,     // 'Z' - ZFS filesystem
151    Ntfs,    // 'N' - Windows NTFS
152    Fat32,   // 'F' - FAT32
153    ExFat,   // 'E' - exFAT
154    Apfs,    // 'A' - Apple File System
155    Hfs,     // 'H' - HFS+ (older Mac)
156    Nfs,     // 'R' - Remote NFS mount
157    Smb,     // 'S' - SMB/CIFS network filesystem
158    Tmpfs,   // 'T' - Temporary filesystem (RAM)
159    Procfs,  // 'P' - /proc virtual filesystem
160    Sysfs,   // 'Y' - /sys virtual filesystem
161    Devfs,   // 'D' - /dev virtual filesystem
162    Mem8,    // 'M' - MEM|8 filesystem (Coming soon - Quantum File System) - https://m8.is
163    Unknown, // '?' - Unknown filesystem
164}
165
166impl FilesystemType {
167    /// Get the single-character code for this filesystem type
168    pub fn to_char(&self) -> char {
169        match self {
170            FilesystemType::Ext4 => '4',
171            FilesystemType::Ext3 => '3',
172            FilesystemType::Ext2 => '2',
173            FilesystemType::Xfs => 'X',
174            FilesystemType::Btrfs => 'B',
175            FilesystemType::Zfs => 'Z',
176            FilesystemType::Ntfs => 'N',
177            FilesystemType::Fat32 => 'F',
178            FilesystemType::ExFat => 'E',
179            FilesystemType::Apfs => 'A',
180            FilesystemType::Hfs => 'H',
181            FilesystemType::Nfs => 'R',
182            FilesystemType::Smb => 'S',
183            FilesystemType::Tmpfs => 'T',
184            FilesystemType::Procfs => 'P',
185            FilesystemType::Sysfs => 'Y',
186            FilesystemType::Devfs => 'D',
187            FilesystemType::Mem8 => 'M',
188            FilesystemType::Unknown => '?',
189        }
190    }
191
192    /// Check if this is a virtual filesystem that should be skipped
193    pub fn is_virtual(&self) -> bool {
194        matches!(
195            self,
196            FilesystemType::Procfs
197                | FilesystemType::Sysfs
198                | FilesystemType::Devfs
199                | FilesystemType::Tmpfs
200        )
201    }
202
203    /// Check if this filesystem type should be shown by default
204    /// (only "interesting" filesystems based on platform)
205    pub fn should_show_by_default(&self) -> bool {
206        #[cfg(target_os = "linux")]
207        {
208            matches!(
209                self,
210                FilesystemType::Ext4
211                    | FilesystemType::Ext3
212                    | FilesystemType::Xfs
213                    | FilesystemType::Btrfs
214                    | FilesystemType::Zfs
215                    | FilesystemType::Nfs
216                    | FilesystemType::Smb
217                    | FilesystemType::Mem8
218            )
219        }
220        #[cfg(target_os = "macos")]
221        {
222            matches!(
223                self,
224                FilesystemType::Apfs
225                    | FilesystemType::Hfs
226                    | FilesystemType::Nfs
227                    | FilesystemType::Smb
228                    | FilesystemType::Mem8
229            )
230        }
231        #[cfg(target_os = "windows")]
232        {
233            matches!(
234                self,
235                FilesystemType::Ntfs
236                    | FilesystemType::Fat32
237                    | FilesystemType::ExFat
238                    | FilesystemType::Mem8
239            )
240        }
241        #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
242        {
243            // Show all non-virtual filesystems on other platforms
244            !self.is_virtual()
245        }
246    }
247}
248
249/// # FileCategory: Adding Semantic Flavor to Files
250///
251/// This enum provides a higher-level categorization based on common file extensions
252/// or names. It's primarily used for display purposes, like coloring output,
253/// and can also help in understanding the nature of a directory's contents.
254/// Trish loves how this makes the tree output more intuitive!
255#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
256pub enum FileCategory {
257    // --- Programming Languages ---
258    Rust,       // .rs
259    Python,     // .py, .pyw, .pyx, .pyi
260    JavaScript, // .js, .mjs, .cjs
261    TypeScript, // .ts, .tsx
262    Java,       // .java, .class, .jar
263    C,          // .c, .h
264    Cpp,        // .cpp, .cc, .cxx, .hpp, .hxx
265    Go,         // .go
266    Ruby,       // .rb
267    PHP,        // .php - Not sure php is programming.
268    Shell,      // .sh, .bash, .zsh, .fish
269
270    // --- Markup & Data Formats ---
271    Markdown, // .md, .markdown
272    Html,     // .html, .htm
273    Css,      // .css, .scss, .sass, .less
274    Json,     // .json, .jsonc
275    Yaml,     // .yaml, .yml
276    Xml,      // .xml, .svg (SVG is XML-based)
277    Toml,     // .toml
278    Csv,      // .csv
279
280    // --- Build Systems & Configuration ---
281    Makefile,   // Makefile, makefile, GNUmakefile
282    Dockerfile, // Dockerfile, .dockerfile
283    GitConfig,  // .gitignore, .gitconfig, .gitmodules
284
285    // --- Archives & Compressed Files ---
286    Archive, // .zip, .tar, .gz, .bz2, .xz, .7z, .rar
287
288    // --- Media Files ---
289    Image, // .jpg, .jpeg, .png, .gif, .bmp, .ico, .webp
290    Video, // .mp4, .avi, .mkv, .mov, .wmv, .flv, .webm
291    Audio, // .mp3, .wav, .flac, .aac, .ogg, .wma
292
293    // --- System & Binary Files ---
294    SystemFile, // Special system files like swap.img, vmlinuz
295    Binary,     // Executables, shared libraries (.exe, .dll, .so, .dylib, .o, .a)
296
297    // --- Database ---
298    Database, // .db, .sqlite, .mdb, .accdb, .dbf
299
300    // --- Office & Documents ---
301    Office,      // .doc, .docx, .odt
302    Spreadsheet, // .xls, .xlsx, .ods, .csv
303    PowerPoint,  // .ppt, .pptx, .odp
304    Pdf,         // .pdf
305    Ebook,       // .epub, .mobi, .azw
306
307    // --- Text Variants ---
308    Log,     // .log
309    Config,  // .ini, .cfg, .conf, .env, .properties
310    License, // LICENSE, COPYING files
311    Readme,  // README files
312    Txt,     // .txt
313    Rtf,     // .rtf
314
315    // --- Security & Crypto ---
316    Certificate, // .crt, .cert, .pem, .key
317    Encrypted,   // .gpg, .pgp, .aes
318
319    // --- Fonts ---
320    Font, // .ttf, .otf, .woff, .woff2
321
322    // --- Virtual & Disk Images ---
323    DiskImage, // .img, .iso, .vdi, .vmdk, .vhd, .dd, .dmg
324
325    // --- 3D & CAD ---
326    Model3D, // .obj, .stl, .dae, .fbx, .blend
327
328    // --- Scientific & Data ---
329    Jupyter, // .ipynb
330    RData,   // .rdata, .rds
331    Matlab,  // .m, .mat
332
333    // --- Web Assets ---
334    WebAsset, // .wasm, .map
335
336    // --- Package & Dependencies ---
337    Package, // package.json, Cargo.toml, requirements.txt, etc.
338    Lock,    // package-lock.json, Cargo.lock, yarn.lock
339
340    // --- Testing ---
341    Test, // Files with test_, _test, .test, .spec patterns
342
343    // --- Memory Files (Our special type!) ---
344    Memory, // .mem8, .m8 - MEM|8 memory files
345
346    // --- Others ---
347    Backup,  // .bak, .backup, ~
348    Temp,    // .tmp, .temp, .swp
349    Unknown, // If we can't categorize it, it's a mysterious Unknown!
350}
351
352/// # TreeStats: The Final Scoreboard
353///
354/// After the concert is over, this is where we see how we did. It's the
355/// scoreboard that tracks total files, total directories, the biggest hits
356/// (largest files), and more. It's the answer to "So, how was the show?"
357#[derive(Debug, Default)]
358pub struct TreeStats {
359    /// Total number of files encountered (excluding directories).
360    pub total_files: u64,
361    /// Total number of directories encountered.
362    pub total_dirs: u64,
363    /// Total size of all files (in bytes).
364    pub total_size: u64,
365    /// A map of file extensions to their counts (e.g., {"rs": 10, "toml": 2}).
366    pub file_types: HashMap<String, u64>,
367    /// Top N largest files found (path and size). N is usually 10.
368    pub largest_files: Vec<(u64, PathBuf)>,
369    /// Top N newest files found (path and modification time).
370    pub newest_files: Vec<(SystemTime, PathBuf)>,
371    /// Top N oldest files found (path and modification time).
372    pub oldest_files: Vec<(SystemTime, PathBuf)>,
373}
374
375impl TreeStats {
376    /// Updates the statistics based on a newly processed `FileNode`.
377    /// This method is called for each non-permission-denied node.
378    pub fn update_file(&mut self, node: &FileNode) {
379        if node.is_dir {
380            self.total_dirs += 1;
381        } else {
382            // It's a file!
383            self.total_files += 1;
384            self.total_size += node.size;
385
386            // Track file extensions for type distribution.
387            if let Some(ext) = node.path.extension() {
388                if let Some(ext_str) = ext.to_str() {
389                    *self.file_types.entry(ext_str.to_string()).or_insert(0) += 1;
390                }
391            }
392
393            // --- Update Top N Lists ---
394            // These lists are kept sorted and truncated to maintain a fixed size (e.g., top 10).
395
396            // Update largest files: Add, sort by size (desc), truncate.
397            self.largest_files.push((node.size, node.path.clone()));
398            self.largest_files.sort_by(|a, b| b.0.cmp(&a.0)); // Largest first
399            self.largest_files.truncate(10); // Keep only the top 10
400
401            // Update newest files: Add, sort by modification time (desc), truncate.
402            self.newest_files.push((node.modified, node.path.clone()));
403            self.newest_files.sort_by(|a, b| b.0.cmp(&a.0)); // Newest first
404            self.newest_files.truncate(10);
405
406            // Update oldest files: Add, sort by modification time (asc), truncate.
407            self.oldest_files.push((node.modified, node.path.clone()));
408            self.oldest_files.sort_by(|a, b| a.0.cmp(&b.0)); // Oldest first
409            self.oldest_files.truncate(10);
410        }
411    }
412}
413
414/// # ScannerConfig: The Rider for our Rock Star Scanner
415///
416/// This is the list of demands for our scanner. "Don't show me hidden files,"
417/// "I only want to see files bigger than a tour bus," "Ignore the messy backstage
418/// area (`.gitignore`)." We build this from the user's command-line arguments
419/// to make sure the scanner puts on the exact show the user wants to see.
420#[derive(Default, Clone)]
421pub struct ScannerConfig {
422    /// Maximum depth to traverse into subdirectories.
423    pub max_depth: usize,
424    /// Should symbolic links be followed? (Currently always `false`).
425    pub follow_symlinks: bool,
426    /// Should `.gitignore` files be respected?
427    pub respect_gitignore: bool,
428    /// Should hidden files (starting with `.`) be shown?
429    pub show_hidden: bool,
430    /// Should ignored files/directories be shown (usually in brackets)?
431    pub show_ignored: bool,
432    /// An optional regex pattern to filter files/directories by name.
433    pub find_pattern: Option<Regex>,
434    /// An optional file extension to filter by (e.g., "rs").
435    pub file_type_filter: Option<String>,
436    /// Optional entry type filter ("f" for files, "d" for directories).
437    pub entry_type_filter: Option<String>,
438    /// Optional minimum file size filter.
439    pub min_size: Option<u64>,
440    /// Optional maximum file size filter.
441    pub max_size: Option<u64>,
442    /// Optional filter for files newer than a specific date.
443    pub newer_than: Option<SystemTime>,
444    /// Optional filter for files older than a specific date.
445    pub older_than: Option<SystemTime>,
446    /// Should the scanner use its built-in list of default ignore patterns
447    /// (like `node_modules`, `__pycache__`, `target/`)?
448    pub use_default_ignores: bool,
449    /// An optional keyword to search for within file contents.
450    pub search_keyword: Option<String>,
451    /// Should filesystem type indicators be shown?
452    pub show_filesystems: bool,
453    /// Sort field for results (name, size, date, type)
454    pub sort_field: Option<String>,
455    /// Limit results to top N entries (useful with sort)
456    pub top_n: Option<usize>,
457    /// Include actual line content in search results (for AI/MCP use)
458    pub include_line_content: bool,
459
460    // --- Smart Scanning Options (Phase 2: Intelligent Context-Aware Scanning) ---
461
462    /// Compute interest scores for each node (default: true when smart mode is enabled)
463    pub compute_interest: bool,
464    /// Perform security scanning during traversal (default: true)
465    pub security_scan: bool,
466    /// Minimum interest score to include in results (0.0-1.0, default: 0.0)
467    pub min_interest: f32,
468    /// Track how we reached each location (symlink, mount, etc.)
469    pub track_traversal: bool,
470    /// Only show changes since last scan
471    pub changes_only: bool,
472    /// Path to previous state file for comparison (or auto-detect from ~/.st/scan_states/)
473    pub compare_state: Option<PathBuf>,
474    /// Enable smart mode - groups by interest, shows changes, minimal output
475    pub smart_mode: bool,
476}
477
478// --- Default Ignore Patterns: The "Please Don't Play These Songs" List ---
479// Every band has songs they'd rather not play. This is our list of files and
480// directories (`node_modules`, `target/`, etc.) that we usually skip to keep
481// the show clean and focused on the hits. A tidy tree is a happy tree!
482const DEFAULT_IGNORE_PATTERNS: &[&str] = &[
483    // Version control systems (but not all hidden dirs like .ssh)
484    ".git",
485    ".svn",
486    ".hg",
487    ".bzr",
488    "_darcs",
489    // Python artifacts
490    "__pycache__",
491    "*.pyc",
492    "*.pyo",
493    "*.pyd",
494    ".Python",
495    ".pytest_cache",
496    ".tox",
497    ".coverage",
498    "*.egg-info",
499    ".eggs",
500    // Node.js / JavaScript artifacts
501    "node_modules",
502    ".npm",
503    ".yarn",
504    ".pnpm-store",
505    "bower_components",
506    ".next",
507    ".nuxt",
508    // General cache directories often found in projects
509    ".cache", // Common cache dir name
510    // Virtual environments
511    "venv",
512    "env",
513    "ENV",
514    "virtualenv",
515    ".venv",
516    ".env",
517    "conda-meta",
518    // Build/compilation artifacts from various languages/systems
519    "target", // Rust
520    "build",
521    "dist",
522    "out",
523    "bin",
524    "obj", // Common build output dirs
525    "*.o",
526    "*.a",
527    "*.so",
528    "*.dll",
529    "*.dylib", // Object files, libraries
530    // Package manager caches/data
531    ".cargo",
532    ".rustup", // Rust
533    ".gem",
534    ".bundle", // Ruby
535    // IDEs and editor-specific files/directories
536    ".idea",
537    ".vscode",
538    ".vs", // Common IDE metadata
539    "*.swp",
540    "*.swo",
541    "*~", // Vim/editor backup/swap files
542    ".project",
543    ".classpath",
544    ".settings", // Eclipse/Java
545    // Development tool caches
546    ".mypy_cache",
547    ".ruff_cache",
548    ".hypothesis",
549    ".pytest_cache",
550    ".tox",
551    ".coverage",
552    ".sass-cache",
553    // OS-specific junk files
554    ".DS_Store",    // macOS
555    "Thumbs.db",    // Windows
556    "desktop.ini",  // Windows
557    "$RECYCLE.BIN", // Windows recycle bin
558    // Common temporary file/directory names and patterns
559    "tmp",
560    "temp",
561    ".tmp",
562    ".temp",
563    "*.tmp",
564    "*.temp",
565    // More cache directories
566    ".sass-cache", // Sass CSS preprocessor
567    "__MACOSX",    // macOS archive metadata
568    // System directories that are almost never useful to traverse deeply from a user's project root.
569    // These are more aggressively ignored if `st` is run on `/`.
570    // "proc", "sys", "dev", "lost+found", "mnt", "media", // Handled by DEFAULT_SYSTEM_PATHS
571    // Other common ignores
572    ".vagrant",
573    ".terraform",
574];
575
576// Default paths that are almost always too noisy or problematic to scan,
577// especially if `st` is run from `/` or a very high-level directory.
578// These are typically mount points for virtual filesystems or system-critical areas.
579const DEFAULT_SYSTEM_PATHS: &[&str] = &[
580    "/proc",
581    "/sys",
582    "/dev",
583    "/run",
584    "/tmp",
585    "/var/tmp",
586    "/lost+found",
587    "/mnt",
588    "/media",
589    "/snap", // Common mount points or special dirs
590];
591
592// Specific individual files (absolute paths) that should always be ignored
593// due to their special nature (e.g., virtual files representing system memory).
594const DEFAULT_IGNORE_FILES: &[&str] = &[
595    "/proc/kcore",    // Virtual file representing physical memory, can be huge & slow.
596    "/proc/kmsg",     // Kernel messages, can be an infinite stream.
597    "/proc/kallsyms", // Kernel symbols, can be large.
598];
599
600/// # Scanner: The Rock Star of our Show
601///
602/// BEHOLD! The `Scanner` itself! This is the main act. It takes the config,
603/// the ignore lists, and a path, and it puts on a spectacular show of directory
604/// traversal. It's fast, it's smart, and it knows all the best moves.
605pub struct Scanner {
606    /// The configuration for this scanning operation.
607    config: ScannerConfig,
608    /// Compiled `GlobSet` from `.gitignore` files, if respected and found.
609    gitignore: Option<GlobSet>,
610    /// Compiled `GlobSet` from our `DEFAULT_IGNORE_PATTERNS`.
611    default_ignores: Option<GlobSet>,
612    /// A set of absolute system paths to ignore (e.g., /proc, /sys).
613    system_paths: HashSet<PathBuf>,
614    /// A set of specific absolute file paths to ignore (e.g., /proc/kcore).
615    ignore_files: HashSet<PathBuf>,
616    /// The root path from which the scan originates.
617    root: PathBuf,
618    /// Safety limits to prevent crashes on large directories
619    safety_limits: ScannerSafetyLimits,
620
621    // --- Smart Scanning Components (Phase 4) ---
622
623    /// Security scanner for detecting supply chain attack patterns
624    security_scanner: Option<SecurityScanner>,
625    /// Interest calculator for scoring file relevance
626    interest_calculator: Option<InterestCalculator>,
627}
628
629impl Scanner {
630    /// Returns the canonicalized root path of the scanner
631    pub fn root(&self) -> &Path {
632        &self.root
633    }
634
635    /// Quick scan for basic project analysis - lighter weight than full scan
636    /// Returns only basic stats and key files for faster integration
637    pub fn quick_scan(&self) -> Result<(Vec<FileNode>, TreeStats)> {
638        let mut config = self.config.clone();
639        config.max_depth = 3; // Limit depth for quick scan
640
641        let quick_scanner = Scanner::new(&self.root, config)?;
642        quick_scanner.scan()
643    }
644
645    /// Find files modified within a specific time range
646    /// Useful for finding recent activity in projects
647    pub fn find_recent_files(&self, hours_ago: u64) -> Result<Vec<FileNode>> {
648        let cutoff_time =
649            std::time::SystemTime::now() - std::time::Duration::from_secs(hours_ago * 3600);
650
651        let (nodes, _) = self.scan()?;
652        Ok(nodes
653            .into_iter()
654            .filter(|node| !node.is_dir && node.modified > cutoff_time)
655            .collect())
656    }
657
658    /// Get key project files (build configs, main files, etc.)
659    /// Returns a filtered list of important files for project analysis
660    pub fn find_key_files(&self) -> Result<Vec<FileNode>> {
661        let (nodes, _) = self.scan()?;
662
663        let important_patterns = [
664            "main.rs",
665            "lib.rs",
666            "mod.rs",
667            "package.json",
668            "Cargo.toml",
669            "requirements.txt",
670            "pyproject.toml",
671            "README.md",
672            "LICENSE",
673            "Makefile",
674            "CMakeLists.txt",
675            "index.js",
676            "app.js",
677            "server.js",
678            "main.js",
679            "main.py",
680            "__init__.py",
681            "setup.py",
682            "go.mod",
683            "main.go",
684            "pom.xml",
685            "build.gradle",
686            "build.xml",
687            ".gitignore",
688            "docker-compose.yml",
689            "Dockerfile",
690        ];
691
692        Ok(nodes
693            .into_iter()
694            .filter(|node| {
695                if node.is_dir {
696                    return false;
697                }
698
699                let file_name = node.path.file_name().and_then(|n| n.to_str()).unwrap_or("");
700
701                important_patterns.contains(&file_name)
702            })
703            .collect())
704    }
705
706    /// ## `get_file_category`
707    /// Determines a `FileCategory` for a given path and `FileType`.
708    /// This function uses a series of heuristics based on file extensions and common names
709    /// to classify files into broad categories, useful for display and understanding content.
710    /// It's like a quick identification guide for files!
711    fn get_file_category(path: &Path, file_type: FileType) -> FileCategory {
712        // Directories don't get a specific content category here; their content defines them.
713        if matches!(file_type, FileType::Directory) {
714            return FileCategory::Unknown;
715        }
716
717        // First, check for some very specific system file names.
718        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
719            if name == "swap.img"
720                || name == "swapfile"
721                || name.starts_with("vmlinuz")
722                || name.starts_with("initrd")
723            {
724                return FileCategory::SystemFile;
725            }
726        }
727
728        // Primary categorization is by file extension.
729        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
730            match ext.to_lowercase().as_str() {
731                // --- Programming Languages ---
732                "rs" => FileCategory::Rust,
733                "py" | "pyw" | "pyx" | "pyi" => FileCategory::Python,
734                "js" | "mjs" | "cjs" => FileCategory::JavaScript,
735                "ts" | "tsx" => FileCategory::TypeScript,
736                "java" | "class" | "jar" => FileCategory::Java,
737                "c" | "h" => FileCategory::C,
738                "cpp" | "cc" | "cxx" | "hpp" | "hxx" => FileCategory::Cpp,
739                "go" => FileCategory::Go,
740                "rb" => FileCategory::Ruby,
741                "php" => FileCategory::PHP,
742                "sh" | "bash" | "zsh" | "fish" | "ps1" | "bat" | "cmd" => FileCategory::Shell,
743
744                // --- Markup/Data ---
745                "md" | "markdown" => FileCategory::Markdown,
746                "html" | "htm" => FileCategory::Html,
747                "css" | "scss" | "sass" | "less" => FileCategory::Css,
748                "json" | "jsonc" | "geojson" => FileCategory::Json,
749                "yaml" | "yml" => FileCategory::Yaml,
750                "xml" | "svg" | "plist" | "kml" | "gpx" => FileCategory::Xml, // SVG and others are XML-based
751                "toml" => FileCategory::Toml,
752
753                // --- Build/Config (some are also by name) ---
754                "dockerfile" => FileCategory::Dockerfile, // Extension variant
755                // .gitignore, .gitconfig are usually by name, handled below
756
757                // --- Archives ---
758                "zip" | "tar" | "gz" | "tgz" | "bz2" | "tbz2" | "xz" | "txz" | "7z" | "rar" => {
759                    FileCategory::Archive
760                }
761
762                // --- Media ---
763                "jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "webp" | "tiff" | "tif"
764                | "heic" | "heif" => FileCategory::Image,
765                "mp4" | "avi" | "mkv" | "mov" | "wmv" | "flv" | "webm" | "mpeg" | "mpg" => {
766                    FileCategory::Video
767                }
768                "mp3" | "wav" | "flac" | "aac" | "ogg" | "wma" | "m4a" => FileCategory::Audio,
769
770                // --- Binary/Executable (some overlap with system, but these are common distributable/object formats) ---
771                "exe" | "dll" | "so" | "dylib" | "o" | "a" | "lib" | "msi" | "deb" | "rpm"
772                | "app" => FileCategory::Binary,
773
774                // --- Database Files ---
775                "db" | "sqlite" | "sqlitedb" | "sqlite3" | "db3" | "db4" | "db5" | "mdb"
776                | "accdb" | "dbf" => FileCategory::Database,
777
778                // --- Office & Documents ---
779                "doc" | "docx" | "odt" | "rtf" => FileCategory::Office,
780                "xls" | "xlsx" | "ods" | "csv" | "tsv" => FileCategory::Spreadsheet,
781                "ppt" | "pptx" | "odp" => FileCategory::PowerPoint,
782                "pdf" => FileCategory::Pdf,
783                "epub" | "mobi" | "azw" | "azw3" | "fb2" => FileCategory::Ebook,
784
785                // --- Text & Config Files ---
786                "txt" | "text" => FileCategory::Txt,
787                "log" => FileCategory::Log,
788                "ini" | "cfg" | "conf" | "config" | "properties" | "env" => FileCategory::Config,
789
790                // --- Security & Crypto ---
791                "crt" | "cert" | "pem" | "key" | "pub" | "cer" | "der" => FileCategory::Certificate,
792                "gpg" | "pgp" | "aes" | "enc" | "asc" => FileCategory::Encrypted,
793
794                // --- Fonts ---
795                "ttf" | "otf" | "woff" | "woff2" | "eot" | "fon" | "fnt" => FileCategory::Font,
796
797                // --- Disk Images ---
798                "img" | "vdi" | "vmdk" | "vhd" | "vhdx" | "dd" | "hdd" | "qcow" | "qcow2" => {
799                    FileCategory::DiskImage
800                }
801                "iso" | "dmg" => FileCategory::DiskImage, // These can be both archives and disk images, but treating as disk images
802
803                // --- 3D & CAD ---
804                "obj" | "stl" | "dae" | "fbx" | "blend" | "3ds" | "ply" | "gltf" | "glb" => {
805                    FileCategory::Model3D
806                }
807
808                // --- Scientific & Data ---
809                "ipynb" => FileCategory::Jupyter,
810                "rdata" | "rds" | "rda" => FileCategory::RData,
811                "m" | "mat" | "mlx" => FileCategory::Matlab,
812
813                // --- Web Assets ---
814                "wasm" | "map" | "sourcemap" => FileCategory::WebAsset,
815
816                // --- Memory Files (MEM|8!) ---
817                "mem8" | "m8" | "mq" => FileCategory::Memory,
818
819                // --- Backup & Temp ---
820                "bak" | "backup" | "old" | "orig" => FileCategory::Backup,
821                "tmp" | "temp" | "swp" | "swo" | "swn" => FileCategory::Temp,
822
823                _ => FileCategory::Unknown, // Extension not recognized
824            }
825        } else {
826            // No extension, or extension parsing failed. Try common filenames.
827            if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
828                // Check for test files
829                if name.starts_with("test_")
830                    || name.ends_with("_test")
831                    || name.contains(".test.")
832                    || name.contains(".spec.")
833                {
834                    return FileCategory::Test;
835                }
836
837                // Check for specific filenames
838                match name {
839                    "Makefile" | "makefile" | "GNUmakefile" => FileCategory::Makefile,
840                    "Dockerfile" => FileCategory::Dockerfile,
841                    ".gitignore" | ".gitconfig" | ".gitattributes" | ".gitmodules" => {
842                        FileCategory::GitConfig
843                    }
844                    "LICENSE" | "LICENCE" | "COPYING" => FileCategory::License,
845                    "README" | "README.md" | "README.txt" | "README.rst" => FileCategory::Readme,
846                    "package.json" | "Cargo.toml" | "requirements.txt" | "pyproject.toml"
847                    | "pom.xml" | "build.gradle" | "go.mod" | "composer.json" => {
848                        FileCategory::Package
849                    }
850                    "package-lock.json" | "Cargo.lock" | "yarn.lock" | "pnpm-lock.yaml"
851                    | "poetry.lock" | "Gemfile.lock" => FileCategory::Lock,
852                    _ => {
853                        // Check for backup files ending with ~
854                        if name.ends_with('~') {
855                            FileCategory::Backup
856                        } else if matches!(file_type, FileType::Executable) {
857                            FileCategory::Binary
858                        } else {
859                            FileCategory::Unknown
860                        }
861                    }
862                }
863            } else {
864                FileCategory::Unknown // Path has no filename component (should be rare for actual files).
865            }
866        }
867    }
868
869    /// ## `Scanner::new` - Constructor
870    ///
871    /// Creates a new `Scanner` instance. This involves:
872    /// 1. Storing the provided `config` and `root` path.
873    /// 2. Loading and compiling `.gitignore` patterns if `config.respect_gitignore` is true.
874    /// 3. Compiling the `DEFAULT_IGNORE_PATTERNS` if `config.use_default_ignores` is true.
875    /// 4. Initializing sets of system paths and specific files to always ignore.
876    ///
877    /// This setup prepares the scanner for efficient `should_ignore` checks during traversal.
878    pub fn new(root: &Path, config: ScannerConfig) -> Result<Self> {
879        // Canonicalize the root path to get the absolute path
880        // If canonicalize fails (e.g., path doesn't exist), fall back to absolute path
881        let canonical_root = root
882            .canonicalize()
883            .or_else(|_| std::env::current_dir().map(|cwd| cwd.join(root)))
884            .unwrap_or_else(|_| root.to_path_buf());
885
886        // Load .gitignore patterns from the root directory if requested.
887        let gitignore = if config.respect_gitignore {
888            Self::load_gitignore(&canonical_root)? // This can return None if no .gitignore or error.
889        } else {
890            None // Not respecting .gitignore.
891        };
892
893        // Build the GlobSet for default ignore patterns if requested.
894        let default_ignores = if config.use_default_ignores {
895            Self::build_default_ignores()? // This can return None if patterns are invalid (unlikely for defaults).
896        } else {
897            None // Not using default ignores.st
898        };
899
900        // Initialize the set of system paths to ignore (e.g., /proc, /sys).
901        let system_paths: HashSet<PathBuf> = if config.use_default_ignores {
902            DEFAULT_SYSTEM_PATHS
903                .iter()
904                .map(PathBuf::from) // Convert string slices to PathBufs
905                .collect() // Collect into a HashSet for quick lookups.
906        } else {
907            HashSet::new() // Empty set if not using default ignores.
908        };
909
910        // Initialize the set of specific files to ignore (e.g., /proc/kcore).
911        let ignore_files: HashSet<PathBuf> = if config.use_default_ignores {
912            DEFAULT_IGNORE_FILES.iter().map(PathBuf::from).collect()
913        } else {
914            HashSet::new()
915        };
916
917        // Determine appropriate safety limits based on the path
918        let safety_limits =
919            if canonical_root.as_os_str() == std::env::var("HOME").unwrap_or_default().as_str() {
920                // Home directory needs special care
921                ScannerSafetyLimits::for_home_directory()
922            } else if canonical_root.starts_with("/") && canonical_root.components().count() <= 2 {
923                // Root or near-root paths need limits
924                ScannerSafetyLimits::for_home_directory()
925            } else {
926                // Regular directories can use default limits
927                ScannerSafetyLimits::default()
928            };
929
930        // Initialize security scanner if enabled
931        let security_scanner = if config.security_scan {
932            Some(SecurityScanner::new())
933        } else {
934            None
935        };
936
937        // Initialize interest calculator if smart mode or interest computation enabled
938        let interest_calculator = if config.compute_interest || config.smart_mode {
939            // Try to load previous state for change detection
940            let calc = InterestCalculator::new();
941            let calc = if let Ok(Some(prev_state)) = ScanState::load(&canonical_root) {
942                calc.with_previous_state(prev_state)
943            } else {
944                calc
945            };
946            Some(calc)
947        } else {
948            None
949        };
950
951        Ok(Self {
952            config,
953            gitignore,
954            default_ignores,
955            system_paths,
956            ignore_files,
957            root: canonical_root, // Store a copy of the root path.
958            safety_limits,
959            security_scanner,
960            interest_calculator,
961        })
962    }
963
964    /// ## `build_default_ignores`
965    ///
966    /// Compiles the `DEFAULT_IGNORE_PATTERNS` array into a `GlobSet` for efficient matching.
967    /// This `GlobSet` is used to quickly check if a file/directory name matches any of the
968    /// common patterns we want to ignore by default (like `node_modules`, `target/`).
969    /// Returns `Ok(Some(GlobSet))` on success, or `Ok(None)` if no patterns (should not happen),
970    /// or an `Err` if glob compilation fails (very unlikely for our hardcoded patterns).
971    fn build_default_ignores() -> Result<Option<GlobSet>> {
972        let mut builder = GlobSetBuilder::new(); // Start with an empty builder.
973
974        // Add each default pattern to the builder.
975        for pattern_str in DEFAULT_IGNORE_PATTERNS {
976            // Glob::new can fail if the pattern is malformed, but ours should be fine.
977            if let Ok(glob) = Glob::new(pattern_str) {
978                builder.add(glob);
979            }
980            // Silently ignore malformed default patterns, though this shouldn't occur.
981        }
982
983        // Build the GlobSet from the accumulated patterns.
984        // This can fail if, for example, the set is empty or patterns are incompatible,
985        // but again, highly unlikely for our predefined set.
986        Ok(Some(builder.build()?))
987    }
988
989    /// ## `load_gitignore`
990    ///
991    /// Reads the `.gitignore` file from the specified `root` directory (if it exists)
992    /// and compiles its patterns into a `GlobSet`.
993    /// Lines starting with `#` (comments) and empty lines are ignored.
994    /// Returns `Ok(Some(GlobSet))` if `.gitignore` is found and parsed,
995    /// `Ok(None)` if no `.gitignore` file exists, or an `Err` on I/O or parsing issues.
996    fn load_gitignore(root: &Path) -> Result<Option<GlobSet>> {
997        let gitignore_path = root.join(".gitignore"); // Construct path to .gitignore.
998        if !gitignore_path.exists() {
999            return Ok(None); // No .gitignore file found, nothing to load.
1000        }
1001
1002        let mut builder = GlobSetBuilder::new();
1003        // Read the entire .gitignore file, handling non-UTF-8 content gracefully
1004        let content = match fs::read(&gitignore_path) {
1005            Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(),
1006            Err(e) => {
1007                eprintln!(
1008                    "Warning: Could not read .gitignore at {:?}: {}",
1009                    gitignore_path, e
1010                );
1011                return Ok(None);
1012            }
1013        };
1014
1015        // Process each line of the .gitignore file.
1016        for line in content.lines() {
1017            let trimmed_line = line.trim(); // Remove leading/trailing whitespace.
1018                                            // Ignore empty lines and lines that are comments (start with '#').
1019            if !trimmed_line.is_empty() && !trimmed_line.starts_with('#') {
1020                // Attempt to compile the line as a glob pattern.
1021                // If successful, add it to our GlobSet builder.
1022                if let Ok(glob) = Glob::new(trimmed_line) {
1023                    builder.add(glob);
1024                }
1025                // Malformed patterns in user's .gitignore are silently skipped.
1026            }
1027        }
1028
1029        // Build the final GlobSet from all valid patterns.
1030        Ok(Some(builder.build()?))
1031    }
1032
1033    /// Stream nodes as they are discovered
1034    /// This version of scan is optimized for the `--stream` flag.
1035    /// It sends `FileNode` objects through the `sender` channel as soon as they are processed.
1036    /// This allows the formatter to start displaying output immediately, which is great for large directories.
1037    /// Returns the final `TreeStats` once the scan is complete.
1038    pub fn scan_stream(&self, sender: mpsc::Sender<FileNode>) -> Result<TreeStats> {
1039        let mut stats = TreeStats::default();
1040
1041        // When searching, we need to collect all nodes first to determine which directories to show
1042        if self.config.search_keyword.is_some() {
1043            // Use the non-streaming scan and then send results in order
1044            let (nodes, stats) = self.scan()?;
1045            for node in nodes {
1046                if sender.send(node).is_err() {
1047                    break; // Receiver disconnected
1048                }
1049            }
1050            return Ok(stats);
1051        }
1052
1053        // Initialize safety tracker for streaming mode
1054        let safety_tracker = ScannerSafetyTracker::new(self.safety_limits.clone());
1055
1056        // Original streaming logic for non-search cases
1057        let mut walker = WalkDir::new(&self.root)
1058            .max_depth(self.config.max_depth)
1059            .follow_links(self.config.follow_symlinks)
1060            .into_iter();
1061
1062        // Loop through each entry provided by WalkDir.
1063        while let Some(entry_result) = walker.next() {
1064            // Check safety limits
1065            if let Err(safety_error) = safety_tracker.should_continue() {
1066                eprintln!("⚠️  {}", safety_error);
1067                eprintln!("   Use --max-depth or scan a more specific directory");
1068                break;
1069            }
1070
1071            match entry_result {
1072                Ok(entry) => {
1073                    // Successfully read a directory entry.
1074                    let depth = entry.depth();
1075                    let path = entry.path();
1076
1077                    // Determine if this entry should be ignored based on various rules.
1078                    let is_ignored_by_rules = self.should_ignore(path)?;
1079
1080                    if is_ignored_by_rules {
1081                        // The entry matches an ignore rule.
1082                        if self.config.show_ignored {
1083                            // If we're showing ignored items, process it but mark as ignored.
1084                            if let Some(mut node) =
1085                                self.process_entry(&entry, depth, is_ignored_by_rules)?
1086                            {
1087                                // Perform content search if applicable, even for ignored files being shown.
1088                                if !node.is_dir && self.should_search_file(&node) {
1089                                    node.search_matches = self.search_in_file(&node.path);
1090                                }
1091
1092                                // Track node for safety limits
1093                                safety_tracker.add_file(estimate_node_size(
1094                                    node.path.to_string_lossy().len(),
1095                                ));
1096
1097                                // Send the (ignored) node through the channel.
1098                                if sender.send(node.clone()).is_err() {
1099                                    break; // Receiver has disconnected, stop scanning.
1100                                }
1101
1102                                // Update stats for ignored items if they aren't permission-denied.
1103                                // This ensures `show_ignored` gives a full picture.
1104                                if !node.permission_denied {
1105                                    stats.update_file(&node);
1106                                }
1107                            }
1108                            // If this ignored item is a directory, tell WalkDir not to descend into it.
1109                            if entry.file_type().is_dir() {
1110                                // `ignored_dirs.insert(path.to_path_buf());` // Not strictly needed if just skipping.
1111                                walker.skip_current_dir();
1112                            }
1113                        } else {
1114                            // We are *not* showing ignored items, and this one is ignored.
1115                            // If it's a directory, skip its contents. Otherwise, just continue.
1116                            if entry.file_type().is_dir() {
1117                                walker.skip_current_dir();
1118                            }
1119                            // `continue;` // Implicitly done by not processing further.
1120                        }
1121                    } else {
1122                        // The entry is NOT ignored by rules. Process it normally.
1123                        if let Some(mut node) = self.process_entry(&entry, depth, false)? {
1124                            // `is_ignored` is false here
1125                            // Perform content search if applicable.
1126                            if !node.is_dir && self.should_search_file(&node) {
1127                                node.search_matches = self.search_in_file(&node.path);
1128                            }
1129
1130                            // Apply filters (size, date, type, find pattern).
1131                            // A file is included if it's a directory, or it matches filters, or it has a search match.
1132                            let has_search_match = node
1133                                .search_matches
1134                                .as_ref()
1135                                .is_some_and(|m| m.total_count > 0);
1136
1137                            // If we have a search keyword, only include files with matches
1138                            let should_include_file = if self.config.search_keyword.is_some() {
1139                                has_search_match
1140                            } else {
1141                                self.should_include(&node)
1142                            };
1143
1144                            if node.is_dir || should_include_file {
1145                                // Track node for safety limits
1146                                safety_tracker.add_file(estimate_node_size(
1147                                    node.path.to_string_lossy().len(),
1148                                ));
1149
1150                                // Send the processed node through the channel.
1151                                if sender.send(node.clone()).is_err() {
1152                                    break; // Receiver disconnected.
1153                                }
1154
1155                                // Update statistics for included, non-permission-denied items.
1156                                if !node.permission_denied {
1157                                    stats.update_file(&node);
1158                                }
1159                            }
1160                        } else {
1161                            // process_entry returned None, which means this is a hidden entry and show_hidden is false
1162                            // If it's a directory, we need to skip its contents
1163                            if entry.file_type().is_dir() {
1164                                walker.skip_current_dir();
1165                            }
1166                        }
1167                    }
1168                }
1169                Err(e) => {
1170                    // An error occurred trying to access a directory entry (e.g., permission denied).
1171                    if let Some(path) = e.path() {
1172                        let depth = e.depth();
1173
1174                        // Check if this is a "directory contents" error vs "directory entry" error.
1175                        // If this is a permission error, it's likely we already processed the directory
1176                        // entry successfully but can't read its contents. In that case, skip creating
1177                        // a duplicate node since we already marked the original as permission_denied.
1178                        let is_contents_error = e.io_error().is_some_and(|io_err| {
1179                            io_err.kind() == std::io::ErrorKind::PermissionDenied
1180                        });
1181
1182                        if !is_contents_error {
1183                            // Create a special node representing the permission-denied entry.
1184                            let node = self.create_permission_denied_node(path, depth);
1185                            safety_tracker
1186                                .add_file(estimate_node_size(node.path.to_string_lossy().len()));
1187
1188                            if sender.send(node.clone()).is_err() {
1189                                break; // Receiver disconnected.
1190                            }
1191                            // Still update stats (e.g., directory count) for permission-denied entries if shown.
1192                            stats.update_file(&node);
1193                        }
1194
1195                        // Tell WalkDir not to try to descend into this unreadable directory.
1196                        walker.skip_current_dir();
1197                    }
1198                    // If the error is not path-specific, it might be logged or ignored depending on severity.
1199                    // For now, we primarily handle path-specific errors like permission issues.
1200                }
1201            }
1202        }
1203        // Scan complete, return the accumulated statistics.
1204        Ok(stats)
1205    }
1206
1207    /// ## `should_search_file`
1208    /// This function is called before `search_in_file` to decide if it's worth attempting a search.
1209    /// It checks if a search keyword is configured and if the file is likely text-based.
1210    fn should_search_file(&self, node: &FileNode) -> bool {
1211        // No search keyword? No search.
1212        if self.config.search_keyword.is_none() {
1213            return false;
1214        }
1215
1216        // If there's a file type filter, only search files that match it
1217        if let Some(ref filter_ext) = self.config.file_type_filter {
1218            if let Some(ext) = node.path.extension() {
1219                if ext.to_str() != Some(filter_ext) {
1220                    return false;
1221                }
1222            } else {
1223                // No extension, doesn't match filter
1224                return false;
1225            }
1226        }
1227
1228        // Skip directories, symlinks, and special files.
1229        if node.is_dir || node.is_symlink || node.permission_denied {
1230            return false;
1231        }
1232
1233        // Skip binary and system files based on category.
1234        matches!(
1235            node.category,
1236            FileCategory::Rust
1237                | FileCategory::Python
1238                | FileCategory::JavaScript
1239                | FileCategory::TypeScript
1240                | FileCategory::Java
1241                | FileCategory::C
1242                | FileCategory::Cpp
1243                | FileCategory::Go
1244                | FileCategory::Ruby
1245                | FileCategory::PHP
1246                | FileCategory::Shell
1247                | FileCategory::Markdown
1248                | FileCategory::Html
1249                | FileCategory::Css
1250                | FileCategory::Json
1251                | FileCategory::Yaml
1252                | FileCategory::Xml
1253                | FileCategory::Toml
1254                | FileCategory::Makefile
1255                | FileCategory::Dockerfile
1256                | FileCategory::GitConfig
1257        )
1258    }
1259
1260    /// ## `search_in_file`
1261    ///
1262    /// Searches for the configured keyword within a file and returns match information.
1263    /// Returns line and column positions for each match, up to a reasonable limit.
1264    /// The search is case-sensitive. Optionally includes the actual line content.
1265    fn search_in_file(&self, path: &Path) -> Option<SearchMatches> {
1266        // Ensure there's a keyword to search for.
1267        let keyword = self.config.search_keyword.as_ref()?;
1268        if keyword.is_empty() {
1269            return None;
1270        }
1271
1272        // Attempt to open the file for reading.
1273        let file = match fs::File::open(path) {
1274            Ok(f) => f,
1275            Err(_) => return None,
1276        };
1277
1278        let mut positions = Vec::new();
1279        let mut line_content_vec = Vec::new();
1280        let reader = BufReader::new(file);
1281        let mut line_number = 1;
1282        let mut first_match: Option<(usize, usize)> = None;
1283        let mut total_count = 0;
1284
1285        // Read and process the file line by line.
1286        for line_result in reader.lines() {
1287            match line_result {
1288                Ok(line_content) => {
1289                    // Find all occurrences of the keyword in the current line.
1290                    let mut line_has_match = false;
1291                    let mut first_column_in_line = None;
1292
1293                    for (column_index, _) in line_content.match_indices(keyword) {
1294                        total_count += 1;
1295                        line_has_match = true;
1296
1297                        // Column numbers are 1-based for user display
1298                        let match_pos = (line_number, column_index + 1);
1299
1300                        if first_match.is_none() {
1301                            first_match = Some(match_pos);
1302                        }
1303
1304                        if first_column_in_line.is_none() {
1305                            first_column_in_line = Some(column_index + 1);
1306                        }
1307
1308                        // Only store first 100 positions to prevent memory issues
1309                        if positions.len() < 100 {
1310                            positions.push(match_pos);
1311                        }
1312
1313                        // Stop processing this file if we've found too many matches
1314                        if total_count > 100 {
1315                            let line_content_option = if self.config.include_line_content {
1316                                Some(line_content_vec)
1317                            } else {
1318                                None
1319                            };
1320
1321                            return Some(SearchMatches {
1322                                first_match: first_match.unwrap(),
1323                                total_count,
1324                                positions,
1325                                truncated: true,
1326                                line_content: line_content_option,
1327                            });
1328                        }
1329                    }
1330
1331                    // If this line has matches and we're including content, add it
1332                    if line_has_match
1333                        && self.config.include_line_content
1334                        && line_content_vec.len() < 100
1335                    {
1336                        line_content_vec.push((
1337                            line_number,
1338                            line_content.clone(),
1339                            first_column_in_line.unwrap(),
1340                        ));
1341                    }
1342
1343                    line_number += 1;
1344                }
1345                Err(_) => {
1346                    // Invalid UTF-8 or other error, stop searching this file
1347                    break;
1348                }
1349            }
1350        }
1351
1352        // Return matches if any were found
1353        first_match.map(|first| {
1354            let line_content_option =
1355                if self.config.include_line_content && !line_content_vec.is_empty() {
1356                    Some(line_content_vec)
1357                } else {
1358                    None
1359                };
1360
1361            SearchMatches {
1362                first_match: first,
1363                total_count,
1364                positions,
1365                truncated: false,
1366                line_content: line_content_option,
1367            }
1368        })
1369    }
1370
1371    /// ## `enrich_with_smart_scanning` - Add Security & Interest Data
1372    ///
1373    /// Enriches a FileNode with security findings and interest scores.
1374    /// This is the heart of "surface what matters" - we analyze each file
1375    /// for potential security issues and calculate how interesting it is.
1376    fn enrich_with_smart_scanning(&self, node: &mut FileNode) {
1377        // Skip directories and very large files for content-based analysis
1378        if node.is_dir || node.size > 10_000_000 {
1379            // Still calculate interest score for directories
1380            if let Some(calc) = &self.interest_calculator {
1381                node.interest = Some(calc.calculate(node));
1382                node.traversal_context = Some(calc.build_traversal_context(node, None));
1383            }
1384            return;
1385        }
1386
1387        // Try to read file content for security scanning
1388        let content = if self.security_scanner.is_some() && self.should_scan_for_security(node) {
1389            fs::read_to_string(&node.path).ok()
1390        } else {
1391            None
1392        };
1393
1394        // Security scanning
1395        if let (Some(scanner), Some(ref content)) = (&self.security_scanner, &content) {
1396            let findings = scanner.scan_file_content(&node.path, content);
1397            if !findings.is_empty() {
1398                node.security_findings = findings;
1399            }
1400        }
1401
1402        // Interest calculation (with or without security findings)
1403        if let Some(calc) = &self.interest_calculator {
1404            let (score, _additional_findings) = if let Some(ref content) = content {
1405                calc.calculate_with_security(node, Some(content))
1406            } else {
1407                (calc.calculate(node), Vec::new())
1408            };
1409            node.interest = Some(score);
1410            node.traversal_context = Some(calc.build_traversal_context(node, None));
1411        }
1412    }
1413
1414    /// Check if a file should be scanned for security patterns
1415    fn should_scan_for_security(&self, node: &FileNode) -> bool {
1416        // Skip binary files based on category
1417        !matches!(
1418            node.category,
1419            FileCategory::Binary
1420                | FileCategory::Archive
1421                | FileCategory::Image
1422                | FileCategory::Video
1423                | FileCategory::Audio
1424                | FileCategory::DiskImage
1425                | FileCategory::Font
1426                | FileCategory::Encrypted
1427        )
1428    }
1429
1430    /// ## `scan` - The Full Scan (Non-Streaming)
1431    ///
1432    /// Performs a complete directory scan, collecting all `FileNode`s that meet the criteria
1433    /// (not ignored, or shown if ignored, and pass filters if any).
1434    /// This method first traverses the entire directory structure defined by `config.max_depth`,
1435    /// creating `FileNode` objects for each entry. It then performs a second pass if filters
1436    /// are active to ensure that directories are only included if they (or their subdirectories)
1437    /// contain files that match the filters.
1438    /// Returns a tuple: `(Vec<FileNode>, TreeStats)`.
1439    /// ## `scan` - The "Scan-It-All-Then-Sort-It-Out" Method
1440    ///
1441    /// This is the classic way to scan. It's a two-act show:
1442    /// 1. **Act I**: Walk through every single file and directory, collecting a huge list of `FileNode`s.
1443    /// 2. **Act II**: If there are filters, go through that huge list and pick out only the ones that
1444    ///    match, making sure to keep their parent directories so the tree still makes sense.
1445    ///    It's thorough and great for when you need the whole picture before making decisions.
1446    pub fn scan(&self) -> Result<(Vec<FileNode>, TreeStats)> {
1447        let mut all_nodes_collected = Vec::new(); // Stores all nodes initially encountered.
1448                                                  // `ignored_dirs` was here, but its primary use with `skip_current_dir` is within the loop.
1449                                                  // If we need to track them for other reasons post-loop, it could be reinstated.
1450
1451        // Initialize safety tracker
1452        let safety_tracker = ScannerSafetyTracker::new(self.safety_limits.clone());
1453
1454        let mut walker = WalkDir::new(&self.root)
1455            .max_depth(self.config.max_depth)
1456            .follow_links(self.config.follow_symlinks)
1457            .into_iter();
1458
1459        while let Some(entry_result) = walker.next() {
1460            // Check safety limits
1461            if let Err(safety_error) = safety_tracker.should_continue() {
1462                eprintln!("⚠️  {}", safety_error);
1463                eprintln!("   Use --max-depth, --stream mode, or scan a more specific directory");
1464                break;
1465            }
1466
1467            match entry_result {
1468                Ok(entry) => {
1469                    let depth = entry.depth();
1470                    let path = entry.path();
1471                    let is_ignored_by_rules = self.should_ignore(path)?;
1472
1473                    if is_ignored_by_rules {
1474                        if self.config.show_ignored {
1475                            // Process and add the ignored entry.
1476                            if let Some(mut node) = self.process_entry(&entry, depth, true)? {
1477                                if !node.is_dir && self.should_search_file(&node) {
1478                                    node.search_matches = self.search_in_file(&node.path);
1479                                }
1480                                // Smart scanning even for ignored files (they might have security issues!)
1481                                self.enrich_with_smart_scanning(&mut node);
1482                                safety_tracker.add_file(estimate_node_size(
1483                                    node.path.to_string_lossy().len(),
1484                                ));
1485                                all_nodes_collected.push(node);
1486                            }
1487                            if entry.file_type().is_dir() {
1488                                walker.skip_current_dir(); // Don't descend into ignored dirs if showing them.
1489                            }
1490                        } else {
1491                            // Not showing ignored, and it's a directory: skip its contents.
1492                            if entry.file_type().is_dir() {
1493                                walker.skip_current_dir();
1494                            }
1495                            // If it's a file, it's simply skipped by not adding to `all_nodes_collected`.
1496                        }
1497                    } else {
1498                        // Not ignored by rules, process normally.
1499                        if let Some(mut node) = self.process_entry(&entry, depth, false)? {
1500                            if !node.is_dir && self.should_search_file(&node) {
1501                                node.search_matches = self.search_in_file(&node.path);
1502                            }
1503                            // Smart scanning: add security findings and interest scores
1504                            self.enrich_with_smart_scanning(&mut node);
1505                            all_nodes_collected.push(node);
1506                        } else {
1507                            // process_entry returned None, which means this is a hidden entry and show_hidden is false
1508                            // If it's a directory, we need to skip its contents
1509                            if entry.file_type().is_dir() {
1510                                walker.skip_current_dir();
1511                            }
1512                        }
1513                    }
1514                }
1515                Err(e) => {
1516                    // Handle errors like permission denied.
1517                    if let Some(path) = e.path() {
1518                        let depth = e.depth();
1519                        all_nodes_collected.push(self.create_permission_denied_node(path, depth));
1520                        if e.io_error().is_some_and(|io_err| {
1521                            io_err.kind() == std::io::ErrorKind::PermissionDenied
1522                        }) {
1523                            walker.skip_current_dir(); // Skip unreadable directory.
1524                        }
1525                    }
1526                }
1527            }
1528        }
1529
1530        // If filters are active, we need a second pass to ensure directories are only included
1531        // if they contain (or lead to) matching files.
1532        // Also, calculate stats based on the *final* list of nodes.
1533        let (final_nodes, final_stats) = if self.has_active_filters() {
1534            self.filter_nodes_and_calculate_stats(all_nodes_collected)
1535        } else {
1536            // No filters, so all collected nodes are final. Calculate stats on them.
1537            let mut stats = TreeStats::default();
1538            for node in &all_nodes_collected {
1539                // Only update stats for non-permission-denied items, or items that are directories.
1540                // (Permission denied files usually have size 0 and aren't "counted" in the same way).
1541                if !node.permission_denied || node.is_dir {
1542                    stats.update_file(node);
1543                }
1544            }
1545            (all_nodes_collected, stats)
1546        };
1547
1548        // Apply sorting and top-N filtering if requested
1549        let sorted_nodes = self.apply_sorting_and_limit(final_nodes);
1550
1551        // Save scan state for future change detection (if smart mode enabled)
1552        if self.config.smart_mode || self.config.compute_interest {
1553            self.save_scan_state(&sorted_nodes);
1554        }
1555
1556        Ok((sorted_nodes, final_stats))
1557    }
1558
1559    /// Save the current scan state for future change detection
1560    fn save_scan_state(&self, nodes: &[FileNode]) {
1561        use crate::scanner_state::FileSignature;
1562
1563        let mut state = ScanState::new(self.root.clone());
1564
1565        for node in nodes {
1566            if let Ok(sig) = FileSignature::from_path(&node.path) {
1567                state.add_signature(node.path.clone(), sig);
1568            }
1569        }
1570
1571        // Save state (ignore errors - this is best-effort)
1572        if let Err(e) = state.save() {
1573            // Only log in debug mode, don't clutter normal output
1574            tracing::debug!("Could not save scan state: {}", e);
1575        }
1576    }
1577
1578    /// ## `has_active_filters`
1579    ///
1580    /// Helper function to quickly check if any of the primary filtering criteria
1581    /// (find pattern, type, size, date) are currently set in the configuration.
1582    /// This determines if the second filtering pass (`filter_nodes_and_calculate_stats`) is needed.
1583    /// Note: `search_keyword` is handled slightly differently; it can make a file appear
1584    /// even if other filters would exclude it, so it's part of `should_include` logic.
1585    fn has_active_filters(&self) -> bool {
1586        self.config.find_pattern.is_some()
1587            || self.config.file_type_filter.is_some()
1588            || self.config.entry_type_filter.is_some()
1589            || self.config.min_size.is_some()
1590            || self.config.max_size.is_some()
1591            || self.config.newer_than.is_some()
1592            || self.config.older_than.is_some()
1593            || self.config.search_keyword.is_some() // Now search_keyword is also a filter
1594    }
1595
1596    /// ## `filter_nodes_and_calculate_stats` (Formerly `filter_nodes_with_ancestors`)
1597    ///
1598    /// This crucial function takes all nodes collected during the initial traversal
1599    /// and filters them based on the `ScannerConfig`. It ensures that:
1600    /// 1. Files are included if they directly match all active filters OR if they contain a search match.
1601    /// 2. Directories are included if they themselves match a `--find` pattern OR
1602    ///    if they are an ancestor of an included file.
1603    ///    It then calculates `TreeStats` based on this final, filtered list of nodes.
1604    ///    This replaces the older `filter_nodes_with_ancestors` to integrate stat calculation
1605    ///    and clarify the logic for directory inclusion with `--find`.
1606    fn filter_nodes_and_calculate_stats(
1607        &self,
1608        all_nodes_collected: Vec<FileNode>,
1609    ) -> (Vec<FileNode>, TreeStats) {
1610        let mut final_stats = TreeStats::default();
1611        let mut included_files_and_matching_dirs = Vec::new(); // Files that pass filters, and Dirs that match --find
1612        let mut required_ancestor_dirs = HashSet::new(); // Ancestors of included_files
1613
1614        // --- Pass 1: Identify matching files and directories that directly match --find ---
1615        for node in &all_nodes_collected {
1616            if node.permission_denied {
1617                // Skip permission denied entries for filtering logic
1618                continue;
1619            }
1620
1621            let has_search_match = node
1622                .search_matches
1623                .as_ref()
1624                .is_some_and(|m| m.total_count > 0);
1625
1626            if node.is_dir {
1627                // For directories, only the --find pattern applies directly.
1628                // Other filters (size, date, type) don't apply to directories themselves.
1629                if self
1630                    .config
1631                    .find_pattern
1632                    .as_ref()
1633                    .is_some_and(|p| p.is_match(&node.path.to_string_lossy()))
1634                {
1635                    included_files_and_matching_dirs.push(node.clone());
1636                    // Add ancestors of this directly matched directory
1637                    let mut current = node.path.parent();
1638                    while let Some(parent_path) = current {
1639                        if parent_path == self.root || required_ancestor_dirs.contains(parent_path)
1640                        {
1641                            break;
1642                        }
1643                        required_ancestor_dirs.insert(parent_path.to_path_buf());
1644                        current = parent_path.parent();
1645                    }
1646                }
1647            } else {
1648                // For files, check if it passes all filters OR has a search match.
1649                // If we have a search keyword, ONLY include files with search matches
1650                if self.config.search_keyword.is_some() {
1651                    if has_search_match {
1652                        // Even with search matches, the file must still pass other filters
1653                        if self.should_include(node) {
1654                            included_files_and_matching_dirs.push(node.clone());
1655                            // Add all ancestors of this matching file to `required_ancestor_dirs`.
1656                            let mut current = node.path.parent();
1657                            while let Some(parent_path) = current {
1658                                // Stop if we reach the root or an already added ancestor.
1659                                if parent_path == self.root
1660                                    || required_ancestor_dirs.contains(parent_path)
1661                                {
1662                                    break;
1663                                }
1664                                required_ancestor_dirs.insert(parent_path.to_path_buf());
1665                                current = parent_path.parent();
1666                            }
1667                        }
1668                    }
1669                } else {
1670                    // No search keyword, use normal filtering
1671                    if has_search_match || self.should_include(node) {
1672                        included_files_and_matching_dirs.push(node.clone());
1673                        // Add all ancestors of this matching file to `required_ancestor_dirs`.
1674                        let mut current = node.path.parent();
1675                        while let Some(parent_path) = current {
1676                            // Stop if we reach the root or an already added ancestor.
1677                            if parent_path == self.root
1678                                || required_ancestor_dirs.contains(parent_path)
1679                            {
1680                                break;
1681                            }
1682                            required_ancestor_dirs.insert(parent_path.to_path_buf());
1683                            current = parent_path.parent();
1684                        }
1685                    }
1686                }
1687            }
1688        }
1689
1690        // --- Pass 2: Build the final list of nodes ---
1691        let mut final_node_list = Vec::new();
1692        let mut added_paths = HashSet::new(); // To prevent duplicates if a dir is both an ancestor and matches --find
1693
1694        // Always add the root node if there's anything to show.
1695        if !included_files_and_matching_dirs.is_empty() {
1696            if let Some(root_node) = all_nodes_collected.iter().find(|n| n.path == self.root) {
1697                if added_paths.insert(root_node.path.clone()) {
1698                    final_node_list.push(root_node.clone());
1699                }
1700            }
1701        }
1702
1703        // Add required ancestor directories and directly matching directories from `all_nodes_collected`.
1704        for node in &all_nodes_collected {
1705            if node.permission_denied {
1706                // Also include permission denied nodes if they are part of the path
1707                if (required_ancestor_dirs.contains(&node.path)
1708                    || node.path == self.root && !final_node_list.is_empty())
1709                    && added_paths.insert(node.path.clone())
1710                {
1711                    final_node_list.push(node.clone());
1712                }
1713                continue;
1714            }
1715
1716            if node.is_dir {
1717                // Is it a required ancestor OR a directory that itself matched --find?
1718                let is_find_match = self
1719                    .config
1720                    .find_pattern
1721                    .as_ref()
1722                    .is_some_and(|p| p.is_match(&node.path.to_string_lossy()));
1723                if (required_ancestor_dirs.contains(&node.path)
1724                    || (is_find_match && node.path != self.root))
1725                    && added_paths.insert(node.path.clone())
1726                {
1727                    final_node_list.push(node.clone());
1728                }
1729            }
1730        }
1731
1732        // Add the files that passed filters or had search matches.
1733        for node in included_files_and_matching_dirs {
1734            // If it's a directory, it was already handled above (if it matched --find).
1735            // If it's a file, add it now.
1736            if !node.is_dir {
1737                if added_paths.insert(node.path.clone()) {
1738                    final_node_list.push(node);
1739                }
1740            } else {
1741                // It's a directory that matched --find
1742                if added_paths.insert(node.path.clone()) {
1743                    final_node_list.push(node);
1744                }
1745            }
1746        }
1747
1748        // Sort the final list by path for consistent output.
1749        final_node_list.sort_by(|a, b| a.path.cmp(&b.path));
1750
1751        // --- Pass 3: Calculate stats on the final_node_list ---
1752        for node in &final_node_list {
1753            // Update stats, ensuring not to double-count or miscount permission-denied entries.
1754            if !node.permission_denied || node.is_dir {
1755                // Dirs (even denied) contribute to dir count.
1756                final_stats.update_file(node);
1757            }
1758        }
1759
1760        (final_node_list, final_stats)
1761    }
1762
1763    /// ## `process_entry`
1764    ///
1765    /// Converts a `walkdir::DirEntry` into our `FileNode` struct.
1766    /// This involves fetching metadata, determining file type, category, hidden status, etc.
1767    /// It also incorporates the `is_ignored_by_rules` status passed to it.
1768    /// Returns `Ok(Some(FileNode))` on success, `Ok(None)` if the entry should be skipped
1769    /// (e.g., hidden and not showing hidden), or an `Err` if metadata access fails.
1770    /// The `is_ignored_by_rules` parameter tells this function if `should_ignore` already determined this node is ignored.
1771    fn process_entry(
1772        &self,
1773        entry: &DirEntry,
1774        depth: usize,
1775        is_ignored_by_rules: bool,
1776    ) -> Result<Option<FileNode>> {
1777        let path = entry.path();
1778
1779        // Determine if the file is hidden (starts with '.').
1780        let is_hidden = path
1781            .file_name()
1782            .and_then(|name_osstr| name_osstr.to_str()) // Convert OsStr to &str
1783            .is_some_and(|name_str| name_str.starts_with('.'));
1784
1785        // Skip if hidden and we are not configured to show hidden files,
1786        // UNLESS it's an ignored item that we *are* configured to show (is_ignored_by_rules = true, config.show_ignored = true).
1787        // The `is_ignored_by_rules` flag takes precedence for display if `config.show_ignored` is true.
1788        if is_hidden && !self.config.show_hidden && !is_ignored_by_rules {
1789            // If it's a directory, we need to tell walkdir to skip its contents.
1790            if entry.file_type().is_dir() {
1791                // This is tricky because `process_entry` doesn't have `walker` to call `skip_current_dir()`.
1792                // The caller (`scan` or `scan_stream`) handles `skip_current_dir` based on `should_ignore`
1793                // and hidden status before calling `process_entry` or by checking the returned node.
1794                // For now, returning None signals to the caller that this node (and its children if a dir)
1795                // should not be further processed or added, unless `show_ignored` logic overrides.
1796            }
1797            return Ok(None); // Skip this hidden entry.
1798        }
1799
1800        // Try to get metadata for the entry. This can fail (e.g., permission denied).
1801        let metadata = match entry.metadata() {
1802            Ok(md) => md,
1803            Err(_e) => {
1804                // If metadata fails, it's likely a permission issue or a broken symlink.
1805                // We create a special "permission_denied_node" in the calling `scan`/`scan_stream` methods
1806                // because they have access to `walker.skip_current_dir()`.
1807                // Here, we can't fully form that node, so we might return an error or a partial node.
1808                // For simplicity, if metadata fails here, we treat it as an inaccessible entry.
1809                // The main scan loops handle creating a FileNode for permission denied errors from WalkDir.
1810                // This specific call path implies WalkDir *could* read the entry but metadata() failed.
1811                // This is less common than WalkDir itself erroring.
1812                // Let's assume the main loops catch this via `Err(e)` from `walker.next()`.
1813                // If `process_entry` is called on an entry that `WalkDir` gave Ok for, but `metadata()` fails,
1814                // it's an edge case. We'll return a basic node marked as permission denied.
1815                return Ok(Some(self.create_permission_denied_node(path, depth)));
1816            }
1817        };
1818
1819        let file_type = self.determine_file_type(&metadata);
1820        let category = Self::get_file_category(path, file_type);
1821
1822        // Determine the size. For special virtual files (like in /proc or /sys),
1823        // reported size can be misleading (e.g., 0 or huge). We mark these as size 0.
1824        let size = if self.is_special_virtual_file(path, &metadata) {
1825            0
1826        } else {
1827            metadata.len()
1828        };
1829
1830        // Check if this is a directory that we can't read the contents of
1831        let permission_denied = if metadata.is_dir() {
1832            // Try to read the directory to see if we have permission
1833            std::fs::read_dir(path).is_err()
1834        } else {
1835            false
1836        };
1837
1838        // Check for git branch if this is a directory
1839        let git_branch = if metadata.is_dir() {
1840            Self::get_git_branch(path)
1841        } else {
1842            None
1843        };
1844
1845        Ok(Some(FileNode {
1846            path: path.to_path_buf(),
1847            is_dir: metadata.is_dir(),
1848            size,
1849            permissions: Self::get_permissions(&metadata),
1850            uid: Self::get_uid(&metadata),
1851            gid: Self::get_gid(&metadata),
1852            modified: metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH), // Fallback for modified time
1853            is_symlink: metadata.file_type().is_symlink(), // Use file_type() for symlink check
1854            is_hidden,
1855            permission_denied, // Set based on whether we can read directory contents
1856            is_ignored: is_ignored_by_rules, // Use the pre-determined ignore status.
1857            depth,
1858            file_type,
1859            category,
1860            search_matches: None, // Search matches are added later by the caller if needed.
1861            filesystem_type: Self::get_filesystem_type(path),
1862            git_branch,
1863            // Smart scanning fields - populated later by interest calculator
1864            traversal_context: None,
1865            interest: None,
1866            security_findings: Vec::new(),
1867            change_status: None,
1868            content_hash: None,
1869        }))
1870    }
1871
1872    /// ## `get_git_branch`
1873    ///
1874    /// Gets the current git branch if this directory contains a .git folder.
1875    /// Reads directly from .git/HEAD for speed (no subprocess).
1876    fn get_git_branch(path: &Path) -> Option<String> {
1877        let git_dir = path.join(".git");
1878        if !git_dir.exists() {
1879            return None;
1880        }
1881
1882        // Read .git/HEAD to get the current ref
1883        let head_path = git_dir.join("HEAD");
1884        let head_content = std::fs::read_to_string(&head_path).ok()?;
1885        let head_content = head_content.trim();
1886
1887        // HEAD can be either:
1888        // 1. "ref: refs/heads/branch-name" (normal branch)
1889        // 2. A raw commit hash (detached HEAD)
1890        if let Some(branch_ref) = head_content.strip_prefix("ref: refs/heads/") {
1891            Some(branch_ref.to_string())
1892        } else if head_content.len() >= 7 {
1893            // Detached HEAD - show abbreviated commit hash
1894            Some(format!(":{}", &head_content[..7]))
1895        } else {
1896            None
1897        }
1898    }
1899
1900    /// ## `get_filesystem_type`
1901    ///
1902    /// Detects the filesystem type for a given path
1903    #[cfg(unix)]
1904    fn get_filesystem_type(path: &Path) -> FilesystemType {
1905        // Skip filesystem detection in CI environments to avoid hangs
1906        if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
1907            return FilesystemType::Unknown;
1908        }
1909
1910        #[cfg(target_os = "linux")]
1911        {
1912            Self::get_filesystem_type_linux(path)
1913        }
1914        #[cfg(not(target_os = "linux"))]
1915        {
1916            // On non-Linux Unix systems, we can't reliably detect filesystem type
1917            // Just check for special paths
1918            if let Some(path_str) = path.to_str() {
1919                if path_str.starts_with("/proc") {
1920                    return FilesystemType::Procfs;
1921                } else if path_str.starts_with("/sys") {
1922                    return FilesystemType::Sysfs;
1923                } else if path_str.starts_with("/dev") {
1924                    return FilesystemType::Devfs;
1925                }
1926            }
1927            FilesystemType::Unknown
1928        }
1929    }
1930
1931    /// ## `get_filesystem_type_linux`
1932    ///
1933    /// Detects the filesystem type for a given path using statfs on Linux systems
1934    #[cfg(target_os = "linux")]
1935    fn get_filesystem_type_linux(path: &Path) -> FilesystemType {
1936        // Double-check for CI environment
1937        if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
1938            return FilesystemType::Unknown;
1939        }
1940
1941        use libc::statfs;
1942        use std::ffi::CString;
1943        use std::mem;
1944
1945        // Filesystem magic numbers from statfs.h
1946        type FsType = i64;
1947
1948        const EXT4_SUPER_MAGIC: FsType = 0xef53;
1949        const XFS_SUPER_MAGIC: FsType = 0x58465342;
1950        const BTRFS_SUPER_MAGIC: FsType = 0x9123683e;
1951        const ZFS_SUPER_MAGIC: FsType = 0x2fc12fc1;
1952        const NTFS_SB_MAGIC: FsType = 0x5346544e;
1953        const MSDOS_SUPER_MAGIC: FsType = 0x4d44; // FAT
1954        const EXFAT_SUPER_MAGIC: FsType = 0x2011bab0;
1955        const APFS_SUPER_MAGIC: FsType = 0x42535041; // 'APFS'
1956        const HFS_SUPER_MAGIC: FsType = 0x482b; // HFS+
1957        const NFS_SUPER_MAGIC: FsType = 0x6969;
1958        const SMB_SUPER_MAGIC: FsType = 0x517b;
1959        const TMPFS_MAGIC: FsType = 0x01021994;
1960        const PROC_SUPER_MAGIC: FsType = 0x9fa0;
1961        const SYSFS_MAGIC: FsType = 0x62656572;
1962        const DEVFS_SUPER_MAGIC: FsType = 0x1373;
1963
1964        let path_cstr = match CString::new(path.to_string_lossy().as_bytes()) {
1965            Ok(s) => s,
1966            Err(_) => return FilesystemType::Unknown,
1967        };
1968
1969        let mut stat_buf: libc::statfs = unsafe { mem::zeroed() };
1970        let result = unsafe { statfs(path_cstr.as_ptr(), &mut stat_buf) };
1971
1972        if result != 0 {
1973            // statfs failed, fall back to path-based detection for virtual filesystems
1974            if let Some(path_str) = path.to_str() {
1975                if path_str.starts_with("/proc") {
1976                    return FilesystemType::Procfs;
1977                } else if path_str.starts_with("/sys") {
1978                    return FilesystemType::Sysfs;
1979                } else if path_str.starts_with("/dev") {
1980                    return FilesystemType::Devfs;
1981                }
1982            }
1983            return FilesystemType::Unknown;
1984        }
1985
1986        // Check for Mem8 filesystem by looking for .mem8 marker files
1987        if path.join(".mem8").exists() || path.to_string_lossy().contains("mem8") {
1988            return FilesystemType::Mem8;
1989        }
1990
1991        match stat_buf.f_type {
1992            EXT4_SUPER_MAGIC => FilesystemType::Ext4, // TODO: Distinguish ext2/3/4
1993            XFS_SUPER_MAGIC => FilesystemType::Xfs,
1994            BTRFS_SUPER_MAGIC => FilesystemType::Btrfs,
1995            ZFS_SUPER_MAGIC => FilesystemType::Zfs,
1996            NTFS_SB_MAGIC => FilesystemType::Ntfs,
1997            MSDOS_SUPER_MAGIC => FilesystemType::Fat32,
1998            EXFAT_SUPER_MAGIC => FilesystemType::ExFat,
1999            APFS_SUPER_MAGIC => FilesystemType::Apfs,
2000            HFS_SUPER_MAGIC => FilesystemType::Hfs,
2001            NFS_SUPER_MAGIC => FilesystemType::Nfs,
2002            SMB_SUPER_MAGIC => FilesystemType::Smb,
2003            TMPFS_MAGIC => FilesystemType::Tmpfs,
2004            PROC_SUPER_MAGIC => FilesystemType::Procfs,
2005            SYSFS_MAGIC => FilesystemType::Sysfs,
2006            DEVFS_SUPER_MAGIC => FilesystemType::Devfs,
2007            _ => FilesystemType::Unknown,
2008        }
2009    }
2010
2011    #[cfg(not(unix))]
2012    fn get_filesystem_type(_path: &Path) -> FilesystemType {
2013        // On non-Unix systems, we can't easily detect filesystem type
2014        FilesystemType::Unknown
2015    }
2016
2017    /// ## `is_virtual_filesystem`
2018    ///
2019    /// Checks if a path is on a virtual filesystem
2020    fn is_virtual_filesystem(path: &Path) -> bool {
2021        Self::get_filesystem_type(path).is_virtual()
2022    }
2023
2024    /// ## `is_special_virtual_file`
2025    ///
2026    /// Checks if a file is likely a special virtual file (e.g., in /proc, /sys, /dev)
2027    /// where reported metadata like size might be zero, misleading, or cause issues if read.
2028    /// This helps in deciding to report size as 0 for such files.
2029    #[allow(unused_variables)]
2030    fn is_special_virtual_file(&self, path: &Path, metadata: &fs::Metadata) -> bool {
2031        // Check if the path starts with known virtual filesystem prefixes.
2032        if let Some(path_str) = path.to_str() {
2033            if path_str.starts_with("/proc/")
2034                || path_str.starts_with("/sys/")
2035                || path_str.starts_with("/dev/")
2036            {
2037                return true;
2038            }
2039        }
2040
2041        // Check for specific problematic files by absolute path.
2042        if self.ignore_files.contains(path) {
2043            // Uses the pre-built HashSet of specific problem files.
2044            return true;
2045        }
2046
2047        // On Unix, check for special file types like character devices, block devices, FIFOs, sockets.
2048        // These often have size 0 or non-standard size reporting.
2049        #[cfg(unix)]
2050        {
2051            use std::os::unix::fs::FileTypeExt; // For is_char_device(), is_block_device(), etc.
2052            let ft = metadata.file_type();
2053            if ft.is_char_device() || ft.is_block_device() || ft.is_fifo() || ft.is_socket() {
2054                return true;
2055            }
2056        }
2057
2058        false // Not determined to be a special virtual file by these checks.
2059    }
2060
2061    /// ## `create_permission_denied_node`
2062    ///
2063    /// Helper to create a `FileNode` representing an entry (usually a directory)
2064    /// that could not be accessed due to permission errors.
2065    /// These nodes are marked specially so formatters can indicate the issue.
2066    fn create_permission_denied_node(&self, path: &Path, depth: usize) -> FileNode {
2067        FileNode {
2068            path: path.to_path_buf(),
2069            is_dir: true, // Assume it's a directory, as that's common for permission errors during traversal.
2070            size: 0,      // No size info available.
2071            permissions: 0, // No permission info.
2072            uid: 0,       // No UID info.
2073            gid: 0,       // No GID info.
2074            modified: SystemTime::UNIX_EPOCH, // Default timestamp.
2075            is_symlink: false,
2076            is_hidden: false,        // Cannot determine if hidden.
2077            permission_denied: true, // Mark as permission denied.
2078            is_ignored: false,       // Not ignored by rules, but inaccessible.
2079            depth,
2080            file_type: FileType::Directory, // Assume directory.
2081            category: FileCategory::Unknown,
2082            search_matches: None,
2083            filesystem_type: Self::get_filesystem_type(path),
2084            git_branch: None, // Can't check git for permission-denied directories
2085            // Smart scanning fields - N/A for permission denied nodes
2086            traversal_context: None,
2087            interest: None,
2088            security_findings: Vec::new(),
2089            change_status: None,
2090            content_hash: None,
2091        }
2092    }
2093
2094    /// ## `should_ignore` - The Bouncer at the Club Door
2095    ///
2096    /// This function is our tough-but-fair bouncer. It checks every file and
2097    /// directory against our lists (`.gitignore`, default ignores, etc.).
2098    /// "Sorry, `node_modules`, you're not on the list tonight."
2099    /// It's the first line of defense against clutter.
2100    fn should_ignore(&self, path: &Path) -> Result<bool> {
2101        // --- Rule 0: Never ignore the root path itself ---
2102        // If the user explicitly asks to scan a directory, we should show it
2103        // even if it would normally be ignored (e.g., scanning 'target' directory)
2104        if path == self.root {
2105            return Ok(false);
2106        }
2107
2108        // --- Rule 1: Check against specific, always-ignored files (absolute paths) ---
2109        if self.config.use_default_ignores && self.ignore_files.contains(path) {
2110            return Ok(true); // Matches a specific problematic file.
2111        }
2112
2113        // --- Rule 2: ALWAYS skip virtual filesystems like /proc, /sys, /dev ---
2114        // These are checked regardless of use_default_ignores because they're not real files
2115        // and can cause issues (huge fake sizes, hangs, etc.)
2116        if Self::is_virtual_filesystem(path) {
2117            return Ok(true);
2118        }
2119
2120        // --- Rule 3: Check against other system paths if using default ignores ---
2121        if self.config.use_default_ignores {
2122            // Check for exact match of a system path.
2123            if self.system_paths.contains(path) {
2124                return Ok(true);
2125            }
2126            // Check if the current path is a child of any registered system path.
2127            for system_root_path in &self.system_paths {
2128                if path.starts_with(system_root_path) {
2129                    return Ok(true); // It's inside /tmp, /var/tmp, etc.
2130                }
2131            }
2132        }
2133
2134        // --- Rule 3: Check against default ignore patterns (GlobSet) ---
2135        // These patterns usually match file/directory names or relative paths within a project.
2136        if let Some(ref default_ignore_set) = self.default_ignores {
2137            // Check if the simple file/directory name matches any default pattern.
2138            // (e.g., "node_modules" will match `path/to/project/node_modules`)
2139            if let Some(file_name) = path.file_name() {
2140                if default_ignore_set.is_match(Path::new(file_name)) {
2141                    return Ok(true);
2142                }
2143            }
2144            // Also check the path relative to the scan root against default patterns.
2145            // This handles patterns like "*.pyc" or "build/outputs/".
2146            if let Ok(relative_path_to_root) = path.strip_prefix(&self.root) {
2147                if default_ignore_set.is_match(relative_path_to_root) {
2148                    return Ok(true);
2149                }
2150            }
2151        }
2152
2153        // --- Rule 4: Check against .gitignore patterns (GlobSet) ---
2154        // These patterns are always relative to the root of the scan (where .gitignore is located).
2155        if let Some(ref gitignore_set) = self.gitignore {
2156            if let Ok(relative_path_to_root) = path.strip_prefix(&self.root) {
2157                if gitignore_set.is_match(relative_path_to_root) {
2158                    return Ok(true); // Matches a .gitignore pattern.
2159                }
2160            }
2161            // If strip_prefix fails (path is not under root), it can't match .gitignore relative patterns.
2162        }
2163
2164        // If none of the above rules triggered, the path is not ignored.
2165        Ok(false)
2166    }
2167
2168    /// ## `should_include` - The Velvet Rope
2169    ///
2170    /// Once a file gets past the bouncer (`should_ignore`), it has to get past
2171    /// the velvet rope. This function checks if the file meets the specific criteria
2172    /// for this party: "Are you a `.rs` file? Are you bigger than 1MB?"
2173    /// Only the coolest files that match all the rules get in.
2174    fn should_include(&self, node: &FileNode) -> bool {
2175        // --- Filter by --find pattern (applies to both files and directories) ---
2176        if let Some(ref find_regex_pattern) = self.config.find_pattern {
2177            // Convert path to string for regex matching. Lossy conversion is acceptable for matching.
2178            let path_str = node.path.to_string_lossy();
2179            if !find_regex_pattern.is_match(&path_str) {
2180                return false; // Path doesn't match the --find pattern.
2181            }
2182        }
2183
2184        // --- Filter by entry type (--entry-type) ---
2185        if let Some(ref entry_type) = self.config.entry_type_filter {
2186            match entry_type.as_str() {
2187                "f" => {
2188                    if node.is_dir {
2189                        return false; // Looking for files only, but this is a directory
2190                    }
2191                }
2192                "d" => {
2193                    if !node.is_dir {
2194                        return false; // Looking for directories only, but this is a file
2195                    }
2196                }
2197                _ => {} // Should not happen due to clap validation
2198            }
2199        }
2200
2201        // --- Filters below only apply to files, not directories ---
2202        if !node.is_dir {
2203            // --- Filter by file extension (--type) ---
2204            if let Some(ref required_extension) = self.config.file_type_filter {
2205                match node
2206                    .path
2207                    .extension()
2208                    .and_then(|ext_osstr| ext_osstr.to_str())
2209                {
2210                    Some(file_ext_str) => {
2211                        if !file_ext_str.eq_ignore_ascii_case(required_extension) {
2212                            return false; // Extension doesn't match.
2213                        }
2214                    }
2215                    None => return false, // File has no extension, so cannot match.
2216                }
2217            }
2218
2219            // --- Filter by minimum size (--min-size) ---
2220            if let Some(min_allowed_size) = self.config.min_size {
2221                if node.size < min_allowed_size {
2222                    return false; // File is too small.
2223                }
2224            }
2225
2226            // --- Filter by maximum size (--max-size) ---
2227            if let Some(max_allowed_size) = self.config.max_size {
2228                if node.size > max_allowed_size {
2229                    return false; // File is too large.
2230                }
2231            }
2232        } // End of file-only filters
2233
2234        // --- Date filters (apply to both files and directories based on their modification time) ---
2235        // --- Filter by newer_than date (--newer-than) ---
2236        if let Some(min_modification_date) = self.config.newer_than {
2237            if node.modified < min_modification_date {
2238                return false; // Entry is older than required.
2239            }
2240        }
2241
2242        // --- Filter by older_than date (--older-than) ---
2243        if let Some(max_modification_date) = self.config.older_than {
2244            if node.modified > max_modification_date {
2245                return false; // Entry is newer than allowed.
2246            }
2247        }
2248
2249        // If all applicable filters passed (or no filters were active for a category), include the node.
2250        true
2251    }
2252
2253    /// ## `determine_file_type` (Helper for `process_entry`)
2254    ///
2255    /// Examines `fs::Metadata` to determine a more specific `FileType`
2256    /// than just `is_dir` or `is_file`. On Unix, this can identify symlinks,
2257    /// sockets, FIFOs, block/char devices, and executables (by permission).
2258    /// On non-Unix, it's simpler (dir, symlink, or regular file).
2259    fn determine_file_type(&self, metadata: &fs::Metadata) -> FileType {
2260        #[cfg(unix)] // Unix-specific detailed file type detection
2261        {
2262            use std::os::unix::fs::FileTypeExt; // For is_socket, is_fifo, etc.
2263            let ft = metadata.file_type(); // Get the rich FileType from metadata.
2264
2265            if ft.is_dir() {
2266                FileType::Directory
2267            } else if ft.is_symlink() {
2268                // Check symlink before other types, as it can point to them.
2269                FileType::Symlink
2270            } else if ft.is_socket() {
2271                FileType::Socket
2272            } else if ft.is_fifo() {
2273                // Named pipe
2274                FileType::Pipe
2275            } else if ft.is_block_device() {
2276                FileType::BlockDevice
2277            } else if ft.is_char_device() {
2278                FileType::CharDevice
2279            // Check for executable permission (any of user, group, other execute bits are set).
2280            // This applies to regular files that are not dirs, symlinks, or other special types.
2281            } else if ft.is_file() && (metadata.permissions().mode() & 0o111 != 0) {
2282                FileType::Executable
2283            } else {
2284                // If none of the above, it's a regular (non-executable) file.
2285                FileType::RegularFile
2286            }
2287        }
2288
2289        #[cfg(not(unix))] // Simpler detection for non-Unix platforms
2290        {
2291            if metadata.is_dir() {
2292                FileType::Directory
2293            } else if metadata.file_type().is_symlink() {
2294                // `is_symlink()` is part of stable `fs::FileType`
2295                FileType::Symlink
2296            } else {
2297                // No easy cross-platform way to check executable bit without external crates or OS-specific calls.
2298                // So, on non-Unix, we don't distinguish Executable from RegularFile here.
2299                FileType::RegularFile
2300            }
2301        }
2302    }
2303
2304    // --- Platform-Dependent Metadata Helpers ---
2305    // These provide a consistent way to get permissions, UID, and GID,
2306    // with sensible defaults for non-Unix systems where these concepts might not directly apply
2307    // or be easily accessible via standard Rust fs::Metadata.
2308
2309    #[cfg(unix)]
2310    fn get_permissions(metadata: &fs::Metadata) -> u32 {
2311        // On Unix, get the mode and mask it to get the permission bits (e.g., 0o755).
2312        metadata.permissions().mode() & 0o777
2313    }
2314    #[cfg(not(unix))]
2315    fn get_permissions(_metadata: &fs::Metadata) -> u32 {
2316        0o755 // A common default permission (rwxr-xr-x) for non-Unix.
2317    }
2318
2319    #[cfg(unix)]
2320    fn get_uid(metadata: &fs::Metadata) -> u32 {
2321        metadata.uid() // Get User ID from metadata.
2322    }
2323    #[cfg(not(unix))]
2324    fn get_uid(_metadata: &fs::Metadata) -> u32 {
2325        1000 // Common default UID placeholder for non-Unix.
2326    }
2327
2328    #[cfg(unix)]
2329    fn get_gid(metadata: &fs::Metadata) -> u32 {
2330        metadata.gid() // Get Group ID from metadata.
2331    }
2332    #[cfg(not(unix))]
2333    fn get_gid(_metadata: &fs::Metadata) -> u32 {
2334        0
2335    }
2336
2337    /// Apply sorting and optional top-N limit to the results
2338    fn apply_sorting_and_limit(&self, mut nodes: Vec<FileNode>) -> Vec<FileNode> {
2339        // If no sort field specified, return as-is
2340        let sort_field = match &self.config.sort_field {
2341            Some(field) => field,
2342            None => return nodes,
2343        };
2344
2345        // Sort based on the field
2346        match sort_field.as_str() {
2347            "name" | "a-to-z" => {
2348                // Sort by name alphabetically (A to Z)
2349                nodes.sort_by(|a, b| {
2350                    let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2351                    let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2352                    name_a.cmp(&name_b)
2353                });
2354            }
2355            "z-to-a" => {
2356                // Sort by name reverse alphabetically (Z to A)
2357                nodes.sort_by(|a, b| {
2358                    let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2359                    let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2360                    name_b.cmp(&name_a)
2361                });
2362            }
2363            "size" | "largest" => {
2364                // Sort by size descending (largest first)
2365                nodes.sort_by(|a, b| b.size.cmp(&a.size));
2366            }
2367            "smallest" => {
2368                // Sort by size ascending (smallest first)
2369                nodes.sort_by(|a, b| a.size.cmp(&b.size));
2370            }
2371            "date" | "newest" => {
2372                // Sort by modification time descending (newest first)
2373                nodes.sort_by(|a, b| b.modified.cmp(&a.modified));
2374            }
2375            "oldest" => {
2376                // Sort by modification time ascending (oldest first)
2377                nodes.sort_by(|a, b| a.modified.cmp(&b.modified));
2378            }
2379            "type" => {
2380                // Sort by file extension, then by name
2381                nodes.sort_by(|a, b| {
2382                    let ext_a = a.path.extension().unwrap_or_default().to_string_lossy();
2383                    let ext_b = b.path.extension().unwrap_or_default().to_string_lossy();
2384                    match ext_a.cmp(&ext_b) {
2385                        std::cmp::Ordering::Equal => {
2386                            let name_a = a.path.file_name().unwrap_or_default().to_string_lossy();
2387                            let name_b = b.path.file_name().unwrap_or_default().to_string_lossy();
2388                            name_a.cmp(&name_b)
2389                        }
2390                        other => other,
2391                    }
2392                });
2393            }
2394            _ => {
2395                // Unknown sort field, don't sort
2396                eprintln!("Warning: Unknown sort field '{}', ignoring", sort_field);
2397            }
2398        }
2399
2400        // Apply top-N limit if specified
2401        if let Some(limit) = self.config.top_n {
2402            nodes.truncate(limit);
2403        }
2404
2405        nodes
2406    }
2407} // end impl Scanner
2408
2409/// # `parse_size` - The Universal Translator for Sizes
2410///
2411/// This handy function takes something a human understands, like "2.5M", and
2412/// translates it into something a computer understands (2,621,440 bytes).
2413/// It's like having a Babel fish for file sizes. Why should we have to do
2414/// that math when the computer can do it for us?
2415pub fn parse_size(size_str: &str) -> Result<u64> {
2416    let size_str = size_str.trim().to_uppercase();
2417    if size_str.is_empty() {
2418        return Err(anyhow::anyhow!("Empty size string"));
2419    }
2420
2421    // Find the first alphabetic character which marks the start of the unit.
2422    let unit_start_index = size_str
2423        .find(|c: char| c.is_alphabetic())
2424        .unwrap_or(size_str.len());
2425    let (num_part_str, unit_part) = size_str.split_at(unit_start_index);
2426
2427    // Trim any space from the number part before parsing.
2428    let num_part_str = num_part_str.trim();
2429
2430    if num_part_str.is_empty() {
2431        return Err(anyhow::anyhow!("Missing number for size string"));
2432    }
2433
2434    let num: f64 = match num_part_str.parse() {
2435        Ok(n) => n,
2436        Err(e) => return Err(anyhow::anyhow!("Invalid number '{}': {}", num_part_str, e)),
2437    };
2438
2439    // Check for negative numbers.
2440    if num.is_sign_negative() {
2441        return Err(anyhow::anyhow!("Size cannot be negative: {}", num));
2442    }
2443
2444    let multiplier = match unit_part {
2445        "K" | "KB" => 1024.0,
2446        "M" | "MB" => 1024.0 * 1024.0,
2447        "G" | "GB" => 1024.0 * 1024.0 * 1024.0,
2448        "T" | "TB" => 1024.0 * 1024.0 * 1024.0 * 1024.0,
2449        "B" | "" => 1.0,
2450        _ => return Err(anyhow::anyhow!("Invalid size unit: '{}'", unit_part)),
2451    };
2452
2453    Ok((num * multiplier) as u64)
2454}
2455
2456// --- Unit Tests: Ensuring Our Scanner Behaves ---
2457// Aye, even the most brilliant code needs tests to keep it honest!
2458// These tests cover some basic functionality of the scanner.
2459#[cfg(test)]
2460mod tests {
2461    use super::*; // Import everything from the parent module (scanner.rs).
2462
2463    #[test]
2464    fn test_parse_size_valid_inputs() {
2465        assert_eq!(parse_size("100").unwrap(), 100);
2466        assert_eq!(parse_size("100B").unwrap(), 100);
2467        assert_eq!(parse_size("1k").unwrap(), 1024);
2468        assert_eq!(parse_size("1K").unwrap(), 1024);
2469        assert_eq!(parse_size("1KB").unwrap(), 1024);
2470        assert_eq!(parse_size("2.5M").unwrap(), (2.5 * 1024.0 * 1024.0) as u64);
2471        assert_eq!(parse_size("1GB").unwrap(), 1024 * 1024 * 1024);
2472        assert_eq!(
2473            parse_size("0.5T").unwrap(),
2474            (0.5 * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64
2475        );
2476        assert_eq!(parse_size("  2 MB  ").unwrap(), 2 * 1024 * 1024); // Test with whitespace
2477    }
2478
2479    #[test]
2480    fn test_parse_size_invalid_inputs() {
2481        assert!(parse_size("100X").is_err());
2482        assert!(parse_size("garbage").is_err());
2483        assert!(parse_size("-100M").is_err());
2484        assert!(parse_size("1..5K").is_err());
2485    }
2486
2487    #[test]
2488    fn test_parse_size_zero_and_empty() {
2489        assert_eq!(parse_size("0").unwrap(), 0);
2490        assert!(parse_size("").is_err());
2491        assert!(parse_size("  ").is_err());
2492    }
2493
2494    // Basic test for Scanner creation. More comprehensive tests would involve
2495    // creating a temporary directory structure and verifying scan results.
2496    #[test]
2497    fn test_scanner_creation_defaults() {
2498        let temp_dir = tempfile::tempdir().unwrap();
2499        let config = ScannerConfig {
2500            max_depth: 5,
2501            follow_symlinks: false,
2502            respect_gitignore: true,
2503            show_hidden: false,
2504            show_ignored: false,
2505            find_pattern: None,
2506            file_type_filter: None,
2507            entry_type_filter: None,
2508            min_size: None,
2509            max_size: None,
2510            newer_than: None,
2511            older_than: None,
2512            use_default_ignores: true,
2513            search_keyword: None,
2514            show_filesystems: false,
2515            sort_field: None,
2516            top_n: None,
2517            include_line_content: false,
2518            // Smart scanning options
2519            compute_interest: false,
2520            security_scan: false,
2521            min_interest: 0.0,
2522            track_traversal: false,
2523            changes_only: false,
2524            compare_state: None,
2525            smart_mode: false,
2526        };
2527        let scanner_result = Scanner::new(temp_dir.path(), config);
2528        assert!(scanner_result.is_ok());
2529    }
2530}
st/scanner.rs

st/
scanner.rs