gravityfile_scan/
scanner.rs

1//! JWalk-based parallel directory scanner.
2
3use std::collections::HashMap;
4use std::path::{Path, PathBuf};
5use std::sync::atomic::{AtomicU64, Ordering};
6use std::sync::Arc;
7use std::time::Instant;
8
9#[cfg(unix)]
10use std::os::unix::fs::MetadataExt;
11
12use compact_str::CompactString;
13use jwalk::{Parallelism, WalkDir};
14use tokio::sync::broadcast;
15
16use gravityfile_core::{
17    FileNode, FileTree, InodeInfo, NodeId, NodeKind, ScanConfig, ScanError, ScanWarning,
18    Timestamps, TreeStats, WarningKind,
19};
20
21use crate::inode::InodeTracker;
22use crate::progress::ScanProgress;
23
24/// High-performance scanner using jwalk for parallel traversal.
25pub struct JwalkScanner {
26    progress_tx: broadcast::Sender<ScanProgress>,
27}
28
29impl JwalkScanner {
30    /// Create a new scanner.
31    pub fn new() -> Self {
32        let (progress_tx, _) = broadcast::channel(100);
33        Self { progress_tx }
34    }
35
36    /// Subscribe to scan progress updates.
37    pub fn subscribe(&self) -> broadcast::Receiver<ScanProgress> {
38        self.progress_tx.subscribe()
39    }
40
41    /// Perform a scan of the given path.
42    pub fn scan(&self, config: &ScanConfig) -> Result<FileTree, ScanError> {
43        let start = Instant::now();
44        let root_path = config.root.canonicalize().map_err(|e| ScanError::io(&config.root, e))?;
45
46        // Verify root is a directory
47        if !root_path.is_dir() {
48            return Err(ScanError::NotADirectory { path: root_path });
49        }
50
51        // Get root device for cross-filesystem detection
52        let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
53        let root_device = get_dev(&root_metadata);
54
55        // Set up tracking
56        let inode_tracker = InodeTracker::new();
57        let node_id_counter = AtomicU64::new(0);
58        let mut stats = TreeStats::new();
59        let mut warnings = Vec::new();
60
61        // Collect all entries first
62        let entries = self.collect_entries(config, &root_path, root_device, &inode_tracker, &mut stats, &mut warnings)?;
63
64        // Build tree from collected entries
65        let root_node = self.build_tree(&root_path, entries, &node_id_counter, &mut stats);
66
67        let scan_duration = start.elapsed();
68
69        Ok(FileTree::new(
70            root_node,
71            root_path,
72            config.clone(),
73            stats,
74            scan_duration,
75            warnings,
76        ))
77    }
78
79    /// Collect all entries using jwalk.
80    fn collect_entries(
81        &self,
82        config: &ScanConfig,
83        root_path: &Path,
84        root_device: u64,
85        inode_tracker: &InodeTracker,
86        stats: &mut TreeStats,
87        warnings: &mut Vec<ScanWarning>,
88    ) -> Result<HashMap<PathBuf, Vec<EntryInfo>>, ScanError> {
89        let parallelism = match config.threads {
90            0 => Parallelism::RayonDefaultPool { busy_timeout: std::time::Duration::from_millis(100) },
91            n => Parallelism::RayonNewPool(n),
92        };
93
94        let walker = WalkDir::new(root_path)
95            .parallelism(parallelism)
96            .skip_hidden(!config.include_hidden)
97            .follow_links(config.follow_symlinks)
98            .min_depth(0)
99            .max_depth(config.max_depth.map(|d| d as usize).unwrap_or(usize::MAX));
100
101        // Map from parent path to children
102        let mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>> = HashMap::new();
103        let progress_counter = Arc::new(AtomicU64::new(0));
104
105        for entry_result in walker {
106            let entry = match entry_result {
107                Ok(e) => e,
108                Err(err) => {
109                    let path = err.path().map(|p| p.to_path_buf()).unwrap_or_default();
110                    warnings.push(ScanWarning::new(
111                        path,
112                        err.to_string(),
113                        WarningKind::ReadError,
114                    ));
115                    continue;
116                }
117            };
118
119            let path = entry.path();
120            let file_name = entry.file_name().to_string_lossy().to_string();
121
122            // Check ignore patterns
123            if config.should_ignore(&file_name) {
124                continue;
125            }
126
127            // Get metadata
128            let metadata = match entry.metadata() {
129                Ok(m) => m,
130                Err(err) => {
131                    warnings.push(ScanWarning::new(
132                        &path,
133                        err.to_string(),
134                        WarningKind::MetadataError,
135                    ));
136                    continue;
137                }
138            };
139
140            // Check cross-filesystem
141            if !config.cross_filesystems && get_dev(&metadata) != root_device {
142                continue;
143            }
144
145            // Handle different file types
146            let file_type = entry.file_type();
147            let depth = entry.depth() as u32;
148
149            if file_type.is_dir() {
150                stats.record_dir(depth);
151
152                // For directories, track them but size will be aggregated later
153                if let Some(parent) = path.parent() {
154                    let entry_info = EntryInfo {
155                        name: file_name.into(),
156                        path: path.clone(),
157                        size: 0,
158                        blocks: 0,
159                        is_dir: true,
160                        is_symlink: false,
161                        executable: false,
162                        timestamps: Timestamps::new(
163                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
164                            metadata.accessed().ok(),
165                            metadata.created().ok(),
166                        ),
167                        inode: Some(InodeInfo::new(get_ino(&metadata), get_dev(&metadata))),
168                    };
169
170                    entries_by_parent
171                        .entry(parent.to_path_buf())
172                        .or_default()
173                        .push(entry_info);
174                }
175            } else if file_type.is_file() {
176                // Check for hardlinks
177                let inode_info = InodeInfo::new(get_ino(&metadata), get_dev(&metadata));
178                let size = if config.apparent_size {
179                    metadata.len()
180                } else {
181                    // Only count size for first hardlink
182                    if get_nlink(&metadata) > 1 && !inode_tracker.track(inode_info) {
183                        0 // Already counted this inode
184                    } else {
185                        metadata.len()
186                    }
187                };
188
189                let blocks = get_blocks(&metadata);
190
191                stats.record_file(
192                    path.clone(),
193                    size,
194                    metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
195                    depth,
196                );
197
198                if let Some(parent) = path.parent() {
199                    let executable = is_executable(&metadata);
200                    let entry_info = EntryInfo {
201                        name: file_name.into(),
202                        path: path.clone(),
203                        size,
204                        blocks,
205                        is_dir: false,
206                        is_symlink: false,
207                        executable,
208                        timestamps: Timestamps::new(
209                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
210                            metadata.accessed().ok(),
211                            metadata.created().ok(),
212                        ),
213                        inode: Some(inode_info),
214                    };
215
216                    entries_by_parent
217                        .entry(parent.to_path_buf())
218                        .or_default()
219                        .push(entry_info);
220                }
221
222                // Update progress periodically
223                let count = progress_counter.fetch_add(1, Ordering::Relaxed);
224                if count % 1000 == 0 {
225                    let _ = self.progress_tx.send(ScanProgress {
226                        files_scanned: stats.total_files,
227                        dirs_scanned: stats.total_dirs,
228                        bytes_scanned: stats.total_size,
229                        current_path: path.clone(),
230                        errors_count: warnings.len() as u64,
231                        elapsed: std::time::Duration::ZERO, // Will be set properly at end
232                    });
233                }
234            } else if file_type.is_symlink() {
235                stats.record_symlink();
236
237                if let Some(parent) = path.parent() {
238                    let target = std::fs::read_link(&path)
239                        .map(|p| p.to_string_lossy().to_string())
240                        .unwrap_or_default();
241
242                    let broken = !path.exists();
243                    if broken {
244                        warnings.push(ScanWarning::broken_symlink(&path, &target));
245                    }
246
247                    let entry_info = EntryInfo {
248                        name: file_name.into(),
249                        path: path.clone(),
250                        size: 0,
251                        blocks: 0,
252                        is_dir: false,
253                        is_symlink: true,
254                        executable: false,
255                        timestamps: Timestamps::new(
256                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
257                            metadata.accessed().ok(),
258                            metadata.created().ok(),
259                        ),
260                        inode: None,
261                    };
262
263                    entries_by_parent
264                        .entry(parent.to_path_buf())
265                        .or_default()
266                        .push(entry_info);
267                }
268            }
269        }
270
271        Ok(entries_by_parent)
272    }
273
274    /// Build tree structure from collected entries.
275    fn build_tree(
276        &self,
277        root_path: &Path,
278        mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>>,
279        node_id_counter: &AtomicU64,
280        stats: &mut TreeStats,
281    ) -> FileNode {
282        self.build_node(root_path, &mut entries_by_parent, node_id_counter, stats)
283    }
284
285    /// Recursively build a node and its children.
286    fn build_node(
287        &self,
288        path: &Path,
289        entries_by_parent: &mut HashMap<PathBuf, Vec<EntryInfo>>,
290        node_id_counter: &AtomicU64,
291        stats: &mut TreeStats,
292    ) -> FileNode {
293        let id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
294        let name = path
295            .file_name()
296            .map(|n| n.to_string_lossy().to_string())
297            .unwrap_or_else(|| path.to_string_lossy().to_string());
298
299        let metadata = std::fs::metadata(path).ok();
300        let timestamps = metadata
301            .as_ref()
302            .map(|m| {
303                Timestamps::new(
304                    m.modified().unwrap_or(std::time::UNIX_EPOCH),
305                    m.accessed().ok(),
306                    m.created().ok(),
307                )
308            })
309            .unwrap_or_else(|| Timestamps::with_modified(std::time::UNIX_EPOCH));
310
311        let mut node = FileNode::new_directory(id, name, timestamps);
312
313        // Get children for this path
314        let children_entries = entries_by_parent.remove(path).unwrap_or_default();
315
316        let mut total_size: u64 = 0;
317        let mut file_count: u64 = 0;
318        let mut dir_count: u64 = 0;
319
320        for entry in children_entries {
321            if entry.is_dir {
322                // Recursively build directory
323                let child_node = self.build_node(&entry.path, entries_by_parent, node_id_counter, stats);
324                total_size += child_node.size;
325                file_count += child_node.file_count();
326                dir_count += child_node.dir_count() + 1;
327                node.children.push(child_node);
328            } else if entry.is_symlink {
329                // Create symlink node
330                let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
331                let target = std::fs::read_link(&entry.path)
332                    .map(|p| CompactString::new(p.to_string_lossy()))
333                    .unwrap_or_default();
334                let broken = !entry.path.exists();
335
336                let child_node = FileNode {
337                    id: child_id,
338                    name: entry.name,
339                    kind: NodeKind::Symlink { target, broken },
340                    size: 0,
341                    blocks: 0,
342                    timestamps: entry.timestamps,
343                    inode: None,
344                    content_hash: None,
345                    children: Vec::new(),
346                };
347                node.children.push(child_node);
348            } else {
349                // Create file node
350                let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
351                let mut child_node = FileNode::new_file(
352                    child_id,
353                    entry.name,
354                    entry.size,
355                    entry.blocks,
356                    entry.timestamps,
357                    entry.executable,
358                );
359                child_node.inode = entry.inode;
360
361                total_size += entry.size;
362                file_count += 1;
363                node.children.push(child_node);
364            }
365        }
366
367        // Update node with aggregated values
368        node.size = total_size;
369        node.kind = NodeKind::Directory {
370            file_count,
371            dir_count,
372        };
373
374        // Sort children by size (descending)
375        node.children.sort_by(|a, b| b.size.cmp(&a.size));
376
377        node
378    }
379}
380
381impl Default for JwalkScanner {
382    fn default() -> Self {
383        Self::new()
384    }
385}
386
387/// Create a quick, non-recursive directory listing for immediate display.
388/// This function reads only the immediate children of a directory without
389/// recursing into subdirectories. Directory sizes will be 0 (unknown).
390pub fn quick_list(path: &Path) -> Result<FileTree, ScanError> {
391    use std::sync::atomic::{AtomicU64, Ordering};
392    use std::time::Instant;
393
394    let start = Instant::now();
395    let root_path = path.canonicalize().map_err(|e| ScanError::io(path, e))?;
396
397    if !root_path.is_dir() {
398        return Err(ScanError::NotADirectory {
399            path: root_path.clone(),
400        });
401    }
402
403    let node_id_counter = AtomicU64::new(0);
404    let mut stats = TreeStats::new();
405
406    // Get root directory metadata
407    let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
408    let root_timestamps = Timestamps::new(
409        root_metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
410        root_metadata.accessed().ok(),
411        root_metadata.created().ok(),
412    );
413
414    let root_name = root_path
415        .file_name()
416        .map(|n| n.to_string_lossy().to_string())
417        .unwrap_or_else(|| root_path.to_string_lossy().to_string());
418
419    let root_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
420    let mut root_node = FileNode::new_directory(root_id, root_name, root_timestamps);
421
422    // Read immediate children
423    let entries = std::fs::read_dir(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
424
425    let mut total_size: u64 = 0;
426    let mut file_count: u64 = 0;
427    let mut dir_count: u64 = 0;
428
429    for entry_result in entries {
430        let entry = match entry_result {
431            Ok(e) => e,
432            Err(_) => continue,
433        };
434
435        let entry_path = entry.path();
436        let entry_name = entry.file_name().to_string_lossy().to_string();
437
438        // Skip hidden files by default
439        if entry_name.starts_with('.') {
440            continue;
441        }
442
443        let metadata = match entry.metadata() {
444            Ok(m) => m,
445            Err(_) => continue,
446        };
447
448        let timestamps = Timestamps::new(
449            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
450            metadata.accessed().ok(),
451            metadata.created().ok(),
452        );
453
454        let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
455
456        if metadata.is_dir() {
457            // Directory - size is unknown (0) until full scan
458            let child_node = FileNode::new_directory(child_id, entry_name, timestamps);
459            root_node.children.push(child_node);
460            dir_count += 1;
461            stats.record_dir(1);
462        } else if metadata.is_file() {
463            // File - we know the size
464            let size = metadata.len();
465            let blocks = get_blocks(&metadata);
466            let executable = is_executable(&metadata);
467
468            let mut child_node =
469                FileNode::new_file(child_id, entry_name, size, blocks, timestamps, executable);
470
471            // Set inode info for potential hardlink detection
472            let inode = InodeInfo::new(get_ino(&metadata), get_dev(&metadata));
473            child_node.inode = Some(inode);
474
475            total_size += size;
476            file_count += 1;
477            root_node.children.push(child_node);
478            stats.record_file(entry_path, size, timestamps.modified, 1);
479        } else if metadata.is_symlink() {
480            // Symlink
481            let target = std::fs::read_link(&entry_path)
482                .map(|p| CompactString::new(p.to_string_lossy()))
483                .unwrap_or_default();
484            let broken = !entry_path.exists();
485
486            let child_node = FileNode {
487                id: child_id,
488                name: CompactString::new(entry_name),
489                kind: NodeKind::Symlink { target, broken },
490                size: 0,
491                blocks: 0,
492                timestamps,
493                inode: None,
494                content_hash: None,
495                children: Vec::new(),
496            };
497            root_node.children.push(child_node);
498            stats.record_symlink();
499        }
500    }
501
502    // Update root node with aggregated values
503    root_node.size = total_size;
504    root_node.kind = NodeKind::Directory {
505        file_count,
506        dir_count,
507    };
508
509    // Sort children by name for initial display (scan will re-sort by size later)
510    root_node.children.sort_by(|a, b| a.name.cmp(&b.name));
511
512    stats.record_dir(0);
513
514    let config = ScanConfig::new(&root_path);
515    let scan_duration = start.elapsed();
516
517    Ok(FileTree::new(
518        root_node,
519        root_path,
520        config,
521        stats,
522        scan_duration,
523        Vec::new(),
524    ))
525}
526
527/// Temporary struct for collecting entry information.
528struct EntryInfo {
529    name: CompactString,
530    path: PathBuf,
531    size: u64,
532    blocks: u64,
533    is_dir: bool,
534    is_symlink: bool,
535    executable: bool,
536    timestamps: Timestamps,
537    inode: Option<InodeInfo>,
538}
539
540/// Check if a file is executable (Unix).
541#[cfg(unix)]
542fn is_executable(metadata: &std::fs::Metadata) -> bool {
543    use std::os::unix::fs::PermissionsExt;
544    metadata.permissions().mode() & 0o111 != 0
545}
546
547#[cfg(not(unix))]
548fn is_executable(_metadata: &std::fs::Metadata) -> bool {
549    false
550}
551
552// Cross-platform metadata helpers
553
554/// Get the device ID from metadata.
555#[cfg(unix)]
556fn get_dev(metadata: &std::fs::Metadata) -> u64 {
557    metadata.dev()
558}
559
560#[cfg(not(unix))]
561fn get_dev(_metadata: &std::fs::Metadata) -> u64 {
562    0 // Windows doesn't have device IDs in the same way
563}
564
565/// Get the inode number from metadata.
566#[cfg(unix)]
567fn get_ino(metadata: &std::fs::Metadata) -> u64 {
568    metadata.ino()
569}
570
571#[cfg(not(unix))]
572fn get_ino(_metadata: &std::fs::Metadata) -> u64 {
573    0 // Windows doesn't have inodes
574}
575
576/// Get the number of hard links from metadata.
577#[cfg(unix)]
578fn get_nlink(metadata: &std::fs::Metadata) -> u64 {
579    metadata.nlink()
580}
581
582#[cfg(not(unix))]
583fn get_nlink(_metadata: &std::fs::Metadata) -> u64 {
584    1 // Assume single link on Windows
585}
586
587/// Get the number of 512-byte blocks from metadata.
588#[cfg(unix)]
589fn get_blocks(metadata: &std::fs::Metadata) -> u64 {
590    metadata.blocks()
591}
592
593#[cfg(not(unix))]
594fn get_blocks(metadata: &std::fs::Metadata) -> u64 {
595    // Estimate blocks from file size (512-byte blocks, rounded up)
596    (metadata.len() + 511) / 512
597}
598
599#[cfg(test)]
600mod tests {
601    use super::*;
602    use std::fs;
603    use tempfile::TempDir;
604
605    fn create_test_tree() -> TempDir {
606        let temp = TempDir::new().unwrap();
607        let root = temp.path();
608
609        // Create directory structure
610        fs::create_dir(root.join("dir1")).unwrap();
611        fs::create_dir(root.join("dir2")).unwrap();
612        fs::create_dir(root.join("dir1/subdir")).unwrap();
613
614        // Create files
615        fs::write(root.join("file1.txt"), "hello").unwrap();
616        fs::write(root.join("dir1/file2.txt"), "world world world").unwrap();
617        fs::write(root.join("dir1/subdir/file3.txt"), "test").unwrap();
618        fs::write(root.join("dir2/file4.txt"), "another file here").unwrap();
619
620        temp
621    }
622
623    #[test]
624    fn test_basic_scan() {
625        let temp = create_test_tree();
626        let config = ScanConfig::new(temp.path());
627
628        let scanner = JwalkScanner::new();
629        let tree = scanner.scan(&config).unwrap();
630
631        assert_eq!(tree.stats.total_files, 4);
632        // dir1, dir2, subdir + root = 4, but root not counted in walker
633        assert!(tree.stats.total_dirs >= 3);
634        assert!(tree.root.size > 0);
635    }
636
637    #[test]
638    fn test_children_sorted_by_size() {
639        let temp = create_test_tree();
640        let config = ScanConfig::new(temp.path());
641
642        let scanner = JwalkScanner::new();
643        let tree = scanner.scan(&config).unwrap();
644
645        // Children should be sorted by size descending
646        for i in 0..tree.root.children.len().saturating_sub(1) {
647            assert!(tree.root.children[i].size >= tree.root.children[i + 1].size);
648        }
649    }
650
651    #[test]
652    fn test_ignore_patterns() {
653        let temp = create_test_tree();
654        let config = ScanConfig::builder()
655            .root(temp.path())
656            .ignore_patterns(vec!["dir2".to_string()])
657            .build()
658            .unwrap();
659
660        let scanner = JwalkScanner::new();
661        let tree = scanner.scan(&config).unwrap();
662
663        // dir2 should be ignored
664        assert!(!tree
665            .root
666            .children
667            .iter()
668            .any(|c| c.name.as_str() == "dir2"));
669    }
670}