gravityfile_scan/
scanner.rs

1//! JWalk-based parallel directory scanner.
2
3use std::collections::HashMap;
4use std::os::unix::fs::MetadataExt;
5use std::path::{Path, PathBuf};
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::Arc;
8use std::time::Instant;
9
10use compact_str::CompactString;
11use jwalk::{Parallelism, WalkDir};
12use tokio::sync::broadcast;
13
14use gravityfile_core::{
15    FileNode, FileTree, InodeInfo, NodeId, NodeKind, ScanConfig, ScanError, ScanWarning,
16    Timestamps, TreeStats, WarningKind,
17};
18
19use crate::inode::InodeTracker;
20use crate::progress::ScanProgress;
21
22/// High-performance scanner using jwalk for parallel traversal.
23pub struct JwalkScanner {
24    progress_tx: broadcast::Sender<ScanProgress>,
25}
26
27impl JwalkScanner {
28    /// Create a new scanner.
29    pub fn new() -> Self {
30        let (progress_tx, _) = broadcast::channel(100);
31        Self { progress_tx }
32    }
33
34    /// Subscribe to scan progress updates.
35    pub fn subscribe(&self) -> broadcast::Receiver<ScanProgress> {
36        self.progress_tx.subscribe()
37    }
38
39    /// Perform a scan of the given path.
40    pub fn scan(&self, config: &ScanConfig) -> Result<FileTree, ScanError> {
41        let start = Instant::now();
42        let root_path = config.root.canonicalize().map_err(|e| ScanError::io(&config.root, e))?;
43
44        // Verify root is a directory
45        if !root_path.is_dir() {
46            return Err(ScanError::NotADirectory { path: root_path });
47        }
48
49        // Get root device for cross-filesystem detection
50        let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
51        let root_device = root_metadata.dev();
52
53        // Set up tracking
54        let inode_tracker = InodeTracker::new();
55        let node_id_counter = AtomicU64::new(0);
56        let mut stats = TreeStats::new();
57        let mut warnings = Vec::new();
58
59        // Collect all entries first
60        let entries = self.collect_entries(config, &root_path, root_device, &inode_tracker, &mut stats, &mut warnings)?;
61
62        // Build tree from collected entries
63        let root_node = self.build_tree(&root_path, entries, &node_id_counter, &mut stats);
64
65        let scan_duration = start.elapsed();
66
67        Ok(FileTree::new(
68            root_node,
69            root_path,
70            config.clone(),
71            stats,
72            scan_duration,
73            warnings,
74        ))
75    }
76
77    /// Collect all entries using jwalk.
78    fn collect_entries(
79        &self,
80        config: &ScanConfig,
81        root_path: &Path,
82        root_device: u64,
83        inode_tracker: &InodeTracker,
84        stats: &mut TreeStats,
85        warnings: &mut Vec<ScanWarning>,
86    ) -> Result<HashMap<PathBuf, Vec<EntryInfo>>, ScanError> {
87        let parallelism = match config.threads {
88            0 => Parallelism::RayonDefaultPool { busy_timeout: std::time::Duration::from_millis(100) },
89            n => Parallelism::RayonNewPool(n),
90        };
91
92        let walker = WalkDir::new(root_path)
93            .parallelism(parallelism)
94            .skip_hidden(!config.include_hidden)
95            .follow_links(config.follow_symlinks)
96            .min_depth(0)
97            .max_depth(config.max_depth.map(|d| d as usize).unwrap_or(usize::MAX));
98
99        // Map from parent path to children
100        let mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>> = HashMap::new();
101        let progress_counter = Arc::new(AtomicU64::new(0));
102
103        for entry_result in walker {
104            let entry = match entry_result {
105                Ok(e) => e,
106                Err(err) => {
107                    let path = err.path().map(|p| p.to_path_buf()).unwrap_or_default();
108                    warnings.push(ScanWarning::new(
109                        path,
110                        err.to_string(),
111                        WarningKind::ReadError,
112                    ));
113                    continue;
114                }
115            };
116
117            let path = entry.path();
118            let file_name = entry.file_name().to_string_lossy().to_string();
119
120            // Check ignore patterns
121            if config.should_ignore(&file_name) {
122                continue;
123            }
124
125            // Get metadata
126            let metadata = match entry.metadata() {
127                Ok(m) => m,
128                Err(err) => {
129                    warnings.push(ScanWarning::new(
130                        &path,
131                        err.to_string(),
132                        WarningKind::MetadataError,
133                    ));
134                    continue;
135                }
136            };
137
138            // Check cross-filesystem
139            if !config.cross_filesystems && metadata.dev() != root_device {
140                continue;
141            }
142
143            // Handle different file types
144            let file_type = entry.file_type();
145            let depth = entry.depth() as u32;
146
147            if file_type.is_dir() {
148                stats.record_dir(depth);
149
150                // For directories, track them but size will be aggregated later
151                if let Some(parent) = path.parent() {
152                    let entry_info = EntryInfo {
153                        name: file_name.into(),
154                        path: path.clone(),
155                        size: 0,
156                        blocks: 0,
157                        is_dir: true,
158                        is_symlink: false,
159                        executable: false,
160                        timestamps: Timestamps::new(
161                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
162                            metadata.accessed().ok(),
163                            metadata.created().ok(),
164                        ),
165                        inode: Some(InodeInfo::new(metadata.ino(), metadata.dev())),
166                    };
167
168                    entries_by_parent
169                        .entry(parent.to_path_buf())
170                        .or_default()
171                        .push(entry_info);
172                }
173            } else if file_type.is_file() {
174                // Check for hardlinks
175                let inode_info = InodeInfo::new(metadata.ino(), metadata.dev());
176                let size = if config.apparent_size {
177                    metadata.len()
178                } else {
179                    // Only count size for first hardlink
180                    if metadata.nlink() > 1 && !inode_tracker.track(inode_info) {
181                        0 // Already counted this inode
182                    } else {
183                        metadata.len()
184                    }
185                };
186
187                let blocks = metadata.blocks();
188
189                stats.record_file(
190                    path.clone(),
191                    size,
192                    metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
193                    depth,
194                );
195
196                if let Some(parent) = path.parent() {
197                    let executable = is_executable(&metadata);
198                    let entry_info = EntryInfo {
199                        name: file_name.into(),
200                        path: path.clone(),
201                        size,
202                        blocks,
203                        is_dir: false,
204                        is_symlink: false,
205                        executable,
206                        timestamps: Timestamps::new(
207                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
208                            metadata.accessed().ok(),
209                            metadata.created().ok(),
210                        ),
211                        inode: Some(inode_info),
212                    };
213
214                    entries_by_parent
215                        .entry(parent.to_path_buf())
216                        .or_default()
217                        .push(entry_info);
218                }
219
220                // Update progress periodically
221                let count = progress_counter.fetch_add(1, Ordering::Relaxed);
222                if count % 1000 == 0 {
223                    let _ = self.progress_tx.send(ScanProgress {
224                        files_scanned: stats.total_files,
225                        dirs_scanned: stats.total_dirs,
226                        bytes_scanned: stats.total_size,
227                        current_path: path.clone(),
228                        errors_count: warnings.len() as u64,
229                        elapsed: std::time::Duration::ZERO, // Will be set properly at end
230                    });
231                }
232            } else if file_type.is_symlink() {
233                stats.record_symlink();
234
235                if let Some(parent) = path.parent() {
236                    let target = std::fs::read_link(&path)
237                        .map(|p| p.to_string_lossy().to_string())
238                        .unwrap_or_default();
239
240                    let broken = !path.exists();
241                    if broken {
242                        warnings.push(ScanWarning::broken_symlink(&path, &target));
243                    }
244
245                    let entry_info = EntryInfo {
246                        name: file_name.into(),
247                        path: path.clone(),
248                        size: 0,
249                        blocks: 0,
250                        is_dir: false,
251                        is_symlink: true,
252                        executable: false,
253                        timestamps: Timestamps::new(
254                            metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
255                            metadata.accessed().ok(),
256                            metadata.created().ok(),
257                        ),
258                        inode: None,
259                    };
260
261                    entries_by_parent
262                        .entry(parent.to_path_buf())
263                        .or_default()
264                        .push(entry_info);
265                }
266            }
267        }
268
269        Ok(entries_by_parent)
270    }
271
272    /// Build tree structure from collected entries.
273    fn build_tree(
274        &self,
275        root_path: &Path,
276        mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>>,
277        node_id_counter: &AtomicU64,
278        stats: &mut TreeStats,
279    ) -> FileNode {
280        self.build_node(root_path, &mut entries_by_parent, node_id_counter, stats)
281    }
282
283    /// Recursively build a node and its children.
284    fn build_node(
285        &self,
286        path: &Path,
287        entries_by_parent: &mut HashMap<PathBuf, Vec<EntryInfo>>,
288        node_id_counter: &AtomicU64,
289        stats: &mut TreeStats,
290    ) -> FileNode {
291        let id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
292        let name = path
293            .file_name()
294            .map(|n| n.to_string_lossy().to_string())
295            .unwrap_or_else(|| path.to_string_lossy().to_string());
296
297        let metadata = std::fs::metadata(path).ok();
298        let timestamps = metadata
299            .as_ref()
300            .map(|m| {
301                Timestamps::new(
302                    m.modified().unwrap_or(std::time::UNIX_EPOCH),
303                    m.accessed().ok(),
304                    m.created().ok(),
305                )
306            })
307            .unwrap_or_else(|| Timestamps::with_modified(std::time::UNIX_EPOCH));
308
309        let mut node = FileNode::new_directory(id, name, timestamps);
310
311        // Get children for this path
312        let children_entries = entries_by_parent.remove(path).unwrap_or_default();
313
314        let mut total_size: u64 = 0;
315        let mut file_count: u64 = 0;
316        let mut dir_count: u64 = 0;
317
318        for entry in children_entries {
319            if entry.is_dir {
320                // Recursively build directory
321                let child_node = self.build_node(&entry.path, entries_by_parent, node_id_counter, stats);
322                total_size += child_node.size;
323                file_count += child_node.file_count();
324                dir_count += child_node.dir_count() + 1;
325                node.children.push(child_node);
326            } else if entry.is_symlink {
327                // Create symlink node
328                let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
329                let target = std::fs::read_link(&entry.path)
330                    .map(|p| CompactString::new(p.to_string_lossy()))
331                    .unwrap_or_default();
332                let broken = !entry.path.exists();
333
334                let child_node = FileNode {
335                    id: child_id,
336                    name: entry.name,
337                    kind: NodeKind::Symlink { target, broken },
338                    size: 0,
339                    blocks: 0,
340                    timestamps: entry.timestamps,
341                    inode: None,
342                    content_hash: None,
343                    children: Vec::new(),
344                };
345                node.children.push(child_node);
346            } else {
347                // Create file node
348                let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
349                let mut child_node = FileNode::new_file(
350                    child_id,
351                    entry.name,
352                    entry.size,
353                    entry.blocks,
354                    entry.timestamps,
355                    entry.executable,
356                );
357                child_node.inode = entry.inode;
358
359                total_size += entry.size;
360                file_count += 1;
361                node.children.push(child_node);
362            }
363        }
364
365        // Update node with aggregated values
366        node.size = total_size;
367        node.kind = NodeKind::Directory {
368            file_count,
369            dir_count,
370        };
371
372        // Sort children by size (descending)
373        node.children.sort_by(|a, b| b.size.cmp(&a.size));
374
375        node
376    }
377}
378
379impl Default for JwalkScanner {
380    fn default() -> Self {
381        Self::new()
382    }
383}
384
385/// Temporary struct for collecting entry information.
386struct EntryInfo {
387    name: CompactString,
388    path: PathBuf,
389    size: u64,
390    blocks: u64,
391    is_dir: bool,
392    is_symlink: bool,
393    executable: bool,
394    timestamps: Timestamps,
395    inode: Option<InodeInfo>,
396}
397
398/// Check if a file is executable (Unix).
399#[cfg(unix)]
400fn is_executable(metadata: &std::fs::Metadata) -> bool {
401    use std::os::unix::fs::PermissionsExt;
402    metadata.permissions().mode() & 0o111 != 0
403}
404
405#[cfg(not(unix))]
406fn is_executable(_metadata: &std::fs::Metadata) -> bool {
407    false
408}
409
410#[cfg(test)]
411mod tests {
412    use super::*;
413    use std::fs;
414    use tempfile::TempDir;
415    use gravityfile_core::ScanConfigBuilder;
416
417    fn create_test_tree() -> TempDir {
418        let temp = TempDir::new().unwrap();
419        let root = temp.path();
420
421        // Create directory structure
422        fs::create_dir(root.join("dir1")).unwrap();
423        fs::create_dir(root.join("dir2")).unwrap();
424        fs::create_dir(root.join("dir1/subdir")).unwrap();
425
426        // Create files
427        fs::write(root.join("file1.txt"), "hello").unwrap();
428        fs::write(root.join("dir1/file2.txt"), "world world world").unwrap();
429        fs::write(root.join("dir1/subdir/file3.txt"), "test").unwrap();
430        fs::write(root.join("dir2/file4.txt"), "another file here").unwrap();
431
432        temp
433    }
434
435    #[test]
436    fn test_basic_scan() {
437        let temp = create_test_tree();
438        let config = ScanConfig::new(temp.path());
439
440        let scanner = JwalkScanner::new();
441        let tree = scanner.scan(&config).unwrap();
442
443        assert_eq!(tree.stats.total_files, 4);
444        // dir1, dir2, subdir + root = 4, but root not counted in walker
445        assert!(tree.stats.total_dirs >= 3);
446        assert!(tree.root.size > 0);
447    }
448
449    #[test]
450    fn test_children_sorted_by_size() {
451        let temp = create_test_tree();
452        let config = ScanConfig::new(temp.path());
453
454        let scanner = JwalkScanner::new();
455        let tree = scanner.scan(&config).unwrap();
456
457        // Children should be sorted by size descending
458        for i in 0..tree.root.children.len().saturating_sub(1) {
459            assert!(tree.root.children[i].size >= tree.root.children[i + 1].size);
460        }
461    }
462
463    #[test]
464    fn test_ignore_patterns() {
465        let temp = create_test_tree();
466        let config = ScanConfig::builder()
467            .root(temp.path())
468            .ignore_patterns(vec!["dir2".to_string()])
469            .build()
470            .unwrap();
471
472        let scanner = JwalkScanner::new();
473        let tree = scanner.scan(&config).unwrap();
474
475        // dir2 should be ignored
476        assert!(!tree
477            .root
478            .children
479            .iter()
480            .any(|c| c.name.as_str() == "dir2"));
481    }
482}