1use std::collections::HashMap;
4use std::path::{Path, PathBuf};
5use std::sync::atomic::{AtomicU64, Ordering};
6use std::sync::Arc;
7use std::time::Instant;
8
9#[cfg(unix)]
10use std::os::unix::fs::MetadataExt;
11
12use compact_str::CompactString;
13use jwalk::{Parallelism, WalkDir};
14use tokio::sync::broadcast;
15
16use gravityfile_core::{
17 FileNode, FileTree, InodeInfo, NodeId, NodeKind, ScanConfig, ScanError, ScanWarning,
18 Timestamps, TreeStats, WarningKind,
19};
20
21use crate::inode::InodeTracker;
22use crate::progress::ScanProgress;
23
24pub struct JwalkScanner {
26 progress_tx: broadcast::Sender<ScanProgress>,
27}
28
29impl JwalkScanner {
30 pub fn new() -> Self {
32 let (progress_tx, _) = broadcast::channel(100);
33 Self { progress_tx }
34 }
35
36 pub fn subscribe(&self) -> broadcast::Receiver<ScanProgress> {
38 self.progress_tx.subscribe()
39 }
40
41 pub fn scan(&self, config: &ScanConfig) -> Result<FileTree, ScanError> {
43 let start = Instant::now();
44 let root_path = config.root.canonicalize().map_err(|e| ScanError::io(&config.root, e))?;
45
46 if !root_path.is_dir() {
48 return Err(ScanError::NotADirectory { path: root_path });
49 }
50
51 let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
53 let root_device = get_dev(&root_metadata);
54
55 let inode_tracker = InodeTracker::new();
57 let node_id_counter = AtomicU64::new(0);
58 let mut stats = TreeStats::new();
59 let mut warnings = Vec::new();
60
61 let entries = self.collect_entries(config, &root_path, root_device, &inode_tracker, &mut stats, &mut warnings)?;
63
64 let root_node = self.build_tree(&root_path, entries, &node_id_counter, &mut stats);
66
67 let scan_duration = start.elapsed();
68
69 Ok(FileTree::new(
70 root_node,
71 root_path,
72 config.clone(),
73 stats,
74 scan_duration,
75 warnings,
76 ))
77 }
78
79 fn collect_entries(
81 &self,
82 config: &ScanConfig,
83 root_path: &Path,
84 root_device: u64,
85 inode_tracker: &InodeTracker,
86 stats: &mut TreeStats,
87 warnings: &mut Vec<ScanWarning>,
88 ) -> Result<HashMap<PathBuf, Vec<EntryInfo>>, ScanError> {
89 let parallelism = match config.threads {
90 0 => Parallelism::RayonDefaultPool { busy_timeout: std::time::Duration::from_millis(100) },
91 n => Parallelism::RayonNewPool(n),
92 };
93
94 let walker = WalkDir::new(root_path)
95 .parallelism(parallelism)
96 .skip_hidden(!config.include_hidden)
97 .follow_links(config.follow_symlinks)
98 .min_depth(0)
99 .max_depth(config.max_depth.map(|d| d as usize).unwrap_or(usize::MAX));
100
101 let mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>> = HashMap::new();
103 let progress_counter = Arc::new(AtomicU64::new(0));
104
105 for entry_result in walker {
106 let entry = match entry_result {
107 Ok(e) => e,
108 Err(err) => {
109 let path = err.path().map(|p| p.to_path_buf()).unwrap_or_default();
110 warnings.push(ScanWarning::new(
111 path,
112 err.to_string(),
113 WarningKind::ReadError,
114 ));
115 continue;
116 }
117 };
118
119 let path = entry.path();
120 let file_name = entry.file_name().to_string_lossy().to_string();
121
122 if config.should_ignore(&file_name) {
124 continue;
125 }
126
127 let metadata = match entry.metadata() {
129 Ok(m) => m,
130 Err(err) => {
131 warnings.push(ScanWarning::new(
132 &path,
133 err.to_string(),
134 WarningKind::MetadataError,
135 ));
136 continue;
137 }
138 };
139
140 if !config.cross_filesystems && get_dev(&metadata) != root_device {
142 continue;
143 }
144
145 let file_type = entry.file_type();
147 let depth = entry.depth() as u32;
148
149 if file_type.is_dir() {
150 stats.record_dir(depth);
151
152 if let Some(parent) = path.parent() {
154 let entry_info = EntryInfo {
155 name: file_name.into(),
156 path: path.clone(),
157 size: 0,
158 blocks: 0,
159 is_dir: true,
160 is_symlink: false,
161 executable: false,
162 timestamps: Timestamps::new(
163 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
164 metadata.accessed().ok(),
165 metadata.created().ok(),
166 ),
167 inode: Some(InodeInfo::new(get_ino(&metadata), get_dev(&metadata))),
168 };
169
170 entries_by_parent
171 .entry(parent.to_path_buf())
172 .or_default()
173 .push(entry_info);
174 }
175 } else if file_type.is_file() {
176 let inode_info = InodeInfo::new(get_ino(&metadata), get_dev(&metadata));
178 let size = if config.apparent_size {
179 metadata.len()
180 } else {
181 if get_nlink(&metadata) > 1 && !inode_tracker.track(inode_info) {
183 0 } else {
185 metadata.len()
186 }
187 };
188
189 let blocks = get_blocks(&metadata);
190
191 stats.record_file(
192 path.clone(),
193 size,
194 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
195 depth,
196 );
197
198 if let Some(parent) = path.parent() {
199 let executable = is_executable(&metadata);
200 let entry_info = EntryInfo {
201 name: file_name.into(),
202 path: path.clone(),
203 size,
204 blocks,
205 is_dir: false,
206 is_symlink: false,
207 executable,
208 timestamps: Timestamps::new(
209 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
210 metadata.accessed().ok(),
211 metadata.created().ok(),
212 ),
213 inode: Some(inode_info),
214 };
215
216 entries_by_parent
217 .entry(parent.to_path_buf())
218 .or_default()
219 .push(entry_info);
220 }
221
222 let count = progress_counter.fetch_add(1, Ordering::Relaxed);
224 if count % 1000 == 0 {
225 let _ = self.progress_tx.send(ScanProgress {
226 files_scanned: stats.total_files,
227 dirs_scanned: stats.total_dirs,
228 bytes_scanned: stats.total_size,
229 current_path: path.clone(),
230 errors_count: warnings.len() as u64,
231 elapsed: std::time::Duration::ZERO, });
233 }
234 } else if file_type.is_symlink() {
235 stats.record_symlink();
236
237 if let Some(parent) = path.parent() {
238 let target = std::fs::read_link(&path)
239 .map(|p| p.to_string_lossy().to_string())
240 .unwrap_or_default();
241
242 let broken = !path.exists();
243 if broken {
244 warnings.push(ScanWarning::broken_symlink(&path, &target));
245 }
246
247 let entry_info = EntryInfo {
248 name: file_name.into(),
249 path: path.clone(),
250 size: 0,
251 blocks: 0,
252 is_dir: false,
253 is_symlink: true,
254 executable: false,
255 timestamps: Timestamps::new(
256 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
257 metadata.accessed().ok(),
258 metadata.created().ok(),
259 ),
260 inode: None,
261 };
262
263 entries_by_parent
264 .entry(parent.to_path_buf())
265 .or_default()
266 .push(entry_info);
267 }
268 }
269 }
270
271 Ok(entries_by_parent)
272 }
273
274 fn build_tree(
276 &self,
277 root_path: &Path,
278 mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>>,
279 node_id_counter: &AtomicU64,
280 stats: &mut TreeStats,
281 ) -> FileNode {
282 self.build_node(root_path, &mut entries_by_parent, node_id_counter, stats)
283 }
284
285 fn build_node(
287 &self,
288 path: &Path,
289 entries_by_parent: &mut HashMap<PathBuf, Vec<EntryInfo>>,
290 node_id_counter: &AtomicU64,
291 stats: &mut TreeStats,
292 ) -> FileNode {
293 let id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
294 let name = path
295 .file_name()
296 .map(|n| n.to_string_lossy().to_string())
297 .unwrap_or_else(|| path.to_string_lossy().to_string());
298
299 let metadata = std::fs::metadata(path).ok();
300 let timestamps = metadata
301 .as_ref()
302 .map(|m| {
303 Timestamps::new(
304 m.modified().unwrap_or(std::time::UNIX_EPOCH),
305 m.accessed().ok(),
306 m.created().ok(),
307 )
308 })
309 .unwrap_or_else(|| Timestamps::with_modified(std::time::UNIX_EPOCH));
310
311 let mut node = FileNode::new_directory(id, name, timestamps);
312
313 let children_entries = entries_by_parent.remove(path).unwrap_or_default();
315
316 let mut total_size: u64 = 0;
317 let mut file_count: u64 = 0;
318 let mut dir_count: u64 = 0;
319
320 for entry in children_entries {
321 if entry.is_dir {
322 let child_node = self.build_node(&entry.path, entries_by_parent, node_id_counter, stats);
324 total_size += child_node.size;
325 file_count += child_node.file_count();
326 dir_count += child_node.dir_count() + 1;
327 node.children.push(child_node);
328 } else if entry.is_symlink {
329 let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
331 let target = std::fs::read_link(&entry.path)
332 .map(|p| CompactString::new(p.to_string_lossy()))
333 .unwrap_or_default();
334 let broken = !entry.path.exists();
335
336 let child_node = FileNode {
337 id: child_id,
338 name: entry.name,
339 kind: NodeKind::Symlink { target, broken },
340 size: 0,
341 blocks: 0,
342 timestamps: entry.timestamps,
343 inode: None,
344 content_hash: None,
345 children: Vec::new(),
346 };
347 node.children.push(child_node);
348 } else {
349 let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
351 let mut child_node = FileNode::new_file(
352 child_id,
353 entry.name,
354 entry.size,
355 entry.blocks,
356 entry.timestamps,
357 entry.executable,
358 );
359 child_node.inode = entry.inode;
360
361 total_size += entry.size;
362 file_count += 1;
363 node.children.push(child_node);
364 }
365 }
366
367 node.size = total_size;
369 node.kind = NodeKind::Directory {
370 file_count,
371 dir_count,
372 };
373
374 node.children.sort_by(|a, b| b.size.cmp(&a.size));
376
377 node
378 }
379}
380
381impl Default for JwalkScanner {
382 fn default() -> Self {
383 Self::new()
384 }
385}
386
387pub fn quick_list(path: &Path) -> Result<FileTree, ScanError> {
391 use std::sync::atomic::{AtomicU64, Ordering};
392 use std::time::Instant;
393
394 let start = Instant::now();
395 let root_path = path.canonicalize().map_err(|e| ScanError::io(path, e))?;
396
397 if !root_path.is_dir() {
398 return Err(ScanError::NotADirectory {
399 path: root_path.clone(),
400 });
401 }
402
403 let node_id_counter = AtomicU64::new(0);
404 let mut stats = TreeStats::new();
405
406 let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
408 let root_timestamps = Timestamps::new(
409 root_metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
410 root_metadata.accessed().ok(),
411 root_metadata.created().ok(),
412 );
413
414 let root_name = root_path
415 .file_name()
416 .map(|n| n.to_string_lossy().to_string())
417 .unwrap_or_else(|| root_path.to_string_lossy().to_string());
418
419 let root_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
420 let mut root_node = FileNode::new_directory(root_id, root_name, root_timestamps);
421
422 let entries = std::fs::read_dir(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
424
425 let mut total_size: u64 = 0;
426 let mut file_count: u64 = 0;
427 let mut dir_count: u64 = 0;
428
429 for entry_result in entries {
430 let entry = match entry_result {
431 Ok(e) => e,
432 Err(_) => continue,
433 };
434
435 let entry_path = entry.path();
436 let entry_name = entry.file_name().to_string_lossy().to_string();
437
438 if entry_name.starts_with('.') {
440 continue;
441 }
442
443 let metadata = match entry.metadata() {
444 Ok(m) => m,
445 Err(_) => continue,
446 };
447
448 let timestamps = Timestamps::new(
449 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
450 metadata.accessed().ok(),
451 metadata.created().ok(),
452 );
453
454 let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
455
456 if metadata.is_dir() {
457 let child_node = FileNode::new_directory(child_id, entry_name, timestamps);
459 root_node.children.push(child_node);
460 dir_count += 1;
461 stats.record_dir(1);
462 } else if metadata.is_file() {
463 let size = metadata.len();
465 let blocks = get_blocks(&metadata);
466 let executable = is_executable(&metadata);
467
468 let mut child_node =
469 FileNode::new_file(child_id, entry_name, size, blocks, timestamps, executable);
470
471 let inode = InodeInfo::new(get_ino(&metadata), get_dev(&metadata));
473 child_node.inode = Some(inode);
474
475 total_size += size;
476 file_count += 1;
477 root_node.children.push(child_node);
478 stats.record_file(entry_path, size, timestamps.modified, 1);
479 } else if metadata.is_symlink() {
480 let target = std::fs::read_link(&entry_path)
482 .map(|p| CompactString::new(p.to_string_lossy()))
483 .unwrap_or_default();
484 let broken = !entry_path.exists();
485
486 let child_node = FileNode {
487 id: child_id,
488 name: CompactString::new(entry_name),
489 kind: NodeKind::Symlink { target, broken },
490 size: 0,
491 blocks: 0,
492 timestamps,
493 inode: None,
494 content_hash: None,
495 children: Vec::new(),
496 };
497 root_node.children.push(child_node);
498 stats.record_symlink();
499 }
500 }
501
502 root_node.size = total_size;
504 root_node.kind = NodeKind::Directory {
505 file_count,
506 dir_count,
507 };
508
509 root_node.children.sort_by(|a, b| a.name.cmp(&b.name));
511
512 stats.record_dir(0);
513
514 let config = ScanConfig::new(&root_path);
515 let scan_duration = start.elapsed();
516
517 Ok(FileTree::new(
518 root_node,
519 root_path,
520 config,
521 stats,
522 scan_duration,
523 Vec::new(),
524 ))
525}
526
527struct EntryInfo {
529 name: CompactString,
530 path: PathBuf,
531 size: u64,
532 blocks: u64,
533 is_dir: bool,
534 is_symlink: bool,
535 executable: bool,
536 timestamps: Timestamps,
537 inode: Option<InodeInfo>,
538}
539
540#[cfg(unix)]
542fn is_executable(metadata: &std::fs::Metadata) -> bool {
543 use std::os::unix::fs::PermissionsExt;
544 metadata.permissions().mode() & 0o111 != 0
545}
546
547#[cfg(not(unix))]
548fn is_executable(_metadata: &std::fs::Metadata) -> bool {
549 false
550}
551
552#[cfg(unix)]
556fn get_dev(metadata: &std::fs::Metadata) -> u64 {
557 metadata.dev()
558}
559
560#[cfg(not(unix))]
561fn get_dev(_metadata: &std::fs::Metadata) -> u64 {
562 0 }
564
565#[cfg(unix)]
567fn get_ino(metadata: &std::fs::Metadata) -> u64 {
568 metadata.ino()
569}
570
571#[cfg(not(unix))]
572fn get_ino(_metadata: &std::fs::Metadata) -> u64 {
573 0 }
575
576#[cfg(unix)]
578fn get_nlink(metadata: &std::fs::Metadata) -> u64 {
579 metadata.nlink()
580}
581
582#[cfg(not(unix))]
583fn get_nlink(_metadata: &std::fs::Metadata) -> u64 {
584 1 }
586
587#[cfg(unix)]
589fn get_blocks(metadata: &std::fs::Metadata) -> u64 {
590 metadata.blocks()
591}
592
593#[cfg(not(unix))]
594fn get_blocks(metadata: &std::fs::Metadata) -> u64 {
595 (metadata.len() + 511) / 512
597}
598
599#[cfg(test)]
600mod tests {
601 use super::*;
602 use std::fs;
603 use tempfile::TempDir;
604
605 fn create_test_tree() -> TempDir {
606 let temp = TempDir::new().unwrap();
607 let root = temp.path();
608
609 fs::create_dir(root.join("dir1")).unwrap();
611 fs::create_dir(root.join("dir2")).unwrap();
612 fs::create_dir(root.join("dir1/subdir")).unwrap();
613
614 fs::write(root.join("file1.txt"), "hello").unwrap();
616 fs::write(root.join("dir1/file2.txt"), "world world world").unwrap();
617 fs::write(root.join("dir1/subdir/file3.txt"), "test").unwrap();
618 fs::write(root.join("dir2/file4.txt"), "another file here").unwrap();
619
620 temp
621 }
622
623 #[test]
624 fn test_basic_scan() {
625 let temp = create_test_tree();
626 let config = ScanConfig::new(temp.path());
627
628 let scanner = JwalkScanner::new();
629 let tree = scanner.scan(&config).unwrap();
630
631 assert_eq!(tree.stats.total_files, 4);
632 assert!(tree.stats.total_dirs >= 3);
634 assert!(tree.root.size > 0);
635 }
636
637 #[test]
638 fn test_children_sorted_by_size() {
639 let temp = create_test_tree();
640 let config = ScanConfig::new(temp.path());
641
642 let scanner = JwalkScanner::new();
643 let tree = scanner.scan(&config).unwrap();
644
645 for i in 0..tree.root.children.len().saturating_sub(1) {
647 assert!(tree.root.children[i].size >= tree.root.children[i + 1].size);
648 }
649 }
650
651 #[test]
652 fn test_ignore_patterns() {
653 let temp = create_test_tree();
654 let config = ScanConfig::builder()
655 .root(temp.path())
656 .ignore_patterns(vec!["dir2".to_string()])
657 .build()
658 .unwrap();
659
660 let scanner = JwalkScanner::new();
661 let tree = scanner.scan(&config).unwrap();
662
663 assert!(!tree
665 .root
666 .children
667 .iter()
668 .any(|c| c.name.as_str() == "dir2"));
669 }
670}