1use std::collections::HashMap;
4use std::os::unix::fs::MetadataExt;
5use std::path::{Path, PathBuf};
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::Arc;
8use std::time::Instant;
9
10use compact_str::CompactString;
11use jwalk::{Parallelism, WalkDir};
12use tokio::sync::broadcast;
13
14use gravityfile_core::{
15 FileNode, FileTree, InodeInfo, NodeId, NodeKind, ScanConfig, ScanError, ScanWarning,
16 Timestamps, TreeStats, WarningKind,
17};
18
19use crate::inode::InodeTracker;
20use crate::progress::ScanProgress;
21
22pub struct JwalkScanner {
24 progress_tx: broadcast::Sender<ScanProgress>,
25}
26
27impl JwalkScanner {
28 pub fn new() -> Self {
30 let (progress_tx, _) = broadcast::channel(100);
31 Self { progress_tx }
32 }
33
34 pub fn subscribe(&self) -> broadcast::Receiver<ScanProgress> {
36 self.progress_tx.subscribe()
37 }
38
39 pub fn scan(&self, config: &ScanConfig) -> Result<FileTree, ScanError> {
41 let start = Instant::now();
42 let root_path = config.root.canonicalize().map_err(|e| ScanError::io(&config.root, e))?;
43
44 if !root_path.is_dir() {
46 return Err(ScanError::NotADirectory { path: root_path });
47 }
48
49 let root_metadata = std::fs::metadata(&root_path).map_err(|e| ScanError::io(&root_path, e))?;
51 let root_device = root_metadata.dev();
52
53 let inode_tracker = InodeTracker::new();
55 let node_id_counter = AtomicU64::new(0);
56 let mut stats = TreeStats::new();
57 let mut warnings = Vec::new();
58
59 let entries = self.collect_entries(config, &root_path, root_device, &inode_tracker, &mut stats, &mut warnings)?;
61
62 let root_node = self.build_tree(&root_path, entries, &node_id_counter, &mut stats);
64
65 let scan_duration = start.elapsed();
66
67 Ok(FileTree::new(
68 root_node,
69 root_path,
70 config.clone(),
71 stats,
72 scan_duration,
73 warnings,
74 ))
75 }
76
77 fn collect_entries(
79 &self,
80 config: &ScanConfig,
81 root_path: &Path,
82 root_device: u64,
83 inode_tracker: &InodeTracker,
84 stats: &mut TreeStats,
85 warnings: &mut Vec<ScanWarning>,
86 ) -> Result<HashMap<PathBuf, Vec<EntryInfo>>, ScanError> {
87 let parallelism = match config.threads {
88 0 => Parallelism::RayonDefaultPool { busy_timeout: std::time::Duration::from_millis(100) },
89 n => Parallelism::RayonNewPool(n),
90 };
91
92 let walker = WalkDir::new(root_path)
93 .parallelism(parallelism)
94 .skip_hidden(!config.include_hidden)
95 .follow_links(config.follow_symlinks)
96 .min_depth(0)
97 .max_depth(config.max_depth.map(|d| d as usize).unwrap_or(usize::MAX));
98
99 let mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>> = HashMap::new();
101 let progress_counter = Arc::new(AtomicU64::new(0));
102
103 for entry_result in walker {
104 let entry = match entry_result {
105 Ok(e) => e,
106 Err(err) => {
107 let path = err.path().map(|p| p.to_path_buf()).unwrap_or_default();
108 warnings.push(ScanWarning::new(
109 path,
110 err.to_string(),
111 WarningKind::ReadError,
112 ));
113 continue;
114 }
115 };
116
117 let path = entry.path();
118 let file_name = entry.file_name().to_string_lossy().to_string();
119
120 if config.should_ignore(&file_name) {
122 continue;
123 }
124
125 let metadata = match entry.metadata() {
127 Ok(m) => m,
128 Err(err) => {
129 warnings.push(ScanWarning::new(
130 &path,
131 err.to_string(),
132 WarningKind::MetadataError,
133 ));
134 continue;
135 }
136 };
137
138 if !config.cross_filesystems && metadata.dev() != root_device {
140 continue;
141 }
142
143 let file_type = entry.file_type();
145 let depth = entry.depth() as u32;
146
147 if file_type.is_dir() {
148 stats.record_dir(depth);
149
150 if let Some(parent) = path.parent() {
152 let entry_info = EntryInfo {
153 name: file_name.into(),
154 path: path.clone(),
155 size: 0,
156 blocks: 0,
157 is_dir: true,
158 is_symlink: false,
159 executable: false,
160 timestamps: Timestamps::new(
161 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
162 metadata.accessed().ok(),
163 metadata.created().ok(),
164 ),
165 inode: Some(InodeInfo::new(metadata.ino(), metadata.dev())),
166 };
167
168 entries_by_parent
169 .entry(parent.to_path_buf())
170 .or_default()
171 .push(entry_info);
172 }
173 } else if file_type.is_file() {
174 let inode_info = InodeInfo::new(metadata.ino(), metadata.dev());
176 let size = if config.apparent_size {
177 metadata.len()
178 } else {
179 if metadata.nlink() > 1 && !inode_tracker.track(inode_info) {
181 0 } else {
183 metadata.len()
184 }
185 };
186
187 let blocks = metadata.blocks();
188
189 stats.record_file(
190 path.clone(),
191 size,
192 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
193 depth,
194 );
195
196 if let Some(parent) = path.parent() {
197 let executable = is_executable(&metadata);
198 let entry_info = EntryInfo {
199 name: file_name.into(),
200 path: path.clone(),
201 size,
202 blocks,
203 is_dir: false,
204 is_symlink: false,
205 executable,
206 timestamps: Timestamps::new(
207 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
208 metadata.accessed().ok(),
209 metadata.created().ok(),
210 ),
211 inode: Some(inode_info),
212 };
213
214 entries_by_parent
215 .entry(parent.to_path_buf())
216 .or_default()
217 .push(entry_info);
218 }
219
220 let count = progress_counter.fetch_add(1, Ordering::Relaxed);
222 if count % 1000 == 0 {
223 let _ = self.progress_tx.send(ScanProgress {
224 files_scanned: stats.total_files,
225 dirs_scanned: stats.total_dirs,
226 bytes_scanned: stats.total_size,
227 current_path: path.clone(),
228 errors_count: warnings.len() as u64,
229 elapsed: std::time::Duration::ZERO, });
231 }
232 } else if file_type.is_symlink() {
233 stats.record_symlink();
234
235 if let Some(parent) = path.parent() {
236 let target = std::fs::read_link(&path)
237 .map(|p| p.to_string_lossy().to_string())
238 .unwrap_or_default();
239
240 let broken = !path.exists();
241 if broken {
242 warnings.push(ScanWarning::broken_symlink(&path, &target));
243 }
244
245 let entry_info = EntryInfo {
246 name: file_name.into(),
247 path: path.clone(),
248 size: 0,
249 blocks: 0,
250 is_dir: false,
251 is_symlink: true,
252 executable: false,
253 timestamps: Timestamps::new(
254 metadata.modified().unwrap_or(std::time::UNIX_EPOCH),
255 metadata.accessed().ok(),
256 metadata.created().ok(),
257 ),
258 inode: None,
259 };
260
261 entries_by_parent
262 .entry(parent.to_path_buf())
263 .or_default()
264 .push(entry_info);
265 }
266 }
267 }
268
269 Ok(entries_by_parent)
270 }
271
272 fn build_tree(
274 &self,
275 root_path: &Path,
276 mut entries_by_parent: HashMap<PathBuf, Vec<EntryInfo>>,
277 node_id_counter: &AtomicU64,
278 stats: &mut TreeStats,
279 ) -> FileNode {
280 self.build_node(root_path, &mut entries_by_parent, node_id_counter, stats)
281 }
282
283 fn build_node(
285 &self,
286 path: &Path,
287 entries_by_parent: &mut HashMap<PathBuf, Vec<EntryInfo>>,
288 node_id_counter: &AtomicU64,
289 stats: &mut TreeStats,
290 ) -> FileNode {
291 let id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
292 let name = path
293 .file_name()
294 .map(|n| n.to_string_lossy().to_string())
295 .unwrap_or_else(|| path.to_string_lossy().to_string());
296
297 let metadata = std::fs::metadata(path).ok();
298 let timestamps = metadata
299 .as_ref()
300 .map(|m| {
301 Timestamps::new(
302 m.modified().unwrap_or(std::time::UNIX_EPOCH),
303 m.accessed().ok(),
304 m.created().ok(),
305 )
306 })
307 .unwrap_or_else(|| Timestamps::with_modified(std::time::UNIX_EPOCH));
308
309 let mut node = FileNode::new_directory(id, name, timestamps);
310
311 let children_entries = entries_by_parent.remove(path).unwrap_or_default();
313
314 let mut total_size: u64 = 0;
315 let mut file_count: u64 = 0;
316 let mut dir_count: u64 = 0;
317
318 for entry in children_entries {
319 if entry.is_dir {
320 let child_node = self.build_node(&entry.path, entries_by_parent, node_id_counter, stats);
322 total_size += child_node.size;
323 file_count += child_node.file_count();
324 dir_count += child_node.dir_count() + 1;
325 node.children.push(child_node);
326 } else if entry.is_symlink {
327 let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
329 let target = std::fs::read_link(&entry.path)
330 .map(|p| CompactString::new(p.to_string_lossy()))
331 .unwrap_or_default();
332 let broken = !entry.path.exists();
333
334 let child_node = FileNode {
335 id: child_id,
336 name: entry.name,
337 kind: NodeKind::Symlink { target, broken },
338 size: 0,
339 blocks: 0,
340 timestamps: entry.timestamps,
341 inode: None,
342 content_hash: None,
343 children: Vec::new(),
344 };
345 node.children.push(child_node);
346 } else {
347 let child_id = NodeId::new(node_id_counter.fetch_add(1, Ordering::Relaxed));
349 let mut child_node = FileNode::new_file(
350 child_id,
351 entry.name,
352 entry.size,
353 entry.blocks,
354 entry.timestamps,
355 entry.executable,
356 );
357 child_node.inode = entry.inode;
358
359 total_size += entry.size;
360 file_count += 1;
361 node.children.push(child_node);
362 }
363 }
364
365 node.size = total_size;
367 node.kind = NodeKind::Directory {
368 file_count,
369 dir_count,
370 };
371
372 node.children.sort_by(|a, b| b.size.cmp(&a.size));
374
375 node
376 }
377}
378
379impl Default for JwalkScanner {
380 fn default() -> Self {
381 Self::new()
382 }
383}
384
385struct EntryInfo {
387 name: CompactString,
388 path: PathBuf,
389 size: u64,
390 blocks: u64,
391 is_dir: bool,
392 is_symlink: bool,
393 executable: bool,
394 timestamps: Timestamps,
395 inode: Option<InodeInfo>,
396}
397
398#[cfg(unix)]
400fn is_executable(metadata: &std::fs::Metadata) -> bool {
401 use std::os::unix::fs::PermissionsExt;
402 metadata.permissions().mode() & 0o111 != 0
403}
404
405#[cfg(not(unix))]
406fn is_executable(_metadata: &std::fs::Metadata) -> bool {
407 false
408}
409
410#[cfg(test)]
411mod tests {
412 use super::*;
413 use std::fs;
414 use tempfile::TempDir;
415 use gravityfile_core::ScanConfigBuilder;
416
417 fn create_test_tree() -> TempDir {
418 let temp = TempDir::new().unwrap();
419 let root = temp.path();
420
421 fs::create_dir(root.join("dir1")).unwrap();
423 fs::create_dir(root.join("dir2")).unwrap();
424 fs::create_dir(root.join("dir1/subdir")).unwrap();
425
426 fs::write(root.join("file1.txt"), "hello").unwrap();
428 fs::write(root.join("dir1/file2.txt"), "world world world").unwrap();
429 fs::write(root.join("dir1/subdir/file3.txt"), "test").unwrap();
430 fs::write(root.join("dir2/file4.txt"), "another file here").unwrap();
431
432 temp
433 }
434
435 #[test]
436 fn test_basic_scan() {
437 let temp = create_test_tree();
438 let config = ScanConfig::new(temp.path());
439
440 let scanner = JwalkScanner::new();
441 let tree = scanner.scan(&config).unwrap();
442
443 assert_eq!(tree.stats.total_files, 4);
444 assert!(tree.stats.total_dirs >= 3);
446 assert!(tree.root.size > 0);
447 }
448
449 #[test]
450 fn test_children_sorted_by_size() {
451 let temp = create_test_tree();
452 let config = ScanConfig::new(temp.path());
453
454 let scanner = JwalkScanner::new();
455 let tree = scanner.scan(&config).unwrap();
456
457 for i in 0..tree.root.children.len().saturating_sub(1) {
459 assert!(tree.root.children[i].size >= tree.root.children[i + 1].size);
460 }
461 }
462
463 #[test]
464 fn test_ignore_patterns() {
465 let temp = create_test_tree();
466 let config = ScanConfig::builder()
467 .root(temp.path())
468 .ignore_patterns(vec!["dir2".to_string()])
469 .build()
470 .unwrap();
471
472 let scanner = JwalkScanner::new();
473 let tree = scanner.scan(&config).unwrap();
474
475 assert!(!tree
477 .root
478 .children
479 .iter()
480 .any(|c| c.name.as_str() == "dir2"));
481 }
482}