acme_disk_use/
scanner.rs

1//! Directory scanning module for calculating disk usage statistics
2
3use rayon::prelude::*;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    fs, io,
8    path::{Path, PathBuf},
9    time::SystemTime,
10};
11
12/// Statistics for a directory and its contents
13#[derive(Serialize, Deserialize, Debug, Clone)]
14pub struct DirStat {
15    pub(crate) path: PathBuf,
16    pub(crate) total_size: u64, // Logical sum of st_size of all files
17    pub(crate) file_count: u64,
18    pub(crate) last_scan: SystemTime, // When this subtree was last scanned
19    pub(crate) children: HashMap<PathBuf, DirStat>,
20}
21
22impl DirStat {
23    /// Get the total size of this directory
24    pub fn total_size(&self) -> u64 {
25        self.total_size
26    }
27
28    /// Get the file count in this directory
29    pub fn file_count(&self) -> u64 {
30        self.file_count
31    }
32
33    /// Get the last scan time
34    pub fn last_scan(&self) -> SystemTime {
35        self.last_scan
36    }
37
38    /// Get the path of this directory
39    pub fn path(&self) -> &Path {
40        &self.path
41    }
42}
43
44/// Prune deleted directories from the cache recursively
45///
46/// Removes any child DirStat entries whose paths no longer exist on disk.
47/// Returns true if any deletions were found and pruned.
48fn prune_deleted_dirs(cached: &mut DirStat) -> bool {
49    let mut found_deletions = false;
50
51    // Check direct children for deletions
52    cached.children.retain(|child_path, child_stat| {
53        if !child_path.exists() {
54            found_deletions = true;
55            false // Remove this entry
56        } else {
57            // Recursively prune this child's children
58            if prune_deleted_dirs(child_stat) {
59                found_deletions = true;
60            }
61            true // Keep this entry
62        }
63    });
64
65    found_deletions
66}
67
68/// Check if a directory or any of its subdirectories have been modified
69///
70/// Assumes deleted directories have already been pruned via prune_deleted_dirs.
71/// Uses a recursive mtime comparison approach:
72/// 1. Check if directory's own mtime > last_scan (files/dirs added/removed)
73/// 2. Recursively validate cached subdirectories
74///
75/// OPTIMIZATION: We skip reading the directory contents (fs::read_dir) and rely
76/// entirely on the directory's mtime. If files were added or removed, the
77/// directory mtime would have been updated by the OS.
78fn dir_changed_since_last_scan(path: &Path, cached: &DirStat) -> bool {
79    // Check if the directory itself was modified
80    match fs::metadata(path).and_then(|m| m.modified()) {
81        Ok(mtime) => {
82            if mtime > cached.last_scan {
83                return true;
84            }
85        }
86        Err(_) => return true, // If we can't stat it, assume it changed (or is gone/inaccessible)
87    }
88
89    // If directory mtime hasn't changed, we assume no files were added/removed
90    // at this level. However, subdirectories might have changed internally
91    // without updating the parent's mtime.
92    for (child_path, child_stat) in &cached.children {
93        if dir_changed_since_last_scan(child_path, child_stat) {
94            return true;
95        }
96    }
97
98    false
99}
100
101/// Scan a directory recursively and return statistics
102///
103/// # Arguments
104/// * `path` - The directory path to scan
105/// * `cache` - Optional cached statistics for this directory
106///
107/// # Returns
108/// Directory statistics including size, file count, and child directories
109pub fn scan_directory(path: &Path, cache: Option<&DirStat>) -> io::Result<DirStat> {
110    // If cache exists, first prune deleted directories, then check if rescan needed
111    if let Some(cached) = cache {
112        let mut pruned_cache = cached.clone();
113        let had_deletions = prune_deleted_dirs(&mut pruned_cache);
114
115        // If we found deletions, we need to recalculate totals from remaining children
116        if had_deletions {
117            // Recalculate total_size and file_count from remaining children
118            let mut total_size = 0;
119            let mut file_count = 0;
120
121            for child in pruned_cache.children.values() {
122                total_size += child.total_size;
123                file_count += child.file_count;
124            }
125
126            // Count files at this level (not in subdirs)
127            if let Ok(entries) = fs::read_dir(path) {
128                for entry in entries.flatten() {
129                    if let Ok(meta) = entry.metadata() {
130                        if meta.is_file() {
131                            total_size += meta.len();
132                            file_count += 1;
133                        }
134                    }
135                }
136            }
137
138            pruned_cache.total_size = total_size;
139            pruned_cache.file_count = file_count;
140            pruned_cache.last_scan = SystemTime::now();
141        }
142
143        // Now check if directory changed (excluding deletion checks)
144        if !dir_changed_since_last_scan(path, &pruned_cache) {
145            return Ok(pruned_cache);
146        }
147    }
148
149    let mut total_size = 0;
150    let mut file_count = 0;
151    let mut children = HashMap::new();
152
153    // Collect entries first for potential parallel processing
154    let entries: Vec<_> = fs::read_dir(path)?.filter_map(|e| e.ok()).collect();
155
156    // Process files and collect subdirectories
157    let mut subdirs = Vec::new();
158
159    for entry in entries {
160        let entry_path = entry.path();
161        if let Ok(meta) = entry.metadata() {
162            if meta.is_file() {
163                total_size += meta.len();
164                file_count += 1;
165            } else if meta.is_dir() {
166                subdirs.push(entry_path);
167            }
168        }
169    }
170
171    // Process subdirectories in parallel if we have multiple
172    if subdirs.len() > 1 {
173        let results: Vec<_> = subdirs
174            .par_iter()
175            .filter_map(|entry_path| {
176                let child_cache = cache.and_then(|c| c.children.get(entry_path));
177                scan_directory(entry_path, child_cache).ok()
178            })
179            .collect();
180
181        for child_stat in results {
182            total_size += child_stat.total_size;
183            file_count += child_stat.file_count;
184            children.insert(child_stat.path.clone(), child_stat);
185        }
186    } else {
187        // Sequential processing for single subdirectory
188        for entry_path in subdirs {
189            let child_cache = cache.and_then(|c| c.children.get(&entry_path));
190            if let Ok(child_stat) = scan_directory(&entry_path, child_cache) {
191                total_size += child_stat.total_size;
192                file_count += child_stat.file_count;
193                children.insert(entry_path, child_stat);
194            }
195        }
196    }
197
198    Ok(DirStat {
199        path: path.to_path_buf(),
200        total_size,
201        file_count,
202        last_scan: SystemTime::now(),
203        children,
204    })
205}
206
207/// Count files in a directory recursively (without using cache)
208pub fn count_files(path: &Path) -> io::Result<u64> {
209    let mut count = 0;
210
211    for entry in fs::read_dir(path)? {
212        let entry = entry?;
213        let meta = entry.metadata()?;
214
215        if meta.is_file() {
216            count += 1;
217        } else if meta.is_dir() {
218            count += count_files(&entry.path())?;
219        }
220    }
221
222    Ok(count)
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228    use tempfile::TempDir;
229
230    fn create_test_structure(base: &Path) -> io::Result<()> {
231        fs::create_dir_all(base.join("subdir1"))?;
232        fs::create_dir_all(base.join("subdir2/nested"))?;
233
234        fs::write(base.join("file1.txt"), "Hello World")?; // 11 bytes
235        fs::write(base.join("file2.txt"), "Test content")?; // 12 bytes
236        fs::write(base.join("subdir1/nested_file.txt"), "Nested content here")?; // 19 bytes
237        fs::write(base.join("subdir2/another.txt"), "More content")?; // 12 bytes
238        fs::write(base.join("subdir2/nested/deep.txt"), "Deep file content")?; // 17 bytes
239
240        Ok(())
241    }
242
243    #[test]
244    fn test_scan_directory() -> io::Result<()> {
245        let temp_dir = TempDir::new()?;
246        let test_dir = temp_dir.path().join("test");
247        fs::create_dir(&test_dir)?;
248
249        create_test_structure(&test_dir)?;
250
251        let result = scan_directory(&test_dir, None)?;
252
253        // Expected total: 11 + 12 + 19 + 12 + 17 = 71 bytes
254        assert_eq!(result.total_size(), 71);
255        assert_eq!(result.file_count(), 5);
256        assert_eq!(result.children.len(), 2); // subdir1 and subdir2
257
258        Ok(())
259    }
260
261    #[test]
262    fn test_count_files() -> io::Result<()> {
263        let temp_dir = TempDir::new()?;
264        let test_dir = temp_dir.path().join("test");
265        fs::create_dir(&test_dir)?;
266
267        create_test_structure(&test_dir)?;
268
269        let count = count_files(&test_dir)?;
270        assert_eq!(count, 5);
271
272        Ok(())
273    }
274
275    #[test]
276    fn test_scan_with_cache() -> io::Result<()> {
277        let temp_dir = TempDir::new()?;
278        let test_dir = temp_dir.path().join("test");
279        fs::create_dir(&test_dir)?;
280
281        create_test_structure(&test_dir)?;
282
283        // First scan without cache
284        let stats1 = scan_directory(&test_dir, None)?;
285        let scan_time1 = stats1.last_scan();
286
287        // Second scan with cache (should reuse if directory hasn't changed)
288        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
289        let scan_time2 = stats2.last_scan();
290
291        // Since directory hasn't changed, should return cached stats with same timestamp
292        assert_eq!(scan_time1, scan_time2);
293
294        Ok(())
295    }
296
297    #[test]
298    fn test_detects_new_nested_subdirectory() -> io::Result<()> {
299        use std::thread::sleep;
300        use std::time::Duration;
301
302        let temp_dir = TempDir::new()?;
303        let test_dir = temp_dir.path().join("test");
304        fs::create_dir(&test_dir)?;
305
306        // Create initial structure: test/a/
307        fs::create_dir(test_dir.join("a"))?;
308        fs::write(test_dir.join("a/file1.txt"), "content")?;
309
310        // First scan
311        let stats1 = scan_directory(&test_dir, None)?;
312        assert_eq!(stats1.file_count(), 1);
313
314        // Wait a moment to ensure time difference
315        sleep(Duration::from_millis(10));
316
317        // Now create test/a/b/ (this updates a's mtime but NOT test's mtime)
318        fs::create_dir(test_dir.join("a/b"))?;
319        fs::write(test_dir.join("a/b/file2.txt"), "new content")?;
320
321        // Second scan with cache - should detect the new subdirectory
322        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
323
324        // Should have scanned and found the new file
325        assert_eq!(stats2.file_count(), 2);
326        assert!(
327            stats2.last_scan() > stats1.last_scan(),
328            "Should have rescanned since new subdirectory was added"
329        );
330
331        Ok(())
332    }
333
334    #[test]
335    fn test_detects_deleted_subdirectory() -> io::Result<()> {
336        use std::thread::sleep;
337        use std::time::Duration;
338
339        let temp_dir = TempDir::new()?;
340        let test_dir = temp_dir.path().join("test");
341        fs::create_dir(&test_dir)?;
342
343        // Create initial structure
344        fs::create_dir(test_dir.join("a"))?;
345        fs::create_dir(test_dir.join("b"))?;
346        fs::write(test_dir.join("a/file1.txt"), "content")?;
347        fs::write(test_dir.join("b/file2.txt"), "content")?;
348
349        // First scan
350        let stats1 = scan_directory(&test_dir, None)?;
351        assert_eq!(stats1.file_count(), 2);
352
353        // Wait a moment
354        sleep(Duration::from_millis(10));
355
356        // Delete subdirectory b
357        fs::remove_file(test_dir.join("b/file2.txt"))?;
358        fs::remove_dir(test_dir.join("b"))?;
359
360        // Second scan with cache - should detect the deleted subdirectory
361        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
362
363        // Should have rescanned and found only 1 file now
364        assert_eq!(stats2.file_count(), 1);
365        assert!(
366            stats2.last_scan() > stats1.last_scan(),
367            "Should have rescanned since subdirectory was deleted"
368        );
369
370        Ok(())
371    }
372
373    #[test]
374    fn test_prunes_deeply_nested_deleted_directory() -> io::Result<()> {
375        use std::thread::sleep;
376        use std::time::Duration;
377
378        let temp_dir = TempDir::new()?;
379        let test_dir = temp_dir.path().join("test");
380        fs::create_dir(&test_dir)?;
381
382        // Create deeply nested structure: test/a/b/c/d/
383        fs::create_dir_all(test_dir.join("a/b/c/d"))?;
384        fs::write(test_dir.join("a/file1.txt"), "content1")?;
385        fs::write(test_dir.join("a/b/file2.txt"), "content2")?;
386        fs::write(test_dir.join("a/b/c/file3.txt"), "content3")?;
387        fs::write(test_dir.join("a/b/c/d/file4.txt"), "content4")?;
388
389        // First scan
390        let stats1 = scan_directory(&test_dir, None)?;
391        assert_eq!(stats1.file_count(), 4);
392
393        // Wait a moment
394        sleep(Duration::from_millis(10));
395
396        // Delete deeply nested directory c (and its child d)
397        fs::remove_file(test_dir.join("a/b/c/d/file4.txt"))?;
398        fs::remove_dir(test_dir.join("a/b/c/d"))?;
399        fs::remove_file(test_dir.join("a/b/c/file3.txt"))?;
400        fs::remove_dir(test_dir.join("a/b/c"))?;
401
402        // Second scan with cache - should prune deleted dirs and update counts
403        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
404
405        // Should have only 2 files now (file1.txt and file2.txt)
406        assert_eq!(stats2.file_count(), 2);
407
408        // Verify cache structure is updated (b should exist, but c and d should be gone)
409        let a_stats = stats2.children.get(&test_dir.join("a")).unwrap();
410        let b_stats = a_stats.children.get(&test_dir.join("a/b")).unwrap();
411        assert!(
412            !b_stats.children.contains_key(&test_dir.join("a/b/c")),
413            "Deleted directory c should be pruned from cache"
414        );
415
416        Ok(())
417    }
418}