acme_disk_use/
scanner.rs

1//! Directory scanning module for calculating disk usage statistics
2
3use rayon::prelude::*;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    fs, io,
8    path::{Path, PathBuf},
9    time::SystemTime,
10};
11
12#[cfg(unix)]
13use std::os::unix::fs::MetadataExt;
14
15use crate::error::DiskUseError;
16
17/// Get the physical size of a file on disk in bytes
18///
19/// On Unix systems, this uses the `blocks` metadata field multiplied by 512
20/// to get the actual disk usage, which accounts for sparse files and block alignment.
21/// On non-Unix systems, it falls back to the logical file size.
22fn get_block_size(meta: &fs::Metadata) -> u64 {
23    #[cfg(unix)]
24    {
25        meta.blocks() * 512
26    }
27    #[cfg(not(unix))]
28    {
29        meta.len()
30    }
31}
32
33/// Statistics for a directory and its contents
34#[derive(Serialize, Deserialize, Debug, Clone)]
35pub struct DirStat {
36    pub(crate) path: PathBuf,                       // Directory path
37    pub(crate) total_size: u64,                     // Logical sum of st_size of all files
38    pub(crate) file_count: u64, // Number of files in this directory and subdirectories
39    pub(crate) last_scan: SystemTime, // When this subtree was last scanned
40    pub(crate) children: HashMap<PathBuf, DirStat>, // Child directories' stats
41}
42
43impl DirStat {
44    /// Get the total size of this directory
45    pub fn total_size(&self) -> u64 {
46        self.total_size
47    }
48
49    /// Get the file count in this directory
50    pub fn file_count(&self) -> u64 {
51        self.file_count
52    }
53
54    /// Get the last scan time
55    pub fn last_scan(&self) -> SystemTime {
56        self.last_scan
57    }
58
59    /// Get the path of this directory
60    pub fn path(&self) -> &Path {
61        &self.path
62    }
63
64    /// Get the child directories
65    pub fn children(&self) -> &HashMap<PathBuf, DirStat> {
66        &self.children
67    }
68}
69
70/// Check if a directory or any of its subdirectories have been modified
71///
72/// Uses a recursive mtime comparison approach:
73/// 1. Check if directory's own mtime > last_scan (files/dirs added/removed)
74/// 2. Recursively validate cached subdirectories
75///
76/// If files were added or removed in subdirectories, the
77/// directory mtime would have been updated by the OS.
78fn dir_changed_since_last_scan(path: &Path, cached: &DirStat) -> bool {
79    // Check if the directory itself was modified
80    match fs::metadata(path).and_then(|m| m.modified()) {
81        Ok(mtime) => {
82            if mtime > cached.last_scan {
83                return true;
84            }
85        }
86        Err(_) => return true, // If we can't stat it, assume it changed (or is gone/inaccessible)
87    }
88
89    // If directory mtime hasn't changed, we assume no files were added/removed
90    // at this level. However, subdirectories might have changed internally
91    // without updating the parent's mtime.
92    // Parallelize the check for children
93    cached
94        .children
95        .par_iter()
96        .any(|(child_path, child_stat)| dir_changed_since_last_scan(child_path, child_stat))
97}
98
99/// Scan a directory recursively and return statistics
100///
101/// # Arguments
102/// * `path` - The directory path to scan
103/// * `cache` - Optional cached statistics for this directory
104///
105/// # Returns
106/// Directory statistics including size, file count, and child directories
107pub fn scan_directory(path: &Path, cache: Option<&DirStat>) -> io::Result<DirStat> {
108    // If cache exists, check if rescan needed BEFORE cloning
109    if let Some(cached) = cache {
110        // If directory hasn't changed, return the cached version
111        // This avoids cloning if we are going to discard it anyway
112        if !dir_changed_since_last_scan(path, cached) {
113            return Ok(cached.clone());
114        }
115    }
116
117    let mut total_size = 0;
118    let mut file_count = 0;
119    let mut children = HashMap::new();
120
121    // Collect entries first for potential parallel processing
122    let entries: Vec<_> = match fs::read_dir(path) {
123        Ok(entries) => entries
124            .filter_map(|e| match e {
125                Ok(entry) => Some(entry),
126                Err(err) => {
127                    // Log warning for individual entry read errors but continue
128                    eprintln!(
129                        "Warning: Failed to read directory entry in '{}': {}",
130                        path.display(),
131                        err
132                    );
133                    None
134                }
135            })
136            .collect(),
137        Err(err) => {
138            return Err(io::Error::from(DiskUseError::ScanError {
139                path: path.to_path_buf(),
140                source: err,
141            }));
142        }
143    };
144
145    // Process files and collect subdirectories
146    let mut subdirs = Vec::new();
147
148    for entry in entries {
149        let entry_path = entry.path();
150        match entry.metadata() {
151            Ok(meta) => {
152                if meta.is_file() {
153                    total_size += get_block_size(&meta);
154                    file_count += 1;
155                } else if meta.is_dir() {
156                    subdirs.push(entry_path);
157                }
158            }
159            Err(err) => {
160                // Log warning for metadata read errors but continue scanning
161                eprintln!(
162                    "Warning: Failed to read metadata for '{}': {}",
163                    entry_path.display(),
164                    err
165                );
166            }
167        }
168    }
169
170    // Process subdirectories in parallel if we have multiple
171    if subdirs.len() > 1 {
172        let results: Vec<_> = subdirs
173            .par_iter()
174            .filter_map(|entry_path| {
175                let child_cache = cache.and_then(|c| c.children.get(entry_path));
176                match scan_directory(entry_path, child_cache) {
177                    Ok(stat) => Some(stat),
178                    Err(err) => {
179                        // Log warning for subdirectory scan errors but continue
180                        eprintln!("Warning: Failed to scan subdirectory: {}", err);
181                        None
182                    }
183                }
184            })
185            .collect();
186
187        for child_stat in results {
188            total_size += child_stat.total_size;
189            file_count += child_stat.file_count;
190            children.insert(child_stat.path.clone(), child_stat);
191        }
192    } else {
193        // Sequential processing for single subdirectory
194        for entry_path in subdirs {
195            let child_cache = cache.and_then(|c| c.children.get(&entry_path));
196            match scan_directory(&entry_path, child_cache) {
197                Ok(child_stat) => {
198                    total_size += child_stat.total_size;
199                    file_count += child_stat.file_count;
200                    children.insert(entry_path, child_stat);
201                }
202                Err(err) => {
203                    // Log warning for subdirectory scan errors but continue
204                    eprintln!("Warning: Failed to scan subdirectory: {}", err);
205                }
206            }
207        }
208    }
209
210    Ok(DirStat {
211        path: path.to_path_buf(),
212        total_size,
213        file_count,
214        last_scan: SystemTime::now(),
215        children,
216    })
217}
218
219/// Count files in a directory recursively (without using cache)
220pub fn count_files(path: &Path) -> io::Result<u64> {
221    let mut count = 0;
222
223    let entries = fs::read_dir(path).map_err(|err| {
224        io::Error::from(DiskUseError::ScanError {
225            path: path.to_path_buf(),
226            source: err,
227        })
228    })?;
229
230    for entry in entries {
231        let entry = match entry {
232            Ok(e) => e,
233            Err(err) => {
234                eprintln!(
235                    "Warning: Failed to read directory entry in '{}': {}",
236                    path.display(),
237                    err
238                );
239                continue;
240            }
241        };
242
243        let meta = match entry.metadata() {
244            Ok(m) => m,
245            Err(err) => {
246                eprintln!(
247                    "Warning: Failed to read metadata for '{}': {}",
248                    entry.path().display(),
249                    err
250                );
251                continue;
252            }
253        };
254
255        if meta.is_file() {
256            count += 1;
257        } else if meta.is_dir() {
258            match count_files(&entry.path()) {
259                Ok(subcount) => count += subcount,
260                Err(err) => {
261                    eprintln!("Warning: Failed to count files in subdirectory: {}", err);
262                }
263            }
264        }
265    }
266
267    Ok(count)
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use tempfile::TempDir;
274
275    fn create_test_structure(base: &Path) -> io::Result<()> {
276        fs::create_dir_all(base.join("subdir1"))?;
277        fs::create_dir_all(base.join("subdir2/nested"))?;
278
279        fs::write(base.join("file1.txt"), "Hello World")?; // 11 bytes
280        fs::write(base.join("file2.txt"), "Test content")?; // 12 bytes
281        fs::write(base.join("subdir1/nested_file.txt"), "Nested content here")?; // 19 bytes
282        fs::write(base.join("subdir2/another.txt"), "More content")?; // 12 bytes
283        fs::write(base.join("subdir2/nested/deep.txt"), "Deep file content")?; // 17 bytes
284
285        Ok(())
286    }
287
288    #[test]
289    fn test_scan_directory() -> io::Result<()> {
290        // This test verifies that `scan_directory` correctly calculates the total size
291        // and file count of a directory structure. It checks if the calculated size
292        // is at least the logical size (accounting for block overhead) and if the
293        // file count and subdirectory count match the expected values.
294        let temp_dir = TempDir::new()?;
295        let test_dir = temp_dir.path().join("test");
296        fs::create_dir(&test_dir)?;
297
298        create_test_structure(&test_dir)?;
299
300        let result = scan_directory(&test_dir, None)?;
301
302        // Expected total: 11 + 12 + 19 + 12 + 17 = 71 bytes (logical)
303        // With block size, it will be larger.
304        assert!(result.total_size() >= 71);
305        assert_eq!(result.file_count(), 5);
306        assert_eq!(result.children.len(), 2); // subdir1 and subdir2
307
308        Ok(())
309    }
310
311    #[test]
312    fn test_count_files() -> io::Result<()> {
313        // This test verifies that `count_files` correctly counts the total number
314        // of files in a directory tree recursively, without using any cache.
315        let temp_dir = TempDir::new()?;
316        let test_dir = temp_dir.path().join("test");
317        fs::create_dir(&test_dir)?;
318
319        create_test_structure(&test_dir)?;
320
321        let count = count_files(&test_dir)?;
322        assert_eq!(count, 5);
323
324        Ok(())
325    }
326
327    #[test]
328    fn test_scan_with_cache() -> io::Result<()> {
329        // This test verifies that the caching mechanism works correctly.
330        // It performs an initial scan, then a second scan with the cache.
331        // It asserts that the second scan reuses the cached result (indicated by
332        // the same `last_scan` timestamp) since the directory hasn't changed.
333        let temp_dir = TempDir::new()?;
334        let test_dir = temp_dir.path().join("test");
335        fs::create_dir(&test_dir)?;
336
337        create_test_structure(&test_dir)?;
338
339        // First scan without cache
340        let stats1 = scan_directory(&test_dir, None)?;
341        let scan_time1 = stats1.last_scan();
342
343        // Second scan with cache (should reuse if directory hasn't changed)
344        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
345        let scan_time2 = stats2.last_scan();
346
347        // Since directory hasn't changed, should return cached stats with same timestamp
348        assert_eq!(scan_time1, scan_time2);
349
350        Ok(())
351    }
352
353    #[test]
354    fn test_detects_new_nested_subdirectory() -> io::Result<()> {
355        // This test ensures that the scanner detects changes deep in the directory tree.
356        // It creates a structure, scans it, then adds a new nested subdirectory and file.
357        // It verifies that the subsequent scan detects the new file and updates the
358        // `last_scan` timestamp, indicating a re-scan occurred.
359        use std::thread::sleep;
360        use std::time::Duration;
361
362        let temp_dir = TempDir::new()?;
363        let test_dir = temp_dir.path().join("test");
364        fs::create_dir(&test_dir)?;
365
366        // Create initial structure: test/a/
367        fs::create_dir(test_dir.join("a"))?;
368        fs::write(test_dir.join("a/file1.txt"), "content")?;
369
370        // First scan
371        let stats1 = scan_directory(&test_dir, None)?;
372        assert_eq!(stats1.file_count(), 1);
373
374        // Wait a moment to ensure time difference
375        sleep(Duration::from_millis(10));
376
377        // Now create test/a/b/ (this updates a's mtime but NOT test's mtime)
378        fs::create_dir(test_dir.join("a/b"))?;
379        fs::write(test_dir.join("a/b/file2.txt"), "new content")?;
380
381        // Second scan with cache - should detect the new subdirectory
382        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
383
384        // Should have scanned and found the new file
385        assert_eq!(stats2.file_count(), 2);
386        assert!(
387            stats2.last_scan() > stats1.last_scan(),
388            "Should have rescanned since new subdirectory was added"
389        );
390
391        Ok(())
392    }
393
394    #[test]
395    fn test_detects_deleted_subdirectory() -> io::Result<()> {
396        // This test ensures that the scanner detects when a subdirectory is deleted.
397        // It creates a structure, scans it, then deletes a subdirectory.
398        // It verifies that the subsequent scan correctly reports the reduced file count
399        // and updates the `last_scan` timestamp.
400        use std::thread::sleep;
401        use std::time::Duration;
402
403        let temp_dir = TempDir::new()?;
404        let test_dir = temp_dir.path().join("test");
405        fs::create_dir(&test_dir)?;
406
407        // Create initial structure
408        fs::create_dir(test_dir.join("a"))?;
409        fs::create_dir(test_dir.join("b"))?;
410        fs::write(test_dir.join("a/file1.txt"), "content")?;
411        fs::write(test_dir.join("b/file2.txt"), "content")?;
412
413        // First scan
414        let stats1 = scan_directory(&test_dir, None)?;
415        assert_eq!(stats1.file_count(), 2);
416
417        // Wait a moment
418        sleep(Duration::from_millis(10));
419
420        // Delete subdirectory b
421        fs::remove_file(test_dir.join("b/file2.txt"))?;
422        fs::remove_dir(test_dir.join("b"))?;
423
424        // Second scan with cache - should detect the deleted subdirectory
425        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
426
427        // Should have rescanned and found only 1 file now
428        assert_eq!(stats2.file_count(), 1);
429        assert!(
430            stats2.last_scan() > stats1.last_scan(),
431            "Should have rescanned since subdirectory was deleted"
432        );
433
434        Ok(())
435    }
436
437    #[test]
438    fn test_prunes_deeply_nested_deleted_directory() -> io::Result<()> {
439        // This test verifies that the scanner correctly handles the deletion of
440        // deeply nested directories. It creates a deep structure, scans it,
441        // then deletes a middle part of the tree. It checks if the cache is
442        // correctly updated to reflect the removal of the nested structure.
443        use std::thread::sleep;
444        use std::time::Duration;
445
446        let temp_dir = TempDir::new()?;
447        let test_dir = temp_dir.path().join("test");
448        fs::create_dir(&test_dir)?;
449
450        // Create deeply nested structure: test/a/b/c/d/
451        fs::create_dir_all(test_dir.join("a/b/c/d"))?;
452        fs::write(test_dir.join("a/file1.txt"), "content1")?;
453        fs::write(test_dir.join("a/b/file2.txt"), "content2")?;
454        fs::write(test_dir.join("a/b/c/file3.txt"), "content3")?;
455        fs::write(test_dir.join("a/b/c/d/file4.txt"), "content4")?;
456
457        // First scan
458        let stats1 = scan_directory(&test_dir, None)?;
459        assert_eq!(stats1.file_count(), 4);
460
461        // Wait a moment
462        sleep(Duration::from_millis(10));
463
464        // Delete deeply nested directory c (and its child d)
465        fs::remove_file(test_dir.join("a/b/c/d/file4.txt"))?;
466        fs::remove_dir(test_dir.join("a/b/c/d"))?;
467        fs::remove_file(test_dir.join("a/b/c/file3.txt"))?;
468        fs::remove_dir(test_dir.join("a/b/c"))?;
469
470        // Second scan with cache - should prune deleted dirs and update counts
471        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
472
473        // Should have only 2 files now (file1.txt and file2.txt)
474        assert_eq!(stats2.file_count(), 2);
475
476        // Verify cache structure is updated (b should exist, but c and d should be gone)
477        let a_stats = stats2.children.get(&test_dir.join("a")).unwrap();
478        let b_stats = a_stats.children.get(&test_dir.join("a/b")).unwrap();
479        assert!(
480            !b_stats.children.contains_key(&test_dir.join("a/b/c")),
481            "Deleted directory c should be pruned from cache"
482        );
483
484        Ok(())
485    }
486
487    #[test]
488    fn test_scan_nonexistent_path() {
489        // Test that scanning a nonexistent path returns an appropriate error
490        let result = scan_directory(Path::new("/nonexistent/path/that/does/not/exist"), None);
491        assert!(result.is_err());
492        let err = result.unwrap_err();
493        assert!(
494            err.to_string().contains("Failed to scan directory"),
495            "Error message should be descriptive"
496        );
497    }
498
499    #[test]
500    fn test_count_files_with_inaccessible_subdirectory() -> io::Result<()> {
501        // Test that count_files handles inaccessible subdirectories gracefully
502        let temp_dir = TempDir::new()?;
503        let test_dir = temp_dir.path().join("test");
504        fs::create_dir(&test_dir)?;
505
506        // Create a normal structure
507        fs::write(test_dir.join("file1.txt"), "content")?;
508        fs::create_dir(test_dir.join("subdir"))?;
509        fs::write(test_dir.join("subdir/file2.txt"), "content")?;
510
511        // Count should work and return 2
512        let count = count_files(&test_dir)?;
513        assert_eq!(count, 2);
514
515        Ok(())
516    }
517}