acme_disk_use/
scanner.rs

1//! Directory scanning module for calculating disk usage statistics
2
3use rayon::prelude::*;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    fs, io,
8    path::{Path, PathBuf},
9    time::SystemTime,
10};
11
12#[cfg(unix)]
13use std::os::unix::fs::MetadataExt;
14
15use crate::error::DiskUseError;
16
17/// Get the physical size of a file on disk in bytes
18///
19/// On Unix systems, this uses the `blocks` metadata field multiplied by 512
20/// to get the actual disk usage, which accounts for sparse files and block alignment.
21/// On non-Unix systems, it falls back to the logical file size.
22fn get_block_size(meta: &fs::Metadata) -> u64 {
23    #[cfg(unix)]
24    {
25        meta.blocks() * 512
26    }
27    #[cfg(not(unix))]
28    {
29        meta.len()
30    }
31}
32
33/// Statistics for a directory and its contents
34#[derive(Serialize, Deserialize, Debug, Clone)]
35pub struct DirStat {
36    pub(crate) path: PathBuf,                       // Directory path
37    pub(crate) total_size: u64,                     // Logical sum of st_size of all files
38    pub(crate) file_count: u64, // Number of files in this directory and subdirectories
39    pub(crate) last_scan: SystemTime, // When this subtree was last scanned
40    pub(crate) children: HashMap<PathBuf, DirStat>, // Child directories' stats
41}
42
43impl DirStat {
44    /// Get the total size of this directory
45    pub fn total_size(&self) -> u64 {
46        self.total_size
47    }
48
49    /// Get the file count in this directory
50    pub fn file_count(&self) -> u64 {
51        self.file_count
52    }
53
54    /// Get the last scan time
55    pub fn last_scan(&self) -> SystemTime {
56        self.last_scan
57    }
58
59    /// Get the path of this directory
60    pub fn path(&self) -> &Path {
61        &self.path
62    }
63}
64
65/// Check if a directory or any of its subdirectories have been modified
66///
67/// Uses a recursive mtime comparison approach:
68/// 1. Check if directory's own mtime > last_scan (files/dirs added/removed)
69/// 2. Recursively validate cached subdirectories
70///
71/// If files were added or removed in subdirectories, the
72/// directory mtime would have been updated by the OS.
73fn dir_changed_since_last_scan(path: &Path, cached: &DirStat) -> bool {
74    // Check if the directory itself was modified
75    match fs::metadata(path).and_then(|m| m.modified()) {
76        Ok(mtime) => {
77            if mtime > cached.last_scan {
78                return true;
79            }
80        }
81        Err(_) => return true, // If we can't stat it, assume it changed (or is gone/inaccessible)
82    }
83
84    // If directory mtime hasn't changed, we assume no files were added/removed
85    // at this level. However, subdirectories might have changed internally
86    // without updating the parent's mtime.
87    // Parallelize the check for children
88    cached
89        .children
90        .par_iter()
91        .any(|(child_path, child_stat)| dir_changed_since_last_scan(child_path, child_stat))
92}
93
94/// Scan a directory recursively and return statistics
95///
96/// # Arguments
97/// * `path` - The directory path to scan
98/// * `cache` - Optional cached statistics for this directory
99///
100/// # Returns
101/// Directory statistics including size, file count, and child directories
102pub fn scan_directory(path: &Path, cache: Option<&DirStat>) -> io::Result<DirStat> {
103    // If cache exists, check if rescan needed BEFORE cloning
104    if let Some(cached) = cache {
105        // If directory hasn't changed, return the cached version
106        // This avoids cloning if we are going to discard it anyway
107        if !dir_changed_since_last_scan(path, cached) {
108            return Ok(cached.clone());
109        }
110    }
111
112    let mut total_size = 0;
113    let mut file_count = 0;
114    let mut children = HashMap::new();
115
116    // Collect entries first for potential parallel processing
117    let entries: Vec<_> = match fs::read_dir(path) {
118        Ok(entries) => entries
119            .filter_map(|e| match e {
120                Ok(entry) => Some(entry),
121                Err(err) => {
122                    // Log warning for individual entry read errors but continue
123                    eprintln!(
124                        "Warning: Failed to read directory entry in '{}': {}",
125                        path.display(),
126                        err
127                    );
128                    None
129                }
130            })
131            .collect(),
132        Err(err) => {
133            return Err(io::Error::from(DiskUseError::ScanError {
134                path: path.to_path_buf(),
135                source: err,
136            }));
137        }
138    };
139
140    // Process files and collect subdirectories
141    let mut subdirs = Vec::new();
142
143    for entry in entries {
144        let entry_path = entry.path();
145        match entry.metadata() {
146            Ok(meta) => {
147                if meta.is_file() {
148                    total_size += get_block_size(&meta);
149                    file_count += 1;
150                } else if meta.is_dir() {
151                    subdirs.push(entry_path);
152                }
153            }
154            Err(err) => {
155                // Log warning for metadata read errors but continue scanning
156                eprintln!(
157                    "Warning: Failed to read metadata for '{}': {}",
158                    entry_path.display(),
159                    err
160                );
161            }
162        }
163    }
164
165    // Process subdirectories in parallel if we have multiple
166    if subdirs.len() > 1 {
167        let results: Vec<_> = subdirs
168            .par_iter()
169            .filter_map(|entry_path| {
170                let child_cache = cache.and_then(|c| c.children.get(entry_path));
171                match scan_directory(entry_path, child_cache) {
172                    Ok(stat) => Some(stat),
173                    Err(err) => {
174                        // Log warning for subdirectory scan errors but continue
175                        eprintln!("Warning: Failed to scan subdirectory: {}", err);
176                        None
177                    }
178                }
179            })
180            .collect();
181
182        for child_stat in results {
183            total_size += child_stat.total_size;
184            file_count += child_stat.file_count;
185            children.insert(child_stat.path.clone(), child_stat);
186        }
187    } else {
188        // Sequential processing for single subdirectory
189        for entry_path in subdirs {
190            let child_cache = cache.and_then(|c| c.children.get(&entry_path));
191            match scan_directory(&entry_path, child_cache) {
192                Ok(child_stat) => {
193                    total_size += child_stat.total_size;
194                    file_count += child_stat.file_count;
195                    children.insert(entry_path, child_stat);
196                }
197                Err(err) => {
198                    // Log warning for subdirectory scan errors but continue
199                    eprintln!("Warning: Failed to scan subdirectory: {}", err);
200                }
201            }
202        }
203    }
204
205    Ok(DirStat {
206        path: path.to_path_buf(),
207        total_size,
208        file_count,
209        last_scan: SystemTime::now(),
210        children,
211    })
212}
213
214/// Count files in a directory recursively (without using cache)
215pub fn count_files(path: &Path) -> io::Result<u64> {
216    let mut count = 0;
217
218    let entries = fs::read_dir(path).map_err(|err| {
219        io::Error::from(DiskUseError::ScanError {
220            path: path.to_path_buf(),
221            source: err,
222        })
223    })?;
224
225    for entry in entries {
226        let entry = match entry {
227            Ok(e) => e,
228            Err(err) => {
229                eprintln!(
230                    "Warning: Failed to read directory entry in '{}': {}",
231                    path.display(),
232                    err
233                );
234                continue;
235            }
236        };
237
238        let meta = match entry.metadata() {
239            Ok(m) => m,
240            Err(err) => {
241                eprintln!(
242                    "Warning: Failed to read metadata for '{}': {}",
243                    entry.path().display(),
244                    err
245                );
246                continue;
247            }
248        };
249
250        if meta.is_file() {
251            count += 1;
252        } else if meta.is_dir() {
253            match count_files(&entry.path()) {
254                Ok(subcount) => count += subcount,
255                Err(err) => {
256                    eprintln!("Warning: Failed to count files in subdirectory: {}", err);
257                }
258            }
259        }
260    }
261
262    Ok(count)
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use tempfile::TempDir;
269
270    fn create_test_structure(base: &Path) -> io::Result<()> {
271        fs::create_dir_all(base.join("subdir1"))?;
272        fs::create_dir_all(base.join("subdir2/nested"))?;
273
274        fs::write(base.join("file1.txt"), "Hello World")?; // 11 bytes
275        fs::write(base.join("file2.txt"), "Test content")?; // 12 bytes
276        fs::write(base.join("subdir1/nested_file.txt"), "Nested content here")?; // 19 bytes
277        fs::write(base.join("subdir2/another.txt"), "More content")?; // 12 bytes
278        fs::write(base.join("subdir2/nested/deep.txt"), "Deep file content")?; // 17 bytes
279
280        Ok(())
281    }
282
283    #[test]
284    fn test_scan_directory() -> io::Result<()> {
285        // This test verifies that `scan_directory` correctly calculates the total size
286        // and file count of a directory structure. It checks if the calculated size
287        // is at least the logical size (accounting for block overhead) and if the
288        // file count and subdirectory count match the expected values.
289        let temp_dir = TempDir::new()?;
290        let test_dir = temp_dir.path().join("test");
291        fs::create_dir(&test_dir)?;
292
293        create_test_structure(&test_dir)?;
294
295        let result = scan_directory(&test_dir, None)?;
296
297        // Expected total: 11 + 12 + 19 + 12 + 17 = 71 bytes (logical)
298        // With block size, it will be larger.
299        assert!(result.total_size() >= 71);
300        assert_eq!(result.file_count(), 5);
301        assert_eq!(result.children.len(), 2); // subdir1 and subdir2
302
303        Ok(())
304    }
305
306    #[test]
307    fn test_count_files() -> io::Result<()> {
308        // This test verifies that `count_files` correctly counts the total number
309        // of files in a directory tree recursively, without using any cache.
310        let temp_dir = TempDir::new()?;
311        let test_dir = temp_dir.path().join("test");
312        fs::create_dir(&test_dir)?;
313
314        create_test_structure(&test_dir)?;
315
316        let count = count_files(&test_dir)?;
317        assert_eq!(count, 5);
318
319        Ok(())
320    }
321
322    #[test]
323    fn test_scan_with_cache() -> io::Result<()> {
324        // This test verifies that the caching mechanism works correctly.
325        // It performs an initial scan, then a second scan with the cache.
326        // It asserts that the second scan reuses the cached result (indicated by
327        // the same `last_scan` timestamp) since the directory hasn't changed.
328        let temp_dir = TempDir::new()?;
329        let test_dir = temp_dir.path().join("test");
330        fs::create_dir(&test_dir)?;
331
332        create_test_structure(&test_dir)?;
333
334        // First scan without cache
335        let stats1 = scan_directory(&test_dir, None)?;
336        let scan_time1 = stats1.last_scan();
337
338        // Second scan with cache (should reuse if directory hasn't changed)
339        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
340        let scan_time2 = stats2.last_scan();
341
342        // Since directory hasn't changed, should return cached stats with same timestamp
343        assert_eq!(scan_time1, scan_time2);
344
345        Ok(())
346    }
347
348    #[test]
349    fn test_detects_new_nested_subdirectory() -> io::Result<()> {
350        // This test ensures that the scanner detects changes deep in the directory tree.
351        // It creates a structure, scans it, then adds a new nested subdirectory and file.
352        // It verifies that the subsequent scan detects the new file and updates the
353        // `last_scan` timestamp, indicating a re-scan occurred.
354        use std::thread::sleep;
355        use std::time::Duration;
356
357        let temp_dir = TempDir::new()?;
358        let test_dir = temp_dir.path().join("test");
359        fs::create_dir(&test_dir)?;
360
361        // Create initial structure: test/a/
362        fs::create_dir(test_dir.join("a"))?;
363        fs::write(test_dir.join("a/file1.txt"), "content")?;
364
365        // First scan
366        let stats1 = scan_directory(&test_dir, None)?;
367        assert_eq!(stats1.file_count(), 1);
368
369        // Wait a moment to ensure time difference
370        sleep(Duration::from_millis(10));
371
372        // Now create test/a/b/ (this updates a's mtime but NOT test's mtime)
373        fs::create_dir(test_dir.join("a/b"))?;
374        fs::write(test_dir.join("a/b/file2.txt"), "new content")?;
375
376        // Second scan with cache - should detect the new subdirectory
377        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
378
379        // Should have scanned and found the new file
380        assert_eq!(stats2.file_count(), 2);
381        assert!(
382            stats2.last_scan() > stats1.last_scan(),
383            "Should have rescanned since new subdirectory was added"
384        );
385
386        Ok(())
387    }
388
389    #[test]
390    fn test_detects_deleted_subdirectory() -> io::Result<()> {
391        // This test ensures that the scanner detects when a subdirectory is deleted.
392        // It creates a structure, scans it, then deletes a subdirectory.
393        // It verifies that the subsequent scan correctly reports the reduced file count
394        // and updates the `last_scan` timestamp.
395        use std::thread::sleep;
396        use std::time::Duration;
397
398        let temp_dir = TempDir::new()?;
399        let test_dir = temp_dir.path().join("test");
400        fs::create_dir(&test_dir)?;
401
402        // Create initial structure
403        fs::create_dir(test_dir.join("a"))?;
404        fs::create_dir(test_dir.join("b"))?;
405        fs::write(test_dir.join("a/file1.txt"), "content")?;
406        fs::write(test_dir.join("b/file2.txt"), "content")?;
407
408        // First scan
409        let stats1 = scan_directory(&test_dir, None)?;
410        assert_eq!(stats1.file_count(), 2);
411
412        // Wait a moment
413        sleep(Duration::from_millis(10));
414
415        // Delete subdirectory b
416        fs::remove_file(test_dir.join("b/file2.txt"))?;
417        fs::remove_dir(test_dir.join("b"))?;
418
419        // Second scan with cache - should detect the deleted subdirectory
420        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
421
422        // Should have rescanned and found only 1 file now
423        assert_eq!(stats2.file_count(), 1);
424        assert!(
425            stats2.last_scan() > stats1.last_scan(),
426            "Should have rescanned since subdirectory was deleted"
427        );
428
429        Ok(())
430    }
431
432    #[test]
433    fn test_prunes_deeply_nested_deleted_directory() -> io::Result<()> {
434        // This test verifies that the scanner correctly handles the deletion of
435        // deeply nested directories. It creates a deep structure, scans it,
436        // then deletes a middle part of the tree. It checks if the cache is
437        // correctly updated to reflect the removal of the nested structure.
438        use std::thread::sleep;
439        use std::time::Duration;
440
441        let temp_dir = TempDir::new()?;
442        let test_dir = temp_dir.path().join("test");
443        fs::create_dir(&test_dir)?;
444
445        // Create deeply nested structure: test/a/b/c/d/
446        fs::create_dir_all(test_dir.join("a/b/c/d"))?;
447        fs::write(test_dir.join("a/file1.txt"), "content1")?;
448        fs::write(test_dir.join("a/b/file2.txt"), "content2")?;
449        fs::write(test_dir.join("a/b/c/file3.txt"), "content3")?;
450        fs::write(test_dir.join("a/b/c/d/file4.txt"), "content4")?;
451
452        // First scan
453        let stats1 = scan_directory(&test_dir, None)?;
454        assert_eq!(stats1.file_count(), 4);
455
456        // Wait a moment
457        sleep(Duration::from_millis(10));
458
459        // Delete deeply nested directory c (and its child d)
460        fs::remove_file(test_dir.join("a/b/c/d/file4.txt"))?;
461        fs::remove_dir(test_dir.join("a/b/c/d"))?;
462        fs::remove_file(test_dir.join("a/b/c/file3.txt"))?;
463        fs::remove_dir(test_dir.join("a/b/c"))?;
464
465        // Second scan with cache - should prune deleted dirs and update counts
466        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
467
468        // Should have only 2 files now (file1.txt and file2.txt)
469        assert_eq!(stats2.file_count(), 2);
470
471        // Verify cache structure is updated (b should exist, but c and d should be gone)
472        let a_stats = stats2.children.get(&test_dir.join("a")).unwrap();
473        let b_stats = a_stats.children.get(&test_dir.join("a/b")).unwrap();
474        assert!(
475            !b_stats.children.contains_key(&test_dir.join("a/b/c")),
476            "Deleted directory c should be pruned from cache"
477        );
478
479        Ok(())
480    }
481
482    #[test]
483    fn test_scan_nonexistent_path() {
484        // Test that scanning a nonexistent path returns an appropriate error
485        let result = scan_directory(Path::new("/nonexistent/path/that/does/not/exist"), None);
486        assert!(result.is_err());
487        let err = result.unwrap_err();
488        assert!(
489            err.to_string().contains("Failed to scan directory"),
490            "Error message should be descriptive"
491        );
492    }
493
494    #[test]
495    fn test_count_files_with_inaccessible_subdirectory() -> io::Result<()> {
496        // Test that count_files handles inaccessible subdirectories gracefully
497        let temp_dir = TempDir::new()?;
498        let test_dir = temp_dir.path().join("test");
499        fs::create_dir(&test_dir)?;
500
501        // Create a normal structure
502        fs::write(test_dir.join("file1.txt"), "content")?;
503        fs::create_dir(test_dir.join("subdir"))?;
504        fs::write(test_dir.join("subdir/file2.txt"), "content")?;
505
506        // Count should work and return 2
507        let count = count_files(&test_dir)?;
508        assert_eq!(count, 2);
509
510        Ok(())
511    }
512}