acme_disk_use/
scanner.rs

1//! Directory scanning module for calculating disk usage statistics
2
3use rayon::prelude::*;
4use serde::{Deserialize, Serialize};
5use std::{
6    collections::HashMap,
7    fs, io,
8    path::{Path, PathBuf},
9    time::SystemTime,
10};
11
12#[cfg(unix)]
13use std::os::unix::fs::MetadataExt;
14
15/// Get the physical size of a file on disk in bytes
16///
17/// On Unix systems, this uses the `blocks` metadata field multiplied by 512
18/// to get the actual disk usage, which accounts for sparse files and block alignment.
19/// On non-Unix systems, it falls back to the logical file size.
20fn get_block_size(meta: &fs::Metadata) -> u64 {
21    #[cfg(unix)]
22    {
23        meta.blocks() * 512
24    }
25    #[cfg(not(unix))]
26    {
27        meta.len()
28    }
29}
30
31/// Statistics for a directory and its contents
32#[derive(Serialize, Deserialize, Debug, Clone)]
33pub struct DirStat {
34    pub(crate) path: PathBuf,                       // Directory path
35    pub(crate) total_size: u64,                     // Logical sum of st_size of all files
36    pub(crate) file_count: u64, // Number of files in this directory and subdirectories
37    pub(crate) last_scan: SystemTime, // When this subtree was last scanned
38    pub(crate) children: HashMap<PathBuf, DirStat>, // Child directories' stats
39}
40
41impl DirStat {
42    /// Get the total size of this directory
43    pub fn total_size(&self) -> u64 {
44        self.total_size
45    }
46
47    /// Get the file count in this directory
48    pub fn file_count(&self) -> u64 {
49        self.file_count
50    }
51
52    /// Get the last scan time
53    pub fn last_scan(&self) -> SystemTime {
54        self.last_scan
55    }
56
57    /// Get the path of this directory
58    pub fn path(&self) -> &Path {
59        &self.path
60    }
61}
62
63/// Check if a directory or any of its subdirectories have been modified
64///
65/// Uses a recursive mtime comparison approach:
66/// 1. Check if directory's own mtime > last_scan (files/dirs added/removed)
67/// 2. Recursively validate cached subdirectories
68///
69/// If files were added or removed in subdirectories, the
70/// directory mtime would have been updated by the OS.
71fn dir_changed_since_last_scan(path: &Path, cached: &DirStat) -> bool {
72    // Check if the directory itself was modified
73    match fs::metadata(path).and_then(|m| m.modified()) {
74        Ok(mtime) => {
75            if mtime > cached.last_scan {
76                return true;
77            }
78        }
79        Err(_) => return true, // If we can't stat it, assume it changed (or is gone/inaccessible)
80    }
81
82    // If directory mtime hasn't changed, we assume no files were added/removed
83    // at this level. However, subdirectories might have changed internally
84    // without updating the parent's mtime.
85    // Parallelize the check for children
86    cached
87        .children
88        .par_iter()
89        .any(|(child_path, child_stat)| dir_changed_since_last_scan(child_path, child_stat))
90}
91
92/// Scan a directory recursively and return statistics
93///
94/// # Arguments
95/// * `path` - The directory path to scan
96/// * `cache` - Optional cached statistics for this directory
97///
98/// # Returns
99/// Directory statistics including size, file count, and child directories
100pub fn scan_directory(path: &Path, cache: Option<&DirStat>) -> io::Result<DirStat> {
101    // If cache exists, check if rescan needed BEFORE cloning
102    if let Some(cached) = cache {
103        // If directory hasn't changed, return the cached version
104        // This avoids cloning if we are going to discard it anyway
105        if !dir_changed_since_last_scan(path, cached) {
106            return Ok(cached.clone());
107        }
108    }
109
110    let mut total_size = 0;
111    let mut file_count = 0;
112    let mut children = HashMap::new();
113
114    // Collect entries first for potential parallel processing
115    let entries: Vec<_> = fs::read_dir(path)?.filter_map(|e| e.ok()).collect();
116
117    // Process files and collect subdirectories
118    let mut subdirs = Vec::new();
119
120    for entry in entries {
121        let entry_path = entry.path();
122        if let Ok(meta) = entry.metadata() {
123            if meta.is_file() {
124                total_size += get_block_size(&meta);
125                file_count += 1;
126            } else if meta.is_dir() {
127                subdirs.push(entry_path);
128            }
129        }
130    }
131
132    // Process subdirectories in parallel if we have multiple
133    if subdirs.len() > 1 {
134        let results: Vec<_> = subdirs
135            .par_iter()
136            .filter_map(|entry_path| {
137                let child_cache = cache.and_then(|c| c.children.get(entry_path));
138                scan_directory(entry_path, child_cache).ok()
139            })
140            .collect();
141
142        for child_stat in results {
143            total_size += child_stat.total_size;
144            file_count += child_stat.file_count;
145            children.insert(child_stat.path.clone(), child_stat);
146        }
147    } else {
148        // Sequential processing for single subdirectory
149        for entry_path in subdirs {
150            let child_cache = cache.and_then(|c| c.children.get(&entry_path));
151            if let Ok(child_stat) = scan_directory(&entry_path, child_cache) {
152                total_size += child_stat.total_size;
153                file_count += child_stat.file_count;
154                children.insert(entry_path, child_stat);
155            }
156        }
157    }
158
159    Ok(DirStat {
160        path: path.to_path_buf(),
161        total_size,
162        file_count,
163        last_scan: SystemTime::now(),
164        children,
165    })
166}
167
168/// Count files in a directory recursively (without using cache)
169pub fn count_files(path: &Path) -> io::Result<u64> {
170    let mut count = 0;
171
172    for entry in fs::read_dir(path)? {
173        let entry = entry?;
174        let meta = entry.metadata()?;
175
176        if meta.is_file() {
177            count += 1;
178        } else if meta.is_dir() {
179            count += count_files(&entry.path())?;
180        }
181    }
182
183    Ok(count)
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189    use tempfile::TempDir;
190
191    fn create_test_structure(base: &Path) -> io::Result<()> {
192        fs::create_dir_all(base.join("subdir1"))?;
193        fs::create_dir_all(base.join("subdir2/nested"))?;
194
195        fs::write(base.join("file1.txt"), "Hello World")?; // 11 bytes
196        fs::write(base.join("file2.txt"), "Test content")?; // 12 bytes
197        fs::write(base.join("subdir1/nested_file.txt"), "Nested content here")?; // 19 bytes
198        fs::write(base.join("subdir2/another.txt"), "More content")?; // 12 bytes
199        fs::write(base.join("subdir2/nested/deep.txt"), "Deep file content")?; // 17 bytes
200
201        Ok(())
202    }
203
204    #[test]
205    fn test_scan_directory() -> io::Result<()> {
206        // This test verifies that `scan_directory` correctly calculates the total size
207        // and file count of a directory structure. It checks if the calculated size
208        // is at least the logical size (accounting for block overhead) and if the
209        // file count and subdirectory count match the expected values.
210        let temp_dir = TempDir::new()?;
211        let test_dir = temp_dir.path().join("test");
212        fs::create_dir(&test_dir)?;
213
214        create_test_structure(&test_dir)?;
215
216        let result = scan_directory(&test_dir, None)?;
217
218        // Expected total: 11 + 12 + 19 + 12 + 17 = 71 bytes (logical)
219        // With block size, it will be larger.
220        assert!(result.total_size() >= 71);
221        assert_eq!(result.file_count(), 5);
222        assert_eq!(result.children.len(), 2); // subdir1 and subdir2
223
224        Ok(())
225    }
226
227    #[test]
228    fn test_count_files() -> io::Result<()> {
229        // This test verifies that `count_files` correctly counts the total number
230        // of files in a directory tree recursively, without using any cache.
231        let temp_dir = TempDir::new()?;
232        let test_dir = temp_dir.path().join("test");
233        fs::create_dir(&test_dir)?;
234
235        create_test_structure(&test_dir)?;
236
237        let count = count_files(&test_dir)?;
238        assert_eq!(count, 5);
239
240        Ok(())
241    }
242
243    #[test]
244    fn test_scan_with_cache() -> io::Result<()> {
245        // This test verifies that the caching mechanism works correctly.
246        // It performs an initial scan, then a second scan with the cache.
247        // It asserts that the second scan reuses the cached result (indicated by
248        // the same `last_scan` timestamp) since the directory hasn't changed.
249        let temp_dir = TempDir::new()?;
250        let test_dir = temp_dir.path().join("test");
251        fs::create_dir(&test_dir)?;
252
253        create_test_structure(&test_dir)?;
254
255        // First scan without cache
256        let stats1 = scan_directory(&test_dir, None)?;
257        let scan_time1 = stats1.last_scan();
258
259        // Second scan with cache (should reuse if directory hasn't changed)
260        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
261        let scan_time2 = stats2.last_scan();
262
263        // Since directory hasn't changed, should return cached stats with same timestamp
264        assert_eq!(scan_time1, scan_time2);
265
266        Ok(())
267    }
268
269    #[test]
270    fn test_detects_new_nested_subdirectory() -> io::Result<()> {
271        // This test ensures that the scanner detects changes deep in the directory tree.
272        // It creates a structure, scans it, then adds a new nested subdirectory and file.
273        // It verifies that the subsequent scan detects the new file and updates the
274        // `last_scan` timestamp, indicating a re-scan occurred.
275        use std::thread::sleep;
276        use std::time::Duration;
277
278        let temp_dir = TempDir::new()?;
279        let test_dir = temp_dir.path().join("test");
280        fs::create_dir(&test_dir)?;
281
282        // Create initial structure: test/a/
283        fs::create_dir(test_dir.join("a"))?;
284        fs::write(test_dir.join("a/file1.txt"), "content")?;
285
286        // First scan
287        let stats1 = scan_directory(&test_dir, None)?;
288        assert_eq!(stats1.file_count(), 1);
289
290        // Wait a moment to ensure time difference
291        sleep(Duration::from_millis(10));
292
293        // Now create test/a/b/ (this updates a's mtime but NOT test's mtime)
294        fs::create_dir(test_dir.join("a/b"))?;
295        fs::write(test_dir.join("a/b/file2.txt"), "new content")?;
296
297        // Second scan with cache - should detect the new subdirectory
298        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
299
300        // Should have scanned and found the new file
301        assert_eq!(stats2.file_count(), 2);
302        assert!(
303            stats2.last_scan() > stats1.last_scan(),
304            "Should have rescanned since new subdirectory was added"
305        );
306
307        Ok(())
308    }
309
310    #[test]
311    fn test_detects_deleted_subdirectory() -> io::Result<()> {
312        // This test ensures that the scanner detects when a subdirectory is deleted.
313        // It creates a structure, scans it, then deletes a subdirectory.
314        // It verifies that the subsequent scan correctly reports the reduced file count
315        // and updates the `last_scan` timestamp.
316        use std::thread::sleep;
317        use std::time::Duration;
318
319        let temp_dir = TempDir::new()?;
320        let test_dir = temp_dir.path().join("test");
321        fs::create_dir(&test_dir)?;
322
323        // Create initial structure
324        fs::create_dir(test_dir.join("a"))?;
325        fs::create_dir(test_dir.join("b"))?;
326        fs::write(test_dir.join("a/file1.txt"), "content")?;
327        fs::write(test_dir.join("b/file2.txt"), "content")?;
328
329        // First scan
330        let stats1 = scan_directory(&test_dir, None)?;
331        assert_eq!(stats1.file_count(), 2);
332
333        // Wait a moment
334        sleep(Duration::from_millis(10));
335
336        // Delete subdirectory b
337        fs::remove_file(test_dir.join("b/file2.txt"))?;
338        fs::remove_dir(test_dir.join("b"))?;
339
340        // Second scan with cache - should detect the deleted subdirectory
341        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
342
343        // Should have rescanned and found only 1 file now
344        assert_eq!(stats2.file_count(), 1);
345        assert!(
346            stats2.last_scan() > stats1.last_scan(),
347            "Should have rescanned since subdirectory was deleted"
348        );
349
350        Ok(())
351    }
352
353    #[test]
354    fn test_prunes_deeply_nested_deleted_directory() -> io::Result<()> {
355        // This test verifies that the scanner correctly handles the deletion of
356        // deeply nested directories. It creates a deep structure, scans it,
357        // then deletes a middle part of the tree. It checks if the cache is
358        // correctly updated to reflect the removal of the nested structure.
359        use std::thread::sleep;
360        use std::time::Duration;
361
362        let temp_dir = TempDir::new()?;
363        let test_dir = temp_dir.path().join("test");
364        fs::create_dir(&test_dir)?;
365
366        // Create deeply nested structure: test/a/b/c/d/
367        fs::create_dir_all(test_dir.join("a/b/c/d"))?;
368        fs::write(test_dir.join("a/file1.txt"), "content1")?;
369        fs::write(test_dir.join("a/b/file2.txt"), "content2")?;
370        fs::write(test_dir.join("a/b/c/file3.txt"), "content3")?;
371        fs::write(test_dir.join("a/b/c/d/file4.txt"), "content4")?;
372
373        // First scan
374        let stats1 = scan_directory(&test_dir, None)?;
375        assert_eq!(stats1.file_count(), 4);
376
377        // Wait a moment
378        sleep(Duration::from_millis(10));
379
380        // Delete deeply nested directory c (and its child d)
381        fs::remove_file(test_dir.join("a/b/c/d/file4.txt"))?;
382        fs::remove_dir(test_dir.join("a/b/c/d"))?;
383        fs::remove_file(test_dir.join("a/b/c/file3.txt"))?;
384        fs::remove_dir(test_dir.join("a/b/c"))?;
385
386        // Second scan with cache - should prune deleted dirs and update counts
387        let stats2 = scan_directory(&test_dir, Some(&stats1))?;
388
389        // Should have only 2 files now (file1.txt and file2.txt)
390        assert_eq!(stats2.file_count(), 2);
391
392        // Verify cache structure is updated (b should exist, but c and d should be gone)
393        let a_stats = stats2.children.get(&test_dir.join("a")).unwrap();
394        let b_stats = a_stats.children.get(&test_dir.join("a/b")).unwrap();
395        assert!(
396            !b_stats.children.contains_key(&test_dir.join("a/b/c")),
397            "Deleted directory c should be pruned from cache"
398        );
399
400        Ok(())
401    }
402}