Skip to main content

putzen_cli/caches/
scan.rs

1//! Filesystem walk: enumerate seeds → ranked `Cache` entries.
2
3use crate::caches::model::{Cache, TopFile};
4use jwalk::WalkDir;
5use std::cmp::Reverse;
6use std::collections::{BinaryHeap, HashSet};
7use std::path::{Path, PathBuf};
8use std::time::SystemTime;
9
10const TOP_K: usize = 64;
11
12/// Walk a single directory and aggregate its size, newest mtime, and counts.
13/// Symlinks are not followed. Permission errors are silenced.
14pub fn stat_dir(root: &Path) -> Cache {
15    stat_dir_with_progress(root, &mut || {})
16}
17
18/// Same as `stat_dir` but invokes `on_dir` once per directory entry observed
19/// during the walk. The callback is the per-cache progress hook used by the
20/// startup scan to feed `Msg::ScanProgress` to the TUI.
21pub fn stat_dir_with_progress(root: &Path, on_dir: &mut dyn FnMut()) -> Cache {
22    let mut size_bytes = 0u64;
23    let mut newest = None::<SystemTime>;
24    let mut file_count = 0u64;
25    let mut dir_count = 0u64;
26    let mut unreadable = 0u64;
27    let mut heap: BinaryHeap<Reverse<(u64, String, Option<SystemTime>)>> = BinaryHeap::new();
28
29    // skip_hidden(false): cache directories often signal "fresh use" via
30    // dotfiles (`.lock`, `.tmp`, `.index`); excluding them would shift
31    // newest_mtime onto the OLDER visible files and make active caches
32    // look dormant.
33    for entry in WalkDir::new(root)
34        .follow_links(false)
35        .skip_hidden(false)
36        .into_iter()
37        .flatten()
38    {
39        let meta = match entry.metadata() {
40            Ok(m) => m,
41            Err(_) => {
42                unreadable += 1;
43                continue;
44            }
45        };
46        if meta.is_dir() {
47            dir_count += 1;
48            on_dir();
49            continue;
50        }
51        if !meta.is_file() {
52            continue;
53        }
54        file_count += 1;
55        size_bytes += meta.len();
56        let file_mtime = meta.modified().ok();
57        if let Some(m) = file_mtime {
58            newest = Some(newest.map_or(m, |prev| prev.max(m)));
59        }
60        let name = entry.file_name().to_string_lossy().to_string();
61        heap.push(Reverse((meta.len(), name, file_mtime)));
62        if heap.len() > TOP_K {
63            heap.pop();
64        }
65    }
66
67    // dir_count includes `root` itself; subtract.
68    let dir_count = dir_count.saturating_sub(1);
69
70    // Preserve the literal directory name, dotfiles included.  Stripping the
71    // leading '.' looked tidier for the well-known built-in seeds (`.cargo`
72    // → `cargo`) but is misleading for user-supplied `--root` paths and any
73    // hidden folder that just happens to land in the rank table.
74    let label = root
75        .file_name()
76        .map(|s| s.to_string_lossy().to_string())
77        .unwrap_or_default();
78
79    let mut top_files: Vec<TopFile> = heap
80        .into_iter()
81        .map(|Reverse((size, name, mtime))| TopFile {
82            name,
83            size_bytes: size,
84            mtime,
85        })
86        .collect();
87    top_files.sort_by_key(|f| Reverse(f.size_bytes));
88
89    Cache {
90        label,
91        path: root.to_path_buf(),
92        size_bytes,
93        newest_mtime: newest,
94        file_count,
95        dir_count,
96        top_files,
97        unreadable,
98    }
99}
100
101/// Enumerate immediate children of `seed`; each becomes one `Cache`.
102/// Non-existent or non-directory seeds yield an empty vec.
103pub fn enumerate_seed(seed: &Path) -> Vec<Cache> {
104    enumerate_seed_with_progress(seed, &mut || {})
105}
106
107/// Same as `enumerate_seed` but invokes `on_dir` once per directory entry
108/// observed during each child cache's walk.  Drill-in workers use this to
109/// feed `Msg::ScanProgress` to the TUI so the spinner counts folders the
110/// same way the startup scan does.
111pub fn enumerate_seed_with_progress(seed: &Path, on_dir: &mut dyn FnMut()) -> Vec<Cache> {
112    let Ok(read) = std::fs::read_dir(seed) else {
113        return Vec::new();
114    };
115    read.flatten()
116        .filter(|e| e.file_type().map(|t| t.is_dir()).unwrap_or(false))
117        .map(|e| stat_dir_with_progress(&e.path(), on_dir))
118        .collect()
119}
120
121/// Walk every seed and concatenate, de-duplicating by canonicalised absolute
122/// path. Order is preserved (first occurrence wins).
123pub fn collect(seeds: &[PathBuf]) -> Vec<Cache> {
124    collect_with_progress(seeds, &mut || {})
125}
126
127/// Same as `collect` but invokes `on_dir` once per directory entry observed
128/// during every cache's walk. The startup-scan worker uses this to send
129/// `Msg::ScanProgress` updates to the TUI.
130pub fn collect_with_progress(seeds: &[PathBuf], on_dir: &mut dyn FnMut()) -> Vec<Cache> {
131    let mut seen = HashSet::new();
132    let mut out = Vec::new();
133    for s in seeds {
134        let Ok(canonical) = s.canonicalize() else {
135            continue;
136        };
137        let Ok(read) = std::fs::read_dir(&canonical) else {
138            continue;
139        };
140        for entry in read.flatten() {
141            if !entry.file_type().map(|t| t.is_dir()).unwrap_or(false) {
142                continue;
143            }
144            let c = stat_dir_with_progress(&entry.path(), on_dir);
145            let canon = c.path.canonicalize().unwrap_or_else(|_| c.path.clone());
146            if seen.insert(canon) {
147                out.push(c);
148            }
149        }
150    }
151    out
152}
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157    use std::fs::{self, File};
158    use std::io::Write;
159
160    #[test]
161    fn stat_empty_dir() {
162        let tmp = tempfile::tempdir().unwrap();
163        let c = stat_dir(tmp.path());
164        assert_eq!(c.size_bytes, 0);
165        assert_eq!(c.file_count, 0);
166        assert_eq!(c.dir_count, 0);
167        assert!(c.newest_mtime.is_none());
168    }
169
170    #[test]
171    fn stat_sums_sizes_and_counts() {
172        let tmp = tempfile::tempdir().unwrap();
173        let nested = tmp.path().join("a/b");
174        fs::create_dir_all(&nested).unwrap();
175        File::create(tmp.path().join("a/one"))
176            .unwrap()
177            .write_all(&[0u8; 100])
178            .unwrap();
179        File::create(tmp.path().join("a/b/two"))
180            .unwrap()
181            .write_all(&[0u8; 200])
182            .unwrap();
183
184        let c = stat_dir(tmp.path());
185        assert_eq!(c.size_bytes, 300);
186        assert_eq!(c.file_count, 2);
187        // a/ and a/b/ are 2 dirs (root is subtracted)
188        assert_eq!(c.dir_count, 2);
189        assert!(c.newest_mtime.is_some());
190    }
191
192    #[test]
193    fn newest_mtime_picks_max_across_files() {
194        let tmp = tempfile::tempdir().unwrap();
195        // Older file
196        let old = tmp.path().join("old");
197        File::create(&old).unwrap().write_all(&[0u8; 10]).unwrap();
198        // Newer file
199        let new = tmp.path().join("new");
200        File::create(&new).unwrap().write_all(&[0u8; 10]).unwrap();
201        let later = std::time::SystemTime::now() + std::time::Duration::from_secs(60);
202        filetime::set_file_mtime(
203            &old,
204            filetime::FileTime::from_system_time(
205                std::time::SystemTime::now() - std::time::Duration::from_secs(86_400),
206            ),
207        )
208        .ok();
209        filetime::set_file_mtime(&new, filetime::FileTime::from_system_time(later)).ok();
210
211        let c = stat_dir(tmp.path());
212        // The youngest file's mtime wins.
213        let nm = c.newest_mtime.expect("expected a newest_mtime");
214        assert!(nm >= later - std::time::Duration::from_secs(1));
215    }
216
217    #[test]
218    fn hidden_files_count_toward_newest_mtime() {
219        let tmp = tempfile::tempdir().unwrap();
220        // One visible old file, one hidden recent file. If skip_hidden defaulted
221        // to true the hidden file would not contribute and newest_mtime would
222        // be the old file. With our explicit skip_hidden(false) the hidden
223        // file's recent mtime wins.
224        let old = tmp.path().join("old");
225        File::create(&old).unwrap().write_all(&[0u8; 1]).unwrap();
226        filetime::set_file_mtime(
227            &old,
228            filetime::FileTime::from_system_time(
229                std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(60),
230            ),
231        )
232        .ok();
233
234        let hidden = tmp.path().join(".lock");
235        File::create(&hidden).unwrap().write_all(&[0u8; 1]).unwrap();
236        let later = std::time::SystemTime::now();
237        filetime::set_file_mtime(&hidden, filetime::FileTime::from_system_time(later)).ok();
238
239        let c = stat_dir(tmp.path());
240        let nm = c.newest_mtime.expect("expected a newest_mtime");
241        // newest_mtime is from the hidden file (today), not the visible 1970 one.
242        assert!(
243            nm > std::time::SystemTime::UNIX_EPOCH
244                + std::time::Duration::from_secs(3600 * 24 * 365)
245        );
246    }
247
248    #[test]
249    fn label_preserves_leading_dot() {
250        let tmp = tempfile::tempdir().unwrap();
251        let hidden = tmp.path().join(".npm");
252        fs::create_dir(&hidden).unwrap();
253        let c = stat_dir(&hidden);
254        assert_eq!(c.label, ".npm");
255    }
256
257    #[test]
258    fn enumerate_returns_immediate_children() {
259        let tmp = tempfile::tempdir().unwrap();
260        fs::create_dir(tmp.path().join("alpha")).unwrap();
261        fs::create_dir(tmp.path().join("beta")).unwrap();
262        File::create(tmp.path().join("alpha/file"))
263            .unwrap()
264            .write_all(&[0u8; 50])
265            .unwrap();
266
267        let mut caches = super::enumerate_seed(tmp.path());
268        caches.sort_by(|a, b| a.label.cmp(&b.label));
269        let labels: Vec<_> = caches.iter().map(|c| c.label.as_str()).collect();
270        assert_eq!(labels, ["alpha", "beta"]);
271    }
272
273    #[test]
274    fn enumerate_seed_skips_missing() {
275        let path = std::path::PathBuf::from("/nonexistent/putzen/should/never/exist");
276        assert!(super::enumerate_seed(&path).is_empty());
277    }
278
279    #[test]
280    fn top_files_lists_largest_files_sorted_desc() {
281        let tmp = tempfile::tempdir().unwrap();
282        fs::create_dir_all(tmp.path()).unwrap();
283        fs::write(tmp.path().join("small"), [0u8; 10]).unwrap();
284        fs::write(tmp.path().join("big"), [0u8; 1_000_000]).unwrap();
285        fs::write(tmp.path().join("medium"), [0u8; 5_000]).unwrap();
286        let c = stat_dir(tmp.path());
287        let names: Vec<_> = c.top_files.iter().map(|f| f.name.as_str()).collect();
288        assert_eq!(names, ["big", "medium", "small"]);
289    }
290
291    #[test]
292    fn top_files_capped_at_64() {
293        let tmp = tempfile::tempdir().unwrap();
294        fs::create_dir_all(tmp.path()).unwrap();
295        for i in 0..100 {
296            fs::write(
297                tmp.path().join(format!("f{:03}", i)),
298                vec![0u8; (i + 1) as usize],
299            )
300            .unwrap();
301        }
302        let c = stat_dir(tmp.path());
303        assert_eq!(c.top_files.len(), 64);
304        // largest one ("f099" with 100 bytes) must be present
305        assert!(c.top_files.iter().any(|f| f.name == "f099"));
306    }
307
308    #[test]
309    fn collect_dedups_by_canonical_path() {
310        let tmp = tempfile::tempdir().unwrap();
311        fs::create_dir(tmp.path().join("alpha")).unwrap();
312        // pass the same seed twice
313        let seeds = vec![tmp.path().to_path_buf(), tmp.path().to_path_buf()];
314        let caches = super::collect(&seeds);
315        assert_eq!(caches.len(), 1, "duplicate seed should yield one cache");
316        assert_eq!(caches[0].label, "alpha");
317    }
318}