Skip to main content

provenant/scanner/
collect.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use glob::Pattern;
5use std::collections::HashSet;
6use std::fs;
7use std::path::{Path, PathBuf};
8
9use crate::utils::file::is_path_excluded;
10
11pub struct CollectedPaths {
12    pub files: Vec<(PathBuf, fs::Metadata)>,
13    pub directories: Vec<(PathBuf, fs::Metadata)>,
14    pub excluded_count: usize,
15    pub total_file_bytes: u64,
16    pub collection_errors: Vec<(PathBuf, String)>,
17}
18
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct CollectionFrontier {
21    pub path: PathBuf,
22    pub recurse: bool,
23}
24
25struct CollectionAccumulator {
26    files: Vec<(PathBuf, fs::Metadata)>,
27    directories: Vec<(PathBuf, fs::Metadata)>,
28    file_seen: HashSet<PathBuf>,
29    dir_seen: HashSet<PathBuf>,
30    excluded_count: usize,
31    total_file_bytes: u64,
32    collection_errors: Vec<(PathBuf, String)>,
33}
34
35impl CollectedPaths {
36    pub fn file_count(&self) -> usize {
37        self.files.len()
38    }
39
40    pub fn directory_count(&self) -> usize {
41        self.directories.len()
42    }
43
44    pub fn scan_root(&self) -> Option<&Path> {
45        self.directories
46            .first()
47            .map(|(path, _)| path.as_path())
48            .or_else(|| {
49                self.files
50                    .first()
51                    .and_then(|(path, _)| path.parent().or(Some(path.as_path())))
52            })
53    }
54}
55
56pub fn collect_paths<P: AsRef<Path>>(
57    root: P,
58    max_depth: usize,
59    exclude_patterns: &[Pattern],
60) -> CollectedPaths {
61    let depth_limit = depth_limit_from_cli(max_depth);
62    let root = root.as_ref();
63
64    if is_path_excluded(root, exclude_patterns) {
65        return CollectedPaths {
66            files: Vec::new(),
67            directories: Vec::new(),
68            excluded_count: 1,
69            total_file_bytes: 0,
70            collection_errors: Vec::new(),
71        };
72    }
73
74    let metadata = match fs::metadata(root) {
75        Ok(metadata) => metadata,
76        Err(error) => {
77            return CollectedPaths {
78                files: Vec::new(),
79                directories: Vec::new(),
80                excluded_count: 0,
81                total_file_bytes: 0,
82                collection_errors: vec![(root.to_path_buf(), error.to_string())],
83            };
84        }
85    };
86
87    if metadata.is_file() {
88        return CollectedPaths {
89            total_file_bytes: metadata.len(),
90            files: vec![(root.to_path_buf(), metadata)],
91            directories: Vec::new(),
92            excluded_count: 0,
93            collection_errors: Vec::new(),
94        };
95    }
96
97    collect_all_paths(root, &metadata, depth_limit, exclude_patterns)
98}
99
100pub fn collect_selected_paths(
101    root: &Path,
102    selected: &[CollectionFrontier],
103    max_depth: usize,
104    exclude_patterns: &[Pattern],
105) -> CollectedPaths {
106    let depth_limit = depth_limit_from_cli(max_depth);
107
108    if is_path_excluded(root, exclude_patterns) {
109        return CollectedPaths {
110            files: Vec::new(),
111            directories: Vec::new(),
112            excluded_count: 1,
113            total_file_bytes: 0,
114            collection_errors: Vec::new(),
115        };
116    }
117
118    let root_metadata = match fs::metadata(root) {
119        Ok(metadata) => metadata,
120        Err(error) => {
121            return CollectedPaths {
122                files: Vec::new(),
123                directories: Vec::new(),
124                excluded_count: 0,
125                total_file_bytes: 0,
126                collection_errors: vec![(root.to_path_buf(), error.to_string())],
127            };
128        }
129    };
130
131    let mut accumulator = CollectionAccumulator {
132        files: Vec::new(),
133        directories: vec![(root.to_path_buf(), root_metadata)],
134        file_seen: HashSet::new(),
135        dir_seen: HashSet::from([root.to_path_buf()]),
136        excluded_count: 0,
137        total_file_bytes: 0,
138        collection_errors: Vec::new(),
139    };
140
141    for frontier in minimize_frontier(selected) {
142        let relative_depth = frontier.path.components().count();
143        if depth_limit.is_some_and(|limit| relative_depth > limit) {
144            continue;
145        }
146
147        let absolute = root.join(&frontier.path);
148        if is_path_or_any_ancestor_excluded(root, &absolute, exclude_patterns) {
149            accumulator.excluded_count += 1;
150            continue;
151        }
152
153        let metadata = match fs::metadata(&absolute) {
154            Ok(metadata) => metadata,
155            Err(error) => {
156                accumulator
157                    .collection_errors
158                    .push((absolute, error.to_string()));
159                continue;
160            }
161        };
162
163        add_ancestor_directories(root, &absolute, &mut accumulator);
164
165        if metadata.is_file() {
166            insert_file(&mut accumulator, absolute, metadata);
167            continue;
168        }
169
170        if !metadata.is_dir() {
171            continue;
172        }
173
174        let subtree_depth_limit = depth_limit.map(|limit| limit.saturating_sub(relative_depth));
175        let collected = if frontier.recurse {
176            collect_all_paths(&absolute, &metadata, subtree_depth_limit, exclude_patterns)
177        } else {
178            CollectedPaths {
179                files: Vec::new(),
180                directories: vec![(absolute, metadata)],
181                excluded_count: 0,
182                total_file_bytes: 0,
183                collection_errors: Vec::new(),
184            }
185        };
186        merge_collected(&mut accumulator, collected);
187    }
188
189    CollectedPaths {
190        files: accumulator.files,
191        directories: accumulator.directories,
192        excluded_count: accumulator.excluded_count,
193        total_file_bytes: accumulator.total_file_bytes,
194        collection_errors: accumulator.collection_errors,
195    }
196}
197
198fn collect_all_paths(
199    root: &Path,
200    root_metadata: &fs::Metadata,
201    depth_limit: Option<usize>,
202    exclude_patterns: &[Pattern],
203) -> CollectedPaths {
204    let mut files = Vec::new();
205    let mut directories = vec![(root.to_path_buf(), root_metadata.clone())];
206    let mut excluded_count = 0;
207    let mut total_file_bytes = 0_u64;
208    let mut collection_errors = Vec::new();
209
210    let mut pending_dirs: Vec<(PathBuf, Option<usize>)> = vec![(root.to_path_buf(), depth_limit)];
211
212    while let Some((dir_path, current_depth)) = pending_dirs.pop() {
213        let entries: Vec<_> = match fs::read_dir(&dir_path) {
214            Ok(entries) => entries.filter_map(Result::ok).collect(),
215            Err(e) => {
216                collection_errors.push((dir_path.clone(), e.to_string()));
217                continue;
218            }
219        };
220
221        for entry in entries {
222            let path = entry.path();
223
224            if is_path_excluded(&path, exclude_patterns) {
225                excluded_count += 1;
226                continue;
227            }
228
229            match entry.metadata() {
230                Ok(metadata) if metadata.is_file() => {
231                    total_file_bytes += metadata.len();
232                    files.push((path, metadata));
233                }
234                Ok(metadata) if metadata.is_dir() => {
235                    directories.push((path.clone(), metadata));
236                    let should_recurse = current_depth.is_none_or(|d| d > 0);
237                    if should_recurse {
238                        let next_depth = current_depth.map(|d| d - 1);
239                        pending_dirs.push((path, next_depth));
240                    }
241                }
242                _ => continue,
243            }
244        }
245    }
246
247    CollectedPaths {
248        files,
249        directories,
250        excluded_count,
251        total_file_bytes,
252        collection_errors,
253    }
254}
255
256fn depth_limit_from_cli(max_depth: usize) -> Option<usize> {
257    if max_depth == 0 {
258        None
259    } else {
260        Some(max_depth)
261    }
262}
263
264fn is_path_or_any_ancestor_excluded(
265    path_root: &Path,
266    path: &Path,
267    exclude_patterns: &[Pattern],
268) -> bool {
269    let mut current = Some(path);
270    while let Some(candidate) = current {
271        if is_path_excluded(candidate, exclude_patterns) {
272            return true;
273        }
274        if candidate == path_root {
275            break;
276        }
277        current = candidate.parent();
278    }
279    false
280}
281
282fn minimize_frontier(selected: &[CollectionFrontier]) -> Vec<CollectionFrontier> {
283    let mut ordered = selected.to_vec();
284    ordered.sort_by_key(|entry| (entry.path.components().count(), !entry.recurse));
285
286    let mut minimized = Vec::new();
287    for entry in ordered {
288        let covered = minimized.iter().any(|existing: &CollectionFrontier| {
289            existing.recurse
290                && (entry.path == existing.path || entry.path.starts_with(&existing.path))
291        });
292        if !covered {
293            minimized.push(entry);
294        }
295    }
296    minimized
297}
298
299fn add_ancestor_directories(root: &Path, path: &Path, accumulator: &mut CollectionAccumulator) {
300    let mut current = path.parent();
301    while let Some(dir) = current {
302        if dir == root {
303            break;
304        }
305        if accumulator.dir_seen.insert(dir.to_path_buf()) {
306            match fs::metadata(dir) {
307                Ok(metadata) => accumulator.directories.push((dir.to_path_buf(), metadata)),
308                Err(error) => accumulator
309                    .collection_errors
310                    .push((dir.to_path_buf(), error.to_string())),
311            }
312        }
313        current = dir.parent();
314    }
315}
316
317fn insert_file(accumulator: &mut CollectionAccumulator, path: PathBuf, metadata: fs::Metadata) {
318    if accumulator.file_seen.insert(path.clone()) {
319        accumulator.total_file_bytes += metadata.len();
320        accumulator.files.push((path, metadata));
321    }
322}
323
324fn merge_collected(accumulator: &mut CollectionAccumulator, collected: CollectedPaths) {
325    accumulator.excluded_count += collected.excluded_count;
326    accumulator
327        .collection_errors
328        .extend(collected.collection_errors);
329
330    for (path, metadata) in collected.files {
331        insert_file(accumulator, path, metadata);
332    }
333    for (path, metadata) in collected.directories {
334        if accumulator.dir_seen.insert(path.clone()) {
335            accumulator.directories.push((path, metadata));
336        }
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::collect_paths;
343    use std::fs;
344
345    #[test]
346    fn file_scan_root_uses_parent_directory() {
347        let temp_dir = tempfile::tempdir().expect("temp dir");
348        let file_path = temp_dir.path().join("Directory.Packages.props");
349        fs::write(&file_path, "<Project />").expect("write props file");
350
351        let collected = collect_paths(&file_path, 0, &[]);
352        assert_eq!(collected.file_count(), 1);
353        assert_eq!(collected.directory_count(), 0);
354        assert_eq!(collected.scan_root(), Some(temp_dir.path()));
355    }
356}