Skip to main content

provenant/scanner/
collect.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use glob::Pattern;
5use std::collections::HashSet;
6use std::fs;
7use std::path::{Path, PathBuf};
8
9use crate::utils::file::is_path_excluded;
10
11pub struct CollectedPaths {
12    pub files: Vec<(PathBuf, fs::Metadata)>,
13    pub directories: Vec<(PathBuf, fs::Metadata)>,
14    pub excluded_count: usize,
15    pub total_file_bytes: u64,
16    pub collection_errors: Vec<(PathBuf, String)>,
17}
18
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct CollectionFrontier {
21    pub path: PathBuf,
22    pub recurse: bool,
23}
24
25struct CollectionAccumulator {
26    files: Vec<(PathBuf, fs::Metadata)>,
27    directories: Vec<(PathBuf, fs::Metadata)>,
28    file_seen: HashSet<PathBuf>,
29    dir_seen: HashSet<PathBuf>,
30    excluded_count: usize,
31    total_file_bytes: u64,
32    collection_errors: Vec<(PathBuf, String)>,
33}
34
35impl CollectedPaths {
36    pub fn file_count(&self) -> usize {
37        self.files.len()
38    }
39
40    pub fn directory_count(&self) -> usize {
41        self.directories.len()
42    }
43}
44
45pub fn collect_paths<P: AsRef<Path>>(
46    root: P,
47    max_depth: usize,
48    exclude_patterns: &[Pattern],
49) -> CollectedPaths {
50    let depth_limit = depth_limit_from_cli(max_depth);
51    let root = root.as_ref();
52
53    if is_path_excluded(root, exclude_patterns) {
54        return CollectedPaths {
55            files: Vec::new(),
56            directories: Vec::new(),
57            excluded_count: 1,
58            total_file_bytes: 0,
59            collection_errors: Vec::new(),
60        };
61    }
62
63    let metadata = match fs::metadata(root) {
64        Ok(metadata) => metadata,
65        Err(error) => {
66            return CollectedPaths {
67                files: Vec::new(),
68                directories: Vec::new(),
69                excluded_count: 0,
70                total_file_bytes: 0,
71                collection_errors: vec![(root.to_path_buf(), error.to_string())],
72            };
73        }
74    };
75
76    if metadata.is_file() {
77        return CollectedPaths {
78            total_file_bytes: metadata.len(),
79            files: vec![(root.to_path_buf(), metadata)],
80            directories: Vec::new(),
81            excluded_count: 0,
82            collection_errors: Vec::new(),
83        };
84    }
85
86    collect_all_paths(root, &metadata, depth_limit, exclude_patterns)
87}
88
89pub fn collect_selected_paths(
90    root: &Path,
91    selected: &[CollectionFrontier],
92    max_depth: usize,
93    exclude_patterns: &[Pattern],
94) -> CollectedPaths {
95    let depth_limit = depth_limit_from_cli(max_depth);
96
97    if is_path_excluded(root, exclude_patterns) {
98        return CollectedPaths {
99            files: Vec::new(),
100            directories: Vec::new(),
101            excluded_count: 1,
102            total_file_bytes: 0,
103            collection_errors: Vec::new(),
104        };
105    }
106
107    let root_metadata = match fs::metadata(root) {
108        Ok(metadata) => metadata,
109        Err(error) => {
110            return CollectedPaths {
111                files: Vec::new(),
112                directories: Vec::new(),
113                excluded_count: 0,
114                total_file_bytes: 0,
115                collection_errors: vec![(root.to_path_buf(), error.to_string())],
116            };
117        }
118    };
119
120    let mut accumulator = CollectionAccumulator {
121        files: Vec::new(),
122        directories: vec![(root.to_path_buf(), root_metadata)],
123        file_seen: HashSet::new(),
124        dir_seen: HashSet::from([root.to_path_buf()]),
125        excluded_count: 0,
126        total_file_bytes: 0,
127        collection_errors: Vec::new(),
128    };
129
130    for frontier in minimize_frontier(selected) {
131        let relative_depth = frontier.path.components().count();
132        if depth_limit.is_some_and(|limit| relative_depth > limit) {
133            continue;
134        }
135
136        let absolute = root.join(&frontier.path);
137        if is_path_or_any_ancestor_excluded(root, &absolute, exclude_patterns) {
138            accumulator.excluded_count += 1;
139            continue;
140        }
141
142        let metadata = match fs::metadata(&absolute) {
143            Ok(metadata) => metadata,
144            Err(error) => {
145                accumulator
146                    .collection_errors
147                    .push((absolute, error.to_string()));
148                continue;
149            }
150        };
151
152        add_ancestor_directories(root, &absolute, &mut accumulator);
153
154        if metadata.is_file() {
155            insert_file(&mut accumulator, absolute, metadata);
156            continue;
157        }
158
159        if !metadata.is_dir() {
160            continue;
161        }
162
163        let subtree_depth_limit = depth_limit.map(|limit| limit.saturating_sub(relative_depth));
164        let collected = if frontier.recurse {
165            collect_all_paths(&absolute, &metadata, subtree_depth_limit, exclude_patterns)
166        } else {
167            CollectedPaths {
168                files: Vec::new(),
169                directories: vec![(absolute, metadata)],
170                excluded_count: 0,
171                total_file_bytes: 0,
172                collection_errors: Vec::new(),
173            }
174        };
175        merge_collected(&mut accumulator, collected);
176    }
177
178    CollectedPaths {
179        files: accumulator.files,
180        directories: accumulator.directories,
181        excluded_count: accumulator.excluded_count,
182        total_file_bytes: accumulator.total_file_bytes,
183        collection_errors: accumulator.collection_errors,
184    }
185}
186
187fn collect_all_paths(
188    root: &Path,
189    root_metadata: &fs::Metadata,
190    depth_limit: Option<usize>,
191    exclude_patterns: &[Pattern],
192) -> CollectedPaths {
193    let mut files = Vec::new();
194    let mut directories = vec![(root.to_path_buf(), root_metadata.clone())];
195    let mut excluded_count = 0;
196    let mut total_file_bytes = 0_u64;
197    let mut collection_errors = Vec::new();
198
199    let mut pending_dirs: Vec<(PathBuf, Option<usize>)> = vec![(root.to_path_buf(), depth_limit)];
200
201    while let Some((dir_path, current_depth)) = pending_dirs.pop() {
202        let entries: Vec<_> = match fs::read_dir(&dir_path) {
203            Ok(entries) => entries.filter_map(Result::ok).collect(),
204            Err(e) => {
205                collection_errors.push((dir_path.clone(), e.to_string()));
206                continue;
207            }
208        };
209
210        for entry in entries {
211            let path = entry.path();
212
213            if is_path_excluded(&path, exclude_patterns) {
214                excluded_count += 1;
215                continue;
216            }
217
218            match entry.metadata() {
219                Ok(metadata) if metadata.is_file() => {
220                    total_file_bytes += metadata.len();
221                    files.push((path, metadata));
222                }
223                Ok(metadata) if metadata.is_dir() => {
224                    directories.push((path.clone(), metadata));
225                    let should_recurse = current_depth.is_none_or(|d| d > 0);
226                    if should_recurse {
227                        let next_depth = current_depth.map(|d| d - 1);
228                        pending_dirs.push((path, next_depth));
229                    }
230                }
231                _ => continue,
232            }
233        }
234    }
235
236    CollectedPaths {
237        files,
238        directories,
239        excluded_count,
240        total_file_bytes,
241        collection_errors,
242    }
243}
244
245fn depth_limit_from_cli(max_depth: usize) -> Option<usize> {
246    if max_depth == 0 {
247        None
248    } else {
249        Some(max_depth)
250    }
251}
252
253fn is_path_or_any_ancestor_excluded(
254    path_root: &Path,
255    path: &Path,
256    exclude_patterns: &[Pattern],
257) -> bool {
258    let mut current = Some(path);
259    while let Some(candidate) = current {
260        if is_path_excluded(candidate, exclude_patterns) {
261            return true;
262        }
263        if candidate == path_root {
264            break;
265        }
266        current = candidate.parent();
267    }
268    false
269}
270
271fn minimize_frontier(selected: &[CollectionFrontier]) -> Vec<CollectionFrontier> {
272    let mut ordered = selected.to_vec();
273    ordered.sort_by_key(|entry| (entry.path.components().count(), !entry.recurse));
274
275    let mut minimized = Vec::new();
276    for entry in ordered {
277        let covered = minimized.iter().any(|existing: &CollectionFrontier| {
278            existing.recurse
279                && (entry.path == existing.path || entry.path.starts_with(&existing.path))
280        });
281        if !covered {
282            minimized.push(entry);
283        }
284    }
285    minimized
286}
287
288fn add_ancestor_directories(root: &Path, path: &Path, accumulator: &mut CollectionAccumulator) {
289    let mut current = path.parent();
290    while let Some(dir) = current {
291        if dir == root {
292            break;
293        }
294        if accumulator.dir_seen.insert(dir.to_path_buf()) {
295            match fs::metadata(dir) {
296                Ok(metadata) => accumulator.directories.push((dir.to_path_buf(), metadata)),
297                Err(error) => accumulator
298                    .collection_errors
299                    .push((dir.to_path_buf(), error.to_string())),
300            }
301        }
302        current = dir.parent();
303    }
304}
305
306fn insert_file(accumulator: &mut CollectionAccumulator, path: PathBuf, metadata: fs::Metadata) {
307    if accumulator.file_seen.insert(path.clone()) {
308        accumulator.total_file_bytes += metadata.len();
309        accumulator.files.push((path, metadata));
310    }
311}
312
313fn merge_collected(accumulator: &mut CollectionAccumulator, collected: CollectedPaths) {
314    accumulator.excluded_count += collected.excluded_count;
315    accumulator
316        .collection_errors
317        .extend(collected.collection_errors);
318
319    for (path, metadata) in collected.files {
320        insert_file(accumulator, path, metadata);
321    }
322    for (path, metadata) in collected.directories {
323        if accumulator.dir_seen.insert(path.clone()) {
324            accumulator.directories.push((path, metadata));
325        }
326    }
327}