Skip to main content

provenant/assembly/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod assemblers;
5#[cfg(all(test, feature = "golden-tests"))]
6mod assembly_golden_test;
7#[cfg(test)]
8mod assembly_test;
9mod bazel_merge;
10mod bazel_prune;
11mod cargo_resource_assign;
12mod cargo_workspace_merge;
13mod composer_resource_assign;
14mod conda_rootfs_merge;
15mod debian_source_merge;
16pub mod file_ref_resolve;
17mod hackage_merge;
18mod nested_merge;
19mod nix_flake_compat_merge;
20mod npm_resource_assign;
21mod npm_workspace_merge;
22mod nuget_cpm_resolve;
23mod python_requirements_assign;
24mod ruby_resource_assign;
25mod sibling_merge;
26mod swift_merge;
27mod topology;
28mod windows_update_merge;
29
30use std::collections::{HashMap, HashSet};
31use std::path::PathBuf;
32use std::sync::LazyLock;
33
34use crate::models::{DatasourceId, FileInfo, Package, PackageUid, TopLevelDependency};
35
36pub use assemblers::ASSEMBLERS;
37
38type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
39
40/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
41/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
42static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
43    let mut lookup = HashMap::new();
44    for config in ASSEMBLERS {
45        let key = *config
46            .datasource_ids
47            .first()
48            .expect("assembler must have at least one datasource_id");
49        for &dsid in config.datasource_ids {
50            lookup.insert(dsid, key);
51        }
52    }
53    lookup
54});
55
56static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
57    LazyLock::new(|| {
58        let mut lookup = HashMap::new();
59        for config in ASSEMBLERS {
60            let key = *config
61                .datasource_ids
62                .first()
63                .expect("assembler must have at least one datasource_id");
64            lookup.insert(key, config);
65        }
66        lookup
67    });
68
69/// Result of the assembly phase: top-level packages and dependencies,
70/// plus updated file-to-package associations.
71pub struct AssemblyResult {
72    pub packages: Vec<Package>,
73    pub dependencies: Vec<TopLevelDependency>,
74}
75
76/// How an assembler groups PackageData into Packages.
77#[derive(Debug, Clone, Copy, PartialEq, Eq)]
78pub enum AssemblyMode {
79    /// Merge related files in the same directory (or nested) into one Package.
80    SiblingMerge,
81    /// Each PackageData becomes its own independent Package (e.g., database files
82    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
83    OnePerPackageData,
84}
85
86pub struct AssemblerConfig {
87    pub datasource_ids: &'static [DatasourceId],
88    pub sibling_file_patterns: &'static [&'static str],
89    pub mode: AssemblyMode,
90}
91
92/// Run the assembly phase over all scanned files.
93///
94/// Groups files by parent directory, finds related manifests/lockfiles,
95/// merges them into top-level `Package` objects, and hoists dependencies.
96/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
97pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
98    let assembler_lookup = &*ASSEMBLER_LOOKUP;
99    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
100    let mut packages = Vec::new();
101    let mut dependencies = Vec::new();
102
103    let dir_files = group_files_by_directory(files);
104    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
105
106    for file_indices in dir_files.values() {
107        let mut groups: HashSet<DatasourceId> = HashSet::new();
108
109        for &idx in file_indices {
110            for pkg_data in &files[idx].package_data {
111                if let Some(dsid) = pkg_data.datasource_id
112                    && let Some(&config_key) = assembler_lookup.get(&dsid)
113                {
114                    groups.insert(config_key);
115                }
116            }
117        }
118
119        for &config_key in &groups {
120            let config = assembler_config_lookup
121                .get(&config_key)
122                .copied()
123                .expect("assembler config must exist");
124
125            if topology_plan.claims_directory_assembly(config, file_indices, files) {
126                continue;
127            }
128
129            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
130                let results = special_merger.run(config, files, file_indices);
131                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
132                continue;
133            }
134
135            match config.mode {
136                AssemblyMode::SiblingMerge => {
137                    let results = sibling_merge::assemble_siblings(config, files, file_indices);
138                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
139                }
140                AssemblyMode::OnePerPackageData => {
141                    let results = assemble_one_per_package_data(config, files, file_indices)
142                        .into_iter()
143                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
144                        .collect();
145                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
146                }
147            }
148        }
149    }
150
151    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
152
153    for config in ASSEMBLERS {
154        if config.mode != AssemblyMode::SiblingMerge {
155            continue;
156        }
157        if let Some((pkg, deps, affected_indices)) =
158            nested_merge::assemble_nested_patterns(files, config)
159        {
160            let package_uid = pkg.package_uid.clone();
161            let purl = pkg.purl.clone();
162            let removed_package_uids: Vec<PackageUid> = packages
163                .iter()
164                .filter(|p| p.purl == purl)
165                .map(|p| p.package_uid.clone())
166                .collect();
167
168            packages.retain(|p| p.purl != purl);
169            dependencies.retain(|d| {
170                d.for_package_uid.as_ref() != Some(&package_uid)
171                    && !removed_package_uids
172                        .iter()
173                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
174            });
175
176            for idx in &affected_indices {
177                files[*idx].for_packages.clear();
178                files[*idx].for_packages.push(package_uid.clone());
179            }
180
181            packages.push(pkg);
182            dependencies.extend(deps);
183        }
184    }
185
186    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
187    hoist_unassembled_file_dependencies(files, &mut dependencies);
188
189    for package in &mut packages {
190        package.datafile_paths.sort();
191        package.datafile_paths.dedup();
192        package.datasource_ids.sort_by_key(|left| left.to_string());
193        package.datasource_ids.dedup();
194    }
195
196    for file in files.iter_mut() {
197        file.for_packages
198            .sort_by(|left, right| left.stable_key().cmp(right.stable_key()));
199        file.for_packages.dedup();
200    }
201
202    packages
203        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
204    dependencies.sort_by(|left, right| {
205        left.purl
206            .as_deref()
207            .cmp(&right.purl.as_deref())
208            .then_with(|| {
209                left.extracted_requirement
210                    .as_deref()
211                    .cmp(&right.extracted_requirement.as_deref())
212            })
213            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
214            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
215            .then_with(|| {
216                left.datasource_id
217                    .to_string()
218                    .cmp(&right.datasource_id.to_string())
219            })
220            .then_with(|| {
221                left.for_package_uid
222                    .as_ref()
223                    .map(|uid| uid.stable_key())
224                    .cmp(&right.for_package_uid.as_ref().map(|uid| uid.stable_key()))
225            })
226    });
227
228    AssemblyResult {
229        packages,
230        dependencies,
231    }
232}
233
234fn apply_directory_merge_results(
235    files: &mut [FileInfo],
236    packages: &mut Vec<Package>,
237    dependencies: &mut Vec<TopLevelDependency>,
238    results: Vec<DirectoryMergeOutput>,
239) {
240    for (package, deps, affected_indices) in results {
241        if let Some(package) = package {
242            let package_uid = package.package_uid.clone();
243            for idx in &affected_indices {
244                if !files[*idx].for_packages.contains(&package_uid) {
245                    files[*idx].for_packages.push(package_uid.clone());
246                }
247            }
248            packages.push(package);
249        }
250        dependencies.extend(deps);
251    }
252}
253
254fn hoist_unassembled_file_dependencies(
255    files: &[FileInfo],
256    dependencies: &mut Vec<TopLevelDependency>,
257) {
258    for file in files {
259        if !file.for_packages.is_empty() {
260            continue;
261        }
262
263        for pkg_data in &file.package_data {
264            let Some(datasource_id) = pkg_data.datasource_id else {
265                continue;
266            };
267
268            if !should_hoist_unassembled_dependencies(datasource_id) {
269                continue;
270            }
271
272            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
273                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
274            }));
275        }
276    }
277}
278
279const HOIST_IF_UNOWNED_DATASOURCE_IDS: &[DatasourceId] = &[DatasourceId::PipRequirements];
280
281fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
282    if HOIST_IF_UNOWNED_DATASOURCE_IDS.contains(&datasource_id) {
283        return true;
284    }
285
286    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
287        return false;
288    }
289
290    !matches!(
291        datasource_id,
292        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
293    )
294}
295
296fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
297    (
298        package.purl.as_deref(),
299        package.name.as_deref(),
300        package.version.as_deref(),
301        package
302            .datafile_paths
303            .first()
304            .map(String::as_str)
305            .unwrap_or(""),
306    )
307}
308
309fn assemble_one_per_package_data(
310    config: &AssemblerConfig,
311    files: &[FileInfo],
312    file_indices: &[usize],
313) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
314    let mut results = Vec::new();
315
316    for &idx in file_indices {
317        let file = &files[idx];
318        for pkg_data in &file.package_data {
319            let dsid_matches = pkg_data
320                .datasource_id
321                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
322
323            if !dsid_matches || pkg_data.purl.is_none() {
324                continue;
325            }
326
327            let datafile_path = file.path.clone();
328            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
329            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
330            let for_package_uid = Some(pkg.package_uid.clone());
331
332            let deps: Vec<TopLevelDependency> = pkg_data
333                .dependencies
334                .iter()
335                .filter(|dep| dep.purl.is_some())
336                .map(|dep| {
337                    TopLevelDependency::from_dependency(
338                        dep,
339                        datafile_path.clone(),
340                        datasource_id,
341                        for_package_uid.clone(),
342                    )
343                })
344                .collect();
345
346            results.push((pkg, deps, idx));
347        }
348    }
349
350    results
351}
352
353/// Group file indices by their parent directory path.
354fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
355    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
356    for (idx, file) in files.iter().enumerate() {
357        if let Some(parent) = std::path::Path::new(&file.path).parent() {
358            groups.entry(parent.to_path_buf()).or_default().push(idx);
359        }
360    }
361    groups
362}