Skip to main content

provenant/assembly/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod assemblers;
5#[cfg(all(test, feature = "golden-tests"))]
6mod assembly_golden_test;
7#[cfg(test)]
8mod assembly_test;
9mod bazel_merge;
10mod bazel_prune;
11mod cargo_resource_assign;
12mod cargo_workspace_merge;
13mod composer_resource_assign;
14mod conda_rootfs_merge;
15mod debian_source_merge;
16pub mod file_ref_resolve;
17mod hackage_merge;
18mod nested_merge;
19mod nix_flake_compat_merge;
20mod npm_resource_assign;
21mod npm_workspace_merge;
22mod nuget_cpm_resolve;
23mod python_requirements_assign;
24mod ruby_resource_assign;
25mod sibling_merge;
26mod swift_merge;
27mod topology;
28
29use std::collections::{HashMap, HashSet};
30use std::path::PathBuf;
31use std::sync::LazyLock;
32
33use crate::models::{DatasourceId, FileInfo, Package, PackageUid, TopLevelDependency};
34
35pub use assemblers::ASSEMBLERS;
36
37type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
38
39/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
40/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
41static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
42    let mut lookup = HashMap::new();
43    for config in ASSEMBLERS {
44        let key = *config
45            .datasource_ids
46            .first()
47            .expect("assembler must have at least one datasource_id");
48        for &dsid in config.datasource_ids {
49            lookup.insert(dsid, key);
50        }
51    }
52    lookup
53});
54
55static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
56    LazyLock::new(|| {
57        let mut lookup = HashMap::new();
58        for config in ASSEMBLERS {
59            let key = *config
60                .datasource_ids
61                .first()
62                .expect("assembler must have at least one datasource_id");
63            lookup.insert(key, config);
64        }
65        lookup
66    });
67
68/// Result of the assembly phase: top-level packages and dependencies,
69/// plus updated file-to-package associations.
70pub struct AssemblyResult {
71    pub packages: Vec<Package>,
72    pub dependencies: Vec<TopLevelDependency>,
73}
74
75/// How an assembler groups PackageData into Packages.
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum AssemblyMode {
78    /// Merge related files in the same directory (or nested) into one Package.
79    SiblingMerge,
80    /// Each PackageData becomes its own independent Package (e.g., database files
81    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
82    OnePerPackageData,
83}
84
85pub struct AssemblerConfig {
86    pub datasource_ids: &'static [DatasourceId],
87    pub sibling_file_patterns: &'static [&'static str],
88    pub mode: AssemblyMode,
89}
90
91/// Run the assembly phase over all scanned files.
92///
93/// Groups files by parent directory, finds related manifests/lockfiles,
94/// merges them into top-level `Package` objects, and hoists dependencies.
95/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
96pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
97    let assembler_lookup = &*ASSEMBLER_LOOKUP;
98    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
99    let mut packages = Vec::new();
100    let mut dependencies = Vec::new();
101
102    let dir_files = group_files_by_directory(files);
103    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
104
105    for file_indices in dir_files.values() {
106        let mut groups: HashSet<DatasourceId> = HashSet::new();
107
108        for &idx in file_indices {
109            for pkg_data in &files[idx].package_data {
110                if let Some(dsid) = pkg_data.datasource_id
111                    && let Some(&config_key) = assembler_lookup.get(&dsid)
112                {
113                    groups.insert(config_key);
114                }
115            }
116        }
117
118        for &config_key in &groups {
119            let config = assembler_config_lookup
120                .get(&config_key)
121                .copied()
122                .expect("assembler config must exist");
123
124            if topology_plan.claims_directory_assembly(config, file_indices, files) {
125                continue;
126            }
127
128            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
129                let results = special_merger.run(config, files, file_indices);
130                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
131                continue;
132            }
133
134            match config.mode {
135                AssemblyMode::SiblingMerge => {
136                    let results = sibling_merge::assemble_siblings(config, files, file_indices);
137                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
138                }
139                AssemblyMode::OnePerPackageData => {
140                    let results = assemble_one_per_package_data(config, files, file_indices)
141                        .into_iter()
142                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
143                        .collect();
144                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
145                }
146            }
147        }
148    }
149
150    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
151
152    for config in ASSEMBLERS {
153        if config.mode != AssemblyMode::SiblingMerge {
154            continue;
155        }
156        if let Some((pkg, deps, affected_indices)) =
157            nested_merge::assemble_nested_patterns(files, config)
158        {
159            let package_uid = pkg.package_uid.clone();
160            let purl = pkg.purl.clone();
161            let removed_package_uids: Vec<PackageUid> = packages
162                .iter()
163                .filter(|p| p.purl == purl)
164                .map(|p| p.package_uid.clone())
165                .collect();
166
167            packages.retain(|p| p.purl != purl);
168            dependencies.retain(|d| {
169                d.for_package_uid.as_ref() != Some(&package_uid)
170                    && !removed_package_uids
171                        .iter()
172                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
173            });
174
175            for idx in &affected_indices {
176                files[*idx].for_packages.clear();
177                files[*idx].for_packages.push(package_uid.clone());
178            }
179
180            packages.push(pkg);
181            dependencies.extend(deps);
182        }
183    }
184
185    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
186    hoist_unassembled_file_dependencies(files, &mut dependencies);
187
188    for package in &mut packages {
189        package.datafile_paths.sort();
190        package.datafile_paths.dedup();
191        package.datasource_ids.sort_by_key(|left| left.to_string());
192        package.datasource_ids.dedup();
193    }
194
195    for file in files.iter_mut() {
196        file.for_packages
197            .sort_by(|left, right| left.stable_key().cmp(right.stable_key()));
198        file.for_packages.dedup();
199    }
200
201    packages
202        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
203    dependencies.sort_by(|left, right| {
204        left.purl
205            .as_deref()
206            .cmp(&right.purl.as_deref())
207            .then_with(|| {
208                left.extracted_requirement
209                    .as_deref()
210                    .cmp(&right.extracted_requirement.as_deref())
211            })
212            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
213            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
214            .then_with(|| {
215                left.datasource_id
216                    .to_string()
217                    .cmp(&right.datasource_id.to_string())
218            })
219            .then_with(|| {
220                left.for_package_uid
221                    .as_ref()
222                    .map(|uid| uid.stable_key())
223                    .cmp(&right.for_package_uid.as_ref().map(|uid| uid.stable_key()))
224            })
225    });
226
227    AssemblyResult {
228        packages,
229        dependencies,
230    }
231}
232
233fn apply_directory_merge_results(
234    files: &mut [FileInfo],
235    packages: &mut Vec<Package>,
236    dependencies: &mut Vec<TopLevelDependency>,
237    results: Vec<DirectoryMergeOutput>,
238) {
239    for (package, deps, affected_indices) in results {
240        if let Some(package) = package {
241            let package_uid = package.package_uid.clone();
242            for idx in &affected_indices {
243                if !files[*idx].for_packages.contains(&package_uid) {
244                    files[*idx].for_packages.push(package_uid.clone());
245                }
246            }
247            packages.push(package);
248        }
249        dependencies.extend(deps);
250    }
251}
252
253fn hoist_unassembled_file_dependencies(
254    files: &[FileInfo],
255    dependencies: &mut Vec<TopLevelDependency>,
256) {
257    for file in files {
258        if !file.for_packages.is_empty() {
259            continue;
260        }
261
262        for pkg_data in &file.package_data {
263            let Some(datasource_id) = pkg_data.datasource_id else {
264                continue;
265            };
266
267            if !should_hoist_unassembled_dependencies(datasource_id) {
268                continue;
269            }
270
271            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
272                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
273            }));
274        }
275    }
276}
277
278const HOIST_IF_UNOWNED_DATASOURCE_IDS: &[DatasourceId] = &[DatasourceId::PipRequirements];
279
280fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
281    if HOIST_IF_UNOWNED_DATASOURCE_IDS.contains(&datasource_id) {
282        return true;
283    }
284
285    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
286        return false;
287    }
288
289    !matches!(
290        datasource_id,
291        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
292    )
293}
294
295fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
296    (
297        package.purl.as_deref(),
298        package.name.as_deref(),
299        package.version.as_deref(),
300        package
301            .datafile_paths
302            .first()
303            .map(String::as_str)
304            .unwrap_or(""),
305    )
306}
307
308fn assemble_one_per_package_data(
309    config: &AssemblerConfig,
310    files: &[FileInfo],
311    file_indices: &[usize],
312) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
313    let mut results = Vec::new();
314
315    for &idx in file_indices {
316        let file = &files[idx];
317        for pkg_data in &file.package_data {
318            let dsid_matches = pkg_data
319                .datasource_id
320                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
321
322            if !dsid_matches || pkg_data.purl.is_none() {
323                continue;
324            }
325
326            let datafile_path = file.path.clone();
327            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
328            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
329            let for_package_uid = Some(pkg.package_uid.clone());
330
331            let deps: Vec<TopLevelDependency> = pkg_data
332                .dependencies
333                .iter()
334                .filter(|dep| dep.purl.is_some())
335                .map(|dep| {
336                    TopLevelDependency::from_dependency(
337                        dep,
338                        datafile_path.clone(),
339                        datasource_id,
340                        for_package_uid.clone(),
341                    )
342                })
343                .collect();
344
345            results.push((pkg, deps, idx));
346        }
347    }
348
349    results
350}
351
352/// Group file indices by their parent directory path.
353fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
354    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
355    for (idx, file) in files.iter().enumerate() {
356        if let Some(parent) = std::path::Path::new(&file.path).parent() {
357            groups.entry(parent.to_path_buf()).or_default().push(idx);
358        }
359    }
360    groups
361}