Skip to main content

provenant/assembly/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4mod assemblers;
5#[cfg(test)]
6mod assembly_test;
7mod bazel_merge;
8mod bazel_prune;
9mod cargo_resource_assign;
10mod cargo_workspace_merge;
11mod composer_resource_assign;
12mod conda_rootfs_merge;
13mod debian_source_merge;
14pub mod file_ref_resolve;
15mod hackage_merge;
16mod nested_merge;
17mod nix_flake_compat_merge;
18mod npm_resource_assign;
19mod npm_workspace_merge;
20mod nuget_cpm_resolve;
21mod python_requirements_assign;
22mod ruby_resource_assign;
23mod sibling_merge;
24mod swift_merge;
25mod topology;
26mod windows_update_merge;
27
28use std::collections::{HashMap, HashSet};
29use std::path::PathBuf;
30use std::sync::LazyLock;
31
32use crate::models::{DatasourceId, FileInfo, Package, PackageUid, TopLevelDependency};
33
34pub use assemblers::ASSEMBLERS;
35
36type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
37
38/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
39/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
40static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
41    let mut lookup = HashMap::new();
42    for config in ASSEMBLERS {
43        let key = *config
44            .datasource_ids
45            .first()
46            .expect("assembler must have at least one datasource_id");
47        for &dsid in config.datasource_ids {
48            lookup.insert(dsid, key);
49        }
50    }
51    lookup
52});
53
54static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
55    LazyLock::new(|| {
56        let mut lookup = HashMap::new();
57        for config in ASSEMBLERS {
58            let key = *config
59                .datasource_ids
60                .first()
61                .expect("assembler must have at least one datasource_id");
62            lookup.insert(key, config);
63        }
64        lookup
65    });
66
67/// Result of the assembly phase: top-level packages and dependencies,
68/// plus updated file-to-package associations.
69pub struct AssemblyResult {
70    pub packages: Vec<Package>,
71    pub dependencies: Vec<TopLevelDependency>,
72}
73
74/// How an assembler groups PackageData into Packages.
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum AssemblyMode {
77    /// Merge related files in the same directory (or nested) into one Package.
78    SiblingMerge,
79    /// Each PackageData becomes its own independent Package (e.g., database files
80    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
81    OnePerPackageData,
82}
83
84pub struct AssemblerConfig {
85    pub datasource_ids: &'static [DatasourceId],
86    pub sibling_file_patterns: &'static [&'static str],
87    pub mode: AssemblyMode,
88}
89
90/// Run the assembly phase over all scanned files.
91///
92/// Groups files by parent directory, finds related manifests/lockfiles,
93/// merges them into top-level `Package` objects, and hoists dependencies.
94/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
95pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
96    let assembler_lookup = &*ASSEMBLER_LOOKUP;
97    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
98    let mut packages = Vec::new();
99    let mut dependencies = Vec::new();
100
101    let dir_files = group_files_by_directory(files);
102    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
103
104    for file_indices in dir_files.values() {
105        let mut groups: HashSet<DatasourceId> = HashSet::new();
106
107        for &idx in file_indices {
108            for pkg_data in &files[idx].package_data {
109                if let Some(dsid) = pkg_data.datasource_id
110                    && let Some(&config_key) = assembler_lookup.get(&dsid)
111                {
112                    groups.insert(config_key);
113                }
114            }
115        }
116
117        for &config_key in &groups {
118            let config = assembler_config_lookup
119                .get(&config_key)
120                .copied()
121                .expect("assembler config must exist");
122
123            if topology_plan.claims_directory_assembly(config, file_indices, files) {
124                continue;
125            }
126
127            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
128                let results = special_merger.run(config, files, file_indices);
129                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
130                continue;
131            }
132
133            match config.mode {
134                AssemblyMode::SiblingMerge => {
135                    let results = sibling_merge::assemble_siblings(config, files, file_indices);
136                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
137                }
138                AssemblyMode::OnePerPackageData => {
139                    let results = assemble_one_per_package_data(config, files, file_indices)
140                        .into_iter()
141                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
142                        .collect();
143                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
144                }
145            }
146        }
147    }
148
149    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
150
151    for config in ASSEMBLERS {
152        if config.mode != AssemblyMode::SiblingMerge {
153            continue;
154        }
155        if let Some((pkg, deps, affected_indices)) =
156            nested_merge::assemble_nested_patterns(files, config)
157        {
158            let package_uid = pkg.package_uid.clone();
159            let purl = pkg.purl.clone();
160            let removed_package_uids: Vec<PackageUid> = packages
161                .iter()
162                .filter(|p| p.purl == purl)
163                .map(|p| p.package_uid.clone())
164                .collect();
165
166            packages.retain(|p| p.purl != purl);
167            dependencies.retain(|d| {
168                d.for_package_uid.as_ref() != Some(&package_uid)
169                    && !removed_package_uids
170                        .iter()
171                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
172            });
173
174            for idx in &affected_indices {
175                files[*idx].for_packages.clear();
176                files[*idx].for_packages.push(package_uid.clone());
177            }
178
179            packages.push(pkg);
180            dependencies.extend(deps);
181        }
182    }
183
184    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
185    hoist_unassembled_file_dependencies(files, &mut dependencies);
186
187    for package in &mut packages {
188        package.datafile_paths.sort();
189        package.datafile_paths.dedup();
190        package.datasource_ids.sort_by_key(|left| left.to_string());
191        package.datasource_ids.dedup();
192    }
193
194    for file in files.iter_mut() {
195        file.for_packages
196            .sort_by(|left, right| left.stable_key().cmp(right.stable_key()));
197        file.for_packages.dedup();
198    }
199
200    packages
201        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
202    dependencies.sort_by(|left, right| {
203        left.purl
204            .as_deref()
205            .cmp(&right.purl.as_deref())
206            .then_with(|| {
207                left.extracted_requirement
208                    .as_deref()
209                    .cmp(&right.extracted_requirement.as_deref())
210            })
211            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
212            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
213            .then_with(|| {
214                left.datasource_id
215                    .to_string()
216                    .cmp(&right.datasource_id.to_string())
217            })
218            .then_with(|| {
219                left.for_package_uid
220                    .as_ref()
221                    .map(|uid| uid.stable_key())
222                    .cmp(&right.for_package_uid.as_ref().map(|uid| uid.stable_key()))
223            })
224    });
225
226    AssemblyResult {
227        packages,
228        dependencies,
229    }
230}
231
232fn apply_directory_merge_results(
233    files: &mut [FileInfo],
234    packages: &mut Vec<Package>,
235    dependencies: &mut Vec<TopLevelDependency>,
236    results: Vec<DirectoryMergeOutput>,
237) {
238    for (package, deps, affected_indices) in results {
239        if let Some(package) = package {
240            let package_uid = package.package_uid.clone();
241            for idx in &affected_indices {
242                if !files[*idx].for_packages.contains(&package_uid) {
243                    files[*idx].for_packages.push(package_uid.clone());
244                }
245            }
246            packages.push(package);
247        }
248        dependencies.extend(deps);
249    }
250}
251
252fn hoist_unassembled_file_dependencies(
253    files: &[FileInfo],
254    dependencies: &mut Vec<TopLevelDependency>,
255) {
256    for file in files {
257        if !file.for_packages.is_empty() {
258            continue;
259        }
260
261        for pkg_data in &file.package_data {
262            let Some(datasource_id) = pkg_data.datasource_id else {
263                continue;
264            };
265
266            if !should_hoist_unassembled_dependencies(datasource_id) {
267                continue;
268            }
269
270            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
271                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
272            }));
273        }
274    }
275}
276
277const HOIST_IF_UNOWNED_DATASOURCE_IDS: &[DatasourceId] = &[DatasourceId::PipRequirements];
278
279fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
280    if HOIST_IF_UNOWNED_DATASOURCE_IDS.contains(&datasource_id) {
281        return true;
282    }
283
284    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
285        return false;
286    }
287
288    !matches!(
289        datasource_id,
290        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
291    )
292}
293
294fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
295    (
296        package.purl.as_deref(),
297        package.name.as_deref(),
298        package.version.as_deref(),
299        package
300            .datafile_paths
301            .first()
302            .map(String::as_str)
303            .unwrap_or(""),
304    )
305}
306
307fn assemble_one_per_package_data(
308    config: &AssemblerConfig,
309    files: &[FileInfo],
310    file_indices: &[usize],
311) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
312    let mut results = Vec::new();
313
314    for &idx in file_indices {
315        let file = &files[idx];
316        for pkg_data in &file.package_data {
317            let dsid_matches = pkg_data
318                .datasource_id
319                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
320
321            if !dsid_matches
322                || pkg_data.purl.is_none()
323                || should_skip_placeholder_only_cocoapods_podspec(pkg_data)
324            {
325                continue;
326            }
327
328            let datafile_path = file.path.clone();
329            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
330            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
331            let for_package_uid = Some(pkg.package_uid.clone());
332
333            let deps: Vec<TopLevelDependency> = pkg_data
334                .dependencies
335                .iter()
336                .filter(|dep| dep.purl.is_some() || dep.extracted_requirement.is_some())
337                .map(|dep| {
338                    TopLevelDependency::from_dependency(
339                        dep,
340                        datafile_path.clone(),
341                        datasource_id,
342                        for_package_uid.clone(),
343                    )
344                })
345                .collect();
346
347            results.push((pkg, deps, idx));
348        }
349    }
350
351    results
352}
353
354pub(super) fn should_skip_placeholder_only_cocoapods_podspec(
355    pkg_data: &crate::models::PackageData,
356) -> bool {
357    pkg_data.datasource_id == Some(DatasourceId::CocoapodsPodspec)
358        && pkg_data
359            .extra_data
360            .as_ref()
361            .and_then(|data| data.get("dynamic_identity_placeholders"))
362            .and_then(|value| value.as_bool())
363            == Some(true)
364}
365
366/// Group file indices by their parent directory path.
367fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
368    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
369    for (idx, file) in files.iter().enumerate() {
370        if let Some(parent) = std::path::Path::new(&file.path).parent() {
371            groups.entry(parent.to_path_buf()).or_default().push(idx);
372        }
373    }
374    groups
375}