Skip to main content

provenant/assembly/
mod.rs

1mod assemblers;
2#[cfg(all(test, feature = "golden-tests"))]
3mod assembly_golden_test;
4#[cfg(test)]
5mod assembly_test;
6mod bazel_merge;
7mod bazel_prune;
8mod cargo_resource_assign;
9mod cargo_workspace_merge;
10mod composer_resource_assign;
11mod conda_rootfs_merge;
12pub mod file_ref_resolve;
13mod hackage_merge;
14mod nested_merge;
15mod nix_flake_compat_merge;
16mod npm_resource_assign;
17mod npm_workspace_merge;
18mod nuget_cpm_resolve;
19mod python_requirements_assign;
20mod ruby_resource_assign;
21mod sibling_merge;
22mod swift_merge;
23mod topology;
24
25use std::collections::{HashMap, HashSet};
26use std::path::PathBuf;
27use std::sync::LazyLock;
28
29use crate::models::{DatasourceId, FileInfo, Package, PackageUid, TopLevelDependency};
30
31pub use assemblers::ASSEMBLERS;
32
33type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
34
35/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
36/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
37static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
38    let mut lookup = HashMap::new();
39    for config in ASSEMBLERS {
40        let key = *config
41            .datasource_ids
42            .first()
43            .expect("assembler must have at least one datasource_id");
44        for &dsid in config.datasource_ids {
45            lookup.insert(dsid, key);
46        }
47    }
48    lookup
49});
50
51static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
52    LazyLock::new(|| {
53        let mut lookup = HashMap::new();
54        for config in ASSEMBLERS {
55            let key = *config
56                .datasource_ids
57                .first()
58                .expect("assembler must have at least one datasource_id");
59            lookup.insert(key, config);
60        }
61        lookup
62    });
63
64/// Result of the assembly phase: top-level packages and dependencies,
65/// plus updated file-to-package associations.
66pub struct AssemblyResult {
67    pub packages: Vec<Package>,
68    pub dependencies: Vec<TopLevelDependency>,
69}
70
71/// How an assembler groups PackageData into Packages.
72#[derive(Debug, Clone, Copy, PartialEq, Eq)]
73pub enum AssemblyMode {
74    /// Merge related files in the same directory (or nested) into one Package.
75    SiblingMerge,
76    /// Each PackageData becomes its own independent Package (e.g., database files
77    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
78    OnePerPackageData,
79}
80
81pub struct AssemblerConfig {
82    pub datasource_ids: &'static [DatasourceId],
83    pub sibling_file_patterns: &'static [&'static str],
84    pub mode: AssemblyMode,
85}
86
87/// Run the assembly phase over all scanned files.
88///
89/// Groups files by parent directory, finds related manifests/lockfiles,
90/// merges them into top-level `Package` objects, and hoists dependencies.
91/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
92pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
93    let assembler_lookup = &*ASSEMBLER_LOOKUP;
94    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
95    let mut packages = Vec::new();
96    let mut dependencies = Vec::new();
97
98    let dir_files = group_files_by_directory(files);
99    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
100
101    for file_indices in dir_files.values() {
102        let mut groups: HashSet<DatasourceId> = HashSet::new();
103
104        for &idx in file_indices {
105            for pkg_data in &files[idx].package_data {
106                if let Some(dsid) = pkg_data.datasource_id
107                    && let Some(&config_key) = assembler_lookup.get(&dsid)
108                {
109                    groups.insert(config_key);
110                }
111            }
112        }
113
114        for &config_key in &groups {
115            let config = assembler_config_lookup
116                .get(&config_key)
117                .copied()
118                .expect("assembler config must exist");
119
120            if topology_plan.claims_directory_assembly(config, file_indices, files) {
121                continue;
122            }
123
124            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
125                let results = special_merger.run(config, files, file_indices);
126                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
127                continue;
128            }
129
130            match config.mode {
131                AssemblyMode::SiblingMerge => {
132                    let results = sibling_merge::assemble_siblings(config, files, file_indices)
133                        .into_iter()
134                        .collect();
135                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
136                }
137                AssemblyMode::OnePerPackageData => {
138                    let results = assemble_one_per_package_data(config, files, file_indices)
139                        .into_iter()
140                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
141                        .collect();
142                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
143                }
144            }
145        }
146    }
147
148    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
149
150    for config in ASSEMBLERS {
151        if config.mode != AssemblyMode::SiblingMerge {
152            continue;
153        }
154        if let Some((pkg, deps, affected_indices)) =
155            nested_merge::assemble_nested_patterns(files, config)
156        {
157            let package_uid = pkg.package_uid.clone();
158            let purl = pkg.purl.clone();
159            let removed_package_uids: Vec<PackageUid> = packages
160                .iter()
161                .filter(|p| p.purl == purl)
162                .map(|p| p.package_uid.clone())
163                .collect();
164
165            packages.retain(|p| p.purl != purl);
166            dependencies.retain(|d| {
167                d.for_package_uid.as_ref() != Some(&package_uid)
168                    && !removed_package_uids
169                        .iter()
170                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
171            });
172
173            for idx in &affected_indices {
174                files[*idx].for_packages.clear();
175                files[*idx].for_packages.push(package_uid.clone());
176            }
177
178            packages.push(pkg);
179            dependencies.extend(deps);
180        }
181    }
182
183    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
184    hoist_unassembled_file_dependencies(files, &mut dependencies);
185
186    for package in &mut packages {
187        package.datafile_paths.sort();
188        package.datafile_paths.dedup();
189        package.datasource_ids.sort_by_key(|left| left.to_string());
190        package.datasource_ids.dedup();
191    }
192
193    for file in files.iter_mut() {
194        file.for_packages
195            .sort_by(|left, right| left.stable_key().cmp(right.stable_key()));
196        file.for_packages.dedup();
197    }
198
199    packages
200        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
201    dependencies.sort_by(|left, right| {
202        left.purl
203            .as_deref()
204            .cmp(&right.purl.as_deref())
205            .then_with(|| {
206                left.extracted_requirement
207                    .as_deref()
208                    .cmp(&right.extracted_requirement.as_deref())
209            })
210            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
211            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
212            .then_with(|| {
213                left.datasource_id
214                    .to_string()
215                    .cmp(&right.datasource_id.to_string())
216            })
217            .then_with(|| {
218                left.for_package_uid
219                    .as_ref()
220                    .map(|uid| uid.stable_key())
221                    .cmp(&right.for_package_uid.as_ref().map(|uid| uid.stable_key()))
222            })
223    });
224
225    AssemblyResult {
226        packages,
227        dependencies,
228    }
229}
230
231fn apply_directory_merge_results(
232    files: &mut [FileInfo],
233    packages: &mut Vec<Package>,
234    dependencies: &mut Vec<TopLevelDependency>,
235    results: Vec<DirectoryMergeOutput>,
236) {
237    for (package, deps, affected_indices) in results {
238        if let Some(package) = package {
239            let package_uid = package.package_uid.clone();
240            for idx in &affected_indices {
241                if !files[*idx].for_packages.contains(&package_uid) {
242                    files[*idx].for_packages.push(package_uid.clone());
243                }
244            }
245            packages.push(package);
246        }
247        dependencies.extend(deps);
248    }
249}
250
251fn hoist_unassembled_file_dependencies(
252    files: &[FileInfo],
253    dependencies: &mut Vec<TopLevelDependency>,
254) {
255    for file in files {
256        if !file.for_packages.is_empty() {
257            continue;
258        }
259
260        for pkg_data in &file.package_data {
261            let Some(datasource_id) = pkg_data.datasource_id else {
262                continue;
263            };
264
265            if !should_hoist_unassembled_dependencies(datasource_id) {
266                continue;
267            }
268
269            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
270                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
271            }));
272        }
273    }
274}
275
276const HOIST_IF_UNOWNED_DATASOURCE_IDS: &[DatasourceId] = &[DatasourceId::PipRequirements];
277
278fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
279    if HOIST_IF_UNOWNED_DATASOURCE_IDS.contains(&datasource_id) {
280        return true;
281    }
282
283    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
284        return false;
285    }
286
287    !matches!(
288        datasource_id,
289        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
290    )
291}
292
293fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
294    (
295        package.purl.as_deref(),
296        package.name.as_deref(),
297        package.version.as_deref(),
298        package
299            .datafile_paths
300            .first()
301            .map(String::as_str)
302            .unwrap_or(""),
303    )
304}
305
306fn assemble_one_per_package_data(
307    config: &AssemblerConfig,
308    files: &[FileInfo],
309    file_indices: &[usize],
310) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
311    let mut results = Vec::new();
312
313    for &idx in file_indices {
314        let file = &files[idx];
315        for pkg_data in &file.package_data {
316            let dsid_matches = pkg_data
317                .datasource_id
318                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
319
320            if !dsid_matches || pkg_data.purl.is_none() {
321                continue;
322            }
323
324            let datafile_path = file.path.clone();
325            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
326            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
327            let for_package_uid = Some(pkg.package_uid.clone());
328
329            let deps: Vec<TopLevelDependency> = pkg_data
330                .dependencies
331                .iter()
332                .filter(|dep| dep.purl.is_some())
333                .map(|dep| {
334                    TopLevelDependency::from_dependency(
335                        dep,
336                        datafile_path.clone(),
337                        datasource_id,
338                        for_package_uid.clone(),
339                    )
340                })
341                .collect();
342
343            results.push((pkg, deps, idx));
344        }
345    }
346
347    results
348}
349
350/// Group file indices by their parent directory path.
351fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
352    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
353    for (idx, file) in files.iter().enumerate() {
354        if let Some(parent) = std::path::Path::new(&file.path).parent() {
355            groups.entry(parent.to_path_buf()).or_default().push(idx);
356        }
357    }
358    groups
359}