Skip to main content

provenant/assembly/
mod.rs

1mod assemblers;
2#[cfg(test)]
3mod assembly_golden_test;
4#[cfg(test)]
5mod assembly_test;
6mod bazel_merge;
7mod bazel_prune;
8mod cargo_resource_assign;
9mod cargo_workspace_merge;
10mod composer_resource_assign;
11mod conda_rootfs_merge;
12pub mod file_ref_resolve;
13mod hackage_merge;
14mod nested_merge;
15mod npm_resource_assign;
16mod npm_workspace_merge;
17mod nuget_cpm_resolve;
18mod python_requirements_assign;
19mod ruby_resource_assign;
20mod sibling_merge;
21mod swift_merge;
22mod topology;
23
24use std::collections::{HashMap, HashSet};
25use std::path::PathBuf;
26use std::sync::LazyLock;
27
28use crate::models::{DatasourceId, FileInfo, Package, PackageUid, TopLevelDependency};
29
30pub use assemblers::ASSEMBLERS;
31
32type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
33
34/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
35/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
36static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
37    let mut lookup = HashMap::new();
38    for config in ASSEMBLERS {
39        let key = *config
40            .datasource_ids
41            .first()
42            .expect("assembler must have at least one datasource_id");
43        for &dsid in config.datasource_ids {
44            lookup.insert(dsid, key);
45        }
46    }
47    lookup
48});
49
50static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
51    LazyLock::new(|| {
52        let mut lookup = HashMap::new();
53        for config in ASSEMBLERS {
54            let key = *config
55                .datasource_ids
56                .first()
57                .expect("assembler must have at least one datasource_id");
58            lookup.insert(key, config);
59        }
60        lookup
61    });
62
63/// Result of the assembly phase: top-level packages and dependencies,
64/// plus updated file-to-package associations.
65pub struct AssemblyResult {
66    pub packages: Vec<Package>,
67    pub dependencies: Vec<TopLevelDependency>,
68}
69
70/// How an assembler groups PackageData into Packages.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum AssemblyMode {
73    /// Merge related files in the same directory (or nested) into one Package.
74    SiblingMerge,
75    /// Each PackageData becomes its own independent Package (e.g., database files
76    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
77    OnePerPackageData,
78}
79
80pub struct AssemblerConfig {
81    pub datasource_ids: &'static [DatasourceId],
82    pub sibling_file_patterns: &'static [&'static str],
83    pub mode: AssemblyMode,
84}
85
86/// Run the assembly phase over all scanned files.
87///
88/// Groups files by parent directory, finds related manifests/lockfiles,
89/// merges them into top-level `Package` objects, and hoists dependencies.
90/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
91pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
92    let assembler_lookup = &*ASSEMBLER_LOOKUP;
93    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
94    let mut packages = Vec::new();
95    let mut dependencies = Vec::new();
96
97    let dir_files = group_files_by_directory(files);
98    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
99
100    for file_indices in dir_files.values() {
101        let mut groups: HashSet<DatasourceId> = HashSet::new();
102
103        for &idx in file_indices {
104            for pkg_data in &files[idx].package_data {
105                if let Some(dsid) = pkg_data.datasource_id
106                    && let Some(&config_key) = assembler_lookup.get(&dsid)
107                {
108                    groups.insert(config_key);
109                }
110            }
111        }
112
113        for &config_key in &groups {
114            let config = assembler_config_lookup
115                .get(&config_key)
116                .copied()
117                .expect("assembler config must exist");
118
119            if topology_plan.claims_directory_assembly(config, file_indices, files) {
120                continue;
121            }
122
123            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
124                let results = special_merger.run(config, files, file_indices);
125                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
126                continue;
127            }
128
129            match config.mode {
130                AssemblyMode::SiblingMerge => {
131                    let results = sibling_merge::assemble_siblings(config, files, file_indices)
132                        .into_iter()
133                        .collect();
134                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
135                }
136                AssemblyMode::OnePerPackageData => {
137                    let results = assemble_one_per_package_data(config, files, file_indices)
138                        .into_iter()
139                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
140                        .collect();
141                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
142                }
143            }
144        }
145    }
146
147    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
148
149    for config in ASSEMBLERS {
150        if config.mode != AssemblyMode::SiblingMerge {
151            continue;
152        }
153        if let Some((pkg, deps, affected_indices)) =
154            nested_merge::assemble_nested_patterns(files, config)
155        {
156            let package_uid = pkg.package_uid.clone();
157            let purl = pkg.purl.clone();
158            let removed_package_uids: Vec<PackageUid> = packages
159                .iter()
160                .filter(|p| p.purl == purl)
161                .map(|p| p.package_uid.clone())
162                .collect();
163
164            packages.retain(|p| p.purl != purl);
165            dependencies.retain(|d| {
166                d.for_package_uid.as_ref() != Some(&package_uid)
167                    && !removed_package_uids
168                        .iter()
169                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
170            });
171
172            for idx in &affected_indices {
173                files[*idx].for_packages.clear();
174                files[*idx].for_packages.push(package_uid.clone());
175            }
176
177            packages.push(pkg);
178            dependencies.extend(deps);
179        }
180    }
181
182    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
183    hoist_unassembled_file_dependencies(files, &mut dependencies);
184
185    for package in &mut packages {
186        package.datafile_paths.sort();
187        package.datafile_paths.dedup();
188        package.datasource_ids.sort_by_key(|left| left.to_string());
189        package.datasource_ids.dedup();
190    }
191
192    for file in files.iter_mut() {
193        file.for_packages
194            .sort_by(|left, right| left.stable_key().cmp(right.stable_key()));
195        file.for_packages.dedup();
196    }
197
198    packages
199        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
200    dependencies.sort_by(|left, right| {
201        left.purl
202            .as_deref()
203            .cmp(&right.purl.as_deref())
204            .then_with(|| {
205                left.extracted_requirement
206                    .as_deref()
207                    .cmp(&right.extracted_requirement.as_deref())
208            })
209            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
210            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
211            .then_with(|| {
212                left.datasource_id
213                    .to_string()
214                    .cmp(&right.datasource_id.to_string())
215            })
216            .then_with(|| {
217                left.for_package_uid
218                    .as_ref()
219                    .map(|uid| uid.stable_key())
220                    .cmp(&right.for_package_uid.as_ref().map(|uid| uid.stable_key()))
221            })
222    });
223
224    AssemblyResult {
225        packages,
226        dependencies,
227    }
228}
229
230fn apply_directory_merge_results(
231    files: &mut [FileInfo],
232    packages: &mut Vec<Package>,
233    dependencies: &mut Vec<TopLevelDependency>,
234    results: Vec<DirectoryMergeOutput>,
235) {
236    for (package, deps, affected_indices) in results {
237        if let Some(package) = package {
238            let package_uid = package.package_uid.clone();
239            for idx in &affected_indices {
240                if !files[*idx].for_packages.contains(&package_uid) {
241                    files[*idx].for_packages.push(package_uid.clone());
242                }
243            }
244            packages.push(package);
245        }
246        dependencies.extend(deps);
247    }
248}
249
250fn hoist_unassembled_file_dependencies(
251    files: &[FileInfo],
252    dependencies: &mut Vec<TopLevelDependency>,
253) {
254    for file in files {
255        if !file.for_packages.is_empty() {
256            continue;
257        }
258
259        for pkg_data in &file.package_data {
260            let Some(datasource_id) = pkg_data.datasource_id else {
261                continue;
262            };
263
264            if !should_hoist_unassembled_dependencies(datasource_id) {
265                continue;
266            }
267
268            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
269                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
270            }));
271        }
272    }
273}
274
275const HOIST_IF_UNOWNED_DATASOURCE_IDS: &[DatasourceId] = &[DatasourceId::PipRequirements];
276
277fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
278    if HOIST_IF_UNOWNED_DATASOURCE_IDS.contains(&datasource_id) {
279        return true;
280    }
281
282    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
283        return false;
284    }
285
286    !matches!(
287        datasource_id,
288        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
289    )
290}
291
292fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
293    (
294        package.purl.as_deref(),
295        package.name.as_deref(),
296        package.version.as_deref(),
297        package
298            .datafile_paths
299            .first()
300            .map(String::as_str)
301            .unwrap_or(""),
302    )
303}
304
305fn assemble_one_per_package_data(
306    config: &AssemblerConfig,
307    files: &[FileInfo],
308    file_indices: &[usize],
309) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
310    let mut results = Vec::new();
311
312    for &idx in file_indices {
313        let file = &files[idx];
314        for pkg_data in &file.package_data {
315            let dsid_matches = pkg_data
316                .datasource_id
317                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
318
319            if !dsid_matches || pkg_data.purl.is_none() {
320                continue;
321            }
322
323            let datafile_path = file.path.clone();
324            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
325            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
326            let for_package_uid = Some(pkg.package_uid.clone());
327
328            let deps: Vec<TopLevelDependency> = pkg_data
329                .dependencies
330                .iter()
331                .filter(|dep| dep.purl.is_some())
332                .map(|dep| {
333                    TopLevelDependency::from_dependency(
334                        dep,
335                        datafile_path.clone(),
336                        datasource_id,
337                        for_package_uid.clone(),
338                    )
339                })
340                .collect();
341
342            results.push((pkg, deps, idx));
343        }
344    }
345
346    results
347}
348
349/// Group file indices by their parent directory path.
350fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
351    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
352    for (idx, file) in files.iter().enumerate() {
353        if let Some(parent) = std::path::Path::new(&file.path).parent() {
354            groups.entry(parent.to_path_buf()).or_default().push(idx);
355        }
356    }
357    groups
358}