Skip to main content

provenant/assembly/
mod.rs

1mod assemblers;
2#[cfg(test)]
3mod assembly_golden_test;
4#[cfg(test)]
5mod assembly_test;
6mod cargo_resource_assign;
7mod cargo_workspace_merge;
8mod composer_resource_assign;
9mod conda_rootfs_merge;
10pub mod file_ref_resolve;
11mod hackage_merge;
12mod nested_merge;
13mod npm_resource_assign;
14mod npm_workspace_merge;
15mod nuget_cpm_resolve;
16mod python_requirements_assign;
17mod ruby_resource_assign;
18mod sibling_merge;
19mod swift_merge;
20
21use std::collections::{HashMap, HashSet};
22use std::path::PathBuf;
23use std::sync::LazyLock;
24
25use crate::models::{DatasourceId, FileInfo, Package, TopLevelDependency};
26
27pub use assemblers::ASSEMBLERS;
28
29type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
30
31/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
32/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
33static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
34    let mut lookup = HashMap::new();
35    for config in ASSEMBLERS {
36        let key = *config
37            .datasource_ids
38            .first()
39            .expect("assembler must have at least one datasource_id");
40        for &dsid in config.datasource_ids {
41            lookup.insert(dsid, key);
42        }
43    }
44    lookup
45});
46
47static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
48    LazyLock::new(|| {
49        let mut lookup = HashMap::new();
50        for config in ASSEMBLERS {
51            let key = *config
52                .datasource_ids
53                .first()
54                .expect("assembler must have at least one datasource_id");
55            lookup.insert(key, config);
56        }
57        lookup
58    });
59
60/// Result of the assembly phase: top-level packages and dependencies,
61/// plus updated file-to-package associations.
62#[derive(serde::Serialize)]
63pub struct AssemblyResult {
64    pub packages: Vec<Package>,
65    pub dependencies: Vec<TopLevelDependency>,
66}
67
68/// How an assembler groups PackageData into Packages.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum AssemblyMode {
71    /// Merge related files in the same directory (or nested) into one Package.
72    SiblingMerge,
73    /// Each PackageData becomes its own independent Package (e.g., database files
74    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
75    OnePerPackageData,
76}
77
78pub struct AssemblerConfig {
79    pub datasource_ids: &'static [DatasourceId],
80    pub sibling_file_patterns: &'static [&'static str],
81    pub mode: AssemblyMode,
82}
83
84/// Run the assembly phase over all scanned files.
85///
86/// Groups files by parent directory, finds related manifests/lockfiles,
87/// merges them into top-level `Package` objects, and hoists dependencies.
88/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
89pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
90    let assembler_lookup = &*ASSEMBLER_LOOKUP;
91    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
92    let mut packages = Vec::new();
93    let mut dependencies = Vec::new();
94
95    let dir_files = group_files_by_directory(files);
96
97    for file_indices in dir_files.values() {
98        let mut groups: HashSet<DatasourceId> = HashSet::new();
99
100        for &idx in file_indices {
101            for pkg_data in &files[idx].package_data {
102                if let Some(dsid) = pkg_data.datasource_id
103                    && let Some(&config_key) = assembler_lookup.get(&dsid)
104                {
105                    groups.insert(config_key);
106                }
107            }
108        }
109
110        for &config_key in &groups {
111            let config = assembler_config_lookup
112                .get(&config_key)
113                .copied()
114                .expect("assembler config must exist");
115
116            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
117                let results = special_merger.run(files, file_indices);
118                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
119                continue;
120            }
121
122            match config.mode {
123                AssemblyMode::SiblingMerge => {
124                    let results = sibling_merge::assemble_siblings(config, files, file_indices)
125                        .into_iter()
126                        .collect();
127                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
128                }
129                AssemblyMode::OnePerPackageData => {
130                    let results = assemble_one_per_package_data(config, files, file_indices)
131                        .into_iter()
132                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
133                        .collect();
134                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
135                }
136            }
137        }
138    }
139
140    for config in ASSEMBLERS {
141        if config.mode != AssemblyMode::SiblingMerge {
142            continue;
143        }
144        if let Some((pkg, deps, affected_indices)) =
145            nested_merge::assemble_nested_patterns(files, config)
146        {
147            let package_uid = pkg.package_uid.clone();
148            let purl = pkg.purl.clone();
149            let removed_package_uids: Vec<String> = packages
150                .iter()
151                .filter(|p| p.purl == purl)
152                .map(|p| p.package_uid.clone())
153                .collect();
154
155            packages.retain(|p| p.purl != purl);
156            dependencies.retain(|d| {
157                d.for_package_uid.as_ref() != Some(&package_uid)
158                    && !removed_package_uids
159                        .iter()
160                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
161            });
162
163            for idx in &affected_indices {
164                files[*idx].for_packages.clear();
165                files[*idx].for_packages.push(package_uid.clone());
166            }
167
168            packages.push(pkg);
169            dependencies.extend(deps);
170        }
171    }
172
173    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies);
174    hoist_unassembled_file_dependencies(files, &mut dependencies);
175
176    for package in &mut packages {
177        package.datafile_paths.sort();
178        package.datafile_paths.dedup();
179        package.datasource_ids.sort_by_key(|left| left.to_string());
180        package.datasource_ids.dedup();
181    }
182
183    for file in files.iter_mut() {
184        file.for_packages
185            .sort_by(|left, right| stable_uid_key(left).cmp(stable_uid_key(right)));
186        file.for_packages.dedup();
187    }
188
189    packages
190        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
191    dependencies.sort_by(|left, right| {
192        left.purl
193            .as_deref()
194            .cmp(&right.purl.as_deref())
195            .then_with(|| {
196                left.extracted_requirement
197                    .as_deref()
198                    .cmp(&right.extracted_requirement.as_deref())
199            })
200            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
201            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
202            .then_with(|| {
203                left.datasource_id
204                    .to_string()
205                    .cmp(&right.datasource_id.to_string())
206            })
207            .then_with(|| {
208                left.for_package_uid
209                    .as_deref()
210                    .map(stable_uid_key)
211                    .cmp(&right.for_package_uid.as_deref().map(stable_uid_key))
212            })
213    });
214
215    AssemblyResult {
216        packages,
217        dependencies,
218    }
219}
220
221fn apply_directory_merge_results(
222    files: &mut [FileInfo],
223    packages: &mut Vec<Package>,
224    dependencies: &mut Vec<TopLevelDependency>,
225    results: Vec<DirectoryMergeOutput>,
226) {
227    for (package, deps, affected_indices) in results {
228        if let Some(package) = package {
229            let package_uid = package.package_uid.clone();
230            for idx in &affected_indices {
231                if !files[*idx].for_packages.contains(&package_uid) {
232                    files[*idx].for_packages.push(package_uid.clone());
233                }
234            }
235            packages.push(package);
236        }
237        dependencies.extend(deps);
238    }
239}
240
241fn hoist_unassembled_file_dependencies(
242    files: &[FileInfo],
243    dependencies: &mut Vec<TopLevelDependency>,
244) {
245    for file in files {
246        if !file.for_packages.is_empty() {
247            continue;
248        }
249
250        for pkg_data in &file.package_data {
251            let Some(datasource_id) = pkg_data.datasource_id else {
252                continue;
253            };
254
255            if !should_hoist_unassembled_dependencies(datasource_id) {
256                continue;
257            }
258
259            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
260                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
261            }));
262        }
263    }
264}
265
266fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
267    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
268        return false;
269    }
270
271    !matches!(
272        datasource_id,
273        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
274    )
275}
276
277fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
278    (
279        package.purl.as_deref(),
280        package.name.as_deref(),
281        package.version.as_deref(),
282        package
283            .datafile_paths
284            .first()
285            .map(String::as_str)
286            .unwrap_or(""),
287    )
288}
289
290fn stable_uid_key(uid: &str) -> &str {
291    uid.split_once("?uuid=")
292        .map(|(prefix, _)| prefix)
293        .or_else(|| uid.split_once("&uuid=").map(|(prefix, _)| prefix))
294        .unwrap_or(uid)
295}
296
297fn assemble_one_per_package_data(
298    config: &AssemblerConfig,
299    files: &[FileInfo],
300    file_indices: &[usize],
301) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
302    let mut results = Vec::new();
303
304    for &idx in file_indices {
305        let file = &files[idx];
306        for pkg_data in &file.package_data {
307            let dsid_matches = pkg_data
308                .datasource_id
309                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
310
311            if !dsid_matches || pkg_data.purl.is_none() {
312                continue;
313            }
314
315            let datafile_path = file.path.clone();
316            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
317            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
318            let for_package_uid = Some(pkg.package_uid.clone());
319
320            let deps: Vec<TopLevelDependency> = pkg_data
321                .dependencies
322                .iter()
323                .filter(|dep| dep.purl.is_some())
324                .map(|dep| {
325                    TopLevelDependency::from_dependency(
326                        dep,
327                        datafile_path.clone(),
328                        datasource_id,
329                        for_package_uid.clone(),
330                    )
331                })
332                .collect();
333
334            results.push((pkg, deps, idx));
335        }
336    }
337
338    results
339}
340
341/// Group file indices by their parent directory path.
342fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
343    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
344    for (idx, file) in files.iter().enumerate() {
345        if let Some(parent) = std::path::Path::new(&file.path).parent() {
346            groups.entry(parent.to_path_buf()).or_default().push(idx);
347        }
348    }
349    groups
350}