Skip to main content

provenant/assembly/
mod.rs

1mod assemblers;
2#[cfg(test)]
3mod assembly_golden_test;
4#[cfg(test)]
5mod assembly_test;
6mod cargo_resource_assign;
7mod cargo_workspace_merge;
8mod composer_resource_assign;
9mod conda_rootfs_merge;
10pub mod file_ref_resolve;
11mod hackage_merge;
12mod nested_merge;
13mod npm_resource_assign;
14mod npm_workspace_merge;
15mod nuget_cpm_resolve;
16mod ruby_resource_assign;
17mod sibling_merge;
18mod swift_merge;
19
20use std::collections::{HashMap, HashSet};
21use std::path::PathBuf;
22use std::sync::LazyLock;
23
24use crate::models::{DatasourceId, FileInfo, Package, TopLevelDependency};
25
26pub use assemblers::ASSEMBLERS;
27
28type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
29
30/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
31/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
32static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
33    let mut lookup = HashMap::new();
34    for config in ASSEMBLERS {
35        let key = *config
36            .datasource_ids
37            .first()
38            .expect("assembler must have at least one datasource_id");
39        for &dsid in config.datasource_ids {
40            lookup.insert(dsid, key);
41        }
42    }
43    lookup
44});
45
46static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
47    LazyLock::new(|| {
48        let mut lookup = HashMap::new();
49        for config in ASSEMBLERS {
50            let key = *config
51                .datasource_ids
52                .first()
53                .expect("assembler must have at least one datasource_id");
54            lookup.insert(key, config);
55        }
56        lookup
57    });
58
59/// Result of the assembly phase: top-level packages and dependencies,
60/// plus updated file-to-package associations.
61#[derive(serde::Serialize)]
62pub struct AssemblyResult {
63    pub packages: Vec<Package>,
64    pub dependencies: Vec<TopLevelDependency>,
65}
66
67/// How an assembler groups PackageData into Packages.
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum AssemblyMode {
70    /// Merge related files in the same directory (or nested) into one Package.
71    SiblingMerge,
72    /// Each PackageData becomes its own independent Package (e.g., database files
73    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
74    OnePerPackageData,
75}
76
77pub struct AssemblerConfig {
78    pub datasource_ids: &'static [DatasourceId],
79    pub sibling_file_patterns: &'static [&'static str],
80    pub mode: AssemblyMode,
81}
82
83/// Run the assembly phase over all scanned files.
84///
85/// Groups files by parent directory, finds related manifests/lockfiles,
86/// merges them into top-level `Package` objects, and hoists dependencies.
87/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
88pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
89    let assembler_lookup = &*ASSEMBLER_LOOKUP;
90    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
91    let mut packages = Vec::new();
92    let mut dependencies = Vec::new();
93
94    let dir_files = group_files_by_directory(files);
95
96    for file_indices in dir_files.values() {
97        let mut groups: HashSet<DatasourceId> = HashSet::new();
98
99        for &idx in file_indices {
100            for pkg_data in &files[idx].package_data {
101                if let Some(dsid) = pkg_data.datasource_id
102                    && let Some(&config_key) = assembler_lookup.get(&dsid)
103                {
104                    groups.insert(config_key);
105                }
106            }
107        }
108
109        for &config_key in &groups {
110            let config = assembler_config_lookup
111                .get(&config_key)
112                .copied()
113                .expect("assembler config must exist");
114
115            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
116                let results = special_merger.run(files, file_indices);
117                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
118                continue;
119            }
120
121            match config.mode {
122                AssemblyMode::SiblingMerge => {
123                    let results = sibling_merge::assemble_siblings(config, files, file_indices)
124                        .into_iter()
125                        .collect();
126                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
127                }
128                AssemblyMode::OnePerPackageData => {
129                    let results = assemble_one_per_package_data(config, files, file_indices)
130                        .into_iter()
131                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
132                        .collect();
133                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
134                }
135            }
136        }
137    }
138
139    for config in ASSEMBLERS {
140        if config.mode != AssemblyMode::SiblingMerge {
141            continue;
142        }
143        if let Some((pkg, deps, affected_indices)) =
144            nested_merge::assemble_nested_patterns(files, config)
145        {
146            let package_uid = pkg.package_uid.clone();
147            let purl = pkg.purl.clone();
148            let removed_package_uids: Vec<String> = packages
149                .iter()
150                .filter(|p| p.purl == purl)
151                .map(|p| p.package_uid.clone())
152                .collect();
153
154            packages.retain(|p| p.purl != purl);
155            dependencies.retain(|d| {
156                d.for_package_uid.as_ref() != Some(&package_uid)
157                    && !removed_package_uids
158                        .iter()
159                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
160            });
161
162            for idx in &affected_indices {
163                files[*idx].for_packages.clear();
164                files[*idx].for_packages.push(package_uid.clone());
165            }
166
167            packages.push(pkg);
168            dependencies.extend(deps);
169        }
170    }
171
172    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies);
173    hoist_unassembled_file_dependencies(files, &mut dependencies);
174
175    for package in &mut packages {
176        package.datafile_paths.sort();
177        package.datafile_paths.dedup();
178        package.datasource_ids.sort_by_key(|left| left.to_string());
179        package.datasource_ids.dedup();
180    }
181
182    for file in files.iter_mut() {
183        file.for_packages
184            .sort_by(|left, right| stable_uid_key(left).cmp(stable_uid_key(right)));
185        file.for_packages.dedup();
186    }
187
188    packages
189        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
190    dependencies.sort_by(|left, right| {
191        left.purl
192            .as_deref()
193            .cmp(&right.purl.as_deref())
194            .then_with(|| {
195                left.extracted_requirement
196                    .as_deref()
197                    .cmp(&right.extracted_requirement.as_deref())
198            })
199            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
200            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
201            .then_with(|| {
202                left.datasource_id
203                    .to_string()
204                    .cmp(&right.datasource_id.to_string())
205            })
206            .then_with(|| {
207                left.for_package_uid
208                    .as_deref()
209                    .map(stable_uid_key)
210                    .cmp(&right.for_package_uid.as_deref().map(stable_uid_key))
211            })
212    });
213
214    AssemblyResult {
215        packages,
216        dependencies,
217    }
218}
219
220fn apply_directory_merge_results(
221    files: &mut [FileInfo],
222    packages: &mut Vec<Package>,
223    dependencies: &mut Vec<TopLevelDependency>,
224    results: Vec<DirectoryMergeOutput>,
225) {
226    for (package, deps, affected_indices) in results {
227        if let Some(package) = package {
228            let package_uid = package.package_uid.clone();
229            for idx in &affected_indices {
230                if !files[*idx].for_packages.contains(&package_uid) {
231                    files[*idx].for_packages.push(package_uid.clone());
232                }
233            }
234            packages.push(package);
235        }
236        dependencies.extend(deps);
237    }
238}
239
240fn hoist_unassembled_file_dependencies(
241    files: &[FileInfo],
242    dependencies: &mut Vec<TopLevelDependency>,
243) {
244    for file in files {
245        if !file.for_packages.is_empty() {
246            continue;
247        }
248
249        for pkg_data in &file.package_data {
250            let Some(datasource_id) = pkg_data.datasource_id else {
251                continue;
252            };
253
254            if !should_hoist_unassembled_dependencies(datasource_id) {
255                continue;
256            }
257
258            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
259                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
260            }));
261        }
262    }
263}
264
265fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
266    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
267        return false;
268    }
269
270    !matches!(
271        datasource_id,
272        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
273    )
274}
275
276fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
277    (
278        package.purl.as_deref(),
279        package.name.as_deref(),
280        package.version.as_deref(),
281        package
282            .datafile_paths
283            .first()
284            .map(String::as_str)
285            .unwrap_or(""),
286    )
287}
288
289fn stable_uid_key(uid: &str) -> &str {
290    uid.split_once("?uuid=")
291        .map(|(prefix, _)| prefix)
292        .or_else(|| uid.split_once("&uuid=").map(|(prefix, _)| prefix))
293        .unwrap_or(uid)
294}
295
296fn assemble_one_per_package_data(
297    config: &AssemblerConfig,
298    files: &[FileInfo],
299    file_indices: &[usize],
300) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
301    let mut results = Vec::new();
302
303    for &idx in file_indices {
304        let file = &files[idx];
305        for pkg_data in &file.package_data {
306            let dsid_matches = pkg_data
307                .datasource_id
308                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
309
310            if !dsid_matches || pkg_data.purl.is_none() {
311                continue;
312            }
313
314            let datafile_path = file.path.clone();
315            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
316            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
317            let for_package_uid = Some(pkg.package_uid.clone());
318
319            let deps: Vec<TopLevelDependency> = pkg_data
320                .dependencies
321                .iter()
322                .filter(|dep| dep.purl.is_some())
323                .map(|dep| {
324                    TopLevelDependency::from_dependency(
325                        dep,
326                        datafile_path.clone(),
327                        datasource_id,
328                        for_package_uid.clone(),
329                    )
330                })
331                .collect();
332
333            results.push((pkg, deps, idx));
334        }
335    }
336
337    results
338}
339
340/// Group file indices by their parent directory path.
341fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
342    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
343    for (idx, file) in files.iter().enumerate() {
344        if let Some(parent) = std::path::Path::new(&file.path).parent() {
345            groups.entry(parent.to_path_buf()).or_default().push(idx);
346        }
347    }
348    groups
349}