Skip to main content

provenant/assembly/
mod.rs

1mod assemblers;
2#[cfg(test)]
3mod assembly_golden_test;
4#[cfg(test)]
5mod assembly_test;
6mod cargo_resource_assign;
7mod cargo_workspace_merge;
8mod composer_resource_assign;
9mod conda_rootfs_merge;
10pub mod file_ref_resolve;
11mod hackage_merge;
12mod nested_merge;
13mod npm_resource_assign;
14mod npm_workspace_merge;
15mod nuget_cpm_resolve;
16mod python_requirements_assign;
17mod ruby_resource_assign;
18mod sibling_merge;
19mod swift_merge;
20mod topology;
21
22use std::collections::{HashMap, HashSet};
23use std::path::PathBuf;
24use std::sync::LazyLock;
25
26use crate::models::{DatasourceId, FileInfo, Package, TopLevelDependency};
27
28pub use assemblers::ASSEMBLERS;
29
30type DirectoryMergeOutput = (Option<Package>, Vec<TopLevelDependency>, Vec<usize>);
31
32/// Pre-computed lookup: DatasourceId → config key (first DatasourceId in config).
33/// Built once on first use, avoiding HashMap allocation on every `assemble()` call.
34static ASSEMBLER_LOOKUP: LazyLock<HashMap<DatasourceId, DatasourceId>> = LazyLock::new(|| {
35    let mut lookup = HashMap::new();
36    for config in ASSEMBLERS {
37        let key = *config
38            .datasource_ids
39            .first()
40            .expect("assembler must have at least one datasource_id");
41        for &dsid in config.datasource_ids {
42            lookup.insert(dsid, key);
43        }
44    }
45    lookup
46});
47
48static ASSEMBLER_CONFIG_LOOKUP: LazyLock<HashMap<DatasourceId, &'static AssemblerConfig>> =
49    LazyLock::new(|| {
50        let mut lookup = HashMap::new();
51        for config in ASSEMBLERS {
52            let key = *config
53                .datasource_ids
54                .first()
55                .expect("assembler must have at least one datasource_id");
56            lookup.insert(key, config);
57        }
58        lookup
59    });
60
61/// Result of the assembly phase: top-level packages and dependencies,
62/// plus updated file-to-package associations.
63#[derive(serde::Serialize)]
64pub struct AssemblyResult {
65    pub packages: Vec<Package>,
66    pub dependencies: Vec<TopLevelDependency>,
67}
68
69/// How an assembler groups PackageData into Packages.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub enum AssemblyMode {
72    /// Merge related files in the same directory (or nested) into one Package.
73    SiblingMerge,
74    /// Each PackageData becomes its own independent Package (e.g., database files
75    /// containing many installed packages like Alpine DB, RPM DB, Debian status).
76    OnePerPackageData,
77}
78
79pub struct AssemblerConfig {
80    pub datasource_ids: &'static [DatasourceId],
81    pub sibling_file_patterns: &'static [&'static str],
82    pub mode: AssemblyMode,
83}
84
85/// Run the assembly phase over all scanned files.
86///
87/// Groups files by parent directory, finds related manifests/lockfiles,
88/// merges them into top-level `Package` objects, and hoists dependencies.
89/// Updates each `FileInfo.for_packages` with the UIDs of packages it belongs to.
90pub fn assemble(files: &mut [FileInfo]) -> AssemblyResult {
91    let assembler_lookup = &*ASSEMBLER_LOOKUP;
92    let assembler_config_lookup = &*ASSEMBLER_CONFIG_LOOKUP;
93    let mut packages = Vec::new();
94    let mut dependencies = Vec::new();
95
96    let dir_files = group_files_by_directory(files);
97    let topology_plan = topology::TopologyPlan::build(files, &dir_files);
98
99    for file_indices in dir_files.values() {
100        let mut groups: HashSet<DatasourceId> = HashSet::new();
101
102        for &idx in file_indices {
103            for pkg_data in &files[idx].package_data {
104                if let Some(dsid) = pkg_data.datasource_id
105                    && let Some(&config_key) = assembler_lookup.get(&dsid)
106                {
107                    groups.insert(config_key);
108                }
109            }
110        }
111
112        for &config_key in &groups {
113            let config = assembler_config_lookup
114                .get(&config_key)
115                .copied()
116                .expect("assembler config must exist");
117
118            if topology_plan.claims_directory_assembly(config, file_indices, files) {
119                continue;
120            }
121
122            if let Some(special_merger) = assemblers::special_directory_merger_for(config_key) {
123                let results = special_merger.run(files, file_indices);
124                apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
125                continue;
126            }
127
128            match config.mode {
129                AssemblyMode::SiblingMerge => {
130                    let results = sibling_merge::assemble_siblings(config, files, file_indices)
131                        .into_iter()
132                        .collect();
133                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
134                }
135                AssemblyMode::OnePerPackageData => {
136                    let results = assemble_one_per_package_data(config, files, file_indices)
137                        .into_iter()
138                        .map(|(pkg, deps, affected_idx)| (Some(pkg), deps, vec![affected_idx]))
139                        .collect();
140                    apply_directory_merge_results(files, &mut packages, &mut dependencies, results);
141                }
142            }
143        }
144    }
145
146    topology_plan.apply_directory_scoped_domains(files, &mut packages, &mut dependencies);
147
148    for config in ASSEMBLERS {
149        if config.mode != AssemblyMode::SiblingMerge {
150            continue;
151        }
152        if let Some((pkg, deps, affected_indices)) =
153            nested_merge::assemble_nested_patterns(files, config)
154        {
155            let package_uid = pkg.package_uid.clone();
156            let purl = pkg.purl.clone();
157            let removed_package_uids: Vec<String> = packages
158                .iter()
159                .filter(|p| p.purl == purl)
160                .map(|p| p.package_uid.clone())
161                .collect();
162
163            packages.retain(|p| p.purl != purl);
164            dependencies.retain(|d| {
165                d.for_package_uid.as_ref() != Some(&package_uid)
166                    && !removed_package_uids
167                        .iter()
168                        .any(|old_uid| d.for_package_uid.as_ref() == Some(old_uid))
169            });
170
171            for idx in &affected_indices {
172                files[*idx].for_packages.clear();
173                files[*idx].for_packages.push(package_uid.clone());
174            }
175
176            packages.push(pkg);
177            dependencies.extend(deps);
178        }
179    }
180
181    assemblers::run_post_assembly_passes(files, &mut packages, &mut dependencies, &topology_plan);
182    hoist_unassembled_file_dependencies(files, &mut dependencies);
183
184    for package in &mut packages {
185        package.datafile_paths.sort();
186        package.datafile_paths.dedup();
187        package.datasource_ids.sort_by_key(|left| left.to_string());
188        package.datasource_ids.dedup();
189    }
190
191    for file in files.iter_mut() {
192        file.for_packages
193            .sort_by(|left, right| stable_uid_key(left).cmp(stable_uid_key(right)));
194        file.for_packages.dedup();
195    }
196
197    packages
198        .sort_by(|left, right| stable_package_sort_key(left).cmp(&stable_package_sort_key(right)));
199    dependencies.sort_by(|left, right| {
200        left.purl
201            .as_deref()
202            .cmp(&right.purl.as_deref())
203            .then_with(|| {
204                left.extracted_requirement
205                    .as_deref()
206                    .cmp(&right.extracted_requirement.as_deref())
207            })
208            .then_with(|| left.scope.as_deref().cmp(&right.scope.as_deref()))
209            .then_with(|| left.datafile_path.cmp(&right.datafile_path))
210            .then_with(|| {
211                left.datasource_id
212                    .to_string()
213                    .cmp(&right.datasource_id.to_string())
214            })
215            .then_with(|| {
216                left.for_package_uid
217                    .as_deref()
218                    .map(stable_uid_key)
219                    .cmp(&right.for_package_uid.as_deref().map(stable_uid_key))
220            })
221    });
222
223    AssemblyResult {
224        packages,
225        dependencies,
226    }
227}
228
229fn apply_directory_merge_results(
230    files: &mut [FileInfo],
231    packages: &mut Vec<Package>,
232    dependencies: &mut Vec<TopLevelDependency>,
233    results: Vec<DirectoryMergeOutput>,
234) {
235    for (package, deps, affected_indices) in results {
236        if let Some(package) = package {
237            let package_uid = package.package_uid.clone();
238            for idx in &affected_indices {
239                if !files[*idx].for_packages.contains(&package_uid) {
240                    files[*idx].for_packages.push(package_uid.clone());
241                }
242            }
243            packages.push(package);
244        }
245        dependencies.extend(deps);
246    }
247}
248
249fn hoist_unassembled_file_dependencies(
250    files: &[FileInfo],
251    dependencies: &mut Vec<TopLevelDependency>,
252) {
253    for file in files {
254        if !file.for_packages.is_empty() {
255            continue;
256        }
257
258        for pkg_data in &file.package_data {
259            let Some(datasource_id) = pkg_data.datasource_id else {
260                continue;
261            };
262
263            if !should_hoist_unassembled_dependencies(datasource_id) {
264                continue;
265            }
266
267            dependencies.extend(pkg_data.dependencies.iter().map(|dep| {
268                TopLevelDependency::from_dependency(dep, file.path.clone(), datasource_id, None)
269            }));
270        }
271    }
272}
273
274fn should_hoist_unassembled_dependencies(datasource_id: DatasourceId) -> bool {
275    if !assemblers::UNASSEMBLED_DATASOURCE_IDS.contains(&datasource_id) {
276        return false;
277    }
278
279    !matches!(
280        datasource_id,
281        DatasourceId::NugetDirectoryBuildProps | DatasourceId::NugetDirectoryPackagesProps
282    )
283}
284
285fn stable_package_sort_key(package: &Package) -> (Option<&str>, Option<&str>, Option<&str>, &str) {
286    (
287        package.purl.as_deref(),
288        package.name.as_deref(),
289        package.version.as_deref(),
290        package
291            .datafile_paths
292            .first()
293            .map(String::as_str)
294            .unwrap_or(""),
295    )
296}
297
298fn stable_uid_key(uid: &str) -> &str {
299    uid.split_once("?uuid=")
300        .map(|(prefix, _)| prefix)
301        .or_else(|| uid.split_once("&uuid=").map(|(prefix, _)| prefix))
302        .unwrap_or(uid)
303}
304
305fn assemble_one_per_package_data(
306    config: &AssemblerConfig,
307    files: &[FileInfo],
308    file_indices: &[usize],
309) -> Vec<(Package, Vec<TopLevelDependency>, usize)> {
310    let mut results = Vec::new();
311
312    for &idx in file_indices {
313        let file = &files[idx];
314        for pkg_data in &file.package_data {
315            let dsid_matches = pkg_data
316                .datasource_id
317                .is_some_and(|dsid| config.datasource_ids.contains(&dsid));
318
319            if !dsid_matches || pkg_data.purl.is_none() {
320                continue;
321            }
322
323            let datafile_path = file.path.clone();
324            let datasource_id = pkg_data.datasource_id.expect("datasource_id must be Some");
325            let pkg = Package::from_package_data(pkg_data, datafile_path.clone());
326            let for_package_uid = Some(pkg.package_uid.clone());
327
328            let deps: Vec<TopLevelDependency> = pkg_data
329                .dependencies
330                .iter()
331                .filter(|dep| dep.purl.is_some())
332                .map(|dep| {
333                    TopLevelDependency::from_dependency(
334                        dep,
335                        datafile_path.clone(),
336                        datasource_id,
337                        for_package_uid.clone(),
338                    )
339                })
340                .collect();
341
342            results.push((pkg, deps, idx));
343        }
344    }
345
346    results
347}
348
349/// Group file indices by their parent directory path.
350fn group_files_by_directory(files: &[FileInfo]) -> HashMap<PathBuf, Vec<usize>> {
351    let mut groups: HashMap<PathBuf, Vec<usize>> = HashMap::new();
352    for (idx, file) in files.iter().enumerate() {
353        if let Some(parent) = std::path::Path::new(&file.path).parent() {
354            groups.entry(parent.to_path_buf()).or_default().push(idx);
355        }
356    }
357    groups
358}