Skip to main content

provenant/assembly/
assemblers.rs

1use std::collections::HashSet;
2
3use crate::models::PackageType;
4use crate::models::{DatasourceId, FileInfo, Package, TopLevelDependency};
5use strum::EnumIter;
6
7use super::{
8    AssemblerConfig, AssemblyMode, DirectoryMergeOutput, bazel_merge, bazel_prune,
9    cargo_resource_assign, composer_resource_assign, conda_rootfs_merge, file_ref_resolve,
10    hackage_merge, npm_resource_assign, nuget_cpm_resolve, python_requirements_assign,
11    ruby_resource_assign, swift_merge, topology,
12};
13
14#[derive(Clone, Copy)]
15pub(super) enum SpecialDirectoryMergerKind {
16    Skip,
17    Bazel,
18    Hackage,
19}
20
21#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, EnumIter)]
22pub(super) enum PostAssemblyPassKind {
23    SwiftMerge,
24    CondaRootfsMerge,
25    NpmResourceAssign,
26    PythonRequirementsAssign,
27    FileReferenceResolve,
28    RpmYumdbMerge,
29    NpmWorkspaceMerge,
30    CargoWorkspaceMerge,
31    NugetCpmResolve,
32    CargoResourceAssign,
33    ComposerResourceAssign,
34    RubyResourceAssign,
35    BazelPrune,
36}
37
38pub(super) fn special_directory_merger_for(
39    config_key: DatasourceId,
40) -> Option<SpecialDirectoryMergerKind> {
41    match config_key {
42        DatasourceId::BazelBuild => Some(SpecialDirectoryMergerKind::Bazel),
43        DatasourceId::HackageCabal => Some(SpecialDirectoryMergerKind::Hackage),
44        DatasourceId::SwiftPackageManifestJson => Some(SpecialDirectoryMergerKind::Skip),
45        _ => None,
46    }
47}
48
49pub(super) static POST_ASSEMBLY_PASSES: &[PostAssemblyPassKind] = &[
50    PostAssemblyPassKind::SwiftMerge,
51    PostAssemblyPassKind::CondaRootfsMerge,
52    PostAssemblyPassKind::NpmResourceAssign,
53    PostAssemblyPassKind::PythonRequirementsAssign,
54    PostAssemblyPassKind::FileReferenceResolve,
55    PostAssemblyPassKind::RpmYumdbMerge,
56    PostAssemblyPassKind::NpmWorkspaceMerge,
57    PostAssemblyPassKind::CargoWorkspaceMerge,
58    PostAssemblyPassKind::NugetCpmResolve,
59    PostAssemblyPassKind::CargoResourceAssign,
60    PostAssemblyPassKind::ComposerResourceAssign,
61    PostAssemblyPassKind::RubyResourceAssign,
62    PostAssemblyPassKind::BazelPrune,
63];
64
65const SWIFT_POST_ASSEMBLY_DATASOURCE_IDS: &[DatasourceId] = &[
66    DatasourceId::SwiftPackageManifestJson,
67    DatasourceId::SwiftPackageResolved,
68    DatasourceId::SwiftPackageShowDependencies,
69];
70
71const CONDA_ROOTFS_POST_ASSEMBLY_DATASOURCE_IDS: &[DatasourceId] =
72    &[DatasourceId::CondaMetaJson, DatasourceId::CondaMetaYaml];
73
74const RPM_INSTALLED_DATABASE_DATASOURCE_IDS: &[DatasourceId] = &[
75    DatasourceId::RpmInstalledDatabaseBdb,
76    DatasourceId::RpmInstalledDatabaseNdb,
77    DatasourceId::RpmInstalledDatabaseSqlite,
78];
79
80const NUGET_CPM_CONFIG_DATASOURCE_IDS: &[DatasourceId] = &[
81    DatasourceId::NugetDirectoryBuildProps,
82    DatasourceId::NugetDirectoryPackagesProps,
83];
84
85const NUGET_CPM_PROJECT_DATASOURCE_IDS: &[DatasourceId] = &[
86    DatasourceId::NugetCsproj,
87    DatasourceId::NugetFsproj,
88    DatasourceId::NugetVbproj,
89];
90
91#[derive(Default)]
92struct PostAssemblyInputs {
93    package_types: HashSet<PackageType>,
94    file_datasource_ids: HashSet<DatasourceId>,
95    has_npm_workspace_markers: bool,
96    has_cargo_workspace_markers: bool,
97}
98
99pub(super) fn run_post_assembly_passes(
100    files: &mut [FileInfo],
101    packages: &mut Vec<Package>,
102    dependencies: &mut Vec<TopLevelDependency>,
103    topology_plan: &topology::TopologyPlan,
104) {
105    let inputs = PostAssemblyInputs::collect(files, packages);
106
107    for pass in POST_ASSEMBLY_PASSES {
108        if !pass.should_run(&inputs) {
109            continue;
110        }
111
112        pass.run(files, packages, dependencies, topology_plan);
113    }
114}
115
116impl PostAssemblyInputs {
117    fn collect(files: &[FileInfo], packages: &[Package]) -> Self {
118        let mut inputs = Self {
119            package_types: packages
120                .iter()
121                .filter_map(|package| package.package_type)
122                .collect(),
123            ..Self::default()
124        };
125
126        for file in files {
127            for package_data in &file.package_data {
128                let Some(datasource_id) = package_data.datasource_id else {
129                    continue;
130                };
131
132                inputs.file_datasource_ids.insert(datasource_id);
133
134                if matches!(
135                    datasource_id,
136                    DatasourceId::NpmPackageJson | DatasourceId::PnpmWorkspaceYaml
137                ) && package_data
138                    .extra_data
139                    .as_ref()
140                    .is_some_and(|extra_data| extra_data.contains_key("workspaces"))
141                {
142                    inputs.has_npm_workspace_markers = true;
143                }
144
145                if datasource_id == DatasourceId::CargoToml
146                    && package_data
147                        .extra_data
148                        .as_ref()
149                        .and_then(|extra_data| extra_data.get("workspace"))
150                        .and_then(|workspace| workspace.get("members"))
151                        .and_then(|members| members.as_array())
152                        .is_some_and(|members| !members.is_empty())
153                {
154                    inputs.has_cargo_workspace_markers = true;
155                }
156            }
157        }
158
159        inputs
160    }
161
162    fn has_package_type(&self, package_type: PackageType) -> bool {
163        self.package_types.contains(&package_type)
164    }
165
166    fn has_any_file_datasource(&self, datasource_ids: &[DatasourceId]) -> bool {
167        datasource_ids
168            .iter()
169            .any(|datasource_id| self.file_datasource_ids.contains(datasource_id))
170    }
171
172    fn has_all_file_datasources(&self, datasource_ids: &[DatasourceId]) -> bool {
173        datasource_ids
174            .iter()
175            .all(|datasource_id| self.file_datasource_ids.contains(datasource_id))
176    }
177}
178
179impl SpecialDirectoryMergerKind {
180    pub(super) fn run(
181        self,
182        config: &AssemblerConfig,
183        files: &[FileInfo],
184        file_indices: &[usize],
185    ) -> Vec<DirectoryMergeOutput> {
186        match self {
187            Self::Skip => Vec::new(),
188            Self::Bazel => bazel_merge::assemble_bazel_packages(config, files, file_indices),
189            Self::Hackage => hackage_merge::assemble_hackage_packages(files, file_indices),
190        }
191    }
192}
193
194impl PostAssemblyPassKind {
195    fn should_run(self, inputs: &PostAssemblyInputs) -> bool {
196        match self {
197            Self::SwiftMerge => inputs.has_any_file_datasource(SWIFT_POST_ASSEMBLY_DATASOURCE_IDS),
198            Self::CondaRootfsMerge => {
199                inputs.has_all_file_datasources(CONDA_ROOTFS_POST_ASSEMBLY_DATASOURCE_IDS)
200            }
201            Self::NpmResourceAssign => inputs.has_package_type(PackageType::Npm),
202            Self::PythonRequirementsAssign => {
203                inputs.has_package_type(PackageType::Pypi)
204                    && inputs.has_any_file_datasource(&[DatasourceId::PipRequirements])
205            }
206            Self::FileReferenceResolve => {
207                file_ref_resolve::has_relevant_file_reference_datasource_ids(
208                    &inputs.file_datasource_ids,
209                )
210            }
211            Self::RpmYumdbMerge => {
212                inputs.has_any_file_datasource(&[DatasourceId::RpmYumdb])
213                    && inputs.has_any_file_datasource(RPM_INSTALLED_DATABASE_DATASOURCE_IDS)
214            }
215            Self::NpmWorkspaceMerge => inputs.has_npm_workspace_markers,
216            Self::CargoWorkspaceMerge => inputs.has_cargo_workspace_markers,
217            Self::NugetCpmResolve => {
218                inputs.has_any_file_datasource(NUGET_CPM_CONFIG_DATASOURCE_IDS)
219                    && inputs.has_any_file_datasource(NUGET_CPM_PROJECT_DATASOURCE_IDS)
220            }
221            Self::CargoResourceAssign => inputs.has_package_type(PackageType::Cargo),
222            Self::ComposerResourceAssign => inputs.has_package_type(PackageType::Composer),
223            Self::RubyResourceAssign => inputs.has_package_type(PackageType::Gem),
224            Self::BazelPrune => inputs.has_package_type(PackageType::Bazel),
225        }
226    }
227
228    fn run(
229        self,
230        files: &mut [FileInfo],
231        packages: &mut Vec<Package>,
232        dependencies: &mut Vec<TopLevelDependency>,
233        topology_plan: &topology::TopologyPlan,
234    ) {
235        match self {
236            Self::SwiftMerge => swift_merge::assemble_swift_packages(files, packages, dependencies),
237            Self::CondaRootfsMerge => {
238                conda_rootfs_merge::merge_conda_rootfs_metadata(files, packages, dependencies)
239            }
240            Self::NpmResourceAssign => {
241                npm_resource_assign::assign_npm_package_resources(files, packages)
242            }
243            Self::PythonRequirementsAssign => {
244                python_requirements_assign::assign_python_requirements_to_projects(
245                    files,
246                    packages,
247                    dependencies,
248                )
249            }
250            Self::FileReferenceResolve => {
251                file_ref_resolve::resolve_file_references(files, packages, dependencies)
252            }
253            Self::RpmYumdbMerge => file_ref_resolve::merge_rpm_yumdb_metadata(files, packages),
254            Self::NpmWorkspaceMerge => {
255                topology_plan.apply_npm_workspace_domains(files, packages, dependencies)
256            }
257            Self::CargoWorkspaceMerge => {
258                topology_plan.apply_cargo_workspace_domains(files, packages, dependencies)
259            }
260            Self::NugetCpmResolve => {
261                nuget_cpm_resolve::resolve_nuget_cpm_versions(files, dependencies)
262            }
263            Self::CargoResourceAssign => {
264                cargo_resource_assign::assign_cargo_package_resources(files, packages)
265            }
266            Self::ComposerResourceAssign => {
267                composer_resource_assign::assign_composer_package_resources(files, packages)
268            }
269            Self::RubyResourceAssign => {
270                ruby_resource_assign::assign_ruby_package_resources(files, packages)
271            }
272            Self::BazelPrune => {
273                bazel_prune::prune_unused_bazel_packages(files, packages, dependencies)
274            }
275        }
276    }
277}
278
279pub static ASSEMBLERS: &[AssemblerConfig] = &[
280    // ── Sibling-merge assemblers ──
281    //
282    // npm ecosystem: package.json + lockfiles in same directory.
283    // NOTE: npm-shrinkwrap.json emits "npm_package_lock_json" as its datasource_id,
284    // so "npm_shrinkwrap_json" is NOT a real datasource_id.
285    AssemblerConfig {
286        datasource_ids: &[
287            DatasourceId::BunLock,
288            DatasourceId::BunLockb,
289            DatasourceId::NpmPackageJson,
290            DatasourceId::NpmPackageLockJson,
291            DatasourceId::YarnLock,
292            DatasourceId::YarnLockV1,
293            DatasourceId::YarnLockV2,
294            DatasourceId::YarnPnpCjs,
295            DatasourceId::PnpmLockYaml,
296            DatasourceId::PnpmWorkspaceYaml,
297        ],
298        sibling_file_patterns: &[
299            "package.json",
300            "bun.lock",
301            "bun.lockb",
302            ".package-lock.json",
303            "package-lock.json",
304            ".npm-shrinkwrap.json",
305            "npm-shrinkwrap.json",
306            "yarn.lock",
307            ".pnp.cjs",
308            "pnpm-lock.yaml",
309            "shrinkwrap.yaml",
310            "pnpm-workspace.yaml",
311        ],
312        mode: AssemblyMode::SiblingMerge,
313    },
314    // Rust/Cargo ecosystem
315    AssemblerConfig {
316        datasource_ids: &[DatasourceId::CargoToml, DatasourceId::CargoLock],
317        sibling_file_patterns: &["Cargo.toml", "Cargo.lock"],
318        mode: AssemblyMode::SiblingMerge,
319    },
320    // CocoaPods ecosystem
321    AssemblerConfig {
322        datasource_ids: &[
323            DatasourceId::CocoapodsPodspec,
324            DatasourceId::CocoapodsPodspecJson,
325            DatasourceId::CocoapodsPodfile,
326            DatasourceId::CocoapodsPodfileLock,
327        ],
328        sibling_file_patterns: &["*.podspec", "*.podspec.json", "Podfile", "Podfile.lock"],
329        mode: AssemblyMode::SiblingMerge,
330    },
331    // PHP Composer ecosystem
332    AssemblerConfig {
333        datasource_ids: &[DatasourceId::PhpComposerJson, DatasourceId::PhpComposerLock],
334        sibling_file_patterns: &[
335            "*composer.json",
336            "composer.*.json",
337            "*composer.lock",
338            "composer.*.lock",
339        ],
340        mode: AssemblyMode::SiblingMerge,
341    },
342    // Go ecosystem (includes legacy Godeps)
343    AssemblerConfig {
344        datasource_ids: &[
345            DatasourceId::GoMod,
346            DatasourceId::GoModGraph,
347            DatasourceId::GoSum,
348            DatasourceId::GoWork,
349            DatasourceId::Godeps,
350        ],
351        sibling_file_patterns: &[
352            "go.mod",
353            "go.work",
354            "go.mod.graph",
355            "go.modgraph",
356            "go.sum",
357            "Godeps.json",
358        ],
359        mode: AssemblyMode::SiblingMerge,
360    },
361    // Dart/Flutter ecosystem
362    AssemblerConfig {
363        datasource_ids: &[DatasourceId::PubspecYaml, DatasourceId::PubspecLock],
364        sibling_file_patterns: &["pubspec.yaml", "pubspec.lock"],
365        mode: AssemblyMode::SiblingMerge,
366    },
367    // Pixi ecosystem
368    AssemblerConfig {
369        datasource_ids: &[DatasourceId::PixiToml, DatasourceId::PixiLock],
370        sibling_file_patterns: &["pixi.toml", "pixi.lock"],
371        mode: AssemblyMode::SiblingMerge,
372    },
373    AssemblerConfig {
374        datasource_ids: &[DatasourceId::NixFlakeNix, DatasourceId::NixFlakeLock],
375        sibling_file_patterns: &["flake.nix", "flake.lock"],
376        mode: AssemblyMode::SiblingMerge,
377    },
378    AssemblerConfig {
379        datasource_ids: &[DatasourceId::NixDefaultNix],
380        sibling_file_patterns: &["default.nix"],
381        mode: AssemblyMode::OnePerPackageData,
382    },
383    // Helm chart ecosystem
384    AssemblerConfig {
385        datasource_ids: &[DatasourceId::HelmChartYaml, DatasourceId::HelmChartLock],
386        sibling_file_patterns: &["Chart.yaml", "Chart.lock"],
387        mode: AssemblyMode::SiblingMerge,
388    },
389    AssemblerConfig {
390        datasource_ids: &[
391            DatasourceId::HackageCabal,
392            DatasourceId::HackageCabalProject,
393            DatasourceId::HackageStackYaml,
394        ],
395        sibling_file_patterns: &["*.cabal", "cabal.project", "stack.yaml"],
396        mode: AssemblyMode::SiblingMerge,
397    },
398    // Chef ecosystem
399    AssemblerConfig {
400        datasource_ids: &[
401            DatasourceId::ChefCookbookMetadataJson,
402            DatasourceId::ChefCookbookMetadataRb,
403        ],
404        sibling_file_patterns: &["metadata.json", "metadata.rb"],
405        mode: AssemblyMode::SiblingMerge,
406    },
407    // Conan (C/C++) ecosystem
408    AssemblerConfig {
409        datasource_ids: &[
410            DatasourceId::ConanConanFilePy,
411            DatasourceId::ConanConanFileTxt,
412            DatasourceId::ConanLock,
413            DatasourceId::ConanConanDataYml,
414        ],
415        sibling_file_patterns: &[
416            "conanfile.py",
417            "conanfile.txt",
418            "conan.lock",
419            "conandata.yml",
420        ],
421        mode: AssemblyMode::SiblingMerge,
422    },
423    // Maven/Java ecosystem (nested merge via META-INF)
424    AssemblerConfig {
425        datasource_ids: &[
426            DatasourceId::MavenPom,
427            DatasourceId::MavenPomProperties,
428            DatasourceId::JavaJarManifest,
429            DatasourceId::JavaOsgiManifest,
430        ],
431        sibling_file_patterns: &[
432            "pom.xml",
433            "*.pom",
434            "pom.properties",
435            "**/META-INF/MANIFEST.MF",
436        ],
437        mode: AssemblyMode::SiblingMerge,
438    },
439    AssemblerConfig {
440        datasource_ids: &[DatasourceId::PypiWheel, DatasourceId::PypiPipOriginJson],
441        sibling_file_patterns: &["*.whl", "origin.json"],
442        mode: AssemblyMode::SiblingMerge,
443    },
444    // Python/PyPI ecosystem
445    AssemblerConfig {
446        datasource_ids: &[
447            DatasourceId::PypiPyprojectToml,
448            DatasourceId::PypiPoetryPyprojectToml,
449            DatasourceId::PypiSetupPy,
450            DatasourceId::PypiSetupCfg,
451            DatasourceId::PypiWheel,
452            DatasourceId::PypiWheelMetadata,
453            DatasourceId::PypiEgg,
454            DatasourceId::PypiEggPkginfo,
455            DatasourceId::PypiEditableEggPkginfo,
456            DatasourceId::PypiJson,
457            DatasourceId::PypiSdist,
458            DatasourceId::PypiSdistPkginfo,
459            DatasourceId::PypiInspectDeplock,
460            DatasourceId::PipRequirements,
461            DatasourceId::PypiPoetryLock,
462            DatasourceId::PypiPylockToml,
463            DatasourceId::PypiUvLock,
464            DatasourceId::Pipfile,
465            DatasourceId::PipfileLock,
466        ],
467        sibling_file_patterns: &[
468            "pyproject.toml",
469            "setup.py",
470            "setup.cfg",
471            "PKG-INFO",
472            "METADATA",
473            "pypi.json",
474            "pip-inspect.deplock",
475            "*.tar.gz",
476            "*.tgz",
477            "*.tar.bz2",
478            "*.tar.xz",
479            "*.zip",
480            "requirements*.txt",
481            "Pipfile",
482            "Pipfile.lock",
483            "poetry.lock",
484            "pylock.toml",
485            "pylock.*.toml",
486            "uv.lock",
487        ],
488        mode: AssemblyMode::SiblingMerge,
489    },
490    AssemblerConfig {
491        datasource_ids: &[DatasourceId::DenoJson, DatasourceId::DenoLock],
492        sibling_file_patterns: &["deno.json", "deno.jsonc", "deno.lock"],
493        mode: AssemblyMode::SiblingMerge,
494    },
495    // Ruby/RubyGems ecosystem
496    AssemblerConfig {
497        datasource_ids: &[
498            DatasourceId::GemArchiveExtracted,
499            DatasourceId::Gemspec,
500            DatasourceId::GemspecExtracted,
501            DatasourceId::Gemfile,
502            DatasourceId::GemfileExtracted,
503            DatasourceId::GemfileLock,
504            DatasourceId::GemfileLockExtracted,
505            DatasourceId::GemArchive,
506        ],
507        sibling_file_patterns: &[
508            "metadata.gz-extract",
509            "**/data.gz-extract/*.gemspec",
510            "**/data.gz-extract/Gemfile",
511            "**/data.gz-extract/Gemfile.lock",
512            "*.gemspec",
513            "Gemfile",
514            "Gemfile.lock",
515        ],
516        mode: AssemblyMode::SiblingMerge,
517    },
518    // Conda ecosystem
519    AssemblerConfig {
520        datasource_ids: &[
521            DatasourceId::CondaMetaYaml,
522            DatasourceId::CondaYaml,
523            DatasourceId::CondaMetaJson,
524        ],
525        sibling_file_patterns: &[
526            "meta.yaml",
527            "meta.yml",
528            "environment.yml",
529            "environment.yaml",
530            "conda.yaml",
531            "conda.yml",
532            "*conda*.yaml",
533            "*conda*.yml",
534            "env.yaml",
535            "env.yml",
536            "*env*.yaml",
537            "*env*.yml",
538            "*environment*.yaml",
539            "*environment*.yml",
540            "*.json",
541        ],
542        mode: AssemblyMode::SiblingMerge,
543    },
544    // RPM specfile (source packages)
545    AssemblerConfig {
546        datasource_ids: &[DatasourceId::RpmSpecfile],
547        sibling_file_patterns: &["*.spec"],
548        mode: AssemblyMode::SiblingMerge,
549    },
550    // Debian source packages (nested merge via debian/ directory)
551    AssemblerConfig {
552        datasource_ids: &[
553            DatasourceId::DebianControlInSource,
554            DatasourceId::DebianCopyrightInSource,
555        ],
556        sibling_file_patterns: &["**/debian/control", "**/debian/copyright"],
557        mode: AssemblyMode::SiblingMerge,
558    },
559    // Gradle/Android ecosystem
560    AssemblerConfig {
561        datasource_ids: &[DatasourceId::BuildGradle, DatasourceId::GradleLockfile],
562        sibling_file_patterns: &["build.gradle", "build.gradle.kts", "gradle.lockfile"],
563        mode: AssemblyMode::SiblingMerge,
564    },
565    AssemblerConfig {
566        datasource_ids: &[DatasourceId::GradleModule],
567        sibling_file_patterns: &["*.module"],
568        mode: AssemblyMode::OnePerPackageData,
569    },
570    // CPAN/Perl ecosystem
571    AssemblerConfig {
572        datasource_ids: &[
573            DatasourceId::CpanMetaJson,
574            DatasourceId::CpanMetaYml,
575            DatasourceId::CpanManifest,
576            DatasourceId::CpanDistIni,
577            DatasourceId::CpanMakefile,
578        ],
579        sibling_file_patterns: &[
580            "META.json",
581            "META.yml",
582            "MANIFEST",
583            "dist.ini",
584            "Makefile.PL",
585        ],
586        mode: AssemblyMode::SiblingMerge,
587    },
588    // NuGet/.NET ecosystem
589    AssemblerConfig {
590        datasource_ids: &[
591            DatasourceId::NugetCsproj,
592            DatasourceId::NugetFsproj,
593            DatasourceId::NugetNuspec,
594            DatasourceId::NugetNupkg,
595            DatasourceId::NugetProjectJson,
596            DatasourceId::NugetProjectLockJson,
597            DatasourceId::NugetPackagesConfig,
598            DatasourceId::NugetPackagesLock,
599            DatasourceId::NugetVbproj,
600        ],
601        sibling_file_patterns: &[
602            "*.csproj",
603            "*.fsproj",
604            "*.nuspec",
605            "*.nupkg",
606            "project.json",
607            "project.lock.json",
608            "packages.config",
609            "packages.lock.json",
610            "*.packages.lock.json",
611            "*.vbproj",
612        ],
613        mode: AssemblyMode::SiblingMerge,
614    },
615    AssemblerConfig {
616        datasource_ids: &[DatasourceId::NugetDepsJson],
617        sibling_file_patterns: &["*.deps.json"],
618        mode: AssemblyMode::OnePerPackageData,
619    },
620    // Swift/SPM ecosystem
621    AssemblerConfig {
622        datasource_ids: &[
623            DatasourceId::SwiftPackageManifestJson,
624            DatasourceId::SwiftPackageResolved,
625            DatasourceId::SwiftPackageShowDependencies,
626        ],
627        sibling_file_patterns: &[
628            "Package.swift.json",
629            "Package.swift.deplock",
630            "Package.resolved",
631            ".package.resolved",
632            "swift-show-dependencies.deplock",
633        ],
634        mode: AssemblyMode::SiblingMerge,
635    },
636    // ── Standalone assemblers (single file → single package) ──
637    //
638    // These ecosystems have only one manifest file type with no sibling merging.
639    // They still need configs so their datasource_ids are recognized by the assembler.
640    //
641    // Bower (JavaScript)
642    AssemblerConfig {
643        datasource_ids: &[DatasourceId::BowerJson],
644        sibling_file_patterns: &["bower.json"],
645        mode: AssemblyMode::SiblingMerge,
646    },
647    // CRAN (R language)
648    AssemblerConfig {
649        datasource_ids: &[DatasourceId::CranDescription],
650        sibling_file_patterns: &["DESCRIPTION"],
651        mode: AssemblyMode::SiblingMerge,
652    },
653    // FreeBSD packages
654    AssemblerConfig {
655        datasource_ids: &[DatasourceId::FreebsdCompactManifest],
656        sibling_file_patterns: &["+COMPACT_MANIFEST"],
657        mode: AssemblyMode::SiblingMerge,
658    },
659    // Haxe ecosystem
660    AssemblerConfig {
661        datasource_ids: &[DatasourceId::HaxelibJson],
662        sibling_file_patterns: &["haxelib.json"],
663        mode: AssemblyMode::SiblingMerge,
664    },
665    AssemblerConfig {
666        datasource_ids: &[DatasourceId::Gitmodules],
667        sibling_file_patterns: &[".gitmodules"],
668        mode: AssemblyMode::SiblingMerge,
669    },
670    // OCaml/opam ecosystem
671    AssemblerConfig {
672        datasource_ids: &[DatasourceId::OpamFile],
673        sibling_file_patterns: &["opam", "*.opam"],
674        mode: AssemblyMode::SiblingMerge,
675    },
676    // RPM Mariner manifest
677    AssemblerConfig {
678        datasource_ids: &[DatasourceId::RpmMarinerManifest],
679        sibling_file_patterns: &["*.rpm.manifest"],
680        mode: AssemblyMode::SiblingMerge,
681    },
682    AssemblerConfig {
683        datasource_ids: &[DatasourceId::RpmYumdb],
684        sibling_file_patterns: &["**/var/lib/yum/yumdb/*/*/from_repo"],
685        mode: AssemblyMode::OnePerPackageData,
686    },
687    // Microsoft Update Manifest
688    AssemblerConfig {
689        datasource_ids: &[DatasourceId::MicrosoftUpdateManifestMum],
690        sibling_file_patterns: &["*.mum"],
691        mode: AssemblyMode::SiblingMerge,
692    },
693    // Autotools (C/C++ build system)
694    AssemblerConfig {
695        datasource_ids: &[DatasourceId::AutotoolsConfigure],
696        sibling_file_patterns: &["configure", "configure.ac"],
697        mode: AssemblyMode::SiblingMerge,
698    },
699    // Bazel (build system)
700    AssemblerConfig {
701        datasource_ids: &[DatasourceId::BazelBuild],
702        sibling_file_patterns: &["BUILD"],
703        mode: AssemblyMode::SiblingMerge,
704    },
705    AssemblerConfig {
706        datasource_ids: &[DatasourceId::BazelModule],
707        sibling_file_patterns: &["MODULE.bazel"],
708        mode: AssemblyMode::OnePerPackageData,
709    },
710    // Buck (build system)
711    AssemblerConfig {
712        datasource_ids: &[DatasourceId::BuckFile, DatasourceId::BuckMetadata],
713        sibling_file_patterns: &["BUCK", "METADATA.bzl", ".buckconfig"],
714        mode: AssemblyMode::SiblingMerge,
715    },
716    // Ant/Ivy (Java dependency management)
717    AssemblerConfig {
718        datasource_ids: &[DatasourceId::AntIvyXml],
719        sibling_file_patterns: &["ivy.xml"],
720        mode: AssemblyMode::SiblingMerge,
721    },
722    // Meteor (JavaScript platform)
723    AssemblerConfig {
724        datasource_ids: &[DatasourceId::MeteorPackage],
725        sibling_file_patterns: &["package.js"],
726        mode: AssemblyMode::SiblingMerge,
727    },
728    // ── One-per-PackageData assemblers (database files with many packages) ──
729    //
730    // Alpine installed package database
731    AssemblerConfig {
732        datasource_ids: &[DatasourceId::AlpineInstalledDb],
733        sibling_file_patterns: &["installed"],
734        mode: AssemblyMode::OnePerPackageData,
735    },
736    AssemblerConfig {
737        datasource_ids: &[DatasourceId::AlpineApkbuild],
738        sibling_file_patterns: &["APKBUILD"],
739        mode: AssemblyMode::SiblingMerge,
740    },
741    // RPM installed package databases (BDB, NDB, SQLite)
742    AssemblerConfig {
743        datasource_ids: &[
744            DatasourceId::RpmInstalledDatabaseBdb,
745            DatasourceId::RpmInstalledDatabaseNdb,
746            DatasourceId::RpmInstalledDatabaseSqlite,
747        ],
748        sibling_file_patterns: &["Packages", "Packages.db", "rpmdb.sqlite"],
749        mode: AssemblyMode::OnePerPackageData,
750    },
751    // Debian installed package databases
752    AssemblerConfig {
753        datasource_ids: &[
754            DatasourceId::DebianInstalledStatusDb,
755            DatasourceId::DebianDistrolessInstalledDb,
756        ],
757        sibling_file_patterns: &["status"],
758        mode: AssemblyMode::OnePerPackageData,
759    },
760    AssemblerConfig {
761        datasource_ids: &[
762            DatasourceId::DebianControlExtractedDeb,
763            DatasourceId::DebianMd5SumsInExtractedDeb,
764        ],
765        sibling_file_patterns: &["control", "md5sums"],
766        mode: AssemblyMode::SiblingMerge,
767    },
768    AssemblerConfig {
769        datasource_ids: &[DatasourceId::AboutFile],
770        sibling_file_patterns: &["*.ABOUT"],
771        mode: AssemblyMode::OnePerPackageData,
772    },
773];
774
775// Datasource IDs intentionally excluded from package assembly.
776//
777// This list is runtime-significant: files with these datasource IDs may remain
778// unowned by any Package, while their dependencies are still eligible for
779// top-level hoisting. Tests also use it to enforce explicit assembly accounting.
780pub static UNASSEMBLED_DATASOURCE_IDS: &[DatasourceId] = &[
781    // Non-package metadata
782    DatasourceId::Readme,
783    DatasourceId::EtcOsRelease,
784    // Binary archives (require external extraction via ExtractCode before scanning)
785    DatasourceId::AlpineApkArchive,
786    DatasourceId::AndroidAarLibrary,
787    DatasourceId::AndroidApk,
788    DatasourceId::AppleDmg,
789    DatasourceId::Axis2Mar,
790    DatasourceId::ChromeCrx,
791    DatasourceId::DebianDeb,
792    DatasourceId::DebianOriginalSourceTarball,
793    DatasourceId::DebianSourceMetadataTarball,
794    DatasourceId::InstallshieldInstaller,
795    DatasourceId::IosIpa,
796    DatasourceId::IsoDiskImage,
797    DatasourceId::JavaEarArchive,
798    DatasourceId::JavaJar,
799    DatasourceId::JavaWarArchive,
800    DatasourceId::JbossSar,
801    DatasourceId::MicrosoftCabinet,
802    DatasourceId::MozillaXpi,
803    DatasourceId::NsisInstaller,
804    DatasourceId::RpmArchive,
805    DatasourceId::SharShellArchive,
806    DatasourceId::SquashfsDiskImage,
807    // Supplementary metadata (not primary package definitions)
808    DatasourceId::ArchAurinfo,
809    DatasourceId::ArchPkginfo,
810    DatasourceId::ArchSrcinfo,
811    DatasourceId::Axis2ModuleXml,
812    DatasourceId::ClojureDepsEdn,
813    DatasourceId::ClojureProjectClj,
814    DatasourceId::DebianInstalledFilesList,
815    DatasourceId::DebianInstalledMd5Sums,
816    DatasourceId::DebianCopyright,
817    DatasourceId::DebianCopyrightInPackage,
818    DatasourceId::DebianCopyrightStandalone,
819    DatasourceId::GoBinary,
820    DatasourceId::WindowsExecutable,
821    DatasourceId::DebianSourceControlDsc,
822    DatasourceId::Dockerfile,
823    DatasourceId::HexMixLock,
824    DatasourceId::JavaEarApplicationXml,
825    DatasourceId::JavaWarWebXml,
826    DatasourceId::JbossServiceXml,
827    DatasourceId::MesonBuild,
828    DatasourceId::GemGemspecInstalledSpecifications,
829    DatasourceId::NugetDirectoryBuildProps,
830    DatasourceId::NugetDirectoryPackagesProps,
831    DatasourceId::CitationCff,
832    DatasourceId::PubliccodeYaml,
833    DatasourceId::RpmPackageLicenses,
834    DatasourceId::RustBinary,
835    DatasourceId::SbtBuildSbt,
836    DatasourceId::VcpkgJson,
837];
838
839#[cfg(test)]
840mod tests {
841    use super::*;
842    use std::collections::HashSet;
843    use strum::IntoEnumIterator;
844
845    #[test]
846    fn test_every_datasource_id_is_accounted_for() {
847        let mut assembled: HashSet<DatasourceId> = HashSet::new();
848        for config in ASSEMBLERS {
849            for &dsid in config.datasource_ids {
850                assembled.insert(dsid);
851            }
852        }
853
854        let unassembled: HashSet<DatasourceId> =
855            UNASSEMBLED_DATASOURCE_IDS.iter().copied().collect();
856
857        let overlap: Vec<_> = assembled.intersection(&unassembled).collect();
858        assert!(
859            overlap.is_empty(),
860            "Datasource IDs in BOTH ASSEMBLERS and UNASSEMBLED: {overlap:?}"
861        );
862
863        let missing: Vec<_> = DatasourceId::iter()
864            .filter(|dsid| !assembled.contains(dsid) && !unassembled.contains(dsid))
865            .collect();
866
867        assert!(
868            missing.is_empty(),
869            "Datasource IDs in NEITHER ASSEMBLERS nor UNASSEMBLED: {missing:?}\n\
870             Add each to an AssemblerConfig in ASSEMBLERS, or to UNASSEMBLED_DATASOURCE_IDS."
871        );
872    }
873
874    #[test]
875    fn test_post_assembly_passes_are_unique() {
876        let unique: HashSet<PostAssemblyPassKind> = POST_ASSEMBLY_PASSES.iter().copied().collect();
877
878        assert_eq!(
879            unique.len(),
880            POST_ASSEMBLY_PASSES.len(),
881            "POST_ASSEMBLY_PASSES contains duplicate entries"
882        );
883    }
884
885    #[test]
886    fn test_every_post_assembly_pass_kind_is_registered_once() {
887        let registered: HashSet<PostAssemblyPassKind> =
888            POST_ASSEMBLY_PASSES.iter().copied().collect();
889
890        let missing: Vec<_> = PostAssemblyPassKind::iter()
891            .filter(|pass| !registered.contains(pass))
892            .collect();
893
894        assert!(
895            missing.is_empty(),
896            "Post-assembly pass variants not registered in POST_ASSEMBLY_PASSES: {missing:?}"
897        );
898
899        for pass in PostAssemblyPassKind::iter() {
900            let count = POST_ASSEMBLY_PASSES
901                .iter()
902                .filter(|registered| **registered == pass)
903                .count();
904            assert_eq!(
905                count, 1,
906                "Post-assembly pass {pass:?} should be registered exactly once"
907            );
908        }
909    }
910
911    #[test]
912    fn test_post_assembly_passes_skip_irrelevant_inputs() {
913        let inputs = PostAssemblyInputs::default();
914
915        for pass in PostAssemblyPassKind::iter() {
916            assert!(
917                !pass.should_run(&inputs),
918                "{pass:?} should skip when no relevant inputs are present"
919            );
920        }
921    }
922
923    #[test]
924    fn test_npm_workspace_inputs_only_run_npm_passes() {
925        let inputs = PostAssemblyInputs {
926            package_types: HashSet::from([PackageType::Npm]),
927            file_datasource_ids: HashSet::from([DatasourceId::NpmPackageJson]),
928            has_npm_workspace_markers: true,
929            has_cargo_workspace_markers: false,
930        };
931
932        let runnable: HashSet<_> = PostAssemblyPassKind::iter()
933            .filter(|pass| pass.should_run(&inputs))
934            .collect();
935
936        assert_eq!(
937            runnable,
938            HashSet::from([
939                PostAssemblyPassKind::NpmResourceAssign,
940                PostAssemblyPassKind::NpmWorkspaceMerge,
941            ])
942        );
943    }
944
945    #[test]
946    fn test_cargo_workspace_merge_requires_workspace_markers() {
947        let without_markers = PostAssemblyInputs {
948            package_types: HashSet::from([PackageType::Cargo]),
949            file_datasource_ids: HashSet::from([DatasourceId::CargoToml]),
950            has_npm_workspace_markers: false,
951            has_cargo_workspace_markers: false,
952        };
953
954        assert!(!PostAssemblyPassKind::CargoWorkspaceMerge.should_run(&without_markers));
955
956        let with_markers = PostAssemblyInputs {
957            has_cargo_workspace_markers: true,
958            ..without_markers
959        };
960
961        assert!(PostAssemblyPassKind::CargoWorkspaceMerge.should_run(&with_markers));
962    }
963}