Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use std::str::FromStr;
5use uuid::Uuid;
6
7use super::DatasourceId;
8use super::PackageType;
9use crate::models::output::Tallies;
10use crate::utils::spdx::combine_license_expressions;
11
12#[derive(Debug, Builder, Serialize, Deserialize)]
13#[builder(build_fn(skip))]
14/// File-level scan result containing metadata and detected findings.
15pub struct FileInfo {
16    pub name: String,
17    pub base_name: String,
18    pub extension: String,
19    pub path: String,
20    #[serde(rename = "type")] // name used by ScanCode
21    pub file_type: FileType,
22    #[builder(default)]
23    pub mime_type: Option<String>,
24    pub size: u64,
25    #[builder(default)]
26    pub date: Option<String>,
27    #[builder(default)]
28    pub sha1: Option<String>,
29    #[builder(default)]
30    pub md5: Option<String>,
31    #[builder(default)]
32    pub sha256: Option<String>,
33    #[builder(default)]
34    pub programming_language: Option<String>,
35    #[builder(default)]
36    #[serde(default)]
37    pub package_data: Vec<PackageData>,
38    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
39    #[builder(default)]
40    pub license_expression: Option<String>,
41    #[builder(default)]
42    #[serde(default)]
43    pub license_detections: Vec<LicenseDetection>,
44    #[builder(default)]
45    #[serde(default)]
46    pub copyrights: Vec<Copyright>,
47    #[builder(default)]
48    #[serde(default)]
49    pub holders: Vec<Holder>,
50    #[builder(default)]
51    #[serde(default)]
52    pub authors: Vec<Author>,
53    #[builder(default)]
54    #[serde(skip_serializing_if = "Vec::is_empty", default)]
55    pub emails: Vec<OutputEmail>,
56    #[builder(default)]
57    #[serde(default)]
58    pub urls: Vec<OutputURL>,
59    #[builder(default)]
60    #[serde(default)]
61    pub for_packages: Vec<String>,
62    #[builder(default)]
63    #[serde(default)]
64    pub scan_errors: Vec<String>,
65    #[builder(default)]
66    #[serde(skip_serializing_if = "Option::is_none", default)]
67    pub is_generated: Option<bool>,
68    #[builder(default)]
69    #[serde(skip_serializing_if = "Option::is_none", default)]
70    pub is_source: Option<bool>,
71    #[builder(default)]
72    #[serde(skip_serializing_if = "Option::is_none", default)]
73    pub source_count: Option<usize>,
74    #[builder(default)]
75    #[serde(skip_serializing_if = "is_false", default)]
76    pub is_legal: bool,
77    #[builder(default)]
78    #[serde(skip_serializing_if = "is_false", default)]
79    pub is_manifest: bool,
80    #[builder(default)]
81    #[serde(skip_serializing_if = "is_false", default)]
82    pub is_readme: bool,
83    #[builder(default)]
84    #[serde(skip_serializing_if = "is_false", default)]
85    pub is_top_level: bool,
86    #[builder(default)]
87    #[serde(skip_serializing_if = "is_false", default)]
88    pub is_key_file: bool,
89    #[builder(default)]
90    #[serde(skip_serializing_if = "is_false", default)]
91    pub is_community: bool,
92    #[builder(default)]
93    #[serde(skip_serializing_if = "Vec::is_empty", default)]
94    pub facets: Vec<String>,
95    #[builder(default)]
96    #[serde(skip_serializing_if = "Option::is_none", default)]
97    pub tallies: Option<Tallies>,
98}
99
100impl FileInfoBuilder {
101    /// Build a [`FileInfo`] from the current builder state.
102    pub fn build(&self) -> Result<FileInfo, String> {
103        Ok(FileInfo::new(
104            self.name.clone().ok_or("Missing field: name")?,
105            self.base_name.clone().ok_or("Missing field: base_name")?,
106            self.extension.clone().ok_or("Missing field: extension")?,
107            self.path.clone().ok_or("Missing field: path")?,
108            self.file_type.clone().ok_or("Missing field: file_type")?,
109            self.mime_type.clone().flatten(),
110            self.size.ok_or("Missing field: size")?,
111            self.date.clone().flatten(),
112            self.sha1.clone().flatten(),
113            self.md5.clone().flatten(),
114            self.sha256.clone().flatten(),
115            self.programming_language.clone().flatten(),
116            self.package_data.clone().unwrap_or_default(),
117            self.license_expression.clone().flatten(),
118            self.license_detections.clone().unwrap_or_default(),
119            self.copyrights.clone().unwrap_or_default(),
120            self.holders.clone().unwrap_or_default(),
121            self.authors.clone().unwrap_or_default(),
122            self.emails.clone().unwrap_or_default(),
123            self.urls.clone().unwrap_or_default(),
124            self.for_packages.clone().unwrap_or_default(),
125            self.scan_errors.clone().unwrap_or_default(),
126        ))
127    }
128}
129
130impl FileInfo {
131    #[allow(clippy::too_many_arguments)]
132    /// Construct a [`FileInfo`] from fully resolved scanner fields.
133    pub fn new(
134        name: String,
135        base_name: String,
136        extension: String,
137        path: String,
138        file_type: FileType,
139        mime_type: Option<String>,
140        size: u64,
141        date: Option<String>,
142        sha1: Option<String>,
143        md5: Option<String>,
144        sha256: Option<String>,
145        programming_language: Option<String>,
146        package_data: Vec<PackageData>,
147        mut license_expression: Option<String>,
148        mut license_detections: Vec<LicenseDetection>,
149        copyrights: Vec<Copyright>,
150        holders: Vec<Holder>,
151        authors: Vec<Author>,
152        emails: Vec<OutputEmail>,
153        urls: Vec<OutputURL>,
154        for_packages: Vec<String>,
155        scan_errors: Vec<String>,
156    ) -> Self {
157        // Combine license expressions from package data if license_expression is None
158        license_expression = license_expression.or_else(|| {
159            let expressions = package_data
160                .iter()
161                .filter_map(|pkg| pkg.get_license_expression());
162            combine_license_expressions(expressions)
163        });
164
165        // Combine license detections from package data if none are provided
166        if license_detections.is_empty() {
167            for pkg in &package_data {
168                license_detections.extend(pkg.license_detections.clone());
169            }
170        }
171
172        // Combine license expressions from license detections if license_expression is still None
173        if license_expression.is_none() && !license_detections.is_empty() {
174            let expressions = license_detections
175                .iter()
176                .map(|detection| detection.license_expression.clone());
177            license_expression = combine_license_expressions(expressions);
178        }
179
180        FileInfo {
181            name,
182            base_name,
183            extension,
184            path,
185            file_type,
186            mime_type,
187            size,
188            date,
189            sha1,
190            md5,
191            sha256,
192            programming_language,
193            package_data,
194            license_expression,
195            license_detections,
196            copyrights,
197            holders,
198            authors,
199            emails,
200            urls,
201            for_packages,
202            scan_errors,
203            is_generated: None,
204            is_source: None,
205            source_count: None,
206            is_legal: false,
207            is_manifest: false,
208            is_readme: false,
209            is_top_level: false,
210            is_key_file: false,
211            is_community: false,
212            facets: vec![],
213            tallies: None,
214        }
215    }
216}
217
218/// Package metadata extracted from manifest files.
219///
220/// Compatible with ScanCode Toolkit output format. Contains standardized package
221/// information including name, version, dependencies, licenses, and other metadata.
222/// This is the primary data structure returned by all parsers.
223#[derive(Serialize, Deserialize, Debug, Clone, Default)]
224pub struct PackageData {
225    #[serde(rename = "type")] // name used by ScanCode
226    pub package_type: Option<PackageType>,
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub namespace: Option<String>,
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub name: Option<String>,
231    #[serde(skip_serializing_if = "Option::is_none")]
232    pub version: Option<String>,
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub qualifiers: Option<std::collections::HashMap<String, String>>,
235    #[serde(skip_serializing_if = "Option::is_none")]
236    pub subpath: Option<String>,
237    #[serde(skip_serializing_if = "Option::is_none")]
238    pub primary_language: Option<String>,
239    #[serde(skip_serializing_if = "Option::is_none")]
240    pub description: Option<String>,
241    #[serde(skip_serializing_if = "Option::is_none")]
242    pub release_date: Option<String>,
243    pub parties: Vec<Party>,
244    #[serde(skip_serializing_if = "Vec::is_empty", default)]
245    pub keywords: Vec<String>,
246    #[serde(skip_serializing_if = "Option::is_none")]
247    pub homepage_url: Option<String>,
248    #[serde(skip_serializing_if = "Option::is_none")]
249    pub download_url: Option<String>,
250    #[serde(skip_serializing_if = "Option::is_none")]
251    pub size: Option<u64>,
252    #[serde(skip_serializing_if = "Option::is_none")]
253    pub sha1: Option<String>,
254    #[serde(skip_serializing_if = "Option::is_none")]
255    pub md5: Option<String>,
256    #[serde(skip_serializing_if = "Option::is_none")]
257    pub sha256: Option<String>,
258    #[serde(skip_serializing_if = "Option::is_none")]
259    pub sha512: Option<String>,
260    #[serde(skip_serializing_if = "Option::is_none")]
261    pub bug_tracking_url: Option<String>,
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub code_view_url: Option<String>,
264    #[serde(skip_serializing_if = "Option::is_none")]
265    pub vcs_url: Option<String>,
266    #[serde(skip_serializing_if = "Option::is_none")]
267    pub copyright: Option<String>,
268    #[serde(skip_serializing_if = "Option::is_none")]
269    pub holder: Option<String>,
270    #[serde(skip_serializing_if = "Option::is_none")]
271    pub declared_license_expression: Option<String>,
272    #[serde(skip_serializing_if = "Option::is_none")]
273    pub declared_license_expression_spdx: Option<String>,
274    #[serde(skip_serializing_if = "Vec::is_empty", default)]
275    pub license_detections: Vec<LicenseDetection>,
276    #[serde(skip_serializing_if = "Option::is_none")]
277    pub other_license_expression: Option<String>,
278    #[serde(skip_serializing_if = "Option::is_none")]
279    pub other_license_expression_spdx: Option<String>,
280    #[serde(skip_serializing_if = "Vec::is_empty", default)]
281    pub other_license_detections: Vec<LicenseDetection>,
282    #[serde(skip_serializing_if = "Option::is_none")]
283    pub extracted_license_statement: Option<String>,
284    #[serde(skip_serializing_if = "Option::is_none")]
285    pub notice_text: Option<String>,
286    #[serde(skip_serializing_if = "Vec::is_empty", default)]
287    pub source_packages: Vec<String>,
288    #[serde(skip_serializing_if = "Vec::is_empty", default)]
289    pub file_references: Vec<FileReference>,
290    #[serde(skip_serializing_if = "is_false", default)]
291    pub is_private: bool,
292    #[serde(skip_serializing_if = "is_false", default)]
293    pub is_virtual: bool,
294    #[serde(skip_serializing_if = "Option::is_none")]
295    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
296    #[serde(skip_serializing_if = "Vec::is_empty", default)]
297    pub dependencies: Vec<Dependency>,
298    #[serde(skip_serializing_if = "Option::is_none")]
299    pub repository_homepage_url: Option<String>,
300    #[serde(skip_serializing_if = "Option::is_none")]
301    pub repository_download_url: Option<String>,
302    #[serde(skip_serializing_if = "Option::is_none")]
303    pub api_data_url: Option<String>,
304    #[serde(skip_serializing_if = "Option::is_none")]
305    pub datasource_id: Option<DatasourceId>,
306    #[serde(skip_serializing_if = "Option::is_none")]
307    pub purl: Option<String>,
308}
309
310// Helper function for serde skip_serializing_if
311fn is_false(b: &bool) -> bool {
312    !b
313}
314
315impl PackageData {
316    /// Extracts a single license expression from all license detections in this package.
317    /// Returns None if there are no license detections.
318    pub fn get_license_expression(&self) -> Option<String> {
319        if self.license_detections.is_empty() {
320            return None;
321        }
322
323        let expressions = self
324            .license_detections
325            .iter()
326            .map(|detection| detection.license_expression.clone());
327        combine_license_expressions(expressions)
328    }
329}
330
331/// License detection result containing matched license expressions.
332///
333/// Aggregates multiple license matches into a single SPDX license expression.
334#[derive(Serialize, Deserialize, Debug, Clone)]
335pub struct LicenseDetection {
336    pub license_expression: String,
337    pub license_expression_spdx: String,
338    pub matches: Vec<Match>,
339    #[serde(skip_serializing_if = "Option::is_none")]
340    pub identifier: Option<String>,
341}
342
343/// Individual license text match with location and confidence score.
344///
345/// Represents a specific region of text that matched a known license pattern.
346#[derive(Serialize, Deserialize, Debug, Clone)]
347pub struct Match {
348    pub license_expression: String,
349    pub license_expression_spdx: String,
350    #[serde(skip_serializing_if = "Option::is_none")]
351    pub from_file: Option<String>,
352    pub start_line: usize,
353    pub end_line: usize,
354    #[serde(skip_serializing_if = "Option::is_none")]
355    pub matcher: Option<String>,
356    pub score: f64,
357    #[serde(skip_serializing_if = "Option::is_none")]
358    pub matched_length: Option<usize>,
359    #[serde(skip_serializing_if = "Option::is_none")]
360    pub match_coverage: Option<f64>,
361    #[serde(skip_serializing_if = "Option::is_none")]
362    pub rule_relevance: Option<usize>,
363    #[serde(skip_serializing_if = "Option::is_none")]
364    pub rule_identifier: Option<String>,
365    #[serde(skip_serializing_if = "Option::is_none")]
366    pub rule_url: Option<String>,
367    #[serde(skip_serializing_if = "Option::is_none")]
368    pub matched_text: Option<String>,
369}
370
371#[derive(Serialize, Deserialize, Debug, Clone)]
372pub struct Copyright {
373    pub copyright: String,
374    pub start_line: usize,
375    pub end_line: usize,
376}
377
378#[derive(Serialize, Deserialize, Debug, Clone)]
379pub struct Holder {
380    pub holder: String,
381    pub start_line: usize,
382    pub end_line: usize,
383}
384
385#[derive(Serialize, Deserialize, Debug, Clone)]
386pub struct Author {
387    pub author: String,
388    pub start_line: usize,
389    pub end_line: usize,
390}
391
392/// Package dependency information with version constraints.
393///
394/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
395/// and optional resolved package details.
396#[derive(Serialize, Deserialize, Debug, Clone)]
397pub struct Dependency {
398    pub purl: Option<String>,
399    #[serde(skip_serializing_if = "Option::is_none")]
400    pub extracted_requirement: Option<String>,
401    pub scope: Option<String>,
402    #[serde(skip_serializing_if = "Option::is_none")]
403    pub is_runtime: Option<bool>,
404    #[serde(skip_serializing_if = "Option::is_none")]
405    pub is_optional: Option<bool>,
406    #[serde(skip_serializing_if = "Option::is_none")]
407    pub is_pinned: Option<bool>,
408    #[serde(skip_serializing_if = "Option::is_none")]
409    pub is_direct: Option<bool>,
410    #[serde(skip_serializing_if = "Option::is_none")]
411    pub resolved_package: Option<Box<ResolvedPackage>>,
412    #[serde(skip_serializing_if = "Option::is_none")]
413    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
414}
415
416#[derive(Serialize, Deserialize, Debug, Clone)]
417pub struct ResolvedPackage {
418    #[serde(rename = "type")]
419    pub package_type: PackageType,
420    #[serde(skip_serializing_if = "String::is_empty")]
421    pub namespace: String,
422    pub name: String,
423    pub version: String,
424    #[serde(skip_serializing_if = "Option::is_none")]
425    pub primary_language: Option<String>,
426    #[serde(skip_serializing_if = "Option::is_none")]
427    pub download_url: Option<String>,
428    #[serde(skip_serializing_if = "Option::is_none")]
429    pub sha1: Option<String>,
430    #[serde(skip_serializing_if = "Option::is_none")]
431    pub sha256: Option<String>,
432    #[serde(skip_serializing_if = "Option::is_none")]
433    pub sha512: Option<String>,
434    #[serde(skip_serializing_if = "Option::is_none")]
435    pub md5: Option<String>,
436    pub is_virtual: bool,
437    #[serde(skip_serializing_if = "Option::is_none")]
438    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
439    pub dependencies: Vec<Dependency>,
440    #[serde(skip_serializing_if = "Option::is_none")]
441    pub repository_homepage_url: Option<String>,
442    #[serde(skip_serializing_if = "Option::is_none")]
443    pub repository_download_url: Option<String>,
444    #[serde(skip_serializing_if = "Option::is_none")]
445    pub api_data_url: Option<String>,
446    #[serde(skip_serializing_if = "Option::is_none")]
447    pub datasource_id: Option<DatasourceId>,
448    #[serde(skip_serializing_if = "Option::is_none")]
449    pub purl: Option<String>,
450}
451
452/// Author, maintainer, or contributor information.
453///
454/// Represents a person or organization associated with a package.
455#[derive(Serialize, Deserialize, Debug, Clone)]
456pub struct Party {
457    #[serde(skip_serializing_if = "Option::is_none")]
458    pub r#type: Option<String>,
459    #[serde(skip_serializing_if = "Option::is_none")]
460    pub role: Option<String>,
461    #[serde(skip_serializing_if = "Option::is_none")]
462    pub name: Option<String>,
463    #[serde(skip_serializing_if = "Option::is_none")]
464    pub email: Option<String>,
465    #[serde(skip_serializing_if = "Option::is_none")]
466    pub url: Option<String>,
467    #[serde(skip_serializing_if = "Option::is_none")]
468    pub organization: Option<String>,
469    #[serde(skip_serializing_if = "Option::is_none")]
470    pub organization_url: Option<String>,
471    #[serde(skip_serializing_if = "Option::is_none")]
472    pub timezone: Option<String>,
473}
474
475/// Reference to a file within a package archive with checksums.
476///
477/// Used in SBOM generation to track files within distribution archives.
478#[derive(Serialize, Deserialize, Debug, Clone)]
479pub struct FileReference {
480    pub path: String,
481    #[serde(skip_serializing_if = "Option::is_none")]
482    pub size: Option<u64>,
483    #[serde(skip_serializing_if = "Option::is_none")]
484    pub sha1: Option<String>,
485    #[serde(skip_serializing_if = "Option::is_none")]
486    pub md5: Option<String>,
487    #[serde(skip_serializing_if = "Option::is_none")]
488    pub sha256: Option<String>,
489    #[serde(skip_serializing_if = "Option::is_none")]
490    pub sha512: Option<String>,
491    #[serde(skip_serializing_if = "Option::is_none")]
492    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
493}
494
495/// Top-level assembled package, created by merging one or more `PackageData`
496/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
497///
498/// Compatible with ScanCode Toolkit output format. The key differences from
499/// `PackageData` are:
500/// - `package_uid`: unique identifier (PURL with UUID qualifier)
501/// - `datafile_paths`: list of all contributing files
502/// - `datasource_ids`: list of all contributing parsers
503/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
504#[derive(Serialize, Deserialize, Debug, Clone)]
505pub struct Package {
506    #[serde(rename = "type")]
507    pub package_type: Option<PackageType>,
508    #[serde(skip_serializing_if = "Option::is_none")]
509    pub namespace: Option<String>,
510    #[serde(skip_serializing_if = "Option::is_none")]
511    pub name: Option<String>,
512    #[serde(skip_serializing_if = "Option::is_none")]
513    pub version: Option<String>,
514    #[serde(skip_serializing_if = "Option::is_none")]
515    pub qualifiers: Option<std::collections::HashMap<String, String>>,
516    #[serde(skip_serializing_if = "Option::is_none")]
517    pub subpath: Option<String>,
518    #[serde(skip_serializing_if = "Option::is_none")]
519    pub primary_language: Option<String>,
520    #[serde(skip_serializing_if = "Option::is_none")]
521    pub description: Option<String>,
522    #[serde(skip_serializing_if = "Option::is_none")]
523    pub release_date: Option<String>,
524    pub parties: Vec<Party>,
525    #[serde(skip_serializing_if = "Vec::is_empty", default)]
526    pub keywords: Vec<String>,
527    #[serde(skip_serializing_if = "Option::is_none")]
528    pub homepage_url: Option<String>,
529    #[serde(skip_serializing_if = "Option::is_none")]
530    pub download_url: Option<String>,
531    #[serde(skip_serializing_if = "Option::is_none")]
532    pub size: Option<u64>,
533    #[serde(skip_serializing_if = "Option::is_none")]
534    pub sha1: Option<String>,
535    #[serde(skip_serializing_if = "Option::is_none")]
536    pub md5: Option<String>,
537    #[serde(skip_serializing_if = "Option::is_none")]
538    pub sha256: Option<String>,
539    #[serde(skip_serializing_if = "Option::is_none")]
540    pub sha512: Option<String>,
541    #[serde(skip_serializing_if = "Option::is_none")]
542    pub bug_tracking_url: Option<String>,
543    #[serde(skip_serializing_if = "Option::is_none")]
544    pub code_view_url: Option<String>,
545    #[serde(skip_serializing_if = "Option::is_none")]
546    pub vcs_url: Option<String>,
547    #[serde(skip_serializing_if = "Option::is_none")]
548    pub copyright: Option<String>,
549    #[serde(skip_serializing_if = "Option::is_none")]
550    pub holder: Option<String>,
551    #[serde(skip_serializing_if = "Option::is_none")]
552    pub declared_license_expression: Option<String>,
553    #[serde(skip_serializing_if = "Option::is_none")]
554    pub declared_license_expression_spdx: Option<String>,
555    #[serde(skip_serializing_if = "Vec::is_empty", default)]
556    pub license_detections: Vec<LicenseDetection>,
557    #[serde(skip_serializing_if = "Option::is_none")]
558    pub other_license_expression: Option<String>,
559    #[serde(skip_serializing_if = "Option::is_none")]
560    pub other_license_expression_spdx: Option<String>,
561    #[serde(skip_serializing_if = "Vec::is_empty", default)]
562    pub other_license_detections: Vec<LicenseDetection>,
563    #[serde(skip_serializing_if = "Option::is_none")]
564    pub extracted_license_statement: Option<String>,
565    #[serde(skip_serializing_if = "Option::is_none")]
566    pub notice_text: Option<String>,
567    #[serde(skip_serializing_if = "Vec::is_empty", default)]
568    pub source_packages: Vec<String>,
569    #[serde(skip_serializing_if = "is_false", default)]
570    pub is_private: bool,
571    #[serde(skip_serializing_if = "is_false", default)]
572    pub is_virtual: bool,
573    #[serde(skip_serializing_if = "Option::is_none")]
574    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
575    #[serde(skip_serializing_if = "Option::is_none")]
576    pub repository_homepage_url: Option<String>,
577    #[serde(skip_serializing_if = "Option::is_none")]
578    pub repository_download_url: Option<String>,
579    #[serde(skip_serializing_if = "Option::is_none")]
580    pub api_data_url: Option<String>,
581    #[serde(skip_serializing_if = "Option::is_none")]
582    pub purl: Option<String>,
583    /// Unique identifier for this package instance (PURL with UUID qualifier).
584    pub package_uid: String,
585    /// Paths to all datafiles that contributed to this package.
586    pub datafile_paths: Vec<String>,
587    /// Datasource identifiers for all parsers that contributed to this package.
588    pub datasource_ids: Vec<DatasourceId>,
589}
590
591impl Package {
592    /// Create a `Package` from a `PackageData` and its source file path.
593    ///
594    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
595    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
596    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
597        let package_uid = package_data
598            .purl
599            .as_ref()
600            .map(|p| build_package_uid(p))
601            .unwrap_or_default();
602
603        Package {
604            package_type: package_data.package_type,
605            namespace: package_data.namespace.clone(),
606            name: package_data.name.clone(),
607            version: package_data.version.clone(),
608            qualifiers: package_data.qualifiers.clone(),
609            subpath: package_data.subpath.clone(),
610            primary_language: package_data.primary_language.clone(),
611            description: package_data.description.clone(),
612            release_date: package_data.release_date.clone(),
613            parties: package_data.parties.clone(),
614            keywords: package_data.keywords.clone(),
615            homepage_url: package_data.homepage_url.clone(),
616            download_url: package_data.download_url.clone(),
617            size: package_data.size,
618            sha1: package_data.sha1.clone(),
619            md5: package_data.md5.clone(),
620            sha256: package_data.sha256.clone(),
621            sha512: package_data.sha512.clone(),
622            bug_tracking_url: package_data.bug_tracking_url.clone(),
623            code_view_url: package_data.code_view_url.clone(),
624            vcs_url: package_data.vcs_url.clone(),
625            copyright: package_data.copyright.clone(),
626            holder: package_data.holder.clone(),
627            declared_license_expression: package_data.declared_license_expression.clone(),
628            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
629            license_detections: package_data.license_detections.clone(),
630            other_license_expression: package_data.other_license_expression.clone(),
631            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
632            other_license_detections: package_data.other_license_detections.clone(),
633            extracted_license_statement: package_data.extracted_license_statement.clone(),
634            notice_text: package_data.notice_text.clone(),
635            source_packages: package_data.source_packages.clone(),
636            is_private: package_data.is_private,
637            is_virtual: package_data.is_virtual,
638            extra_data: package_data.extra_data.clone(),
639            repository_homepage_url: package_data.repository_homepage_url.clone(),
640            repository_download_url: package_data.repository_download_url.clone(),
641            api_data_url: package_data.api_data_url.clone(),
642            purl: package_data.purl.clone(),
643            package_uid,
644            datafile_paths: vec![datafile_path],
645            datasource_ids: if let Some(dsid) = package_data.datasource_id {
646                vec![dsid]
647            } else {
648                vec![]
649            },
650        }
651    }
652
653    /// Update this package with data from another `PackageData`.
654    ///
655    /// Merges data from a related file (e.g., lockfile) into this package.
656    /// Existing non-empty values are preserved; empty fields are filled from
657    /// the new data. Lists (parties, license_detections) are merged.
658    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
659        if let Some(dsid) = package_data.datasource_id {
660            self.datasource_ids.push(dsid);
661        }
662        self.datafile_paths.push(datafile_path);
663
664        macro_rules! fill_if_empty {
665            ($field:ident) => {
666                if self.$field.is_none() {
667                    self.$field = package_data.$field.clone();
668                }
669            };
670        }
671
672        fill_if_empty!(package_type);
673        fill_if_empty!(name);
674        fill_if_empty!(namespace);
675        fill_if_empty!(version);
676        fill_if_empty!(qualifiers);
677        fill_if_empty!(subpath);
678        fill_if_empty!(primary_language);
679        fill_if_empty!(description);
680        fill_if_empty!(release_date);
681        fill_if_empty!(homepage_url);
682        fill_if_empty!(download_url);
683        fill_if_empty!(size);
684        fill_if_empty!(sha1);
685        fill_if_empty!(md5);
686        fill_if_empty!(sha256);
687        fill_if_empty!(sha512);
688        fill_if_empty!(bug_tracking_url);
689        fill_if_empty!(code_view_url);
690        fill_if_empty!(vcs_url);
691        fill_if_empty!(copyright);
692        fill_if_empty!(holder);
693        fill_if_empty!(declared_license_expression);
694        fill_if_empty!(declared_license_expression_spdx);
695        fill_if_empty!(other_license_expression);
696        fill_if_empty!(other_license_expression_spdx);
697        fill_if_empty!(extracted_license_statement);
698        fill_if_empty!(notice_text);
699        match (&mut self.extra_data, &package_data.extra_data) {
700            (None, Some(extra_data)) => {
701                self.extra_data = Some(extra_data.clone());
702            }
703            (Some(existing), Some(incoming)) => {
704                for (key, value) in incoming {
705                    existing.entry(key.clone()).or_insert_with(|| value.clone());
706                }
707            }
708            _ => {}
709        }
710        fill_if_empty!(repository_homepage_url);
711        fill_if_empty!(repository_download_url);
712        fill_if_empty!(api_data_url);
713
714        for party in &package_data.parties {
715            if let Some(existing) = self.parties.iter_mut().find(|p| {
716                p.role == party.role
717                    && ((p.name.is_some() && p.name == party.name)
718                        || (p.email.is_some() && p.email == party.email))
719            }) {
720                if existing.name.is_none() {
721                    existing.name = party.name.clone();
722                }
723                if existing.email.is_none() {
724                    existing.email = party.email.clone();
725                }
726            } else {
727                self.parties.push(party.clone());
728            }
729        }
730
731        for keyword in &package_data.keywords {
732            if !self.keywords.contains(keyword) {
733                self.keywords.push(keyword.clone());
734            }
735        }
736
737        for detection in &package_data.license_detections {
738            self.license_detections.push(detection.clone());
739        }
740
741        for detection in &package_data.other_license_detections {
742            self.other_license_detections.push(detection.clone());
743        }
744
745        for source_pkg in &package_data.source_packages {
746            if !self.source_packages.contains(source_pkg) {
747                self.source_packages.push(source_pkg.clone());
748            }
749        }
750
751        self.refresh_identity();
752    }
753
754    fn refresh_identity(&mut self) {
755        let Some(next_purl) = self.build_current_purl() else {
756            return;
757        };
758
759        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
760            self.package_uid = build_package_uid(&next_purl);
761        }
762
763        self.purl = Some(next_purl);
764    }
765
766    fn build_current_purl(&self) -> Option<String> {
767        if let (Some(package_type), Some(name)) = (
768            self.package_type.as_ref(),
769            self.name
770                .as_deref()
771                .filter(|value| !value.trim().is_empty()),
772        ) {
773            let purl_type = match package_type {
774                PackageType::Deno => "generic",
775                _ => package_type.as_str(),
776            };
777
778            let mut purl = PackageUrl::new(purl_type, name).ok()?;
779
780            if let Some(namespace) = self
781                .namespace
782                .as_deref()
783                .filter(|value| !value.trim().is_empty())
784            {
785                purl.with_namespace(namespace).ok()?;
786            }
787
788            if let Some(version) = self
789                .version
790                .as_deref()
791                .filter(|value| !value.trim().is_empty())
792            {
793                purl.with_version(version).ok()?;
794            }
795
796            if let Some(qualifiers) = &self.qualifiers {
797                for (key, value) in qualifiers {
798                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
799                }
800            }
801
802            if let Some(subpath) = self
803                .subpath
804                .as_deref()
805                .filter(|value| !value.trim().is_empty())
806            {
807                purl.with_subpath(subpath).ok()?;
808            }
809
810            return Some(purl.to_string());
811        }
812
813        let existing_purl = self.purl.as_deref()?;
814        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
815
816        if let Some(version) = self
817            .version
818            .as_deref()
819            .filter(|value| !value.trim().is_empty())
820        {
821            purl.with_version(version).ok()?;
822        } else {
823            purl.without_version();
824        }
825
826        Some(purl.to_string())
827    }
828}
829
830/// Top-level dependency instance, created during package assembly.
831///
832/// Extends the file-level `Dependency` with traceability fields that link
833/// each dependency to its owning package and source datafile.
834#[derive(Serialize, Deserialize, Debug, Clone)]
835pub struct TopLevelDependency {
836    pub purl: Option<String>,
837    #[serde(skip_serializing_if = "Option::is_none")]
838    pub extracted_requirement: Option<String>,
839    pub scope: Option<String>,
840    #[serde(skip_serializing_if = "Option::is_none")]
841    pub is_runtime: Option<bool>,
842    #[serde(skip_serializing_if = "Option::is_none")]
843    pub is_optional: Option<bool>,
844    #[serde(skip_serializing_if = "Option::is_none")]
845    pub is_pinned: Option<bool>,
846    #[serde(skip_serializing_if = "Option::is_none")]
847    pub is_direct: Option<bool>,
848    #[serde(skip_serializing_if = "Option::is_none")]
849    pub resolved_package: Option<Box<ResolvedPackage>>,
850    #[serde(skip_serializing_if = "Option::is_none")]
851    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
852    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
853    pub dependency_uid: String,
854    /// The `package_uid` of the package this dependency belongs to.
855    #[serde(skip_serializing_if = "Option::is_none")]
856    pub for_package_uid: Option<String>,
857    /// Path to the datafile where this dependency was declared.
858    pub datafile_path: String,
859    /// Datasource identifier for the parser that extracted this dependency.
860    pub datasource_id: DatasourceId,
861    /// Namespace for the dependency (e.g., distribution name for RPM packages).
862    #[serde(skip_serializing_if = "Option::is_none")]
863    pub namespace: Option<String>,
864}
865
866impl TopLevelDependency {
867    /// Create a `TopLevelDependency` from a file-level `Dependency`.
868    pub fn from_dependency(
869        dep: &Dependency,
870        datafile_path: String,
871        datasource_id: DatasourceId,
872        for_package_uid: Option<String>,
873    ) -> Self {
874        let dependency_uid = dep
875            .purl
876            .as_ref()
877            .map(|p| build_package_uid(p))
878            .unwrap_or_default();
879
880        TopLevelDependency {
881            purl: dep.purl.clone(),
882            extracted_requirement: dep.extracted_requirement.clone(),
883            scope: dep.scope.clone(),
884            is_runtime: dep.is_runtime,
885            is_optional: dep.is_optional,
886            is_pinned: dep.is_pinned,
887            is_direct: dep.is_direct,
888            resolved_package: dep.resolved_package.clone(),
889            extra_data: dep.extra_data.clone(),
890            dependency_uid,
891            for_package_uid,
892            datafile_path,
893            datasource_id,
894            namespace: None,
895        }
896    }
897}
898
899/// Generate a unique package identifier by appending a UUID v4 qualifier to a PURL.
900///
901/// The format matches Python ScanCode: `pkg:type/name@version?uuid=<uuid-v4>`
902pub fn build_package_uid(purl: &str) -> String {
903    let uuid = Uuid::new_v4();
904    if purl.contains('?') {
905        format!("{}&uuid={}", purl, uuid)
906    } else {
907        format!("{}?uuid={}", purl, uuid)
908    }
909}
910
911#[derive(Serialize, Deserialize, Debug, Clone)]
912pub struct OutputEmail {
913    pub email: String,
914    pub start_line: usize,
915    pub end_line: usize,
916}
917
918#[derive(Serialize, Deserialize, Debug, Clone)]
919pub struct OutputURL {
920    pub url: String,
921    pub start_line: usize,
922    pub end_line: usize,
923}
924
925#[derive(Debug, Clone, PartialEq)]
926pub enum FileType {
927    File,
928    Directory,
929}
930
931impl Serialize for FileType {
932    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
933    where
934        S: serde::Serializer,
935    {
936        let value = match self {
937            FileType::File => "file",
938            FileType::Directory => "directory",
939        };
940        serializer.serialize_str(value)
941    }
942}
943
944impl<'de> Deserialize<'de> for FileType {
945    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
946    where
947        D: serde::Deserializer<'de>,
948    {
949        let value = String::deserialize(deserializer)?;
950        match value.as_str() {
951            "file" => Ok(FileType::File),
952            "directory" => Ok(FileType::Directory),
953            _ => Err(serde::de::Error::custom("invalid file type")),
954        }
955    }
956}