Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use std::str::FromStr;
5use uuid::Uuid;
6
7use super::DatasourceId;
8use super::PackageType;
9use crate::utils::spdx::combine_license_expressions;
10
11#[derive(Debug, Builder, Serialize, Deserialize)]
12#[builder(build_fn(skip))]
13/// File-level scan result containing metadata and detected findings.
14pub struct FileInfo {
15    pub name: String,
16    pub base_name: String,
17    pub extension: String,
18    pub path: String,
19    #[serde(rename = "type")] // name used by ScanCode
20    pub file_type: FileType,
21    #[builder(default)]
22    pub mime_type: Option<String>,
23    pub size: u64,
24    #[builder(default)]
25    pub date: Option<String>,
26    #[builder(default)]
27    pub sha1: Option<String>,
28    #[builder(default)]
29    pub md5: Option<String>,
30    #[builder(default)]
31    pub sha256: Option<String>,
32    #[builder(default)]
33    pub programming_language: Option<String>,
34    #[builder(default)]
35    #[serde(default)]
36    pub package_data: Vec<PackageData>,
37    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
38    #[builder(default)]
39    pub license_expression: Option<String>,
40    #[builder(default)]
41    #[serde(default)]
42    pub license_detections: Vec<LicenseDetection>,
43    #[builder(default)]
44    #[serde(default)]
45    pub copyrights: Vec<Copyright>,
46    #[builder(default)]
47    #[serde(default)]
48    pub holders: Vec<Holder>,
49    #[builder(default)]
50    #[serde(default)]
51    pub authors: Vec<Author>,
52    #[builder(default)]
53    #[serde(skip_serializing_if = "Vec::is_empty", default)]
54    pub emails: Vec<OutputEmail>,
55    #[builder(default)]
56    #[serde(default)]
57    pub urls: Vec<OutputURL>,
58    #[builder(default)]
59    #[serde(default)]
60    pub for_packages: Vec<String>,
61    #[builder(default)]
62    #[serde(default)]
63    pub scan_errors: Vec<String>,
64    #[builder(default)]
65    #[serde(skip_serializing_if = "Option::is_none", default)]
66    pub is_source: Option<bool>,
67    #[builder(default)]
68    #[serde(skip_serializing_if = "Option::is_none", default)]
69    pub source_count: Option<usize>,
70    #[builder(default)]
71    #[serde(skip_serializing_if = "is_false", default)]
72    pub is_legal: bool,
73    #[builder(default)]
74    #[serde(skip_serializing_if = "is_false", default)]
75    pub is_manifest: bool,
76    #[builder(default)]
77    #[serde(skip_serializing_if = "is_false", default)]
78    pub is_readme: bool,
79    #[builder(default)]
80    #[serde(skip_serializing_if = "is_false", default)]
81    pub is_top_level: bool,
82    #[builder(default)]
83    #[serde(skip_serializing_if = "is_false", default)]
84    pub is_key_file: bool,
85}
86
87impl FileInfoBuilder {
88    /// Build a [`FileInfo`] from the current builder state.
89    pub fn build(&self) -> Result<FileInfo, String> {
90        Ok(FileInfo::new(
91            self.name.clone().ok_or("Missing field: name")?,
92            self.base_name.clone().ok_or("Missing field: base_name")?,
93            self.extension.clone().ok_or("Missing field: extension")?,
94            self.path.clone().ok_or("Missing field: path")?,
95            self.file_type.clone().ok_or("Missing field: file_type")?,
96            self.mime_type.clone().flatten(),
97            self.size.ok_or("Missing field: size")?,
98            self.date.clone().flatten(),
99            self.sha1.clone().flatten(),
100            self.md5.clone().flatten(),
101            self.sha256.clone().flatten(),
102            self.programming_language.clone().flatten(),
103            self.package_data.clone().unwrap_or_default(),
104            self.license_expression.clone().flatten(),
105            self.license_detections.clone().unwrap_or_default(),
106            self.copyrights.clone().unwrap_or_default(),
107            self.holders.clone().unwrap_or_default(),
108            self.authors.clone().unwrap_or_default(),
109            self.emails.clone().unwrap_or_default(),
110            self.urls.clone().unwrap_or_default(),
111            self.for_packages.clone().unwrap_or_default(),
112            self.scan_errors.clone().unwrap_or_default(),
113        ))
114    }
115}
116
117impl FileInfo {
118    #[allow(clippy::too_many_arguments)]
119    /// Construct a [`FileInfo`] from fully resolved scanner fields.
120    pub fn new(
121        name: String,
122        base_name: String,
123        extension: String,
124        path: String,
125        file_type: FileType,
126        mime_type: Option<String>,
127        size: u64,
128        date: Option<String>,
129        sha1: Option<String>,
130        md5: Option<String>,
131        sha256: Option<String>,
132        programming_language: Option<String>,
133        package_data: Vec<PackageData>,
134        mut license_expression: Option<String>,
135        mut license_detections: Vec<LicenseDetection>,
136        copyrights: Vec<Copyright>,
137        holders: Vec<Holder>,
138        authors: Vec<Author>,
139        emails: Vec<OutputEmail>,
140        urls: Vec<OutputURL>,
141        for_packages: Vec<String>,
142        scan_errors: Vec<String>,
143    ) -> Self {
144        // Combine license expressions from package data if license_expression is None
145        license_expression = license_expression.or_else(|| {
146            let expressions = package_data
147                .iter()
148                .filter_map(|pkg| pkg.get_license_expression());
149            combine_license_expressions(expressions)
150        });
151
152        // Combine license detections from package data if none are provided
153        if license_detections.is_empty() {
154            for pkg in &package_data {
155                license_detections.extend(pkg.license_detections.clone());
156            }
157        }
158
159        // Combine license expressions from license detections if license_expression is still None
160        if license_expression.is_none() && !license_detections.is_empty() {
161            let expressions = license_detections
162                .iter()
163                .map(|detection| detection.license_expression.clone());
164            license_expression = combine_license_expressions(expressions);
165        }
166
167        FileInfo {
168            name,
169            base_name,
170            extension,
171            path,
172            file_type,
173            mime_type,
174            size,
175            date,
176            sha1,
177            md5,
178            sha256,
179            programming_language,
180            package_data,
181            license_expression,
182            license_detections,
183            copyrights,
184            holders,
185            authors,
186            emails,
187            urls,
188            for_packages,
189            scan_errors,
190            is_source: None,
191            source_count: None,
192            is_legal: false,
193            is_manifest: false,
194            is_readme: false,
195            is_top_level: false,
196            is_key_file: false,
197        }
198    }
199}
200
201/// Package metadata extracted from manifest files.
202///
203/// Compatible with ScanCode Toolkit output format. Contains standardized package
204/// information including name, version, dependencies, licenses, and other metadata.
205/// This is the primary data structure returned by all parsers.
206#[derive(Serialize, Deserialize, Debug, Clone, Default)]
207pub struct PackageData {
208    #[serde(rename = "type")] // name used by ScanCode
209    pub package_type: Option<PackageType>,
210    #[serde(skip_serializing_if = "Option::is_none")]
211    pub namespace: Option<String>,
212    #[serde(skip_serializing_if = "Option::is_none")]
213    pub name: Option<String>,
214    #[serde(skip_serializing_if = "Option::is_none")]
215    pub version: Option<String>,
216    #[serde(skip_serializing_if = "Option::is_none")]
217    pub qualifiers: Option<std::collections::HashMap<String, String>>,
218    #[serde(skip_serializing_if = "Option::is_none")]
219    pub subpath: Option<String>,
220    #[serde(skip_serializing_if = "Option::is_none")]
221    pub primary_language: Option<String>,
222    #[serde(skip_serializing_if = "Option::is_none")]
223    pub description: Option<String>,
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub release_date: Option<String>,
226    pub parties: Vec<Party>,
227    #[serde(skip_serializing_if = "Vec::is_empty", default)]
228    pub keywords: Vec<String>,
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub homepage_url: Option<String>,
231    #[serde(skip_serializing_if = "Option::is_none")]
232    pub download_url: Option<String>,
233    #[serde(skip_serializing_if = "Option::is_none")]
234    pub size: Option<u64>,
235    #[serde(skip_serializing_if = "Option::is_none")]
236    pub sha1: Option<String>,
237    #[serde(skip_serializing_if = "Option::is_none")]
238    pub md5: Option<String>,
239    #[serde(skip_serializing_if = "Option::is_none")]
240    pub sha256: Option<String>,
241    #[serde(skip_serializing_if = "Option::is_none")]
242    pub sha512: Option<String>,
243    #[serde(skip_serializing_if = "Option::is_none")]
244    pub bug_tracking_url: Option<String>,
245    #[serde(skip_serializing_if = "Option::is_none")]
246    pub code_view_url: Option<String>,
247    #[serde(skip_serializing_if = "Option::is_none")]
248    pub vcs_url: Option<String>,
249    #[serde(skip_serializing_if = "Option::is_none")]
250    pub copyright: Option<String>,
251    #[serde(skip_serializing_if = "Option::is_none")]
252    pub holder: Option<String>,
253    #[serde(skip_serializing_if = "Option::is_none")]
254    pub declared_license_expression: Option<String>,
255    #[serde(skip_serializing_if = "Option::is_none")]
256    pub declared_license_expression_spdx: Option<String>,
257    #[serde(skip_serializing_if = "Vec::is_empty", default)]
258    pub license_detections: Vec<LicenseDetection>,
259    #[serde(skip_serializing_if = "Option::is_none")]
260    pub other_license_expression: Option<String>,
261    #[serde(skip_serializing_if = "Option::is_none")]
262    pub other_license_expression_spdx: Option<String>,
263    #[serde(skip_serializing_if = "Vec::is_empty", default)]
264    pub other_license_detections: Vec<LicenseDetection>,
265    #[serde(skip_serializing_if = "Option::is_none")]
266    pub extracted_license_statement: Option<String>,
267    #[serde(skip_serializing_if = "Option::is_none")]
268    pub notice_text: Option<String>,
269    #[serde(skip_serializing_if = "Vec::is_empty", default)]
270    pub source_packages: Vec<String>,
271    #[serde(skip_serializing_if = "Vec::is_empty", default)]
272    pub file_references: Vec<FileReference>,
273    #[serde(skip_serializing_if = "is_false", default)]
274    pub is_private: bool,
275    #[serde(skip_serializing_if = "is_false", default)]
276    pub is_virtual: bool,
277    #[serde(skip_serializing_if = "Option::is_none")]
278    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
279    #[serde(skip_serializing_if = "Vec::is_empty", default)]
280    pub dependencies: Vec<Dependency>,
281    #[serde(skip_serializing_if = "Option::is_none")]
282    pub repository_homepage_url: Option<String>,
283    #[serde(skip_serializing_if = "Option::is_none")]
284    pub repository_download_url: Option<String>,
285    #[serde(skip_serializing_if = "Option::is_none")]
286    pub api_data_url: Option<String>,
287    #[serde(skip_serializing_if = "Option::is_none")]
288    pub datasource_id: Option<DatasourceId>,
289    #[serde(skip_serializing_if = "Option::is_none")]
290    pub purl: Option<String>,
291}
292
293// Helper function for serde skip_serializing_if
294fn is_false(b: &bool) -> bool {
295    !b
296}
297
298impl PackageData {
299    /// Extracts a single license expression from all license detections in this package.
300    /// Returns None if there are no license detections.
301    pub fn get_license_expression(&self) -> Option<String> {
302        if self.license_detections.is_empty() {
303            return None;
304        }
305
306        let expressions = self
307            .license_detections
308            .iter()
309            .map(|detection| detection.license_expression.clone());
310        combine_license_expressions(expressions)
311    }
312}
313
314/// License detection result containing matched license expressions.
315///
316/// Aggregates multiple license matches into a single SPDX license expression.
317#[derive(Serialize, Deserialize, Debug, Clone)]
318pub struct LicenseDetection {
319    pub license_expression: String,
320    pub license_expression_spdx: String,
321    pub matches: Vec<Match>,
322    #[serde(skip_serializing_if = "Option::is_none")]
323    pub identifier: Option<String>,
324}
325
326/// Individual license text match with location and confidence score.
327///
328/// Represents a specific region of text that matched a known license pattern.
329#[derive(Serialize, Deserialize, Debug, Clone)]
330pub struct Match {
331    pub license_expression: String,
332    pub license_expression_spdx: String,
333    #[serde(skip_serializing_if = "Option::is_none")]
334    pub from_file: Option<String>,
335    pub start_line: usize,
336    pub end_line: usize,
337    #[serde(skip_serializing_if = "Option::is_none")]
338    pub matcher: Option<String>,
339    pub score: f64,
340    #[serde(skip_serializing_if = "Option::is_none")]
341    pub matched_length: Option<usize>,
342    #[serde(skip_serializing_if = "Option::is_none")]
343    pub match_coverage: Option<f64>,
344    #[serde(skip_serializing_if = "Option::is_none")]
345    pub rule_relevance: Option<usize>,
346    #[serde(skip_serializing_if = "Option::is_none")]
347    pub rule_identifier: Option<String>,
348    #[serde(skip_serializing_if = "Option::is_none")]
349    pub rule_url: Option<String>,
350    #[serde(skip_serializing_if = "Option::is_none")]
351    pub matched_text: Option<String>,
352}
353
354#[derive(Serialize, Deserialize, Debug, Clone)]
355pub struct Copyright {
356    pub copyright: String,
357    pub start_line: usize,
358    pub end_line: usize,
359}
360
361#[derive(Serialize, Deserialize, Debug, Clone)]
362pub struct Holder {
363    pub holder: String,
364    pub start_line: usize,
365    pub end_line: usize,
366}
367
368#[derive(Serialize, Deserialize, Debug, Clone)]
369pub struct Author {
370    pub author: String,
371    pub start_line: usize,
372    pub end_line: usize,
373}
374
375/// Package dependency information with version constraints.
376///
377/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
378/// and optional resolved package details.
379#[derive(Serialize, Deserialize, Debug, Clone)]
380pub struct Dependency {
381    pub purl: Option<String>,
382    #[serde(skip_serializing_if = "Option::is_none")]
383    pub extracted_requirement: Option<String>,
384    pub scope: Option<String>,
385    #[serde(skip_serializing_if = "Option::is_none")]
386    pub is_runtime: Option<bool>,
387    #[serde(skip_serializing_if = "Option::is_none")]
388    pub is_optional: Option<bool>,
389    #[serde(skip_serializing_if = "Option::is_none")]
390    pub is_pinned: Option<bool>,
391    #[serde(skip_serializing_if = "Option::is_none")]
392    pub is_direct: Option<bool>,
393    #[serde(skip_serializing_if = "Option::is_none")]
394    pub resolved_package: Option<Box<ResolvedPackage>>,
395    #[serde(skip_serializing_if = "Option::is_none")]
396    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
397}
398
399#[derive(Serialize, Deserialize, Debug, Clone)]
400pub struct ResolvedPackage {
401    #[serde(rename = "type")]
402    pub package_type: PackageType,
403    #[serde(skip_serializing_if = "String::is_empty")]
404    pub namespace: String,
405    pub name: String,
406    pub version: String,
407    #[serde(skip_serializing_if = "Option::is_none")]
408    pub primary_language: Option<String>,
409    #[serde(skip_serializing_if = "Option::is_none")]
410    pub download_url: Option<String>,
411    #[serde(skip_serializing_if = "Option::is_none")]
412    pub sha1: Option<String>,
413    #[serde(skip_serializing_if = "Option::is_none")]
414    pub sha256: Option<String>,
415    #[serde(skip_serializing_if = "Option::is_none")]
416    pub sha512: Option<String>,
417    #[serde(skip_serializing_if = "Option::is_none")]
418    pub md5: Option<String>,
419    pub is_virtual: bool,
420    #[serde(skip_serializing_if = "Option::is_none")]
421    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
422    pub dependencies: Vec<Dependency>,
423    #[serde(skip_serializing_if = "Option::is_none")]
424    pub repository_homepage_url: Option<String>,
425    #[serde(skip_serializing_if = "Option::is_none")]
426    pub repository_download_url: Option<String>,
427    #[serde(skip_serializing_if = "Option::is_none")]
428    pub api_data_url: Option<String>,
429    #[serde(skip_serializing_if = "Option::is_none")]
430    pub datasource_id: Option<DatasourceId>,
431    #[serde(skip_serializing_if = "Option::is_none")]
432    pub purl: Option<String>,
433}
434
435/// Author, maintainer, or contributor information.
436///
437/// Represents a person or organization associated with a package.
438#[derive(Serialize, Deserialize, Debug, Clone)]
439pub struct Party {
440    #[serde(skip_serializing_if = "Option::is_none")]
441    pub r#type: Option<String>,
442    #[serde(skip_serializing_if = "Option::is_none")]
443    pub role: Option<String>,
444    #[serde(skip_serializing_if = "Option::is_none")]
445    pub name: Option<String>,
446    #[serde(skip_serializing_if = "Option::is_none")]
447    pub email: Option<String>,
448    #[serde(skip_serializing_if = "Option::is_none")]
449    pub url: Option<String>,
450    #[serde(skip_serializing_if = "Option::is_none")]
451    pub organization: Option<String>,
452    #[serde(skip_serializing_if = "Option::is_none")]
453    pub organization_url: Option<String>,
454    #[serde(skip_serializing_if = "Option::is_none")]
455    pub timezone: Option<String>,
456}
457
458/// Reference to a file within a package archive with checksums.
459///
460/// Used in SBOM generation to track files within distribution archives.
461#[derive(Serialize, Deserialize, Debug, Clone)]
462pub struct FileReference {
463    pub path: String,
464    #[serde(skip_serializing_if = "Option::is_none")]
465    pub size: Option<u64>,
466    #[serde(skip_serializing_if = "Option::is_none")]
467    pub sha1: Option<String>,
468    #[serde(skip_serializing_if = "Option::is_none")]
469    pub md5: Option<String>,
470    #[serde(skip_serializing_if = "Option::is_none")]
471    pub sha256: Option<String>,
472    #[serde(skip_serializing_if = "Option::is_none")]
473    pub sha512: Option<String>,
474    #[serde(skip_serializing_if = "Option::is_none")]
475    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
476}
477
478/// Top-level assembled package, created by merging one or more `PackageData`
479/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
480///
481/// Compatible with ScanCode Toolkit output format. The key differences from
482/// `PackageData` are:
483/// - `package_uid`: unique identifier (PURL with UUID qualifier)
484/// - `datafile_paths`: list of all contributing files
485/// - `datasource_ids`: list of all contributing parsers
486/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
487#[derive(Serialize, Deserialize, Debug, Clone)]
488pub struct Package {
489    #[serde(rename = "type")]
490    pub package_type: Option<PackageType>,
491    #[serde(skip_serializing_if = "Option::is_none")]
492    pub namespace: Option<String>,
493    #[serde(skip_serializing_if = "Option::is_none")]
494    pub name: Option<String>,
495    #[serde(skip_serializing_if = "Option::is_none")]
496    pub version: Option<String>,
497    #[serde(skip_serializing_if = "Option::is_none")]
498    pub qualifiers: Option<std::collections::HashMap<String, String>>,
499    #[serde(skip_serializing_if = "Option::is_none")]
500    pub subpath: Option<String>,
501    #[serde(skip_serializing_if = "Option::is_none")]
502    pub primary_language: Option<String>,
503    #[serde(skip_serializing_if = "Option::is_none")]
504    pub description: Option<String>,
505    #[serde(skip_serializing_if = "Option::is_none")]
506    pub release_date: Option<String>,
507    pub parties: Vec<Party>,
508    #[serde(skip_serializing_if = "Vec::is_empty", default)]
509    pub keywords: Vec<String>,
510    #[serde(skip_serializing_if = "Option::is_none")]
511    pub homepage_url: Option<String>,
512    #[serde(skip_serializing_if = "Option::is_none")]
513    pub download_url: Option<String>,
514    #[serde(skip_serializing_if = "Option::is_none")]
515    pub size: Option<u64>,
516    #[serde(skip_serializing_if = "Option::is_none")]
517    pub sha1: Option<String>,
518    #[serde(skip_serializing_if = "Option::is_none")]
519    pub md5: Option<String>,
520    #[serde(skip_serializing_if = "Option::is_none")]
521    pub sha256: Option<String>,
522    #[serde(skip_serializing_if = "Option::is_none")]
523    pub sha512: Option<String>,
524    #[serde(skip_serializing_if = "Option::is_none")]
525    pub bug_tracking_url: Option<String>,
526    #[serde(skip_serializing_if = "Option::is_none")]
527    pub code_view_url: Option<String>,
528    #[serde(skip_serializing_if = "Option::is_none")]
529    pub vcs_url: Option<String>,
530    #[serde(skip_serializing_if = "Option::is_none")]
531    pub copyright: Option<String>,
532    #[serde(skip_serializing_if = "Option::is_none")]
533    pub holder: Option<String>,
534    #[serde(skip_serializing_if = "Option::is_none")]
535    pub declared_license_expression: Option<String>,
536    #[serde(skip_serializing_if = "Option::is_none")]
537    pub declared_license_expression_spdx: Option<String>,
538    #[serde(skip_serializing_if = "Vec::is_empty", default)]
539    pub license_detections: Vec<LicenseDetection>,
540    #[serde(skip_serializing_if = "Option::is_none")]
541    pub other_license_expression: Option<String>,
542    #[serde(skip_serializing_if = "Option::is_none")]
543    pub other_license_expression_spdx: Option<String>,
544    #[serde(skip_serializing_if = "Vec::is_empty", default)]
545    pub other_license_detections: Vec<LicenseDetection>,
546    #[serde(skip_serializing_if = "Option::is_none")]
547    pub extracted_license_statement: Option<String>,
548    #[serde(skip_serializing_if = "Option::is_none")]
549    pub notice_text: Option<String>,
550    #[serde(skip_serializing_if = "Vec::is_empty", default)]
551    pub source_packages: Vec<String>,
552    #[serde(skip_serializing_if = "is_false", default)]
553    pub is_private: bool,
554    #[serde(skip_serializing_if = "is_false", default)]
555    pub is_virtual: bool,
556    #[serde(skip_serializing_if = "Option::is_none")]
557    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
558    #[serde(skip_serializing_if = "Option::is_none")]
559    pub repository_homepage_url: Option<String>,
560    #[serde(skip_serializing_if = "Option::is_none")]
561    pub repository_download_url: Option<String>,
562    #[serde(skip_serializing_if = "Option::is_none")]
563    pub api_data_url: Option<String>,
564    #[serde(skip_serializing_if = "Option::is_none")]
565    pub purl: Option<String>,
566    /// Unique identifier for this package instance (PURL with UUID qualifier).
567    pub package_uid: String,
568    /// Paths to all datafiles that contributed to this package.
569    pub datafile_paths: Vec<String>,
570    /// Datasource identifiers for all parsers that contributed to this package.
571    pub datasource_ids: Vec<DatasourceId>,
572}
573
574impl Package {
575    /// Create a `Package` from a `PackageData` and its source file path.
576    ///
577    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
578    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
579    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
580        let package_uid = package_data
581            .purl
582            .as_ref()
583            .map(|p| build_package_uid(p))
584            .unwrap_or_default();
585
586        Package {
587            package_type: package_data.package_type,
588            namespace: package_data.namespace.clone(),
589            name: package_data.name.clone(),
590            version: package_data.version.clone(),
591            qualifiers: package_data.qualifiers.clone(),
592            subpath: package_data.subpath.clone(),
593            primary_language: package_data.primary_language.clone(),
594            description: package_data.description.clone(),
595            release_date: package_data.release_date.clone(),
596            parties: package_data.parties.clone(),
597            keywords: package_data.keywords.clone(),
598            homepage_url: package_data.homepage_url.clone(),
599            download_url: package_data.download_url.clone(),
600            size: package_data.size,
601            sha1: package_data.sha1.clone(),
602            md5: package_data.md5.clone(),
603            sha256: package_data.sha256.clone(),
604            sha512: package_data.sha512.clone(),
605            bug_tracking_url: package_data.bug_tracking_url.clone(),
606            code_view_url: package_data.code_view_url.clone(),
607            vcs_url: package_data.vcs_url.clone(),
608            copyright: package_data.copyright.clone(),
609            holder: package_data.holder.clone(),
610            declared_license_expression: package_data.declared_license_expression.clone(),
611            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
612            license_detections: package_data.license_detections.clone(),
613            other_license_expression: package_data.other_license_expression.clone(),
614            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
615            other_license_detections: package_data.other_license_detections.clone(),
616            extracted_license_statement: package_data.extracted_license_statement.clone(),
617            notice_text: package_data.notice_text.clone(),
618            source_packages: package_data.source_packages.clone(),
619            is_private: package_data.is_private,
620            is_virtual: package_data.is_virtual,
621            extra_data: package_data.extra_data.clone(),
622            repository_homepage_url: package_data.repository_homepage_url.clone(),
623            repository_download_url: package_data.repository_download_url.clone(),
624            api_data_url: package_data.api_data_url.clone(),
625            purl: package_data.purl.clone(),
626            package_uid,
627            datafile_paths: vec![datafile_path],
628            datasource_ids: if let Some(dsid) = package_data.datasource_id {
629                vec![dsid]
630            } else {
631                vec![]
632            },
633        }
634    }
635
636    /// Update this package with data from another `PackageData`.
637    ///
638    /// Merges data from a related file (e.g., lockfile) into this package.
639    /// Existing non-empty values are preserved; empty fields are filled from
640    /// the new data. Lists (parties, license_detections) are merged.
641    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
642        if let Some(dsid) = package_data.datasource_id {
643            self.datasource_ids.push(dsid);
644        }
645        self.datafile_paths.push(datafile_path);
646
647        macro_rules! fill_if_empty {
648            ($field:ident) => {
649                if self.$field.is_none() {
650                    self.$field = package_data.$field.clone();
651                }
652            };
653        }
654
655        fill_if_empty!(package_type);
656        fill_if_empty!(name);
657        fill_if_empty!(namespace);
658        fill_if_empty!(version);
659        fill_if_empty!(qualifiers);
660        fill_if_empty!(subpath);
661        fill_if_empty!(primary_language);
662        fill_if_empty!(description);
663        fill_if_empty!(release_date);
664        fill_if_empty!(homepage_url);
665        fill_if_empty!(download_url);
666        fill_if_empty!(size);
667        fill_if_empty!(sha1);
668        fill_if_empty!(md5);
669        fill_if_empty!(sha256);
670        fill_if_empty!(sha512);
671        fill_if_empty!(bug_tracking_url);
672        fill_if_empty!(code_view_url);
673        fill_if_empty!(vcs_url);
674        fill_if_empty!(copyright);
675        fill_if_empty!(holder);
676        fill_if_empty!(declared_license_expression);
677        fill_if_empty!(declared_license_expression_spdx);
678        fill_if_empty!(other_license_expression);
679        fill_if_empty!(other_license_expression_spdx);
680        fill_if_empty!(extracted_license_statement);
681        fill_if_empty!(notice_text);
682        match (&mut self.extra_data, &package_data.extra_data) {
683            (None, Some(extra_data)) => {
684                self.extra_data = Some(extra_data.clone());
685            }
686            (Some(existing), Some(incoming)) => {
687                for (key, value) in incoming {
688                    existing.entry(key.clone()).or_insert_with(|| value.clone());
689                }
690            }
691            _ => {}
692        }
693        fill_if_empty!(repository_homepage_url);
694        fill_if_empty!(repository_download_url);
695        fill_if_empty!(api_data_url);
696
697        for party in &package_data.parties {
698            if let Some(existing) = self.parties.iter_mut().find(|p| {
699                p.role == party.role
700                    && ((p.name.is_some() && p.name == party.name)
701                        || (p.email.is_some() && p.email == party.email))
702            }) {
703                if existing.name.is_none() {
704                    existing.name = party.name.clone();
705                }
706                if existing.email.is_none() {
707                    existing.email = party.email.clone();
708                }
709            } else {
710                self.parties.push(party.clone());
711            }
712        }
713
714        for keyword in &package_data.keywords {
715            if !self.keywords.contains(keyword) {
716                self.keywords.push(keyword.clone());
717            }
718        }
719
720        for detection in &package_data.license_detections {
721            self.license_detections.push(detection.clone());
722        }
723
724        for detection in &package_data.other_license_detections {
725            self.other_license_detections.push(detection.clone());
726        }
727
728        for source_pkg in &package_data.source_packages {
729            if !self.source_packages.contains(source_pkg) {
730                self.source_packages.push(source_pkg.clone());
731            }
732        }
733
734        self.refresh_identity();
735    }
736
737    fn refresh_identity(&mut self) {
738        let Some(next_purl) = self.build_current_purl() else {
739            return;
740        };
741
742        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
743            self.package_uid = build_package_uid(&next_purl);
744        }
745
746        self.purl = Some(next_purl);
747    }
748
749    fn build_current_purl(&self) -> Option<String> {
750        if let (Some(package_type), Some(name)) = (
751            self.package_type.as_ref(),
752            self.name
753                .as_deref()
754                .filter(|value| !value.trim().is_empty()),
755        ) {
756            let purl_type = match package_type {
757                PackageType::Deno => "generic",
758                _ => package_type.as_str(),
759            };
760
761            let mut purl = PackageUrl::new(purl_type, name).ok()?;
762
763            if let Some(namespace) = self
764                .namespace
765                .as_deref()
766                .filter(|value| !value.trim().is_empty())
767            {
768                purl.with_namespace(namespace).ok()?;
769            }
770
771            if let Some(version) = self
772                .version
773                .as_deref()
774                .filter(|value| !value.trim().is_empty())
775            {
776                purl.with_version(version).ok()?;
777            }
778
779            if let Some(qualifiers) = &self.qualifiers {
780                for (key, value) in qualifiers {
781                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
782                }
783            }
784
785            if let Some(subpath) = self
786                .subpath
787                .as_deref()
788                .filter(|value| !value.trim().is_empty())
789            {
790                purl.with_subpath(subpath).ok()?;
791            }
792
793            return Some(purl.to_string());
794        }
795
796        let existing_purl = self.purl.as_deref()?;
797        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
798
799        if let Some(version) = self
800            .version
801            .as_deref()
802            .filter(|value| !value.trim().is_empty())
803        {
804            purl.with_version(version).ok()?;
805        } else {
806            purl.without_version();
807        }
808
809        Some(purl.to_string())
810    }
811}
812
813/// Top-level dependency instance, created during package assembly.
814///
815/// Extends the file-level `Dependency` with traceability fields that link
816/// each dependency to its owning package and source datafile.
817#[derive(Serialize, Deserialize, Debug, Clone)]
818pub struct TopLevelDependency {
819    pub purl: Option<String>,
820    #[serde(skip_serializing_if = "Option::is_none")]
821    pub extracted_requirement: Option<String>,
822    pub scope: Option<String>,
823    #[serde(skip_serializing_if = "Option::is_none")]
824    pub is_runtime: Option<bool>,
825    #[serde(skip_serializing_if = "Option::is_none")]
826    pub is_optional: Option<bool>,
827    #[serde(skip_serializing_if = "Option::is_none")]
828    pub is_pinned: Option<bool>,
829    #[serde(skip_serializing_if = "Option::is_none")]
830    pub is_direct: Option<bool>,
831    #[serde(skip_serializing_if = "Option::is_none")]
832    pub resolved_package: Option<Box<ResolvedPackage>>,
833    #[serde(skip_serializing_if = "Option::is_none")]
834    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
835    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
836    pub dependency_uid: String,
837    /// The `package_uid` of the package this dependency belongs to.
838    #[serde(skip_serializing_if = "Option::is_none")]
839    pub for_package_uid: Option<String>,
840    /// Path to the datafile where this dependency was declared.
841    pub datafile_path: String,
842    /// Datasource identifier for the parser that extracted this dependency.
843    pub datasource_id: DatasourceId,
844    /// Namespace for the dependency (e.g., distribution name for RPM packages).
845    #[serde(skip_serializing_if = "Option::is_none")]
846    pub namespace: Option<String>,
847}
848
849impl TopLevelDependency {
850    /// Create a `TopLevelDependency` from a file-level `Dependency`.
851    pub fn from_dependency(
852        dep: &Dependency,
853        datafile_path: String,
854        datasource_id: DatasourceId,
855        for_package_uid: Option<String>,
856    ) -> Self {
857        let dependency_uid = dep
858            .purl
859            .as_ref()
860            .map(|p| build_package_uid(p))
861            .unwrap_or_default();
862
863        TopLevelDependency {
864            purl: dep.purl.clone(),
865            extracted_requirement: dep.extracted_requirement.clone(),
866            scope: dep.scope.clone(),
867            is_runtime: dep.is_runtime,
868            is_optional: dep.is_optional,
869            is_pinned: dep.is_pinned,
870            is_direct: dep.is_direct,
871            resolved_package: dep.resolved_package.clone(),
872            extra_data: dep.extra_data.clone(),
873            dependency_uid,
874            for_package_uid,
875            datafile_path,
876            datasource_id,
877            namespace: None,
878        }
879    }
880}
881
882/// Generate a unique package identifier by appending a UUID v4 qualifier to a PURL.
883///
884/// The format matches Python ScanCode: `pkg:type/name@version?uuid=<uuid-v4>`
885pub fn build_package_uid(purl: &str) -> String {
886    let uuid = Uuid::new_v4();
887    if purl.contains('?') {
888        format!("{}&uuid={}", purl, uuid)
889    } else {
890        format!("{}?uuid={}", purl, uuid)
891    }
892}
893
894#[derive(Serialize, Deserialize, Debug, Clone)]
895pub struct OutputEmail {
896    pub email: String,
897    pub start_line: usize,
898    pub end_line: usize,
899}
900
901#[derive(Serialize, Deserialize, Debug, Clone)]
902pub struct OutputURL {
903    pub url: String,
904    pub start_line: usize,
905    pub end_line: usize,
906}
907
908#[derive(Debug, Clone, PartialEq)]
909pub enum FileType {
910    File,
911    Directory,
912}
913
914impl Serialize for FileType {
915    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
916    where
917        S: serde::Serializer,
918    {
919        let value = match self {
920            FileType::File => "file",
921            FileType::Directory => "directory",
922        };
923        serializer.serialize_str(value)
924    }
925}
926
927impl<'de> Deserialize<'de> for FileType {
928    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
929    where
930        D: serde::Deserializer<'de>,
931    {
932        let value = String::deserialize(deserializer)?;
933        match value.as_str() {
934            "file" => Ok(FileType::File),
935            "directory" => Ok(FileType::Directory),
936            _ => Err(serde::de::Error::custom("invalid file type")),
937        }
938    }
939}