Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use std::str::FromStr;
5use uuid::Uuid;
6
7use sha1::{Digest, Sha1};
8
9use super::DatasourceId;
10use super::PackageType;
11use crate::license_detection::tokenize::tokenize_without_stopwords;
12use crate::models::output::Tallies;
13use crate::utils::spdx::combine_license_expressions;
14
15#[derive(Debug, Builder, Serialize, Deserialize)]
16#[builder(build_fn(skip))]
17/// File-level scan result containing metadata and detected findings.
18pub struct FileInfo {
19    pub name: String,
20    pub base_name: String,
21    pub extension: String,
22    pub path: String,
23    #[serde(rename = "type")] // name used by ScanCode
24    pub file_type: FileType,
25    #[builder(default)]
26    pub mime_type: Option<String>,
27    pub size: u64,
28    #[builder(default)]
29    pub date: Option<String>,
30    #[builder(default)]
31    pub sha1: Option<String>,
32    #[builder(default)]
33    pub md5: Option<String>,
34    #[builder(default)]
35    pub sha256: Option<String>,
36    #[builder(default)]
37    pub programming_language: Option<String>,
38    #[builder(default)]
39    #[serde(default)]
40    pub package_data: Vec<PackageData>,
41    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
42    #[builder(default)]
43    pub license_expression: Option<String>,
44    #[builder(default)]
45    #[serde(default)]
46    pub license_detections: Vec<LicenseDetection>,
47    #[builder(default)]
48    #[serde(skip_serializing_if = "Vec::is_empty", default)]
49    pub license_clues: Vec<Match>,
50    #[builder(default)]
51    #[serde(skip_serializing_if = "Option::is_none", default)]
52    pub percentage_of_license_text: Option<f64>,
53    #[builder(default)]
54    #[serde(default)]
55    pub copyrights: Vec<Copyright>,
56    #[builder(default)]
57    #[serde(default)]
58    pub holders: Vec<Holder>,
59    #[builder(default)]
60    #[serde(default)]
61    pub authors: Vec<Author>,
62    #[builder(default)]
63    #[serde(skip_serializing_if = "Vec::is_empty", default)]
64    pub emails: Vec<OutputEmail>,
65    #[builder(default)]
66    #[serde(default)]
67    pub urls: Vec<OutputURL>,
68    #[builder(default)]
69    #[serde(default)]
70    pub for_packages: Vec<String>,
71    #[builder(default)]
72    #[serde(default)]
73    pub scan_errors: Vec<String>,
74    #[builder(default)]
75    #[serde(skip_serializing_if = "Option::is_none", default)]
76    pub is_generated: Option<bool>,
77    #[builder(default)]
78    #[serde(skip_serializing_if = "Option::is_none", default)]
79    pub is_source: Option<bool>,
80    #[builder(default)]
81    #[serde(skip_serializing_if = "Option::is_none", default)]
82    pub source_count: Option<usize>,
83    #[builder(default)]
84    #[serde(skip_serializing_if = "is_false", default)]
85    pub is_legal: bool,
86    #[builder(default)]
87    #[serde(skip_serializing_if = "is_false", default)]
88    pub is_manifest: bool,
89    #[builder(default)]
90    #[serde(skip_serializing_if = "is_false", default)]
91    pub is_readme: bool,
92    #[builder(default)]
93    #[serde(skip_serializing_if = "is_false", default)]
94    pub is_top_level: bool,
95    #[builder(default)]
96    #[serde(skip_serializing_if = "is_false", default)]
97    pub is_key_file: bool,
98    #[builder(default)]
99    #[serde(skip_serializing_if = "is_false", default)]
100    pub is_community: bool,
101    #[builder(default)]
102    #[serde(skip_serializing_if = "Vec::is_empty", default)]
103    pub facets: Vec<String>,
104    #[builder(default)]
105    #[serde(skip_serializing_if = "Option::is_none", default)]
106    pub tallies: Option<Tallies>,
107}
108
109impl FileInfoBuilder {
110    /// Build a [`FileInfo`] from the current builder state.
111    pub fn build(&self) -> Result<FileInfo, String> {
112        Ok(FileInfo::new(
113            self.name.clone().ok_or("Missing field: name")?,
114            self.base_name.clone().ok_or("Missing field: base_name")?,
115            self.extension.clone().ok_or("Missing field: extension")?,
116            self.path.clone().ok_or("Missing field: path")?,
117            self.file_type.clone().ok_or("Missing field: file_type")?,
118            self.mime_type.clone().flatten(),
119            self.size.ok_or("Missing field: size")?,
120            self.date.clone().flatten(),
121            self.sha1.clone().flatten(),
122            self.md5.clone().flatten(),
123            self.sha256.clone().flatten(),
124            self.programming_language.clone().flatten(),
125            self.package_data.clone().unwrap_or_default(),
126            self.license_expression.clone().flatten(),
127            self.license_detections.clone().unwrap_or_default(),
128            self.license_clues.clone().unwrap_or_default(),
129            self.copyrights.clone().unwrap_or_default(),
130            self.holders.clone().unwrap_or_default(),
131            self.authors.clone().unwrap_or_default(),
132            self.emails.clone().unwrap_or_default(),
133            self.urls.clone().unwrap_or_default(),
134            self.for_packages.clone().unwrap_or_default(),
135            self.scan_errors.clone().unwrap_or_default(),
136        ))
137    }
138}
139
140impl FileInfo {
141    #[allow(clippy::too_many_arguments)]
142    /// Construct a [`FileInfo`] from fully resolved scanner fields.
143    pub fn new(
144        name: String,
145        base_name: String,
146        extension: String,
147        path: String,
148        file_type: FileType,
149        mime_type: Option<String>,
150        size: u64,
151        date: Option<String>,
152        sha1: Option<String>,
153        md5: Option<String>,
154        sha256: Option<String>,
155        programming_language: Option<String>,
156        package_data: Vec<PackageData>,
157        mut license_expression: Option<String>,
158        mut license_detections: Vec<LicenseDetection>,
159        license_clues: Vec<Match>,
160        copyrights: Vec<Copyright>,
161        holders: Vec<Holder>,
162        authors: Vec<Author>,
163        emails: Vec<OutputEmail>,
164        urls: Vec<OutputURL>,
165        for_packages: Vec<String>,
166        scan_errors: Vec<String>,
167    ) -> Self {
168        let mut package_data = package_data;
169        for package in &mut package_data {
170            enrich_package_data_license_provenance(package, &path);
171        }
172
173        // Combine license expressions from package data if license_expression is None
174        license_expression = license_expression.or_else(|| {
175            let expressions = package_data
176                .iter()
177                .filter_map(|pkg| pkg.get_license_expression());
178            combine_license_expressions(expressions)
179        });
180
181        // Combine license detections from package data if none are provided
182        if license_detections.is_empty() {
183            for pkg in &package_data {
184                license_detections.extend(pkg.license_detections.clone());
185            }
186        }
187
188        // Combine license expressions from license detections if license_expression is still None
189        if license_expression.is_none() && !license_detections.is_empty() {
190            let expressions = license_detections
191                .iter()
192                .map(|detection| detection.license_expression.clone());
193            license_expression = combine_license_expressions(expressions);
194        }
195
196        let mut file_info = FileInfo {
197            name,
198            base_name,
199            extension,
200            path,
201            file_type,
202            mime_type,
203            size,
204            date,
205            sha1,
206            md5,
207            sha256,
208            programming_language,
209            package_data,
210            license_expression,
211            license_detections,
212            license_clues,
213            percentage_of_license_text: None,
214            copyrights,
215            holders,
216            authors,
217            emails,
218            urls,
219            for_packages,
220            scan_errors,
221            is_generated: None,
222            is_source: None,
223            source_count: None,
224            is_legal: false,
225            is_manifest: false,
226            is_readme: false,
227            is_top_level: false,
228            is_key_file: false,
229            is_community: false,
230            facets: vec![],
231            tallies: None,
232        };
233        file_info.backfill_license_provenance();
234        file_info
235    }
236
237    pub fn backfill_license_provenance(&mut self) {
238        for detection in &mut self.license_detections {
239            enrich_license_detection_provenance(detection, &self.path);
240        }
241
242        for package in &mut self.package_data {
243            enrich_package_data_license_provenance(package, &self.path);
244        }
245    }
246}
247
248fn enrich_package_data_license_provenance(package_data: &mut PackageData, path: &str) {
249    for detection in &mut package_data.license_detections {
250        enrich_license_detection_provenance(detection, path);
251    }
252    for detection in &mut package_data.other_license_detections {
253        enrich_license_detection_provenance(detection, path);
254    }
255}
256
257pub(crate) fn enrich_license_detection_provenance(detection: &mut LicenseDetection, path: &str) {
258    for detection_match in &mut detection.matches {
259        if detection_match.from_file.is_none() {
260            detection_match.from_file = Some(path.to_string());
261        }
262    }
263
264    if detection.identifier.is_none() {
265        detection.identifier = Some(compute_public_detection_identifier(detection));
266    }
267}
268
269fn compute_public_detection_identifier(detection: &LicenseDetection) -> String {
270    let expression = python_safe_name(&detection.license_expression);
271    let mut hasher = Sha1::new();
272    hasher.update(format_public_detection_content(detection).as_bytes());
273    let hex_str = hex::encode(hasher.finalize());
274    let uuid_hex = &hex_str[..32];
275    let content_uuid = uuid::Uuid::parse_str(uuid_hex)
276        .map(|uuid| uuid.to_string())
277        .unwrap_or_else(|_| uuid_hex.to_string());
278
279    format!("{}-{}", expression, content_uuid)
280}
281
282fn format_public_detection_content(detection: &LicenseDetection) -> String {
283    let mut result = String::from("(");
284
285    for (index, detection_match) in detection.matches.iter().enumerate() {
286        if index > 0 {
287            result.push_str(", ");
288        }
289        result.push_str(&format!(
290            "({}, {}, {})",
291            python_str_repr(
292                detection_match
293                    .rule_identifier
294                    .as_deref()
295                    .or(detection_match.matcher.as_deref())
296                    .unwrap_or("parser-declared-license")
297            ),
298            detection_match.score as f32,
299            python_token_tuple_repr(&tokenize_without_stopwords(
300                detection_match.matched_text.as_deref().unwrap_or_default(),
301            )),
302        ));
303    }
304
305    if detection.matches.len() == 1 {
306        result.push(',');
307    }
308    result.push(')');
309    result
310}
311
312fn python_safe_name(value: &str) -> String {
313    let mut result = String::new();
314    let mut prev_underscore = false;
315
316    for character in value.chars() {
317        if character.is_alphanumeric() {
318            result.push(character);
319            prev_underscore = false;
320        } else if !prev_underscore {
321            result.push('_');
322            prev_underscore = true;
323        }
324    }
325
326    let trimmed = result.trim_matches('_');
327    if trimmed.is_empty() {
328        String::new()
329    } else {
330        trimmed.to_string()
331    }
332}
333
334fn python_str_repr(value: &str) -> String {
335    if value.contains('\'') && !value.contains('"') {
336        format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\""))
337    } else {
338        format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\\'"))
339    }
340}
341
342fn python_token_tuple_repr(tokens: &[String]) -> String {
343    if tokens.is_empty() {
344        return String::from("()");
345    }
346
347    let mut result = String::from("(");
348    for (index, token) in tokens.iter().enumerate() {
349        if index > 0 {
350            result.push_str(", ");
351        }
352        result.push_str(&python_str_repr(token));
353    }
354
355    if tokens.len() == 1 {
356        result.push(',');
357    }
358    result.push(')');
359    result
360}
361
362/// Package metadata extracted from manifest files.
363///
364/// Compatible with ScanCode Toolkit output format. Contains standardized package
365/// information including name, version, dependencies, licenses, and other metadata.
366/// This is the primary data structure returned by all parsers.
367#[derive(Serialize, Deserialize, Debug, Clone, Default)]
368pub struct PackageData {
369    #[serde(rename = "type")] // name used by ScanCode
370    pub package_type: Option<PackageType>,
371    #[serde(skip_serializing_if = "Option::is_none")]
372    pub namespace: Option<String>,
373    #[serde(skip_serializing_if = "Option::is_none")]
374    pub name: Option<String>,
375    #[serde(skip_serializing_if = "Option::is_none")]
376    pub version: Option<String>,
377    #[serde(skip_serializing_if = "Option::is_none")]
378    pub qualifiers: Option<std::collections::HashMap<String, String>>,
379    #[serde(skip_serializing_if = "Option::is_none")]
380    pub subpath: Option<String>,
381    #[serde(skip_serializing_if = "Option::is_none")]
382    pub primary_language: Option<String>,
383    #[serde(skip_serializing_if = "Option::is_none")]
384    pub description: Option<String>,
385    #[serde(skip_serializing_if = "Option::is_none")]
386    pub release_date: Option<String>,
387    pub parties: Vec<Party>,
388    #[serde(skip_serializing_if = "Vec::is_empty", default)]
389    pub keywords: Vec<String>,
390    #[serde(skip_serializing_if = "Option::is_none")]
391    pub homepage_url: Option<String>,
392    #[serde(skip_serializing_if = "Option::is_none")]
393    pub download_url: Option<String>,
394    #[serde(skip_serializing_if = "Option::is_none")]
395    pub size: Option<u64>,
396    #[serde(skip_serializing_if = "Option::is_none")]
397    pub sha1: Option<String>,
398    #[serde(skip_serializing_if = "Option::is_none")]
399    pub md5: Option<String>,
400    #[serde(skip_serializing_if = "Option::is_none")]
401    pub sha256: Option<String>,
402    #[serde(skip_serializing_if = "Option::is_none")]
403    pub sha512: Option<String>,
404    #[serde(skip_serializing_if = "Option::is_none")]
405    pub bug_tracking_url: Option<String>,
406    #[serde(skip_serializing_if = "Option::is_none")]
407    pub code_view_url: Option<String>,
408    #[serde(skip_serializing_if = "Option::is_none")]
409    pub vcs_url: Option<String>,
410    #[serde(skip_serializing_if = "Option::is_none")]
411    pub copyright: Option<String>,
412    #[serde(skip_serializing_if = "Option::is_none")]
413    pub holder: Option<String>,
414    #[serde(skip_serializing_if = "Option::is_none")]
415    pub declared_license_expression: Option<String>,
416    #[serde(skip_serializing_if = "Option::is_none")]
417    pub declared_license_expression_spdx: Option<String>,
418    #[serde(default)]
419    pub license_detections: Vec<LicenseDetection>,
420    #[serde(skip_serializing_if = "Option::is_none")]
421    pub other_license_expression: Option<String>,
422    #[serde(skip_serializing_if = "Option::is_none")]
423    pub other_license_expression_spdx: Option<String>,
424    #[serde(skip_serializing_if = "Vec::is_empty", default)]
425    pub other_license_detections: Vec<LicenseDetection>,
426    #[serde(skip_serializing_if = "Option::is_none")]
427    pub extracted_license_statement: Option<String>,
428    #[serde(skip_serializing_if = "Option::is_none")]
429    pub notice_text: Option<String>,
430    #[serde(skip_serializing_if = "Vec::is_empty", default)]
431    pub source_packages: Vec<String>,
432    #[serde(skip_serializing_if = "Vec::is_empty", default)]
433    pub file_references: Vec<FileReference>,
434    #[serde(skip_serializing_if = "is_false", default)]
435    pub is_private: bool,
436    #[serde(skip_serializing_if = "is_false", default)]
437    pub is_virtual: bool,
438    #[serde(skip_serializing_if = "Option::is_none")]
439    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
440    #[serde(default)]
441    pub dependencies: Vec<Dependency>,
442    #[serde(skip_serializing_if = "Option::is_none")]
443    pub repository_homepage_url: Option<String>,
444    #[serde(skip_serializing_if = "Option::is_none")]
445    pub repository_download_url: Option<String>,
446    #[serde(skip_serializing_if = "Option::is_none")]
447    pub api_data_url: Option<String>,
448    #[serde(skip_serializing_if = "Option::is_none")]
449    pub datasource_id: Option<DatasourceId>,
450    #[serde(skip_serializing_if = "Option::is_none")]
451    pub purl: Option<String>,
452}
453
454// Helper function for serde skip_serializing_if
455fn is_false(b: &bool) -> bool {
456    !b
457}
458
459impl PackageData {
460    /// Extracts a single license expression from all license detections in this package.
461    /// Returns None if there are no license detections.
462    pub fn get_license_expression(&self) -> Option<String> {
463        if self.license_detections.is_empty() {
464            return None;
465        }
466
467        let expressions = self
468            .license_detections
469            .iter()
470            .map(|detection| detection.license_expression.clone());
471        combine_license_expressions(expressions)
472    }
473}
474
475/// License detection result containing matched license expressions.
476///
477/// Aggregates multiple license matches into a single SPDX license expression.
478#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
479pub struct LicenseDetection {
480    pub license_expression: String,
481    pub license_expression_spdx: String,
482    pub matches: Vec<Match>,
483    #[serde(skip_serializing_if = "Vec::is_empty", default)]
484    pub detection_log: Vec<String>,
485    #[serde(skip_serializing_if = "Option::is_none")]
486    pub identifier: Option<String>,
487}
488
489/// Individual license text match with location and confidence score.
490///
491/// Represents a specific region of text that matched a known license pattern.
492#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
493pub struct Match {
494    pub license_expression: String,
495    pub license_expression_spdx: String,
496    #[serde(skip_serializing_if = "Option::is_none")]
497    pub from_file: Option<String>,
498    pub start_line: usize,
499    pub end_line: usize,
500    #[serde(skip_serializing_if = "Option::is_none")]
501    pub matcher: Option<String>,
502    pub score: f64,
503    #[serde(skip_serializing_if = "Option::is_none")]
504    pub matched_length: Option<usize>,
505    #[serde(skip_serializing_if = "Option::is_none")]
506    pub match_coverage: Option<f64>,
507    #[serde(skip_serializing_if = "Option::is_none")]
508    pub rule_relevance: Option<usize>,
509    #[serde(skip_serializing_if = "Option::is_none")]
510    pub rule_identifier: Option<String>,
511    pub rule_url: Option<String>,
512    #[serde(skip_serializing_if = "Option::is_none")]
513    pub matched_text: Option<String>,
514    #[serde(skip_serializing_if = "Option::is_none")]
515    pub matched_text_diagnostics: Option<String>,
516    #[serde(skip_serializing_if = "Option::is_none", default)]
517    pub referenced_filenames: Option<Vec<String>>,
518}
519
520#[derive(Serialize, Deserialize, Debug, Clone)]
521pub struct Copyright {
522    pub copyright: String,
523    pub start_line: usize,
524    pub end_line: usize,
525}
526
527#[derive(Serialize, Deserialize, Debug, Clone)]
528pub struct Holder {
529    pub holder: String,
530    pub start_line: usize,
531    pub end_line: usize,
532}
533
534#[derive(Serialize, Deserialize, Debug, Clone)]
535pub struct Author {
536    pub author: String,
537    pub start_line: usize,
538    pub end_line: usize,
539}
540
541/// Package dependency information with version constraints.
542///
543/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
544/// and optional resolved package details.
545#[derive(Serialize, Deserialize, Debug, Clone)]
546pub struct Dependency {
547    pub purl: Option<String>,
548    #[serde(skip_serializing_if = "Option::is_none")]
549    pub extracted_requirement: Option<String>,
550    pub scope: Option<String>,
551    #[serde(skip_serializing_if = "Option::is_none")]
552    pub is_runtime: Option<bool>,
553    #[serde(skip_serializing_if = "Option::is_none")]
554    pub is_optional: Option<bool>,
555    #[serde(skip_serializing_if = "Option::is_none")]
556    pub is_pinned: Option<bool>,
557    #[serde(skip_serializing_if = "Option::is_none")]
558    pub is_direct: Option<bool>,
559    #[serde(skip_serializing_if = "Option::is_none")]
560    pub resolved_package: Option<Box<ResolvedPackage>>,
561    #[serde(skip_serializing_if = "Option::is_none")]
562    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
563}
564
565#[derive(Serialize, Deserialize, Debug, Clone)]
566pub struct ResolvedPackage {
567    #[serde(rename = "type")]
568    pub package_type: PackageType,
569    #[serde(skip_serializing_if = "String::is_empty")]
570    pub namespace: String,
571    pub name: String,
572    pub version: String,
573    #[serde(skip_serializing_if = "Option::is_none")]
574    pub primary_language: Option<String>,
575    #[serde(skip_serializing_if = "Option::is_none")]
576    pub download_url: Option<String>,
577    #[serde(skip_serializing_if = "Option::is_none")]
578    pub sha1: Option<String>,
579    #[serde(skip_serializing_if = "Option::is_none")]
580    pub sha256: Option<String>,
581    #[serde(skip_serializing_if = "Option::is_none")]
582    pub sha512: Option<String>,
583    #[serde(skip_serializing_if = "Option::is_none")]
584    pub md5: Option<String>,
585    pub is_virtual: bool,
586    #[serde(skip_serializing_if = "Option::is_none")]
587    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
588    pub dependencies: Vec<Dependency>,
589    #[serde(skip_serializing_if = "Option::is_none")]
590    pub repository_homepage_url: Option<String>,
591    #[serde(skip_serializing_if = "Option::is_none")]
592    pub repository_download_url: Option<String>,
593    #[serde(skip_serializing_if = "Option::is_none")]
594    pub api_data_url: Option<String>,
595    #[serde(skip_serializing_if = "Option::is_none")]
596    pub datasource_id: Option<DatasourceId>,
597    #[serde(skip_serializing_if = "Option::is_none")]
598    pub purl: Option<String>,
599}
600
601/// Author, maintainer, or contributor information.
602///
603/// Represents a person or organization associated with a package.
604#[derive(Serialize, Deserialize, Debug, Clone)]
605pub struct Party {
606    #[serde(skip_serializing_if = "Option::is_none")]
607    pub r#type: Option<String>,
608    #[serde(skip_serializing_if = "Option::is_none")]
609    pub role: Option<String>,
610    #[serde(skip_serializing_if = "Option::is_none")]
611    pub name: Option<String>,
612    #[serde(skip_serializing_if = "Option::is_none")]
613    pub email: Option<String>,
614    #[serde(skip_serializing_if = "Option::is_none")]
615    pub url: Option<String>,
616    #[serde(skip_serializing_if = "Option::is_none")]
617    pub organization: Option<String>,
618    #[serde(skip_serializing_if = "Option::is_none")]
619    pub organization_url: Option<String>,
620    #[serde(skip_serializing_if = "Option::is_none")]
621    pub timezone: Option<String>,
622}
623
624/// Reference to a file within a package archive with checksums.
625///
626/// Used in SBOM generation to track files within distribution archives.
627#[derive(Serialize, Deserialize, Debug, Clone)]
628pub struct FileReference {
629    pub path: String,
630    #[serde(skip_serializing_if = "Option::is_none")]
631    pub size: Option<u64>,
632    #[serde(skip_serializing_if = "Option::is_none")]
633    pub sha1: Option<String>,
634    #[serde(skip_serializing_if = "Option::is_none")]
635    pub md5: Option<String>,
636    #[serde(skip_serializing_if = "Option::is_none")]
637    pub sha256: Option<String>,
638    #[serde(skip_serializing_if = "Option::is_none")]
639    pub sha512: Option<String>,
640    #[serde(skip_serializing_if = "Option::is_none")]
641    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
642}
643
644/// Top-level assembled package, created by merging one or more `PackageData`
645/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
646///
647/// Compatible with ScanCode Toolkit output format. The key differences from
648/// `PackageData` are:
649/// - `package_uid`: unique identifier (PURL with UUID qualifier)
650/// - `datafile_paths`: list of all contributing files
651/// - `datasource_ids`: list of all contributing parsers
652/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
653#[derive(Serialize, Deserialize, Debug, Clone)]
654pub struct Package {
655    #[serde(rename = "type")]
656    pub package_type: Option<PackageType>,
657    #[serde(skip_serializing_if = "Option::is_none")]
658    pub namespace: Option<String>,
659    #[serde(skip_serializing_if = "Option::is_none")]
660    pub name: Option<String>,
661    #[serde(skip_serializing_if = "Option::is_none")]
662    pub version: Option<String>,
663    #[serde(skip_serializing_if = "Option::is_none")]
664    pub qualifiers: Option<std::collections::HashMap<String, String>>,
665    #[serde(skip_serializing_if = "Option::is_none")]
666    pub subpath: Option<String>,
667    #[serde(skip_serializing_if = "Option::is_none")]
668    pub primary_language: Option<String>,
669    #[serde(skip_serializing_if = "Option::is_none")]
670    pub description: Option<String>,
671    #[serde(skip_serializing_if = "Option::is_none")]
672    pub release_date: Option<String>,
673    pub parties: Vec<Party>,
674    #[serde(skip_serializing_if = "Vec::is_empty", default)]
675    pub keywords: Vec<String>,
676    #[serde(skip_serializing_if = "Option::is_none")]
677    pub homepage_url: Option<String>,
678    #[serde(skip_serializing_if = "Option::is_none")]
679    pub download_url: Option<String>,
680    #[serde(skip_serializing_if = "Option::is_none")]
681    pub size: Option<u64>,
682    #[serde(skip_serializing_if = "Option::is_none")]
683    pub sha1: Option<String>,
684    #[serde(skip_serializing_if = "Option::is_none")]
685    pub md5: Option<String>,
686    #[serde(skip_serializing_if = "Option::is_none")]
687    pub sha256: Option<String>,
688    #[serde(skip_serializing_if = "Option::is_none")]
689    pub sha512: Option<String>,
690    #[serde(skip_serializing_if = "Option::is_none")]
691    pub bug_tracking_url: Option<String>,
692    #[serde(skip_serializing_if = "Option::is_none")]
693    pub code_view_url: Option<String>,
694    #[serde(skip_serializing_if = "Option::is_none")]
695    pub vcs_url: Option<String>,
696    #[serde(skip_serializing_if = "Option::is_none")]
697    pub copyright: Option<String>,
698    #[serde(skip_serializing_if = "Option::is_none")]
699    pub holder: Option<String>,
700    #[serde(skip_serializing_if = "Option::is_none")]
701    pub declared_license_expression: Option<String>,
702    #[serde(skip_serializing_if = "Option::is_none")]
703    pub declared_license_expression_spdx: Option<String>,
704    #[serde(skip_serializing_if = "Vec::is_empty", default)]
705    pub license_detections: Vec<LicenseDetection>,
706    #[serde(skip_serializing_if = "Option::is_none")]
707    pub other_license_expression: Option<String>,
708    #[serde(skip_serializing_if = "Option::is_none")]
709    pub other_license_expression_spdx: Option<String>,
710    #[serde(skip_serializing_if = "Vec::is_empty", default)]
711    pub other_license_detections: Vec<LicenseDetection>,
712    #[serde(skip_serializing_if = "Option::is_none")]
713    pub extracted_license_statement: Option<String>,
714    #[serde(skip_serializing_if = "Option::is_none")]
715    pub notice_text: Option<String>,
716    #[serde(skip_serializing_if = "Vec::is_empty", default)]
717    pub source_packages: Vec<String>,
718    #[serde(skip_serializing_if = "is_false", default)]
719    pub is_private: bool,
720    #[serde(skip_serializing_if = "is_false", default)]
721    pub is_virtual: bool,
722    #[serde(skip_serializing_if = "Option::is_none")]
723    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
724    #[serde(skip_serializing_if = "Option::is_none")]
725    pub repository_homepage_url: Option<String>,
726    #[serde(skip_serializing_if = "Option::is_none")]
727    pub repository_download_url: Option<String>,
728    #[serde(skip_serializing_if = "Option::is_none")]
729    pub api_data_url: Option<String>,
730    #[serde(skip_serializing_if = "Option::is_none")]
731    pub purl: Option<String>,
732    /// Unique identifier for this package instance (PURL with UUID qualifier).
733    pub package_uid: String,
734    /// Paths to all datafiles that contributed to this package.
735    pub datafile_paths: Vec<String>,
736    /// Datasource identifiers for all parsers that contributed to this package.
737    pub datasource_ids: Vec<DatasourceId>,
738}
739
740impl Package {
741    /// Create a `Package` from a `PackageData` and its source file path.
742    ///
743    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
744    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
745    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
746        let mut package_data = package_data.clone();
747        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
748
749        let package_uid = package_data
750            .purl
751            .as_ref()
752            .map(|p| build_package_uid(p))
753            .unwrap_or_default();
754
755        Package {
756            package_type: package_data.package_type,
757            namespace: package_data.namespace.clone(),
758            name: package_data.name.clone(),
759            version: package_data.version.clone(),
760            qualifiers: package_data.qualifiers.clone(),
761            subpath: package_data.subpath.clone(),
762            primary_language: package_data.primary_language.clone(),
763            description: package_data.description.clone(),
764            release_date: package_data.release_date.clone(),
765            parties: package_data.parties.clone(),
766            keywords: package_data.keywords.clone(),
767            homepage_url: package_data.homepage_url.clone(),
768            download_url: package_data.download_url.clone(),
769            size: package_data.size,
770            sha1: package_data.sha1.clone(),
771            md5: package_data.md5.clone(),
772            sha256: package_data.sha256.clone(),
773            sha512: package_data.sha512.clone(),
774            bug_tracking_url: package_data.bug_tracking_url.clone(),
775            code_view_url: package_data.code_view_url.clone(),
776            vcs_url: package_data.vcs_url.clone(),
777            copyright: package_data.copyright.clone(),
778            holder: package_data.holder.clone(),
779            declared_license_expression: package_data.declared_license_expression.clone(),
780            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
781            license_detections: package_data.license_detections.clone(),
782            other_license_expression: package_data.other_license_expression.clone(),
783            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
784            other_license_detections: package_data.other_license_detections.clone(),
785            extracted_license_statement: package_data.extracted_license_statement.clone(),
786            notice_text: package_data.notice_text.clone(),
787            source_packages: package_data.source_packages.clone(),
788            is_private: package_data.is_private,
789            is_virtual: package_data.is_virtual,
790            extra_data: package_data.extra_data.clone(),
791            repository_homepage_url: package_data.repository_homepage_url.clone(),
792            repository_download_url: package_data.repository_download_url.clone(),
793            api_data_url: package_data.api_data_url.clone(),
794            purl: package_data.purl.clone(),
795            package_uid,
796            datafile_paths: vec![datafile_path],
797            datasource_ids: if let Some(dsid) = package_data.datasource_id {
798                vec![dsid]
799            } else {
800                vec![]
801            },
802        }
803    }
804
805    /// Update this package with data from another `PackageData`.
806    ///
807    /// Merges data from a related file (e.g., lockfile) into this package.
808    /// Existing non-empty values are preserved; empty fields are filled from
809    /// the new data. Lists (parties, license_detections) are merged.
810    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
811        let mut package_data = package_data.clone();
812        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
813
814        if let Some(dsid) = package_data.datasource_id {
815            self.datasource_ids.push(dsid);
816        }
817        self.datafile_paths.push(datafile_path);
818
819        macro_rules! fill_if_empty {
820            ($field:ident) => {
821                if self.$field.is_none() {
822                    self.$field = package_data.$field.clone();
823                }
824            };
825        }
826
827        fill_if_empty!(package_type);
828        fill_if_empty!(name);
829        fill_if_empty!(namespace);
830        fill_if_empty!(version);
831        fill_if_empty!(qualifiers);
832        fill_if_empty!(subpath);
833        fill_if_empty!(primary_language);
834        fill_if_empty!(description);
835        fill_if_empty!(release_date);
836        fill_if_empty!(homepage_url);
837        fill_if_empty!(download_url);
838        fill_if_empty!(size);
839        fill_if_empty!(sha1);
840        fill_if_empty!(md5);
841        fill_if_empty!(sha256);
842        fill_if_empty!(sha512);
843        fill_if_empty!(bug_tracking_url);
844        fill_if_empty!(code_view_url);
845        fill_if_empty!(vcs_url);
846        fill_if_empty!(copyright);
847        fill_if_empty!(holder);
848        fill_if_empty!(declared_license_expression);
849        fill_if_empty!(declared_license_expression_spdx);
850        fill_if_empty!(other_license_expression);
851        fill_if_empty!(other_license_expression_spdx);
852        fill_if_empty!(extracted_license_statement);
853        fill_if_empty!(notice_text);
854        match (&mut self.extra_data, &package_data.extra_data) {
855            (None, Some(extra_data)) => {
856                self.extra_data = Some(extra_data.clone());
857            }
858            (Some(existing), Some(incoming)) => {
859                for (key, value) in incoming {
860                    existing.entry(key.clone()).or_insert_with(|| value.clone());
861                }
862            }
863            _ => {}
864        }
865        fill_if_empty!(repository_homepage_url);
866        fill_if_empty!(repository_download_url);
867        fill_if_empty!(api_data_url);
868
869        for party in &package_data.parties {
870            if let Some(existing) = self.parties.iter_mut().find(|p| {
871                p.role == party.role
872                    && ((p.name.is_some() && p.name == party.name)
873                        || (p.email.is_some() && p.email == party.email))
874            }) {
875                if existing.name.is_none() {
876                    existing.name = party.name.clone();
877                }
878                if existing.email.is_none() {
879                    existing.email = party.email.clone();
880                }
881            } else {
882                self.parties.push(party.clone());
883            }
884        }
885
886        for keyword in &package_data.keywords {
887            if !self.keywords.contains(keyword) {
888                self.keywords.push(keyword.clone());
889            }
890        }
891
892        for detection in &package_data.license_detections {
893            self.license_detections.push(detection.clone());
894        }
895
896        for detection in &package_data.other_license_detections {
897            self.other_license_detections.push(detection.clone());
898        }
899
900        for source_pkg in &package_data.source_packages {
901            if !self.source_packages.contains(source_pkg) {
902                self.source_packages.push(source_pkg.clone());
903            }
904        }
905
906        self.refresh_identity();
907    }
908
909    pub fn backfill_license_provenance(&mut self) {
910        let Some(datafile_path) = self.datafile_paths.first().cloned() else {
911            return;
912        };
913
914        for detection in &mut self.license_detections {
915            enrich_license_detection_provenance(detection, &datafile_path);
916        }
917        for detection in &mut self.other_license_detections {
918            enrich_license_detection_provenance(detection, &datafile_path);
919        }
920    }
921
922    fn refresh_identity(&mut self) {
923        let Some(next_purl) = self.build_current_purl() else {
924            return;
925        };
926
927        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
928            self.package_uid = build_package_uid(&next_purl);
929        }
930
931        self.purl = Some(next_purl);
932    }
933
934    fn build_current_purl(&self) -> Option<String> {
935        if let (Some(package_type), Some(name)) = (
936            self.package_type.as_ref(),
937            self.name
938                .as_deref()
939                .filter(|value| !value.trim().is_empty()),
940        ) {
941            let purl_type = match package_type {
942                PackageType::Deno => "generic",
943                _ => package_type.as_str(),
944            };
945
946            let mut purl = PackageUrl::new(purl_type, name).ok()?;
947
948            if let Some(namespace) = self
949                .namespace
950                .as_deref()
951                .filter(|value| !value.trim().is_empty())
952            {
953                purl.with_namespace(namespace).ok()?;
954            }
955
956            if let Some(version) = self
957                .version
958                .as_deref()
959                .filter(|value| !value.trim().is_empty())
960            {
961                purl.with_version(version).ok()?;
962            }
963
964            if let Some(qualifiers) = &self.qualifiers {
965                for (key, value) in qualifiers {
966                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
967                }
968            }
969
970            if let Some(subpath) = self
971                .subpath
972                .as_deref()
973                .filter(|value| !value.trim().is_empty())
974            {
975                purl.with_subpath(subpath).ok()?;
976            }
977
978            return Some(purl.to_string());
979        }
980
981        let existing_purl = self.purl.as_deref()?;
982        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
983
984        if let Some(version) = self
985            .version
986            .as_deref()
987            .filter(|value| !value.trim().is_empty())
988        {
989            purl.with_version(version).ok()?;
990        } else {
991            purl.without_version();
992        }
993
994        Some(purl.to_string())
995    }
996}
997
998#[cfg(test)]
999mod tests {
1000    use super::*;
1001
1002    #[test]
1003    fn file_info_new_backfills_package_detection_provenance() {
1004        let package_data = PackageData {
1005            package_type: Some(PackageType::Npm),
1006            license_detections: vec![LicenseDetection {
1007                license_expression: "mit".to_string(),
1008                license_expression_spdx: "MIT".to_string(),
1009                matches: vec![Match {
1010                    license_expression: "mit".to_string(),
1011                    license_expression_spdx: "MIT".to_string(),
1012                    from_file: None,
1013                    start_line: 1,
1014                    end_line: 1,
1015                    matcher: Some("parser-declared-license".to_string()),
1016                    score: 100.0,
1017                    matched_length: Some(1),
1018                    match_coverage: Some(100.0),
1019                    rule_relevance: Some(100),
1020                    rule_identifier: None,
1021                    rule_url: None,
1022                    matched_text: Some("MIT".to_string()),
1023                    referenced_filenames: None,
1024                    matched_text_diagnostics: None,
1025                }],
1026                detection_log: vec![],
1027                identifier: None,
1028            }],
1029            ..PackageData::default()
1030        };
1031
1032        let file_info = FileInfo::new(
1033            "package.json".to_string(),
1034            "package".to_string(),
1035            ".json".to_string(),
1036            "project/package.json".to_string(),
1037            FileType::File,
1038            None,
1039            1,
1040            None,
1041            None,
1042            None,
1043            None,
1044            None,
1045            vec![package_data],
1046            None,
1047            vec![],
1048            vec![],
1049            vec![],
1050            vec![],
1051            vec![],
1052            vec![],
1053            vec![],
1054            vec![],
1055            vec![],
1056        );
1057
1058        assert_eq!(file_info.license_detections.len(), 1);
1059        assert_eq!(
1060            file_info.license_detections[0].matches[0]
1061                .from_file
1062                .as_deref(),
1063            Some("project/package.json")
1064        );
1065        assert!(file_info.license_detections[0].identifier.is_some());
1066        assert_eq!(
1067            file_info.package_data[0].license_detections[0].matches[0]
1068                .from_file
1069                .as_deref(),
1070            Some("project/package.json")
1071        );
1072        assert!(
1073            file_info.package_data[0].license_detections[0]
1074                .identifier
1075                .is_some()
1076        );
1077    }
1078
1079    #[test]
1080    fn package_from_package_data_backfills_detection_provenance() {
1081        let package_data = PackageData {
1082            package_type: Some(PackageType::Npm),
1083            license_detections: vec![LicenseDetection {
1084                license_expression: "mit".to_string(),
1085                license_expression_spdx: "MIT".to_string(),
1086                matches: vec![Match {
1087                    license_expression: "mit".to_string(),
1088                    license_expression_spdx: "MIT".to_string(),
1089                    from_file: None,
1090                    start_line: 1,
1091                    end_line: 1,
1092                    matcher: Some("parser-declared-license".to_string()),
1093                    score: 100.0,
1094                    matched_length: Some(1),
1095                    match_coverage: Some(100.0),
1096                    rule_relevance: Some(100),
1097                    rule_identifier: None,
1098                    rule_url: None,
1099                    matched_text: Some("MIT".to_string()),
1100                    referenced_filenames: None,
1101                    matched_text_diagnostics: None,
1102                }],
1103                detection_log: vec![],
1104                identifier: None,
1105            }],
1106            ..PackageData::default()
1107        };
1108
1109        let package = Package::from_package_data(&package_data, "project/package.json".to_string());
1110
1111        assert_eq!(
1112            package.license_detections[0].matches[0]
1113                .from_file
1114                .as_deref(),
1115            Some("project/package.json")
1116        );
1117        assert!(package.license_detections[0].identifier.is_some());
1118    }
1119}
1120
1121/// Top-level dependency instance, created during package assembly.
1122///
1123/// Extends the file-level `Dependency` with traceability fields that link
1124/// each dependency to its owning package and source datafile.
1125#[derive(Serialize, Deserialize, Debug, Clone)]
1126pub struct TopLevelDependency {
1127    pub purl: Option<String>,
1128    #[serde(skip_serializing_if = "Option::is_none")]
1129    pub extracted_requirement: Option<String>,
1130    pub scope: Option<String>,
1131    #[serde(skip_serializing_if = "Option::is_none")]
1132    pub is_runtime: Option<bool>,
1133    #[serde(skip_serializing_if = "Option::is_none")]
1134    pub is_optional: Option<bool>,
1135    #[serde(skip_serializing_if = "Option::is_none")]
1136    pub is_pinned: Option<bool>,
1137    #[serde(skip_serializing_if = "Option::is_none")]
1138    pub is_direct: Option<bool>,
1139    #[serde(skip_serializing_if = "Option::is_none")]
1140    pub resolved_package: Option<Box<ResolvedPackage>>,
1141    #[serde(skip_serializing_if = "Option::is_none")]
1142    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
1143    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
1144    pub dependency_uid: String,
1145    /// The `package_uid` of the package this dependency belongs to.
1146    #[serde(skip_serializing_if = "Option::is_none")]
1147    pub for_package_uid: Option<String>,
1148    /// Path to the datafile where this dependency was declared.
1149    pub datafile_path: String,
1150    /// Datasource identifier for the parser that extracted this dependency.
1151    pub datasource_id: DatasourceId,
1152    /// Namespace for the dependency (e.g., distribution name for RPM packages).
1153    #[serde(skip_serializing_if = "Option::is_none")]
1154    pub namespace: Option<String>,
1155}
1156
1157impl TopLevelDependency {
1158    /// Create a `TopLevelDependency` from a file-level `Dependency`.
1159    pub fn from_dependency(
1160        dep: &Dependency,
1161        datafile_path: String,
1162        datasource_id: DatasourceId,
1163        for_package_uid: Option<String>,
1164    ) -> Self {
1165        let dependency_uid = dep
1166            .purl
1167            .as_ref()
1168            .map(|p| build_package_uid(p))
1169            .unwrap_or_default();
1170
1171        TopLevelDependency {
1172            purl: dep.purl.clone(),
1173            extracted_requirement: dep.extracted_requirement.clone(),
1174            scope: dep.scope.clone(),
1175            is_runtime: dep.is_runtime,
1176            is_optional: dep.is_optional,
1177            is_pinned: dep.is_pinned,
1178            is_direct: dep.is_direct,
1179            resolved_package: dep.resolved_package.clone(),
1180            extra_data: dep.extra_data.clone(),
1181            dependency_uid,
1182            for_package_uid,
1183            datafile_path,
1184            datasource_id,
1185            namespace: None,
1186        }
1187    }
1188}
1189
1190/// Generate a unique package identifier by appending a UUID v4 qualifier to a PURL.
1191///
1192/// The format matches Python ScanCode: `pkg:type/name@version?uuid=<uuid-v4>`
1193pub fn build_package_uid(purl: &str) -> String {
1194    let uuid = Uuid::new_v4();
1195    if purl.contains('?') {
1196        format!("{}&uuid={}", purl, uuid)
1197    } else {
1198        format!("{}?uuid={}", purl, uuid)
1199    }
1200}
1201
1202#[derive(Serialize, Deserialize, Debug, Clone)]
1203pub struct OutputEmail {
1204    pub email: String,
1205    pub start_line: usize,
1206    pub end_line: usize,
1207}
1208
1209#[derive(Serialize, Deserialize, Debug, Clone)]
1210pub struct OutputURL {
1211    pub url: String,
1212    pub start_line: usize,
1213    pub end_line: usize,
1214}
1215
1216#[derive(Debug, Clone, PartialEq)]
1217pub enum FileType {
1218    File,
1219    Directory,
1220}
1221
1222impl Serialize for FileType {
1223    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1224    where
1225        S: serde::Serializer,
1226    {
1227        let value = match self {
1228            FileType::File => "file",
1229            FileType::Directory => "directory",
1230        };
1231        serializer.serialize_str(value)
1232    }
1233}
1234
1235impl<'de> Deserialize<'de> for FileType {
1236    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1237    where
1238        D: serde::Deserializer<'de>,
1239    {
1240        let value = String::deserialize(deserializer)?;
1241        match value.as_str() {
1242            "file" => Ok(FileType::File),
1243            "directory" => Ok(FileType::Directory),
1244            _ => Err(serde::de::Error::custom("invalid file type")),
1245        }
1246    }
1247}