Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use sha1::{Digest, Sha1};
5use std::collections::HashMap;
6use std::str::FromStr;
7
8use super::DatasourceId;
9use super::DependencyUid;
10use super::GitSha1;
11use super::LineNumber;
12use super::MatchScore;
13use super::Md5Digest;
14use super::PackageType;
15use super::PackageUid;
16use super::Sha1Digest;
17use super::Sha256Digest;
18use super::Sha512Digest;
19use crate::license_detection::tokenize::tokenize_without_stopwords;
20use crate::models::output::Tallies;
21use crate::utils::spdx::combine_license_expressions;
22
23#[derive(Debug, Builder, Serialize, Deserialize, Clone)]
24#[builder(build_fn(skip))]
25/// File-level scan result containing metadata and detected findings.
26pub struct FileInfo {
27    pub name: String,
28    pub base_name: String,
29    pub extension: String,
30    pub path: String,
31    #[serde(rename = "type")] // name used by ScanCode
32    pub file_type: FileType,
33    #[builder(default)]
34    #[serde(default)]
35    pub mime_type: Option<String>,
36    #[builder(default)]
37    #[serde(rename = "file_type", default)]
38    pub file_type_label: Option<String>,
39    pub size: u64,
40    #[builder(default)]
41    #[serde(default)]
42    pub date: Option<String>,
43    #[builder(default)]
44    #[serde(default)]
45    pub sha1: Option<Sha1Digest>,
46    #[builder(default)]
47    #[serde(default)]
48    pub md5: Option<Md5Digest>,
49    #[builder(default)]
50    #[serde(default)]
51    pub sha256: Option<Sha256Digest>,
52    #[builder(default)]
53    #[serde(default)]
54    pub sha1_git: Option<GitSha1>,
55    #[builder(default)]
56    #[serde(default)]
57    pub programming_language: Option<String>,
58    #[builder(default)]
59    #[serde(default)]
60    pub package_data: Vec<PackageData>,
61    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
62    #[builder(default)]
63    pub license_expression: Option<String>,
64    #[builder(default)]
65    #[serde(default)]
66    pub license_detections: Vec<LicenseDetection>,
67    #[builder(default)]
68    #[serde(default)]
69    pub license_clues: Vec<Match>,
70    #[builder(default)]
71    #[serde(default)]
72    pub percentage_of_license_text: Option<f64>,
73    #[builder(default)]
74    #[serde(default)]
75    pub copyrights: Vec<Copyright>,
76    #[builder(default)]
77    #[serde(default)]
78    pub holders: Vec<Holder>,
79    #[builder(default)]
80    #[serde(default)]
81    pub authors: Vec<Author>,
82    #[builder(default)]
83    #[serde(default)]
84    pub emails: Vec<OutputEmail>,
85    #[builder(default)]
86    #[serde(default)]
87    pub urls: Vec<OutputURL>,
88    #[builder(default)]
89    #[serde(default)]
90    pub for_packages: Vec<PackageUid>,
91    #[builder(default)]
92    #[serde(default)]
93    pub scan_errors: Vec<String>,
94    #[builder(default)]
95    #[serde(default)]
96    pub license_policy: Option<Vec<LicensePolicyEntry>>,
97    #[builder(default)]
98    #[serde(default)]
99    pub is_generated: Option<bool>,
100    #[builder(default)]
101    #[serde(default)]
102    pub is_binary: Option<bool>,
103    #[builder(default)]
104    #[serde(default)]
105    pub is_text: Option<bool>,
106    #[builder(default)]
107    #[serde(default)]
108    pub is_archive: Option<bool>,
109    #[builder(default)]
110    #[serde(default)]
111    pub is_media: Option<bool>,
112    #[builder(default)]
113    #[serde(default)]
114    pub is_source: Option<bool>,
115    #[builder(default)]
116    #[serde(default)]
117    pub is_script: Option<bool>,
118    #[builder(default)]
119    #[serde(default)]
120    pub files_count: Option<usize>,
121    #[builder(default)]
122    #[serde(default)]
123    pub dirs_count: Option<usize>,
124    #[builder(default)]
125    #[serde(default)]
126    pub size_count: Option<u64>,
127    #[builder(default)]
128    #[serde(default)]
129    pub source_count: Option<usize>,
130    #[builder(default)]
131    #[serde(default)]
132    pub is_legal: bool,
133    #[builder(default)]
134    #[serde(default)]
135    pub is_manifest: bool,
136    #[builder(default)]
137    #[serde(default)]
138    pub is_readme: bool,
139    #[builder(default)]
140    #[serde(default)]
141    pub is_top_level: bool,
142    #[builder(default)]
143    #[serde(default)]
144    pub is_key_file: bool,
145    #[builder(default)]
146    #[serde(default)]
147    pub is_community: bool,
148    #[builder(default)]
149    #[serde(default)]
150    pub facets: Vec<String>,
151    #[builder(default)]
152    #[serde(default)]
153    pub tallies: Option<Tallies>,
154}
155
156impl FileInfoBuilder {
157    /// Build a [`FileInfo`] from the current builder state.
158    pub fn build(&self) -> Result<FileInfo, String> {
159        let mut file_info = FileInfo::new(
160            self.name.clone().ok_or("Missing field: name")?,
161            self.base_name.clone().ok_or("Missing field: base_name")?,
162            self.extension.clone().ok_or("Missing field: extension")?,
163            self.path.clone().ok_or("Missing field: path")?,
164            self.file_type.clone().ok_or("Missing field: file_type")?,
165            self.mime_type.clone().flatten(),
166            self.file_type_label.clone().flatten(),
167            self.size.ok_or("Missing field: size")?,
168            self.date.clone().flatten(),
169            self.sha1.flatten(),
170            self.md5.flatten(),
171            self.sha256.flatten(),
172            self.programming_language.clone().flatten(),
173            self.package_data.clone().unwrap_or_default(),
174            self.license_expression.clone().flatten(),
175            self.license_detections.clone().unwrap_or_default(),
176            self.license_clues.clone().unwrap_or_default(),
177            self.copyrights.clone().unwrap_or_default(),
178            self.holders.clone().unwrap_or_default(),
179            self.authors.clone().unwrap_or_default(),
180            self.emails.clone().unwrap_or_default(),
181            self.urls.clone().unwrap_or_default(),
182            self.for_packages.clone().unwrap_or_default(),
183            self.scan_errors.clone().unwrap_or_default(),
184        );
185        file_info.license_policy = self.license_policy.clone().flatten();
186        file_info.sha1_git = self.sha1_git.flatten();
187        file_info.is_binary = self.is_binary.flatten();
188        file_info.is_text = self.is_text.flatten();
189        file_info.is_archive = self.is_archive.flatten();
190        file_info.is_media = self.is_media.flatten();
191        file_info.is_script = self.is_script.flatten();
192        file_info.files_count = self.files_count.flatten();
193        file_info.dirs_count = self.dirs_count.flatten();
194        file_info.size_count = self.size_count.flatten();
195        Ok(file_info)
196    }
197}
198
199impl FileInfo {
200    #[allow(clippy::too_many_arguments)]
201    /// Construct a [`FileInfo`] from fully resolved scanner fields.
202    pub fn new(
203        name: String,
204        base_name: String,
205        extension: String,
206        path: String,
207        file_type: FileType,
208        mime_type: Option<String>,
209        file_type_label: Option<String>,
210        size: u64,
211        date: Option<String>,
212        sha1: Option<Sha1Digest>,
213        md5: Option<Md5Digest>,
214        sha256: Option<Sha256Digest>,
215        programming_language: Option<String>,
216        package_data: Vec<PackageData>,
217        mut license_expression: Option<String>,
218        mut license_detections: Vec<LicenseDetection>,
219        license_clues: Vec<Match>,
220        copyrights: Vec<Copyright>,
221        holders: Vec<Holder>,
222        authors: Vec<Author>,
223        emails: Vec<OutputEmail>,
224        urls: Vec<OutputURL>,
225        for_packages: Vec<PackageUid>,
226        scan_errors: Vec<String>,
227    ) -> Self {
228        let mut package_data = package_data;
229        for package in &mut package_data {
230            enrich_package_data_license_provenance(package, &path);
231        }
232
233        // Combine license expressions from package data if license_expression is None
234        license_expression = license_expression.or_else(|| {
235            let expressions = package_data
236                .iter()
237                .filter_map(|pkg| pkg.get_license_expression());
238            combine_license_expressions(expressions)
239        });
240
241        // Combine license detections from package data if none are provided
242        if license_detections.is_empty() {
243            for pkg in &package_data {
244                license_detections.extend(pkg.license_detections.clone());
245            }
246        }
247
248        // Combine license expressions from license detections if license_expression is still None
249        if license_expression.is_none() && !license_detections.is_empty() {
250            let expressions = license_detections
251                .iter()
252                .map(|detection| detection.license_expression.clone());
253            license_expression = combine_license_expressions(expressions);
254        }
255
256        let mut file_info = FileInfo {
257            name,
258            base_name,
259            extension,
260            path,
261            file_type,
262            mime_type,
263            file_type_label,
264            size,
265            date,
266            sha1,
267            md5,
268            sha256,
269            sha1_git: None,
270            programming_language,
271            package_data,
272            license_expression,
273            license_detections,
274            license_clues,
275            percentage_of_license_text: None,
276            copyrights,
277            holders,
278            authors,
279            emails,
280            urls,
281            for_packages,
282            scan_errors,
283            license_policy: None,
284            is_generated: None,
285            is_binary: None,
286            is_text: None,
287            is_archive: None,
288            is_media: None,
289            is_source: None,
290            is_script: None,
291            files_count: None,
292            dirs_count: None,
293            size_count: None,
294            source_count: None,
295            is_legal: false,
296            is_manifest: false,
297            is_readme: false,
298            is_top_level: false,
299            is_key_file: false,
300            is_community: false,
301            facets: vec![],
302            tallies: None,
303        };
304        file_info.backfill_license_provenance();
305        file_info
306    }
307
308    pub fn backfill_license_provenance(&mut self) {
309        for detection in &mut self.license_detections {
310            enrich_license_detection_provenance(detection, &self.path);
311        }
312
313        for package in &mut self.package_data {
314            enrich_package_data_license_provenance(package, &self.path);
315        }
316    }
317}
318
319fn enrich_package_data_license_provenance(package_data: &mut PackageData, path: &str) {
320    for detection in &mut package_data.license_detections {
321        enrich_license_detection_provenance(detection, path);
322    }
323    for detection in &mut package_data.other_license_detections {
324        enrich_license_detection_provenance(detection, path);
325    }
326}
327
328pub(crate) fn enrich_license_detection_provenance(detection: &mut LicenseDetection, path: &str) {
329    for detection_match in &mut detection.matches {
330        if detection_match.from_file.is_none() {
331            detection_match.from_file = Some(path.to_string());
332        }
333    }
334
335    if detection.identifier.is_none() {
336        detection.identifier = Some(compute_public_detection_identifier(detection));
337    }
338}
339
340fn compute_public_detection_identifier(detection: &LicenseDetection) -> String {
341    let expression = python_safe_name(&detection.license_expression);
342    let mut hasher = Sha1::new();
343    hasher.update(format_public_detection_content(detection).as_bytes());
344    let hex_str = hex::encode(hasher.finalize());
345    let uuid_hex = &hex_str[..32];
346    let content_uuid = uuid::Uuid::parse_str(uuid_hex)
347        .map(|uuid| uuid.to_string())
348        .unwrap_or_else(|_| uuid_hex.to_string());
349
350    format!("{}-{}", expression, content_uuid)
351}
352
353fn format_public_detection_content(detection: &LicenseDetection) -> String {
354    let mut result = String::from("(");
355
356    for (index, detection_match) in detection.matches.iter().enumerate() {
357        if index > 0 {
358            result.push_str(", ");
359        }
360        result.push_str(&format!(
361            "({}, {}, {})",
362            python_str_repr(
363                detection_match
364                    .rule_identifier
365                    .as_deref()
366                    .or(detection_match.matcher.as_deref())
367                    .unwrap_or("parser-declared-license")
368            ),
369            detection_match.score.value() as f32,
370            python_token_tuple_repr(&tokenize_without_stopwords(
371                detection_match.matched_text.as_deref().unwrap_or_default(),
372            )),
373        ));
374    }
375
376    if detection.matches.len() == 1 {
377        result.push(',');
378    }
379    result.push(')');
380    result
381}
382
383fn python_safe_name(value: &str) -> String {
384    let mut result = String::new();
385    let mut prev_underscore = false;
386
387    for character in value.chars() {
388        if character.is_alphanumeric() {
389            result.push(character);
390            prev_underscore = false;
391        } else if !prev_underscore {
392            result.push('_');
393            prev_underscore = true;
394        }
395    }
396
397    let trimmed = result.trim_matches('_');
398    if trimmed.is_empty() {
399        String::new()
400    } else {
401        trimmed.to_string()
402    }
403}
404
405fn python_str_repr(value: &str) -> String {
406    if value.contains('\'') && !value.contains('"') {
407        format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\""))
408    } else {
409        format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\\'"))
410    }
411}
412
413fn python_token_tuple_repr(tokens: &[String]) -> String {
414    if tokens.is_empty() {
415        return String::from("()");
416    }
417
418    let mut result = String::from("(");
419    for (index, token) in tokens.iter().enumerate() {
420        if index > 0 {
421            result.push_str(", ");
422        }
423        result.push_str(&python_str_repr(token));
424    }
425
426    if tokens.len() == 1 {
427        result.push(',');
428    }
429    result.push(')');
430    result
431}
432
433/// Package metadata extracted from manifest files.
434///
435/// Compatible with ScanCode Toolkit output format. Contains standardized package
436/// information including name, version, dependencies, licenses, and other metadata.
437/// This is the primary data structure returned by all parsers.
438#[derive(Serialize, Deserialize, Debug, Clone, Default)]
439pub struct PackageData {
440    #[serde(rename = "type")] // name used by ScanCode
441    pub package_type: Option<PackageType>,
442    pub namespace: Option<String>,
443    pub name: Option<String>,
444    pub version: Option<String>,
445    #[serde(default)]
446    pub qualifiers: Option<HashMap<String, String>>,
447    pub subpath: Option<String>,
448    pub primary_language: Option<String>,
449    pub description: Option<String>,
450    pub release_date: Option<String>,
451    #[serde(default)]
452    pub parties: Vec<Party>,
453    #[serde(default)]
454    pub keywords: Vec<String>,
455    pub homepage_url: Option<String>,
456    pub download_url: Option<String>,
457    pub size: Option<u64>,
458    pub sha1: Option<Sha1Digest>,
459    pub md5: Option<Md5Digest>,
460    pub sha256: Option<Sha256Digest>,
461    pub sha512: Option<Sha512Digest>,
462    pub bug_tracking_url: Option<String>,
463    pub code_view_url: Option<String>,
464    pub vcs_url: Option<String>,
465    pub copyright: Option<String>,
466    pub holder: Option<String>,
467    pub declared_license_expression: Option<String>,
468    pub declared_license_expression_spdx: Option<String>,
469    #[serde(default)]
470    pub license_detections: Vec<LicenseDetection>,
471    pub other_license_expression: Option<String>,
472    pub other_license_expression_spdx: Option<String>,
473    #[serde(default)]
474    pub other_license_detections: Vec<LicenseDetection>,
475    pub extracted_license_statement: Option<String>,
476    pub notice_text: Option<String>,
477    #[serde(default)]
478    pub source_packages: Vec<String>,
479    #[serde(default)]
480    pub file_references: Vec<FileReference>,
481    #[serde(default)]
482    pub is_private: bool,
483    #[serde(default)]
484    pub is_virtual: bool,
485    #[serde(default)]
486    pub extra_data: Option<HashMap<String, serde_json::Value>>,
487    #[serde(default)]
488    pub dependencies: Vec<Dependency>,
489    pub repository_homepage_url: Option<String>,
490    pub repository_download_url: Option<String>,
491    pub api_data_url: Option<String>,
492    pub datasource_id: Option<DatasourceId>,
493    pub purl: Option<String>,
494}
495
496impl PackageData {
497    /// Extracts a single license expression from all license detections in this package.
498    /// Returns None if there are no license detections.
499    pub fn get_license_expression(&self) -> Option<String> {
500        if self.license_detections.is_empty() {
501            return None;
502        }
503
504        let expressions = self
505            .license_detections
506            .iter()
507            .map(|detection| detection.license_expression.clone());
508        combine_license_expressions(expressions)
509    }
510}
511
512/// License detection result containing matched license expressions.
513///
514/// Aggregates multiple license matches into a single SPDX license expression.
515#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
516pub struct LicenseDetection {
517    pub license_expression: String,
518    pub license_expression_spdx: String,
519    pub matches: Vec<Match>,
520    #[serde(default)]
521    pub detection_log: Vec<String>,
522    pub identifier: Option<String>,
523}
524
525/// Individual license text match with location and confidence score.
526///
527/// Represents a specific region of text that matched a known license pattern.
528#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
529pub struct Match {
530    pub license_expression: String,
531    pub license_expression_spdx: String,
532    pub from_file: Option<String>,
533    pub start_line: LineNumber,
534    pub end_line: LineNumber,
535    pub matcher: Option<String>,
536    pub score: MatchScore,
537    pub matched_length: Option<usize>,
538    pub match_coverage: Option<f64>,
539    pub rule_relevance: Option<u8>,
540    pub rule_identifier: Option<String>,
541    pub rule_url: Option<String>,
542    pub matched_text: Option<String>,
543    pub matched_text_diagnostics: Option<String>,
544    #[serde(default)]
545    pub referenced_filenames: Option<Vec<String>>,
546}
547
548#[derive(Serialize, Deserialize, Debug, Clone)]
549pub struct Copyright {
550    pub copyright: String,
551    pub start_line: LineNumber,
552    pub end_line: LineNumber,
553}
554
555#[derive(Serialize, Deserialize, Debug, Clone)]
556pub struct Holder {
557    pub holder: String,
558    pub start_line: LineNumber,
559    pub end_line: LineNumber,
560}
561
562#[derive(Serialize, Deserialize, Debug, Clone)]
563pub struct Author {
564    pub author: String,
565    pub start_line: LineNumber,
566    pub end_line: LineNumber,
567}
568
569/// Package dependency information with version constraints.
570///
571/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
572/// and optional resolved package details.
573#[derive(Serialize, Deserialize, Debug, Clone)]
574pub struct Dependency {
575    pub purl: Option<String>,
576    pub extracted_requirement: Option<String>,
577    pub scope: Option<String>,
578    pub is_runtime: Option<bool>,
579    pub is_optional: Option<bool>,
580    pub is_pinned: Option<bool>,
581    pub is_direct: Option<bool>,
582    pub resolved_package: Option<Box<ResolvedPackage>>,
583    #[serde(default)]
584    pub extra_data: Option<HashMap<String, serde_json::Value>>,
585}
586
587#[derive(Serialize, Deserialize, Debug, Clone)]
588pub struct ResolvedPackage {
589    #[serde(rename = "type")]
590    pub package_type: PackageType,
591    pub namespace: String,
592    pub name: String,
593    pub version: String,
594    #[serde(default)]
595    pub qualifiers: Option<HashMap<String, String>>,
596    pub subpath: Option<String>,
597    pub primary_language: Option<String>,
598    pub description: Option<String>,
599    pub release_date: Option<String>,
600    #[serde(default)]
601    pub parties: Vec<Party>,
602    #[serde(default)]
603    pub keywords: Vec<String>,
604    pub homepage_url: Option<String>,
605    pub download_url: Option<String>,
606    pub size: Option<u64>,
607    pub sha1: Option<Sha1Digest>,
608    pub md5: Option<Md5Digest>,
609    pub sha256: Option<Sha256Digest>,
610    pub sha512: Option<Sha512Digest>,
611    pub bug_tracking_url: Option<String>,
612    pub code_view_url: Option<String>,
613    pub vcs_url: Option<String>,
614    pub copyright: Option<String>,
615    pub holder: Option<String>,
616    pub declared_license_expression: Option<String>,
617    pub declared_license_expression_spdx: Option<String>,
618    #[serde(default)]
619    pub license_detections: Vec<LicenseDetection>,
620    pub other_license_expression: Option<String>,
621    pub other_license_expression_spdx: Option<String>,
622    #[serde(default)]
623    pub other_license_detections: Vec<LicenseDetection>,
624    pub extracted_license_statement: Option<String>,
625    pub notice_text: Option<String>,
626    #[serde(default)]
627    pub source_packages: Vec<String>,
628    #[serde(default)]
629    pub file_references: Vec<FileReference>,
630    #[serde(default)]
631    pub is_private: bool,
632    #[serde(default)]
633    pub is_virtual: bool,
634    #[serde(default)]
635    pub extra_data: Option<HashMap<String, serde_json::Value>>,
636    #[serde(default)]
637    pub dependencies: Vec<Dependency>,
638    pub repository_homepage_url: Option<String>,
639    pub repository_download_url: Option<String>,
640    pub api_data_url: Option<String>,
641    pub datasource_id: Option<DatasourceId>,
642    pub purl: Option<String>,
643}
644
645impl ResolvedPackage {
646    pub fn new(
647        package_type: PackageType,
648        namespace: String,
649        name: String,
650        version: String,
651    ) -> Self {
652        Self {
653            package_type,
654            namespace,
655            name,
656            version,
657            qualifiers: None,
658            subpath: None,
659            primary_language: None,
660            description: None,
661            release_date: None,
662            parties: vec![],
663            keywords: vec![],
664            homepage_url: None,
665            download_url: None,
666            size: None,
667            sha1: None,
668            md5: None,
669            sha256: None,
670            sha512: None,
671            bug_tracking_url: None,
672            code_view_url: None,
673            vcs_url: None,
674            copyright: None,
675            holder: None,
676            declared_license_expression: None,
677            declared_license_expression_spdx: None,
678            license_detections: vec![],
679            other_license_expression: None,
680            other_license_expression_spdx: None,
681            other_license_detections: vec![],
682            extracted_license_statement: None,
683            notice_text: None,
684            source_packages: vec![],
685            file_references: vec![],
686            is_private: false,
687            is_virtual: false,
688            extra_data: None,
689            dependencies: vec![],
690            repository_homepage_url: None,
691            repository_download_url: None,
692            api_data_url: None,
693            datasource_id: None,
694            purl: None,
695        }
696    }
697
698    pub fn from_package_data(package_data: &PackageData, fallback_type: PackageType) -> Self {
699        Self {
700            package_type: package_data.package_type.unwrap_or(fallback_type),
701            namespace: package_data.namespace.clone().unwrap_or_default(),
702            name: package_data.name.clone().unwrap_or_default(),
703            version: package_data.version.clone().unwrap_or_default(),
704            qualifiers: package_data.qualifiers.clone(),
705            subpath: package_data.subpath.clone(),
706            primary_language: package_data.primary_language.clone(),
707            description: package_data.description.clone(),
708            release_date: package_data.release_date.clone(),
709            parties: package_data.parties.clone(),
710            keywords: package_data.keywords.clone(),
711            homepage_url: package_data.homepage_url.clone(),
712            download_url: package_data.download_url.clone(),
713            size: package_data.size,
714            sha1: package_data.sha1,
715            md5: package_data.md5,
716            sha256: package_data.sha256,
717            sha512: package_data.sha512,
718            bug_tracking_url: package_data.bug_tracking_url.clone(),
719            code_view_url: package_data.code_view_url.clone(),
720            vcs_url: package_data.vcs_url.clone(),
721            copyright: package_data.copyright.clone(),
722            holder: package_data.holder.clone(),
723            declared_license_expression: package_data.declared_license_expression.clone(),
724            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
725            license_detections: package_data.license_detections.clone(),
726            other_license_expression: package_data.other_license_expression.clone(),
727            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
728            other_license_detections: package_data.other_license_detections.clone(),
729            extracted_license_statement: package_data.extracted_license_statement.clone(),
730            notice_text: package_data.notice_text.clone(),
731            source_packages: package_data.source_packages.clone(),
732            file_references: package_data.file_references.clone(),
733            is_private: package_data.is_private,
734            is_virtual: package_data.is_virtual,
735            extra_data: package_data.extra_data.clone(),
736            dependencies: package_data.dependencies.clone(),
737            repository_homepage_url: package_data.repository_homepage_url.clone(),
738            repository_download_url: package_data.repository_download_url.clone(),
739            api_data_url: package_data.api_data_url.clone(),
740            datasource_id: package_data.datasource_id,
741            purl: package_data.purl.clone(),
742        }
743    }
744}
745
746/// Author, maintainer, or contributor information.
747///
748/// Represents a person or organization associated with a package.
749#[derive(Serialize, Deserialize, Debug, Clone)]
750pub struct Party {
751    pub r#type: Option<String>,
752    pub role: Option<String>,
753    pub name: Option<String>,
754    pub email: Option<String>,
755    pub url: Option<String>,
756    pub organization: Option<String>,
757    pub organization_url: Option<String>,
758    pub timezone: Option<String>,
759}
760
761/// Reference to a file within a package archive with checksums.
762///
763/// Used in SBOM generation to track files within distribution archives.
764#[derive(Serialize, Deserialize, Debug, Clone)]
765pub struct FileReference {
766    pub path: String,
767    pub size: Option<u64>,
768    pub sha1: Option<Sha1Digest>,
769    pub md5: Option<Md5Digest>,
770    pub sha256: Option<Sha256Digest>,
771    pub sha512: Option<Sha512Digest>,
772    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
773}
774
775/// Top-level assembled package, created by merging one or more `PackageData`
776/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
777///
778/// Compatible with ScanCode Toolkit output format. The key differences from
779/// `PackageData` are:
780/// - `package_uid`: unique identifier (PURL with UUID qualifier)
781/// - `datafile_paths`: list of all contributing files
782/// - `datasource_ids`: list of all contributing parsers
783/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
784#[derive(Serialize, Deserialize, Debug, Clone)]
785pub struct Package {
786    #[serde(rename = "type")]
787    pub package_type: Option<PackageType>,
788    pub namespace: Option<String>,
789    pub name: Option<String>,
790    pub version: Option<String>,
791    #[serde(default)]
792    pub qualifiers: Option<HashMap<String, String>>,
793    pub subpath: Option<String>,
794    pub primary_language: Option<String>,
795    pub description: Option<String>,
796    pub release_date: Option<String>,
797    #[serde(default)]
798    pub parties: Vec<Party>,
799    #[serde(default)]
800    pub keywords: Vec<String>,
801    pub homepage_url: Option<String>,
802    pub download_url: Option<String>,
803    pub size: Option<u64>,
804    pub sha1: Option<Sha1Digest>,
805    pub md5: Option<Md5Digest>,
806    pub sha256: Option<Sha256Digest>,
807    pub sha512: Option<Sha512Digest>,
808    pub bug_tracking_url: Option<String>,
809    pub code_view_url: Option<String>,
810    pub vcs_url: Option<String>,
811    pub copyright: Option<String>,
812    pub holder: Option<String>,
813    pub declared_license_expression: Option<String>,
814    pub declared_license_expression_spdx: Option<String>,
815    #[serde(default)]
816    pub license_detections: Vec<LicenseDetection>,
817    pub other_license_expression: Option<String>,
818    pub other_license_expression_spdx: Option<String>,
819    #[serde(default)]
820    pub other_license_detections: Vec<LicenseDetection>,
821    pub extracted_license_statement: Option<String>,
822    pub notice_text: Option<String>,
823    #[serde(default)]
824    pub source_packages: Vec<String>,
825    #[serde(default)]
826    pub is_private: bool,
827    #[serde(default)]
828    pub is_virtual: bool,
829    #[serde(default)]
830    pub extra_data: Option<HashMap<String, serde_json::Value>>,
831    pub repository_homepage_url: Option<String>,
832    pub repository_download_url: Option<String>,
833    pub api_data_url: Option<String>,
834    pub purl: Option<String>,
835    /// Unique identifier for this package instance (PURL with UUID qualifier).
836    pub package_uid: PackageUid,
837    /// Paths to all datafiles that contributed to this package.
838    pub datafile_paths: Vec<String>,
839    /// Datasource identifiers for all parsers that contributed to this package.
840    pub datasource_ids: Vec<DatasourceId>,
841}
842
843impl Package {
844    /// Create a `Package` from a `PackageData` and its source file path.
845    ///
846    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
847    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
848    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
849        let mut package_data = package_data.clone();
850        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
851
852        let package_uid = package_data
853            .purl
854            .as_ref()
855            .map(|p| PackageUid::new(p))
856            .unwrap_or_else(PackageUid::empty);
857
858        Package {
859            package_type: package_data.package_type,
860            namespace: package_data.namespace.clone(),
861            name: package_data.name.clone(),
862            version: package_data.version.clone(),
863            qualifiers: package_data.qualifiers.clone(),
864            subpath: package_data.subpath.clone(),
865            primary_language: package_data.primary_language.clone(),
866            description: package_data.description.clone(),
867            release_date: package_data.release_date.clone(),
868            parties: package_data.parties.clone(),
869            keywords: package_data.keywords.clone(),
870            homepage_url: package_data.homepage_url.clone(),
871            download_url: package_data.download_url.clone(),
872            size: package_data.size,
873            sha1: package_data.sha1,
874            md5: package_data.md5,
875            sha256: package_data.sha256,
876            sha512: package_data.sha512,
877            bug_tracking_url: package_data.bug_tracking_url.clone(),
878            code_view_url: package_data.code_view_url.clone(),
879            vcs_url: package_data.vcs_url.clone(),
880            copyright: package_data.copyright.clone(),
881            holder: package_data.holder.clone(),
882            declared_license_expression: package_data.declared_license_expression.clone(),
883            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
884            license_detections: package_data.license_detections.clone(),
885            other_license_expression: package_data.other_license_expression.clone(),
886            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
887            other_license_detections: package_data.other_license_detections.clone(),
888            extracted_license_statement: package_data.extracted_license_statement.clone(),
889            notice_text: package_data.notice_text.clone(),
890            source_packages: package_data.source_packages.clone(),
891            is_private: package_data.is_private,
892            is_virtual: package_data.is_virtual,
893            extra_data: package_data.extra_data.clone(),
894            repository_homepage_url: package_data.repository_homepage_url.clone(),
895            repository_download_url: package_data.repository_download_url.clone(),
896            api_data_url: package_data.api_data_url.clone(),
897            purl: package_data.purl.clone(),
898            package_uid,
899            datafile_paths: vec![datafile_path],
900            datasource_ids: if let Some(dsid) = package_data.datasource_id {
901                vec![dsid]
902            } else {
903                vec![]
904            },
905        }
906    }
907
908    /// Update this package with data from another `PackageData`.
909    ///
910    /// Merges data from a related file (e.g., lockfile) into this package.
911    /// Existing non-empty values are preserved; empty fields are filled from
912    /// the new data. Lists (parties, license_detections) are merged.
913    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
914        let mut package_data = package_data.clone();
915        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
916
917        if let Some(dsid) = package_data.datasource_id {
918            self.datasource_ids.push(dsid);
919        }
920        self.datafile_paths.push(datafile_path);
921
922        macro_rules! fill_if_empty {
923            ($field:ident) => {
924                if self.$field.is_none() {
925                    self.$field = package_data.$field;
926                }
927            };
928        }
929
930        fill_if_empty!(package_type);
931        fill_if_empty!(name);
932        fill_if_empty!(namespace);
933        fill_if_empty!(version);
934        fill_if_empty!(qualifiers);
935        fill_if_empty!(subpath);
936        fill_if_empty!(primary_language);
937        fill_if_empty!(description);
938        fill_if_empty!(release_date);
939        fill_if_empty!(homepage_url);
940        fill_if_empty!(download_url);
941        fill_if_empty!(size);
942        fill_if_empty!(sha1);
943        fill_if_empty!(md5);
944        fill_if_empty!(sha256);
945        fill_if_empty!(sha512);
946        fill_if_empty!(bug_tracking_url);
947        fill_if_empty!(code_view_url);
948        fill_if_empty!(vcs_url);
949        fill_if_empty!(copyright);
950        fill_if_empty!(holder);
951        fill_if_empty!(declared_license_expression);
952        fill_if_empty!(declared_license_expression_spdx);
953        fill_if_empty!(other_license_expression);
954        fill_if_empty!(other_license_expression_spdx);
955        fill_if_empty!(extracted_license_statement);
956        fill_if_empty!(notice_text);
957        match (&mut self.extra_data, &package_data.extra_data) {
958            (None, Some(extra_data)) => {
959                self.extra_data = Some(extra_data.clone());
960            }
961            (Some(existing), Some(incoming)) => {
962                for (key, value) in incoming {
963                    existing.entry(key.clone()).or_insert_with(|| value.clone());
964                }
965            }
966            _ => {}
967        }
968        fill_if_empty!(repository_homepage_url);
969        fill_if_empty!(repository_download_url);
970        fill_if_empty!(api_data_url);
971
972        for party in &package_data.parties {
973            if let Some(existing) = self.parties.iter_mut().find(|p| {
974                p.role == party.role
975                    && ((p.name.is_some() && p.name == party.name)
976                        || (p.email.is_some() && p.email == party.email))
977            }) {
978                if existing.name.is_none() {
979                    existing.name = party.name.clone();
980                }
981                if existing.email.is_none() {
982                    existing.email = party.email.clone();
983                }
984            } else {
985                self.parties.push(party.clone());
986            }
987        }
988
989        for keyword in &package_data.keywords {
990            if !self.keywords.contains(keyword) {
991                self.keywords.push(keyword.clone());
992            }
993        }
994
995        for detection in &package_data.license_detections {
996            self.license_detections.push(detection.clone());
997        }
998
999        for detection in &package_data.other_license_detections {
1000            self.other_license_detections.push(detection.clone());
1001        }
1002
1003        for source_pkg in &package_data.source_packages {
1004            if !self.source_packages.contains(source_pkg) {
1005                self.source_packages.push(source_pkg.clone());
1006            }
1007        }
1008
1009        self.refresh_identity();
1010    }
1011
1012    pub fn backfill_license_provenance(&mut self) {
1013        let Some(datafile_path) = self.datafile_paths.first().cloned() else {
1014            return;
1015        };
1016
1017        for detection in &mut self.license_detections {
1018            enrich_license_detection_provenance(detection, &datafile_path);
1019        }
1020        for detection in &mut self.other_license_detections {
1021            enrich_license_detection_provenance(detection, &datafile_path);
1022        }
1023    }
1024
1025    fn refresh_identity(&mut self) {
1026        let Some(next_purl) = self.build_current_purl() else {
1027            return;
1028        };
1029
1030        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
1031            self.package_uid = PackageUid::new(&next_purl);
1032        }
1033
1034        self.purl = Some(next_purl);
1035    }
1036
1037    fn build_current_purl(&self) -> Option<String> {
1038        if let (Some(package_type), Some(name)) = (
1039            self.package_type.as_ref(),
1040            self.name
1041                .as_deref()
1042                .filter(|value| !value.trim().is_empty()),
1043        ) {
1044            let purl_type = match package_type {
1045                PackageType::Deno => "generic",
1046                _ => package_type.as_str(),
1047            };
1048
1049            let mut purl = PackageUrl::new(purl_type, name).ok()?;
1050
1051            if let Some(namespace) = self
1052                .namespace
1053                .as_deref()
1054                .filter(|value| !value.trim().is_empty())
1055            {
1056                purl.with_namespace(namespace).ok()?;
1057            }
1058
1059            if let Some(version) = self
1060                .version
1061                .as_deref()
1062                .filter(|value| !value.trim().is_empty())
1063            {
1064                purl.with_version(version).ok()?;
1065            }
1066
1067            if let Some(qualifiers) = &self.qualifiers {
1068                for (key, value) in qualifiers {
1069                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
1070                }
1071            }
1072
1073            if let Some(subpath) = self
1074                .subpath
1075                .as_deref()
1076                .filter(|value| !value.trim().is_empty())
1077            {
1078                purl.with_subpath(subpath).ok()?;
1079            }
1080
1081            return Some(purl.to_string());
1082        }
1083
1084        let existing_purl = self.purl.as_deref()?;
1085        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
1086
1087        if let Some(version) = self
1088            .version
1089            .as_deref()
1090            .filter(|value| !value.trim().is_empty())
1091        {
1092            purl.with_version(version).ok()?;
1093        } else {
1094            purl.without_version();
1095        }
1096
1097        Some(purl.to_string())
1098    }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103    use super::*;
1104
1105    #[test]
1106    fn file_info_new_backfills_package_detection_provenance() {
1107        let package_data = PackageData {
1108            package_type: Some(PackageType::Npm),
1109            license_detections: vec![LicenseDetection {
1110                license_expression: "mit".to_string(),
1111                license_expression_spdx: "MIT".to_string(),
1112                matches: vec![Match {
1113                    license_expression: "mit".to_string(),
1114                    license_expression_spdx: "MIT".to_string(),
1115                    from_file: None,
1116                    start_line: LineNumber::ONE,
1117                    end_line: LineNumber::ONE,
1118                    matcher: Some("parser-declared-license".to_string()),
1119                    score: MatchScore::MAX,
1120                    matched_length: Some(1),
1121                    match_coverage: Some(100.0),
1122                    rule_relevance: Some(100),
1123                    rule_identifier: None,
1124                    rule_url: None,
1125                    matched_text: Some("MIT".to_string()),
1126                    referenced_filenames: None,
1127                    matched_text_diagnostics: None,
1128                }],
1129                detection_log: vec![],
1130                identifier: None,
1131            }],
1132            ..PackageData::default()
1133        };
1134
1135        let file_info = FileInfo::new(
1136            "package.json".to_string(),
1137            "package".to_string(),
1138            ".json".to_string(),
1139            "project/package.json".to_string(),
1140            FileType::File,
1141            None,
1142            None,
1143            1,
1144            None,
1145            None,
1146            None,
1147            None,
1148            None,
1149            vec![package_data],
1150            None,
1151            vec![],
1152            vec![],
1153            vec![],
1154            vec![],
1155            vec![],
1156            vec![],
1157            vec![],
1158            vec![],
1159            vec![],
1160        );
1161
1162        assert_eq!(file_info.license_detections.len(), 1);
1163        assert_eq!(
1164            file_info.license_detections[0].matches[0]
1165                .from_file
1166                .as_deref(),
1167            Some("project/package.json")
1168        );
1169        assert!(file_info.license_detections[0].identifier.is_some());
1170        assert_eq!(
1171            file_info.package_data[0].license_detections[0].matches[0]
1172                .from_file
1173                .as_deref(),
1174            Some("project/package.json")
1175        );
1176        assert!(
1177            file_info.package_data[0].license_detections[0]
1178                .identifier
1179                .is_some()
1180        );
1181    }
1182
1183    #[test]
1184    fn package_from_package_data_backfills_detection_provenance() {
1185        let package_data = PackageData {
1186            package_type: Some(PackageType::Npm),
1187            license_detections: vec![LicenseDetection {
1188                license_expression: "mit".to_string(),
1189                license_expression_spdx: "MIT".to_string(),
1190                matches: vec![Match {
1191                    license_expression: "mit".to_string(),
1192                    license_expression_spdx: "MIT".to_string(),
1193                    from_file: None,
1194                    start_line: LineNumber::ONE,
1195                    end_line: LineNumber::ONE,
1196                    matcher: Some("parser-declared-license".to_string()),
1197                    score: MatchScore::MAX,
1198                    matched_length: Some(1),
1199                    match_coverage: Some(100.0),
1200                    rule_relevance: Some(100),
1201                    rule_identifier: None,
1202                    rule_url: None,
1203                    matched_text: Some("MIT".to_string()),
1204                    referenced_filenames: None,
1205                    matched_text_diagnostics: None,
1206                }],
1207                detection_log: vec![],
1208                identifier: None,
1209            }],
1210            ..PackageData::default()
1211        };
1212
1213        let package = Package::from_package_data(&package_data, "project/package.json".to_string());
1214
1215        assert_eq!(
1216            package.license_detections[0].matches[0]
1217                .from_file
1218                .as_deref(),
1219            Some("project/package.json")
1220        );
1221        assert!(package.license_detections[0].identifier.is_some());
1222    }
1223}
1224
1225/// Top-level dependency instance, created during package assembly.
1226///
1227/// Extends the file-level `Dependency` with traceability fields that link
1228/// each dependency to its owning package and source datafile.
1229#[derive(Serialize, Deserialize, Debug, Clone)]
1230pub struct TopLevelDependency {
1231    pub purl: Option<String>,
1232    pub extracted_requirement: Option<String>,
1233    pub scope: Option<String>,
1234    pub is_runtime: Option<bool>,
1235    pub is_optional: Option<bool>,
1236    pub is_pinned: Option<bool>,
1237    pub is_direct: Option<bool>,
1238    pub resolved_package: Option<Box<ResolvedPackage>>,
1239    #[serde(default)]
1240    pub extra_data: Option<HashMap<String, serde_json::Value>>,
1241    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
1242    pub dependency_uid: DependencyUid,
1243    /// The `package_uid` of the package this dependency belongs to.
1244    pub for_package_uid: Option<PackageUid>,
1245    /// Path to the datafile where this dependency was declared.
1246    pub datafile_path: String,
1247    /// Datasource identifier for the parser that extracted this dependency.
1248    pub datasource_id: DatasourceId,
1249    /// Namespace for the dependency (e.g., distribution name for RPM packages).
1250    pub namespace: Option<String>,
1251}
1252
1253impl TopLevelDependency {
1254    /// Create a `TopLevelDependency` from a file-level `Dependency`.
1255    pub fn from_dependency(
1256        dep: &Dependency,
1257        datafile_path: String,
1258        datasource_id: DatasourceId,
1259        for_package_uid: Option<PackageUid>,
1260    ) -> Self {
1261        let dependency_uid = dep
1262            .purl
1263            .as_ref()
1264            .map(|p| DependencyUid::new(p))
1265            .unwrap_or_else(DependencyUid::empty);
1266
1267        TopLevelDependency {
1268            purl: dep.purl.clone(),
1269            extracted_requirement: dep.extracted_requirement.clone(),
1270            scope: dep.scope.clone(),
1271            is_runtime: dep.is_runtime,
1272            is_optional: dep.is_optional,
1273            is_pinned: dep.is_pinned,
1274            is_direct: dep.is_direct,
1275            resolved_package: dep.resolved_package.clone(),
1276            extra_data: dep.extra_data.clone(),
1277            dependency_uid,
1278            for_package_uid,
1279            datafile_path,
1280            datasource_id,
1281            namespace: None,
1282        }
1283    }
1284}
1285
1286#[derive(Serialize, Deserialize, Debug, Clone)]
1287pub struct OutputEmail {
1288    pub email: String,
1289    pub start_line: LineNumber,
1290    pub end_line: LineNumber,
1291}
1292
1293#[derive(Serialize, Deserialize, Debug, Clone)]
1294pub struct OutputURL {
1295    pub url: String,
1296    pub start_line: LineNumber,
1297    pub end_line: LineNumber,
1298}
1299
1300#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
1301pub struct LicensePolicyEntry {
1302    pub license_key: String,
1303    pub label: String,
1304    pub color_code: String,
1305    pub icon: String,
1306}
1307
1308#[derive(Debug, Clone, PartialEq)]
1309pub enum FileType {
1310    File,
1311    Directory,
1312}
1313
1314impl serde::Serialize for FileType {
1315    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1316    where
1317        S: serde::Serializer,
1318    {
1319        match self {
1320            FileType::File => serializer.serialize_str("file"),
1321            FileType::Directory => serializer.serialize_str("directory"),
1322        }
1323    }
1324}
1325
1326impl<'de> Deserialize<'de> for FileType {
1327    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1328    where
1329        D: serde::Deserializer<'de>,
1330    {
1331        let value = String::deserialize(deserializer)?;
1332        match value.as_str() {
1333            "file" => Ok(FileType::File),
1334            "directory" => Ok(FileType::Directory),
1335            _ => Err(serde::de::Error::custom("invalid file type")),
1336        }
1337    }
1338}