Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use sha1::{Digest, Sha1};
5use std::collections::HashMap;
6use std::str::FromStr;
7
8use super::DatasourceId;
9use super::DependencyUid;
10use super::GitSha1;
11use super::LineNumber;
12use super::MatchScore;
13use super::Md5Digest;
14use super::PackageType;
15use super::PackageUid;
16use super::Sha1Digest;
17use super::Sha256Digest;
18use super::Sha512Digest;
19use crate::license_detection::tokenize::tokenize_without_stopwords;
20use crate::models::output::Tallies;
21use crate::utils::spdx::combine_license_expressions;
22
23#[derive(Debug, Builder, Serialize, Deserialize, Clone)]
24#[builder(build_fn(skip))]
25/// File-level scan result containing metadata and detected findings.
26pub struct FileInfo {
27    pub name: String,
28    pub base_name: String,
29    pub extension: String,
30    pub path: String,
31    #[serde(rename = "type")] // name used by ScanCode
32    pub file_type: FileType,
33    #[builder(default)]
34    #[serde(default)]
35    pub mime_type: Option<String>,
36    #[builder(default)]
37    #[serde(rename = "file_type", default)]
38    pub file_type_label: Option<String>,
39    pub size: u64,
40    #[builder(default)]
41    #[serde(default)]
42    pub date: Option<String>,
43    #[builder(default)]
44    #[serde(default)]
45    pub sha1: Option<Sha1Digest>,
46    #[builder(default)]
47    #[serde(default)]
48    pub md5: Option<Md5Digest>,
49    #[builder(default)]
50    #[serde(default)]
51    pub sha256: Option<Sha256Digest>,
52    #[builder(default)]
53    #[serde(default)]
54    pub sha1_git: Option<GitSha1>,
55    #[builder(default)]
56    #[serde(default)]
57    pub programming_language: Option<String>,
58    #[builder(default)]
59    #[serde(default)]
60    pub package_data: Vec<PackageData>,
61    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
62    #[builder(default)]
63    pub license_expression: Option<String>,
64    #[builder(default)]
65    #[serde(default)]
66    pub license_detections: Vec<LicenseDetection>,
67    #[builder(default)]
68    #[serde(default)]
69    pub license_clues: Vec<Match>,
70    #[builder(default)]
71    #[serde(default)]
72    pub percentage_of_license_text: Option<f64>,
73    #[builder(default)]
74    #[serde(default)]
75    pub copyrights: Vec<Copyright>,
76    #[builder(default)]
77    #[serde(default)]
78    pub holders: Vec<Holder>,
79    #[builder(default)]
80    #[serde(default)]
81    pub authors: Vec<Author>,
82    #[builder(default)]
83    #[serde(default)]
84    pub emails: Vec<OutputEmail>,
85    #[builder(default)]
86    #[serde(default)]
87    pub urls: Vec<OutputURL>,
88    #[builder(default)]
89    #[serde(default)]
90    pub for_packages: Vec<PackageUid>,
91    #[builder(default)]
92    #[serde(default)]
93    pub scan_errors: Vec<String>,
94    #[builder(default)]
95    #[serde(default)]
96    pub license_policy: Option<Vec<LicensePolicyEntry>>,
97    #[builder(default)]
98    #[serde(default)]
99    pub is_generated: Option<bool>,
100    #[builder(default)]
101    #[serde(default)]
102    pub is_binary: Option<bool>,
103    #[builder(default)]
104    #[serde(default)]
105    pub is_text: Option<bool>,
106    #[builder(default)]
107    #[serde(default)]
108    pub is_archive: Option<bool>,
109    #[builder(default)]
110    #[serde(default)]
111    pub is_media: Option<bool>,
112    #[builder(default)]
113    #[serde(default)]
114    pub is_source: Option<bool>,
115    #[builder(default)]
116    #[serde(default)]
117    pub is_script: Option<bool>,
118    #[builder(default)]
119    #[serde(default)]
120    pub files_count: Option<usize>,
121    #[builder(default)]
122    #[serde(default)]
123    pub dirs_count: Option<usize>,
124    #[builder(default)]
125    #[serde(default)]
126    pub size_count: Option<u64>,
127    #[builder(default)]
128    #[serde(default)]
129    pub source_count: Option<usize>,
130    #[builder(default)]
131    #[serde(default)]
132    pub is_legal: bool,
133    #[builder(default)]
134    #[serde(default)]
135    pub is_manifest: bool,
136    #[builder(default)]
137    #[serde(default)]
138    pub is_readme: bool,
139    #[builder(default)]
140    #[serde(default)]
141    pub is_top_level: bool,
142    #[builder(default)]
143    #[serde(default)]
144    pub is_key_file: bool,
145    #[builder(default)]
146    #[serde(default)]
147    pub is_community: bool,
148    #[builder(default)]
149    #[serde(default)]
150    pub facets: Vec<String>,
151    #[builder(default)]
152    #[serde(default)]
153    pub tallies: Option<Tallies>,
154}
155
156impl FileInfoBuilder {
157    /// Build a [`FileInfo`] from the current builder state.
158    pub fn build(&self) -> Result<FileInfo, String> {
159        let mut file_info = FileInfo::new(
160            self.name.clone().ok_or("Missing field: name")?,
161            self.base_name.clone().ok_or("Missing field: base_name")?,
162            self.extension.clone().ok_or("Missing field: extension")?,
163            self.path.clone().ok_or("Missing field: path")?,
164            self.file_type.clone().ok_or("Missing field: file_type")?,
165            self.mime_type.clone().flatten(),
166            self.file_type_label.clone().flatten(),
167            self.size.ok_or("Missing field: size")?,
168            self.date.clone().flatten(),
169            self.sha1.flatten(),
170            self.md5.flatten(),
171            self.sha256.flatten(),
172            self.programming_language.clone().flatten(),
173            self.package_data.clone().unwrap_or_default(),
174            self.license_expression.clone().flatten(),
175            self.license_detections.clone().unwrap_or_default(),
176            self.license_clues.clone().unwrap_or_default(),
177            self.copyrights.clone().unwrap_or_default(),
178            self.holders.clone().unwrap_or_default(),
179            self.authors.clone().unwrap_or_default(),
180            self.emails.clone().unwrap_or_default(),
181            self.urls.clone().unwrap_or_default(),
182            self.for_packages.clone().unwrap_or_default(),
183            self.scan_errors.clone().unwrap_or_default(),
184        );
185        file_info.license_policy = self.license_policy.clone().flatten();
186        file_info.sha1_git = self.sha1_git.flatten();
187        file_info.is_binary = self.is_binary.flatten();
188        file_info.is_text = self.is_text.flatten();
189        file_info.is_archive = self.is_archive.flatten();
190        file_info.is_media = self.is_media.flatten();
191        file_info.is_script = self.is_script.flatten();
192        file_info.files_count = self.files_count.flatten();
193        file_info.dirs_count = self.dirs_count.flatten();
194        file_info.size_count = self.size_count.flatten();
195        Ok(file_info)
196    }
197}
198
199impl FileInfo {
200    #[allow(clippy::too_many_arguments)]
201    /// Construct a [`FileInfo`] from fully resolved scanner fields.
202    pub fn new(
203        name: String,
204        base_name: String,
205        extension: String,
206        path: String,
207        file_type: FileType,
208        mime_type: Option<String>,
209        file_type_label: Option<String>,
210        size: u64,
211        date: Option<String>,
212        sha1: Option<Sha1Digest>,
213        md5: Option<Md5Digest>,
214        sha256: Option<Sha256Digest>,
215        programming_language: Option<String>,
216        package_data: Vec<PackageData>,
217        mut license_expression: Option<String>,
218        mut license_detections: Vec<LicenseDetection>,
219        license_clues: Vec<Match>,
220        copyrights: Vec<Copyright>,
221        holders: Vec<Holder>,
222        authors: Vec<Author>,
223        emails: Vec<OutputEmail>,
224        urls: Vec<OutputURL>,
225        for_packages: Vec<PackageUid>,
226        scan_errors: Vec<String>,
227    ) -> Self {
228        let mut package_data = package_data;
229        for package in &mut package_data {
230            enrich_package_data_license_provenance(package, &path);
231        }
232
233        // Combine license expressions from package data if license_expression is None
234        license_expression = license_expression.or_else(|| {
235            let expressions = package_data
236                .iter()
237                .filter_map(|pkg| pkg.get_license_expression());
238            combine_license_expressions(expressions)
239        });
240
241        // Combine license detections from package data if none are provided
242        if license_detections.is_empty() {
243            for pkg in &package_data {
244                license_detections.extend(pkg.license_detections.clone());
245            }
246        }
247
248        // Combine license expressions from license detections if license_expression is still None
249        if license_expression.is_none() && !license_detections.is_empty() {
250            let expressions = license_detections
251                .iter()
252                .map(|detection| detection.license_expression.clone());
253            license_expression = combine_license_expressions(expressions);
254        }
255
256        let mut file_info = FileInfo {
257            name,
258            base_name,
259            extension,
260            path,
261            file_type,
262            mime_type,
263            file_type_label,
264            size,
265            date,
266            sha1,
267            md5,
268            sha256,
269            sha1_git: None,
270            programming_language,
271            package_data,
272            license_expression,
273            license_detections,
274            license_clues,
275            percentage_of_license_text: None,
276            copyrights,
277            holders,
278            authors,
279            emails,
280            urls,
281            for_packages,
282            scan_errors,
283            license_policy: None,
284            is_generated: None,
285            is_binary: None,
286            is_text: None,
287            is_archive: None,
288            is_media: None,
289            is_source: None,
290            is_script: None,
291            files_count: None,
292            dirs_count: None,
293            size_count: None,
294            source_count: None,
295            is_legal: false,
296            is_manifest: false,
297            is_readme: false,
298            is_top_level: false,
299            is_key_file: false,
300            is_community: false,
301            facets: vec![],
302            tallies: None,
303        };
304        file_info.backfill_license_provenance();
305        file_info
306    }
307
308    pub fn backfill_license_provenance(&mut self) {
309        for detection in &mut self.license_detections {
310            enrich_license_detection_provenance(detection, &self.path);
311        }
312
313        for package in &mut self.package_data {
314            enrich_package_data_license_provenance(package, &self.path);
315        }
316    }
317}
318
319fn enrich_package_data_license_provenance(package_data: &mut PackageData, path: &str) {
320    for detection in &mut package_data.license_detections {
321        enrich_license_detection_provenance(detection, path);
322    }
323    for detection in &mut package_data.other_license_detections {
324        enrich_license_detection_provenance(detection, path);
325    }
326}
327
328pub(crate) fn enrich_license_detection_provenance(detection: &mut LicenseDetection, path: &str) {
329    for detection_match in &mut detection.matches {
330        if detection_match.from_file.is_none() {
331            detection_match.from_file = Some(path.to_string());
332        }
333
334        if detection_match.rule_identifier.is_none() {
335            detection_match.rule_identifier = detection_match.matcher.clone();
336        }
337    }
338
339    if detection.identifier.is_none() {
340        detection.identifier = Some(compute_public_detection_identifier(detection));
341    }
342}
343
344fn compute_public_detection_identifier(detection: &LicenseDetection) -> String {
345    let expression = python_safe_name(&detection.license_expression);
346    let mut hasher = Sha1::new();
347    hasher.update(format_public_detection_content(detection).as_bytes());
348    let hex_str = hex::encode(hasher.finalize());
349    let uuid_hex = &hex_str[..32];
350    let content_uuid = uuid::Uuid::parse_str(uuid_hex)
351        .map(|uuid| uuid.to_string())
352        .unwrap_or_else(|_| uuid_hex.to_string());
353
354    format!("{}-{}", expression, content_uuid)
355}
356
357fn format_public_detection_content(detection: &LicenseDetection) -> String {
358    let mut result = String::from("(");
359
360    for (index, detection_match) in detection.matches.iter().enumerate() {
361        if index > 0 {
362            result.push_str(", ");
363        }
364        result.push_str(&format!(
365            "({}, {}, {})",
366            python_str_repr(
367                detection_match
368                    .rule_identifier
369                    .as_deref()
370                    .or(detection_match.matcher.as_deref())
371                    .unwrap_or("parser-declared-license")
372            ),
373            detection_match.score.value() as f32,
374            python_token_tuple_repr(&tokenize_without_stopwords(
375                detection_match.matched_text.as_deref().unwrap_or_default(),
376            )),
377        ));
378    }
379
380    if detection.matches.len() == 1 {
381        result.push(',');
382    }
383    result.push(')');
384    result
385}
386
387fn python_safe_name(value: &str) -> String {
388    let mut result = String::new();
389    let mut prev_underscore = false;
390
391    for character in value.chars() {
392        if character.is_alphanumeric() {
393            result.push(character);
394            prev_underscore = false;
395        } else if !prev_underscore {
396            result.push('_');
397            prev_underscore = true;
398        }
399    }
400
401    let trimmed = result.trim_matches('_');
402    if trimmed.is_empty() {
403        String::new()
404    } else {
405        trimmed.to_string()
406    }
407}
408
409fn python_str_repr(value: &str) -> String {
410    if value.contains('\'') && !value.contains('"') {
411        format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\""))
412    } else {
413        format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\\'"))
414    }
415}
416
417fn python_token_tuple_repr(tokens: &[String]) -> String {
418    if tokens.is_empty() {
419        return String::from("()");
420    }
421
422    let mut result = String::from("(");
423    for (index, token) in tokens.iter().enumerate() {
424        if index > 0 {
425            result.push_str(", ");
426        }
427        result.push_str(&python_str_repr(token));
428    }
429
430    if tokens.len() == 1 {
431        result.push(',');
432    }
433    result.push(')');
434    result
435}
436
437/// Package metadata extracted from manifest files.
438///
439/// Compatible with ScanCode Toolkit output format. Contains standardized package
440/// information including name, version, dependencies, licenses, and other metadata.
441/// This is the primary data structure returned by all parsers.
442#[derive(Serialize, Deserialize, Debug, Clone, Default)]
443pub struct PackageData {
444    #[serde(rename = "type")] // name used by ScanCode
445    pub package_type: Option<PackageType>,
446    pub namespace: Option<String>,
447    pub name: Option<String>,
448    pub version: Option<String>,
449    #[serde(default)]
450    pub qualifiers: Option<HashMap<String, String>>,
451    pub subpath: Option<String>,
452    pub primary_language: Option<String>,
453    pub description: Option<String>,
454    pub release_date: Option<String>,
455    #[serde(default)]
456    pub parties: Vec<Party>,
457    #[serde(default)]
458    pub keywords: Vec<String>,
459    pub homepage_url: Option<String>,
460    pub download_url: Option<String>,
461    pub size: Option<u64>,
462    pub sha1: Option<Sha1Digest>,
463    pub md5: Option<Md5Digest>,
464    pub sha256: Option<Sha256Digest>,
465    pub sha512: Option<Sha512Digest>,
466    pub bug_tracking_url: Option<String>,
467    pub code_view_url: Option<String>,
468    pub vcs_url: Option<String>,
469    pub copyright: Option<String>,
470    pub holder: Option<String>,
471    pub declared_license_expression: Option<String>,
472    pub declared_license_expression_spdx: Option<String>,
473    #[serde(default)]
474    pub license_detections: Vec<LicenseDetection>,
475    pub other_license_expression: Option<String>,
476    pub other_license_expression_spdx: Option<String>,
477    #[serde(default)]
478    pub other_license_detections: Vec<LicenseDetection>,
479    pub extracted_license_statement: Option<String>,
480    pub notice_text: Option<String>,
481    #[serde(default)]
482    pub source_packages: Vec<String>,
483    #[serde(default)]
484    pub file_references: Vec<FileReference>,
485    #[serde(default)]
486    pub is_private: bool,
487    #[serde(default)]
488    pub is_virtual: bool,
489    #[serde(default)]
490    pub extra_data: Option<HashMap<String, serde_json::Value>>,
491    #[serde(default)]
492    pub dependencies: Vec<Dependency>,
493    pub repository_homepage_url: Option<String>,
494    pub repository_download_url: Option<String>,
495    pub api_data_url: Option<String>,
496    pub datasource_id: Option<DatasourceId>,
497    pub purl: Option<String>,
498}
499
500impl PackageData {
501    /// Extracts a single license expression from all license detections in this package.
502    /// Returns None if there are no license detections.
503    pub fn get_license_expression(&self) -> Option<String> {
504        if self.license_detections.is_empty() {
505            return None;
506        }
507
508        let expressions = self
509            .license_detections
510            .iter()
511            .map(|detection| detection.license_expression.clone());
512        combine_license_expressions(expressions)
513    }
514}
515
516/// License detection result containing matched license expressions.
517///
518/// Aggregates multiple license matches into a single SPDX license expression.
519#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
520pub struct LicenseDetection {
521    pub license_expression: String,
522    pub license_expression_spdx: String,
523    pub matches: Vec<Match>,
524    #[serde(default)]
525    pub detection_log: Vec<String>,
526    pub identifier: Option<String>,
527}
528
529/// Individual license text match with location and confidence score.
530///
531/// Represents a specific region of text that matched a known license pattern.
532#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
533pub struct Match {
534    pub license_expression: String,
535    pub license_expression_spdx: String,
536    pub from_file: Option<String>,
537    pub start_line: LineNumber,
538    pub end_line: LineNumber,
539    pub matcher: Option<String>,
540    pub score: MatchScore,
541    pub matched_length: Option<usize>,
542    pub match_coverage: Option<f64>,
543    pub rule_relevance: Option<u8>,
544    pub rule_identifier: Option<String>,
545    pub rule_url: Option<String>,
546    pub matched_text: Option<String>,
547    pub matched_text_diagnostics: Option<String>,
548    #[serde(default)]
549    pub referenced_filenames: Option<Vec<String>>,
550}
551
552#[derive(Serialize, Deserialize, Debug, Clone)]
553pub struct Copyright {
554    pub copyright: String,
555    pub start_line: LineNumber,
556    pub end_line: LineNumber,
557}
558
559#[derive(Serialize, Deserialize, Debug, Clone)]
560pub struct Holder {
561    pub holder: String,
562    pub start_line: LineNumber,
563    pub end_line: LineNumber,
564}
565
566#[derive(Serialize, Deserialize, Debug, Clone)]
567pub struct Author {
568    pub author: String,
569    pub start_line: LineNumber,
570    pub end_line: LineNumber,
571}
572
573/// Package dependency information with version constraints.
574///
575/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
576/// and optional resolved package details.
577#[derive(Serialize, Deserialize, Debug, Clone)]
578pub struct Dependency {
579    pub purl: Option<String>,
580    pub extracted_requirement: Option<String>,
581    pub scope: Option<String>,
582    pub is_runtime: Option<bool>,
583    pub is_optional: Option<bool>,
584    pub is_pinned: Option<bool>,
585    pub is_direct: Option<bool>,
586    pub resolved_package: Option<Box<ResolvedPackage>>,
587    #[serde(default)]
588    pub extra_data: Option<HashMap<String, serde_json::Value>>,
589}
590
591#[derive(Serialize, Deserialize, Debug, Clone)]
592pub struct ResolvedPackage {
593    #[serde(rename = "type")]
594    pub package_type: PackageType,
595    pub namespace: String,
596    pub name: String,
597    pub version: String,
598    #[serde(default)]
599    pub qualifiers: Option<HashMap<String, String>>,
600    pub subpath: Option<String>,
601    pub primary_language: Option<String>,
602    pub description: Option<String>,
603    pub release_date: Option<String>,
604    #[serde(default)]
605    pub parties: Vec<Party>,
606    #[serde(default)]
607    pub keywords: Vec<String>,
608    pub homepage_url: Option<String>,
609    pub download_url: Option<String>,
610    pub size: Option<u64>,
611    pub sha1: Option<Sha1Digest>,
612    pub md5: Option<Md5Digest>,
613    pub sha256: Option<Sha256Digest>,
614    pub sha512: Option<Sha512Digest>,
615    pub bug_tracking_url: Option<String>,
616    pub code_view_url: Option<String>,
617    pub vcs_url: Option<String>,
618    pub copyright: Option<String>,
619    pub holder: Option<String>,
620    pub declared_license_expression: Option<String>,
621    pub declared_license_expression_spdx: Option<String>,
622    #[serde(default)]
623    pub license_detections: Vec<LicenseDetection>,
624    pub other_license_expression: Option<String>,
625    pub other_license_expression_spdx: Option<String>,
626    #[serde(default)]
627    pub other_license_detections: Vec<LicenseDetection>,
628    pub extracted_license_statement: Option<String>,
629    pub notice_text: Option<String>,
630    #[serde(default)]
631    pub source_packages: Vec<String>,
632    #[serde(default)]
633    pub file_references: Vec<FileReference>,
634    #[serde(default)]
635    pub is_private: bool,
636    #[serde(default)]
637    pub is_virtual: bool,
638    #[serde(default)]
639    pub extra_data: Option<HashMap<String, serde_json::Value>>,
640    #[serde(default)]
641    pub dependencies: Vec<Dependency>,
642    pub repository_homepage_url: Option<String>,
643    pub repository_download_url: Option<String>,
644    pub api_data_url: Option<String>,
645    pub datasource_id: Option<DatasourceId>,
646    pub purl: Option<String>,
647}
648
649impl ResolvedPackage {
650    pub fn new(
651        package_type: PackageType,
652        namespace: String,
653        name: String,
654        version: String,
655    ) -> Self {
656        Self {
657            package_type,
658            namespace,
659            name,
660            version,
661            qualifiers: None,
662            subpath: None,
663            primary_language: None,
664            description: None,
665            release_date: None,
666            parties: vec![],
667            keywords: vec![],
668            homepage_url: None,
669            download_url: None,
670            size: None,
671            sha1: None,
672            md5: None,
673            sha256: None,
674            sha512: None,
675            bug_tracking_url: None,
676            code_view_url: None,
677            vcs_url: None,
678            copyright: None,
679            holder: None,
680            declared_license_expression: None,
681            declared_license_expression_spdx: None,
682            license_detections: vec![],
683            other_license_expression: None,
684            other_license_expression_spdx: None,
685            other_license_detections: vec![],
686            extracted_license_statement: None,
687            notice_text: None,
688            source_packages: vec![],
689            file_references: vec![],
690            is_private: false,
691            is_virtual: false,
692            extra_data: None,
693            dependencies: vec![],
694            repository_homepage_url: None,
695            repository_download_url: None,
696            api_data_url: None,
697            datasource_id: None,
698            purl: None,
699        }
700    }
701
702    pub fn from_package_data(package_data: &PackageData, fallback_type: PackageType) -> Self {
703        Self {
704            package_type: package_data.package_type.unwrap_or(fallback_type),
705            namespace: package_data.namespace.clone().unwrap_or_default(),
706            name: package_data.name.clone().unwrap_or_default(),
707            version: package_data.version.clone().unwrap_or_default(),
708            qualifiers: package_data.qualifiers.clone(),
709            subpath: package_data.subpath.clone(),
710            primary_language: package_data.primary_language.clone(),
711            description: package_data.description.clone(),
712            release_date: package_data.release_date.clone(),
713            parties: package_data.parties.clone(),
714            keywords: package_data.keywords.clone(),
715            homepage_url: package_data.homepage_url.clone(),
716            download_url: package_data.download_url.clone(),
717            size: package_data.size,
718            sha1: package_data.sha1,
719            md5: package_data.md5,
720            sha256: package_data.sha256,
721            sha512: package_data.sha512,
722            bug_tracking_url: package_data.bug_tracking_url.clone(),
723            code_view_url: package_data.code_view_url.clone(),
724            vcs_url: package_data.vcs_url.clone(),
725            copyright: package_data.copyright.clone(),
726            holder: package_data.holder.clone(),
727            declared_license_expression: package_data.declared_license_expression.clone(),
728            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
729            license_detections: package_data.license_detections.clone(),
730            other_license_expression: package_data.other_license_expression.clone(),
731            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
732            other_license_detections: package_data.other_license_detections.clone(),
733            extracted_license_statement: package_data.extracted_license_statement.clone(),
734            notice_text: package_data.notice_text.clone(),
735            source_packages: package_data.source_packages.clone(),
736            file_references: package_data.file_references.clone(),
737            is_private: package_data.is_private,
738            is_virtual: package_data.is_virtual,
739            extra_data: package_data.extra_data.clone(),
740            dependencies: package_data.dependencies.clone(),
741            repository_homepage_url: package_data.repository_homepage_url.clone(),
742            repository_download_url: package_data.repository_download_url.clone(),
743            api_data_url: package_data.api_data_url.clone(),
744            datasource_id: package_data.datasource_id,
745            purl: package_data.purl.clone(),
746        }
747    }
748}
749
750/// Author, maintainer, or contributor information.
751///
752/// Represents a person or organization associated with a package.
753#[derive(Serialize, Deserialize, Debug, Clone)]
754pub struct Party {
755    pub r#type: Option<String>,
756    pub role: Option<String>,
757    pub name: Option<String>,
758    pub email: Option<String>,
759    pub url: Option<String>,
760    pub organization: Option<String>,
761    pub organization_url: Option<String>,
762    pub timezone: Option<String>,
763}
764
765/// Reference to a file within a package archive with checksums.
766///
767/// Used in SBOM generation to track files within distribution archives.
768#[derive(Serialize, Deserialize, Debug, Clone)]
769pub struct FileReference {
770    pub path: String,
771    pub size: Option<u64>,
772    pub sha1: Option<Sha1Digest>,
773    pub md5: Option<Md5Digest>,
774    pub sha256: Option<Sha256Digest>,
775    pub sha512: Option<Sha512Digest>,
776    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
777}
778
779/// Top-level assembled package, created by merging one or more `PackageData`
780/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
781///
782/// Compatible with ScanCode Toolkit output format. The key differences from
783/// `PackageData` are:
784/// - `package_uid`: unique identifier (PURL with UUID qualifier)
785/// - `datafile_paths`: list of all contributing files
786/// - `datasource_ids`: list of all contributing parsers
787/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
788#[derive(Serialize, Deserialize, Debug, Clone)]
789pub struct Package {
790    #[serde(rename = "type")]
791    pub package_type: Option<PackageType>,
792    pub namespace: Option<String>,
793    pub name: Option<String>,
794    pub version: Option<String>,
795    #[serde(default)]
796    pub qualifiers: Option<HashMap<String, String>>,
797    pub subpath: Option<String>,
798    pub primary_language: Option<String>,
799    pub description: Option<String>,
800    pub release_date: Option<String>,
801    #[serde(default)]
802    pub parties: Vec<Party>,
803    #[serde(default)]
804    pub keywords: Vec<String>,
805    pub homepage_url: Option<String>,
806    pub download_url: Option<String>,
807    pub size: Option<u64>,
808    pub sha1: Option<Sha1Digest>,
809    pub md5: Option<Md5Digest>,
810    pub sha256: Option<Sha256Digest>,
811    pub sha512: Option<Sha512Digest>,
812    pub bug_tracking_url: Option<String>,
813    pub code_view_url: Option<String>,
814    pub vcs_url: Option<String>,
815    pub copyright: Option<String>,
816    pub holder: Option<String>,
817    pub declared_license_expression: Option<String>,
818    pub declared_license_expression_spdx: Option<String>,
819    #[serde(default)]
820    pub license_detections: Vec<LicenseDetection>,
821    pub other_license_expression: Option<String>,
822    pub other_license_expression_spdx: Option<String>,
823    #[serde(default)]
824    pub other_license_detections: Vec<LicenseDetection>,
825    pub extracted_license_statement: Option<String>,
826    pub notice_text: Option<String>,
827    #[serde(default)]
828    pub source_packages: Vec<String>,
829    #[serde(default)]
830    pub is_private: bool,
831    #[serde(default)]
832    pub is_virtual: bool,
833    #[serde(default)]
834    pub extra_data: Option<HashMap<String, serde_json::Value>>,
835    pub repository_homepage_url: Option<String>,
836    pub repository_download_url: Option<String>,
837    pub api_data_url: Option<String>,
838    pub purl: Option<String>,
839    /// Unique identifier for this package instance (PURL with UUID qualifier).
840    pub package_uid: PackageUid,
841    /// Paths to all datafiles that contributed to this package.
842    pub datafile_paths: Vec<String>,
843    /// Datasource identifiers for all parsers that contributed to this package.
844    pub datasource_ids: Vec<DatasourceId>,
845}
846
847impl Package {
848    /// Create a `Package` from a `PackageData` and its source file path.
849    ///
850    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
851    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
852    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
853        let mut package_data = package_data.clone();
854        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
855
856        let package_uid = package_data
857            .purl
858            .as_ref()
859            .map(|p| PackageUid::new(p))
860            .unwrap_or_else(PackageUid::empty);
861
862        Package {
863            package_type: package_data.package_type,
864            namespace: package_data.namespace.clone(),
865            name: package_data.name.clone(),
866            version: package_data.version.clone(),
867            qualifiers: package_data.qualifiers.clone(),
868            subpath: package_data.subpath.clone(),
869            primary_language: package_data.primary_language.clone(),
870            description: package_data.description.clone(),
871            release_date: package_data.release_date.clone(),
872            parties: package_data.parties.clone(),
873            keywords: package_data.keywords.clone(),
874            homepage_url: package_data.homepage_url.clone(),
875            download_url: package_data.download_url.clone(),
876            size: package_data.size,
877            sha1: package_data.sha1,
878            md5: package_data.md5,
879            sha256: package_data.sha256,
880            sha512: package_data.sha512,
881            bug_tracking_url: package_data.bug_tracking_url.clone(),
882            code_view_url: package_data.code_view_url.clone(),
883            vcs_url: package_data.vcs_url.clone(),
884            copyright: package_data.copyright.clone(),
885            holder: package_data.holder.clone(),
886            declared_license_expression: package_data.declared_license_expression.clone(),
887            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
888            license_detections: package_data.license_detections.clone(),
889            other_license_expression: package_data.other_license_expression.clone(),
890            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
891            other_license_detections: package_data.other_license_detections.clone(),
892            extracted_license_statement: package_data.extracted_license_statement.clone(),
893            notice_text: package_data.notice_text.clone(),
894            source_packages: package_data.source_packages.clone(),
895            is_private: package_data.is_private,
896            is_virtual: package_data.is_virtual,
897            extra_data: package_data.extra_data.clone(),
898            repository_homepage_url: package_data.repository_homepage_url.clone(),
899            repository_download_url: package_data.repository_download_url.clone(),
900            api_data_url: package_data.api_data_url.clone(),
901            purl: package_data.purl.clone(),
902            package_uid,
903            datafile_paths: vec![datafile_path],
904            datasource_ids: if let Some(dsid) = package_data.datasource_id {
905                vec![dsid]
906            } else {
907                vec![]
908            },
909        }
910    }
911
912    /// Update this package with data from another `PackageData`.
913    ///
914    /// Merges data from a related file (e.g., lockfile) into this package.
915    /// Existing non-empty values are preserved; empty fields are filled from
916    /// the new data. Lists (parties, license_detections) are merged.
917    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
918        let mut package_data = package_data.clone();
919        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
920
921        if let Some(dsid) = package_data.datasource_id {
922            self.datasource_ids.push(dsid);
923        }
924        self.datafile_paths.push(datafile_path);
925
926        macro_rules! fill_if_empty {
927            ($field:ident) => {
928                if self.$field.is_none() {
929                    self.$field = package_data.$field;
930                }
931            };
932        }
933
934        fill_if_empty!(package_type);
935        fill_if_empty!(name);
936        fill_if_empty!(namespace);
937        fill_if_empty!(version);
938        fill_if_empty!(qualifiers);
939        fill_if_empty!(subpath);
940        fill_if_empty!(primary_language);
941        fill_if_empty!(description);
942        fill_if_empty!(release_date);
943        fill_if_empty!(homepage_url);
944        fill_if_empty!(download_url);
945        fill_if_empty!(size);
946        fill_if_empty!(sha1);
947        fill_if_empty!(md5);
948        fill_if_empty!(sha256);
949        fill_if_empty!(sha512);
950        fill_if_empty!(bug_tracking_url);
951        fill_if_empty!(code_view_url);
952        fill_if_empty!(vcs_url);
953        fill_if_empty!(copyright);
954        fill_if_empty!(holder);
955        fill_if_empty!(declared_license_expression);
956        fill_if_empty!(declared_license_expression_spdx);
957        fill_if_empty!(other_license_expression);
958        fill_if_empty!(other_license_expression_spdx);
959        fill_if_empty!(extracted_license_statement);
960        fill_if_empty!(notice_text);
961        match (&mut self.extra_data, &package_data.extra_data) {
962            (None, Some(extra_data)) => {
963                self.extra_data = Some(extra_data.clone());
964            }
965            (Some(existing), Some(incoming)) => {
966                for (key, value) in incoming {
967                    existing.entry(key.clone()).or_insert_with(|| value.clone());
968                }
969            }
970            _ => {}
971        }
972        fill_if_empty!(repository_homepage_url);
973        fill_if_empty!(repository_download_url);
974        fill_if_empty!(api_data_url);
975
976        for party in &package_data.parties {
977            if let Some(existing) = self.parties.iter_mut().find(|p| {
978                p.role == party.role
979                    && ((p.name.is_some() && p.name == party.name)
980                        || (p.email.is_some() && p.email == party.email))
981            }) {
982                if existing.name.is_none() {
983                    existing.name = party.name.clone();
984                }
985                if existing.email.is_none() {
986                    existing.email = party.email.clone();
987                }
988            } else {
989                self.parties.push(party.clone());
990            }
991        }
992
993        for keyword in &package_data.keywords {
994            if !self.keywords.contains(keyword) {
995                self.keywords.push(keyword.clone());
996            }
997        }
998
999        for detection in &package_data.license_detections {
1000            self.license_detections.push(detection.clone());
1001        }
1002
1003        for detection in &package_data.other_license_detections {
1004            self.other_license_detections.push(detection.clone());
1005        }
1006
1007        for source_pkg in &package_data.source_packages {
1008            if !self.source_packages.contains(source_pkg) {
1009                self.source_packages.push(source_pkg.clone());
1010            }
1011        }
1012
1013        self.refresh_identity();
1014    }
1015
1016    pub fn backfill_license_provenance(&mut self) {
1017        let Some(datafile_path) = self.datafile_paths.first().cloned() else {
1018            return;
1019        };
1020
1021        for detection in &mut self.license_detections {
1022            enrich_license_detection_provenance(detection, &datafile_path);
1023        }
1024        for detection in &mut self.other_license_detections {
1025            enrich_license_detection_provenance(detection, &datafile_path);
1026        }
1027    }
1028
1029    fn refresh_identity(&mut self) {
1030        let Some(next_purl) = self.build_current_purl() else {
1031            return;
1032        };
1033
1034        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
1035            self.package_uid = PackageUid::new(&next_purl);
1036        }
1037
1038        self.purl = Some(next_purl);
1039    }
1040
1041    fn build_current_purl(&self) -> Option<String> {
1042        if let (Some(package_type), Some(name)) = (
1043            self.package_type.as_ref(),
1044            self.name
1045                .as_deref()
1046                .filter(|value| !value.trim().is_empty()),
1047        ) {
1048            let purl_type = match package_type {
1049                PackageType::Deno => "generic",
1050                _ => package_type.as_str(),
1051            };
1052
1053            let mut purl = PackageUrl::new(purl_type, name).ok()?;
1054
1055            if let Some(namespace) = self
1056                .namespace
1057                .as_deref()
1058                .filter(|value| !value.trim().is_empty())
1059            {
1060                purl.with_namespace(namespace).ok()?;
1061            }
1062
1063            if let Some(version) = self
1064                .version
1065                .as_deref()
1066                .filter(|value| !value.trim().is_empty())
1067            {
1068                purl.with_version(version).ok()?;
1069            }
1070
1071            if let Some(qualifiers) = &self.qualifiers {
1072                for (key, value) in qualifiers {
1073                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
1074                }
1075            }
1076
1077            if let Some(subpath) = self
1078                .subpath
1079                .as_deref()
1080                .filter(|value| !value.trim().is_empty())
1081            {
1082                purl.with_subpath(subpath).ok()?;
1083            }
1084
1085            return Some(purl.to_string());
1086        }
1087
1088        let existing_purl = self.purl.as_deref()?;
1089        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
1090
1091        if let Some(version) = self
1092            .version
1093            .as_deref()
1094            .filter(|value| !value.trim().is_empty())
1095        {
1096            purl.with_version(version).ok()?;
1097        } else {
1098            purl.without_version();
1099        }
1100
1101        Some(purl.to_string())
1102    }
1103}
1104
1105#[cfg(test)]
1106mod tests {
1107    use super::*;
1108
1109    #[test]
1110    fn file_info_new_backfills_package_detection_provenance() {
1111        let package_data = PackageData {
1112            package_type: Some(PackageType::Npm),
1113            license_detections: vec![LicenseDetection {
1114                license_expression: "mit".to_string(),
1115                license_expression_spdx: "MIT".to_string(),
1116                matches: vec![Match {
1117                    license_expression: "mit".to_string(),
1118                    license_expression_spdx: "MIT".to_string(),
1119                    from_file: None,
1120                    start_line: LineNumber::ONE,
1121                    end_line: LineNumber::ONE,
1122                    matcher: Some("parser-declared-license".to_string()),
1123                    score: MatchScore::MAX,
1124                    matched_length: Some(1),
1125                    match_coverage: Some(100.0),
1126                    rule_relevance: Some(100),
1127                    rule_identifier: None,
1128                    rule_url: None,
1129                    matched_text: Some("MIT".to_string()),
1130                    referenced_filenames: None,
1131                    matched_text_diagnostics: None,
1132                }],
1133                detection_log: vec![],
1134                identifier: None,
1135            }],
1136            ..PackageData::default()
1137        };
1138
1139        let file_info = FileInfo::new(
1140            "package.json".to_string(),
1141            "package".to_string(),
1142            ".json".to_string(),
1143            "project/package.json".to_string(),
1144            FileType::File,
1145            None,
1146            None,
1147            1,
1148            None,
1149            None,
1150            None,
1151            None,
1152            None,
1153            vec![package_data],
1154            None,
1155            vec![],
1156            vec![],
1157            vec![],
1158            vec![],
1159            vec![],
1160            vec![],
1161            vec![],
1162            vec![],
1163            vec![],
1164        );
1165
1166        assert_eq!(file_info.license_detections.len(), 1);
1167        assert_eq!(
1168            file_info.license_detections[0].matches[0]
1169                .from_file
1170                .as_deref(),
1171            Some("project/package.json")
1172        );
1173        assert!(file_info.license_detections[0].identifier.is_some());
1174        assert_eq!(
1175            file_info.package_data[0].license_detections[0].matches[0]
1176                .from_file
1177                .as_deref(),
1178            Some("project/package.json")
1179        );
1180        assert_eq!(
1181            file_info.package_data[0].license_detections[0].matches[0]
1182                .rule_identifier
1183                .as_deref(),
1184            Some("parser-declared-license")
1185        );
1186        assert!(
1187            file_info.package_data[0].license_detections[0]
1188                .identifier
1189                .is_some()
1190        );
1191    }
1192
1193    #[test]
1194    fn package_from_package_data_backfills_detection_provenance() {
1195        let package_data = PackageData {
1196            package_type: Some(PackageType::Npm),
1197            license_detections: vec![LicenseDetection {
1198                license_expression: "mit".to_string(),
1199                license_expression_spdx: "MIT".to_string(),
1200                matches: vec![Match {
1201                    license_expression: "mit".to_string(),
1202                    license_expression_spdx: "MIT".to_string(),
1203                    from_file: None,
1204                    start_line: LineNumber::ONE,
1205                    end_line: LineNumber::ONE,
1206                    matcher: Some("parser-declared-license".to_string()),
1207                    score: MatchScore::MAX,
1208                    matched_length: Some(1),
1209                    match_coverage: Some(100.0),
1210                    rule_relevance: Some(100),
1211                    rule_identifier: None,
1212                    rule_url: None,
1213                    matched_text: Some("MIT".to_string()),
1214                    referenced_filenames: None,
1215                    matched_text_diagnostics: None,
1216                }],
1217                detection_log: vec![],
1218                identifier: None,
1219            }],
1220            ..PackageData::default()
1221        };
1222
1223        let package = Package::from_package_data(&package_data, "project/package.json".to_string());
1224
1225        assert_eq!(
1226            package.license_detections[0].matches[0]
1227                .from_file
1228                .as_deref(),
1229            Some("project/package.json")
1230        );
1231        assert_eq!(
1232            package.license_detections[0].matches[0]
1233                .rule_identifier
1234                .as_deref(),
1235            Some("parser-declared-license")
1236        );
1237        assert!(package.license_detections[0].identifier.is_some());
1238    }
1239}
1240
1241/// Top-level dependency instance, created during package assembly.
1242///
1243/// Extends the file-level `Dependency` with traceability fields that link
1244/// each dependency to its owning package and source datafile.
1245#[derive(Serialize, Deserialize, Debug, Clone)]
1246pub struct TopLevelDependency {
1247    pub purl: Option<String>,
1248    pub extracted_requirement: Option<String>,
1249    pub scope: Option<String>,
1250    pub is_runtime: Option<bool>,
1251    pub is_optional: Option<bool>,
1252    pub is_pinned: Option<bool>,
1253    pub is_direct: Option<bool>,
1254    pub resolved_package: Option<Box<ResolvedPackage>>,
1255    #[serde(default)]
1256    pub extra_data: Option<HashMap<String, serde_json::Value>>,
1257    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
1258    pub dependency_uid: DependencyUid,
1259    /// The `package_uid` of the package this dependency belongs to.
1260    pub for_package_uid: Option<PackageUid>,
1261    /// Path to the datafile where this dependency was declared.
1262    pub datafile_path: String,
1263    /// Datasource identifier for the parser that extracted this dependency.
1264    pub datasource_id: DatasourceId,
1265    /// Namespace for the dependency (e.g., distribution name for RPM packages).
1266    pub namespace: Option<String>,
1267}
1268
1269impl TopLevelDependency {
1270    /// Create a `TopLevelDependency` from a file-level `Dependency`.
1271    pub fn from_dependency(
1272        dep: &Dependency,
1273        datafile_path: String,
1274        datasource_id: DatasourceId,
1275        for_package_uid: Option<PackageUid>,
1276    ) -> Self {
1277        let dependency_uid = dep
1278            .purl
1279            .as_ref()
1280            .map(|p| DependencyUid::new(p))
1281            .unwrap_or_else(DependencyUid::empty);
1282
1283        TopLevelDependency {
1284            purl: dep.purl.clone(),
1285            extracted_requirement: dep.extracted_requirement.clone(),
1286            scope: dep.scope.clone(),
1287            is_runtime: dep.is_runtime,
1288            is_optional: dep.is_optional,
1289            is_pinned: dep.is_pinned,
1290            is_direct: dep.is_direct,
1291            resolved_package: dep.resolved_package.clone(),
1292            extra_data: dep.extra_data.clone(),
1293            dependency_uid,
1294            for_package_uid,
1295            datafile_path,
1296            datasource_id,
1297            namespace: None,
1298        }
1299    }
1300}
1301
1302#[derive(Serialize, Deserialize, Debug, Clone)]
1303pub struct OutputEmail {
1304    pub email: String,
1305    pub start_line: LineNumber,
1306    pub end_line: LineNumber,
1307}
1308
1309#[derive(Serialize, Deserialize, Debug, Clone)]
1310pub struct OutputURL {
1311    pub url: String,
1312    pub start_line: LineNumber,
1313    pub end_line: LineNumber,
1314}
1315
1316#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
1317pub struct LicensePolicyEntry {
1318    pub license_key: String,
1319    pub label: String,
1320    pub color_code: String,
1321    pub icon: String,
1322}
1323
1324#[derive(Debug, Clone, PartialEq)]
1325pub enum FileType {
1326    File,
1327    Directory,
1328}
1329
1330impl serde::Serialize for FileType {
1331    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1332    where
1333        S: serde::Serializer,
1334    {
1335        match self {
1336            FileType::File => serializer.serialize_str("file"),
1337            FileType::Directory => serializer.serialize_str("directory"),
1338        }
1339    }
1340}
1341
1342impl<'de> Deserialize<'de> for FileType {
1343    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1344    where
1345        D: serde::Deserializer<'de>,
1346    {
1347        let value = String::deserialize(deserializer)?;
1348        match value.as_str() {
1349            "file" => Ok(FileType::File),
1350            "directory" => Ok(FileType::Directory),
1351            _ => Err(serde::de::Error::custom("invalid file type")),
1352        }
1353    }
1354}