Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::{Deserialize, Serialize};
4use sha1::{Digest, Sha1};
5use std::collections::HashMap;
6use std::str::FromStr;
7
8use super::DatasourceId;
9use super::DependencyUid;
10use super::GitSha1;
11use super::LineNumber;
12use super::MatchScore;
13use super::Md5Digest;
14use super::PackageType;
15use super::PackageUid;
16use super::Sha1Digest;
17use super::Sha256Digest;
18use super::Sha512Digest;
19use crate::license_detection::tokenize::tokenize_without_stopwords;
20use crate::models::output::Tallies;
21use crate::utils::spdx::combine_license_expressions;
22
23#[derive(Debug, Builder, Serialize, Deserialize, Clone)]
24#[builder(build_fn(skip))]
25/// File-level scan result containing metadata and detected findings.
26pub struct FileInfo {
27    pub name: String,
28    pub base_name: String,
29    pub extension: String,
30    pub path: String,
31    #[serde(rename = "type")] // name used by ScanCode
32    pub file_type: FileType,
33    #[builder(default)]
34    #[serde(default)]
35    pub mime_type: Option<String>,
36    #[builder(default)]
37    #[serde(rename = "file_type", default)]
38    pub file_type_label: Option<String>,
39    pub size: u64,
40    #[builder(default)]
41    #[serde(default)]
42    pub date: Option<String>,
43    #[builder(default)]
44    #[serde(default)]
45    pub sha1: Option<Sha1Digest>,
46    #[builder(default)]
47    #[serde(default)]
48    pub md5: Option<Md5Digest>,
49    #[builder(default)]
50    #[serde(default)]
51    pub sha256: Option<Sha256Digest>,
52    #[builder(default)]
53    #[serde(default)]
54    pub sha1_git: Option<GitSha1>,
55    #[builder(default)]
56    #[serde(default)]
57    pub programming_language: Option<String>,
58    #[builder(default)]
59    #[serde(default)]
60    pub package_data: Vec<PackageData>,
61    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
62    #[builder(default)]
63    pub license_expression: Option<String>,
64    #[builder(default)]
65    #[serde(default)]
66    pub license_detections: Vec<LicenseDetection>,
67    #[builder(default)]
68    #[serde(default)]
69    pub license_clues: Vec<Match>,
70    #[builder(default)]
71    #[serde(default)]
72    pub percentage_of_license_text: Option<f64>,
73    #[builder(default)]
74    #[serde(default)]
75    pub copyrights: Vec<Copyright>,
76    #[builder(default)]
77    #[serde(default)]
78    pub holders: Vec<Holder>,
79    #[builder(default)]
80    #[serde(default)]
81    pub authors: Vec<Author>,
82    #[builder(default)]
83    #[serde(default)]
84    pub emails: Vec<OutputEmail>,
85    #[builder(default)]
86    #[serde(default)]
87    pub urls: Vec<OutputURL>,
88    #[builder(default)]
89    #[serde(default)]
90    pub for_packages: Vec<PackageUid>,
91    #[builder(default)]
92    #[serde(default)]
93    pub scan_errors: Vec<String>,
94    #[builder(default)]
95    #[serde(default)]
96    pub license_policy: Option<Vec<LicensePolicyEntry>>,
97    #[builder(default)]
98    #[serde(default)]
99    pub is_generated: Option<bool>,
100    #[builder(default)]
101    #[serde(default)]
102    pub is_binary: Option<bool>,
103    #[builder(default)]
104    #[serde(default)]
105    pub is_text: Option<bool>,
106    #[builder(default)]
107    #[serde(default)]
108    pub is_archive: Option<bool>,
109    #[builder(default)]
110    #[serde(default)]
111    pub is_media: Option<bool>,
112    #[builder(default)]
113    #[serde(default)]
114    pub is_source: Option<bool>,
115    #[builder(default)]
116    #[serde(default)]
117    pub is_script: Option<bool>,
118    #[builder(default)]
119    #[serde(default)]
120    pub files_count: Option<usize>,
121    #[builder(default)]
122    #[serde(default)]
123    pub dirs_count: Option<usize>,
124    #[builder(default)]
125    #[serde(default)]
126    pub size_count: Option<u64>,
127    #[builder(default)]
128    #[serde(default)]
129    pub source_count: Option<usize>,
130    #[builder(default)]
131    #[serde(default)]
132    pub is_legal: bool,
133    #[builder(default)]
134    #[serde(default)]
135    pub is_manifest: bool,
136    #[builder(default)]
137    #[serde(default)]
138    pub is_readme: bool,
139    #[builder(default)]
140    #[serde(default)]
141    pub is_top_level: bool,
142    #[builder(default)]
143    #[serde(default)]
144    pub is_key_file: bool,
145    #[builder(default)]
146    #[serde(default)]
147    pub is_community: bool,
148    #[builder(default)]
149    #[serde(default)]
150    pub facets: Vec<String>,
151    #[builder(default)]
152    #[serde(default)]
153    pub tallies: Option<Tallies>,
154}
155
156impl FileInfoBuilder {
157    /// Build a [`FileInfo`] from the current builder state.
158    pub fn build(&self) -> Result<FileInfo, String> {
159        let mut file_info = FileInfo::new(
160            self.name.clone().ok_or("Missing field: name")?,
161            self.base_name.clone().ok_or("Missing field: base_name")?,
162            self.extension.clone().ok_or("Missing field: extension")?,
163            self.path.clone().ok_or("Missing field: path")?,
164            self.file_type.clone().ok_or("Missing field: file_type")?,
165            self.mime_type.clone().flatten(),
166            self.file_type_label.clone().flatten(),
167            self.size.ok_or("Missing field: size")?,
168            self.date.clone().flatten(),
169            self.sha1.flatten(),
170            self.md5.flatten(),
171            self.sha256.flatten(),
172            self.programming_language.clone().flatten(),
173            self.package_data.clone().unwrap_or_default(),
174            self.license_expression.clone().flatten(),
175            self.license_detections.clone().unwrap_or_default(),
176            self.license_clues.clone().unwrap_or_default(),
177            self.copyrights.clone().unwrap_or_default(),
178            self.holders.clone().unwrap_or_default(),
179            self.authors.clone().unwrap_or_default(),
180            self.emails.clone().unwrap_or_default(),
181            self.urls.clone().unwrap_or_default(),
182            self.for_packages.clone().unwrap_or_default(),
183            self.scan_errors.clone().unwrap_or_default(),
184        );
185        file_info.license_policy = self.license_policy.clone().flatten();
186        file_info.sha1_git = self.sha1_git.flatten();
187        file_info.is_binary = self.is_binary.flatten();
188        file_info.is_text = self.is_text.flatten();
189        file_info.is_archive = self.is_archive.flatten();
190        file_info.is_media = self.is_media.flatten();
191        file_info.is_script = self.is_script.flatten();
192        file_info.files_count = self.files_count.flatten();
193        file_info.dirs_count = self.dirs_count.flatten();
194        file_info.size_count = self.size_count.flatten();
195        Ok(file_info)
196    }
197}
198
199impl FileInfo {
200    #[allow(clippy::too_many_arguments)]
201    /// Construct a [`FileInfo`] from fully resolved scanner fields.
202    pub fn new(
203        name: String,
204        base_name: String,
205        extension: String,
206        path: String,
207        file_type: FileType,
208        mime_type: Option<String>,
209        file_type_label: Option<String>,
210        size: u64,
211        date: Option<String>,
212        sha1: Option<Sha1Digest>,
213        md5: Option<Md5Digest>,
214        sha256: Option<Sha256Digest>,
215        programming_language: Option<String>,
216        package_data: Vec<PackageData>,
217        mut license_expression: Option<String>,
218        mut license_detections: Vec<LicenseDetection>,
219        license_clues: Vec<Match>,
220        copyrights: Vec<Copyright>,
221        holders: Vec<Holder>,
222        authors: Vec<Author>,
223        emails: Vec<OutputEmail>,
224        urls: Vec<OutputURL>,
225        for_packages: Vec<PackageUid>,
226        scan_errors: Vec<String>,
227    ) -> Self {
228        let mut package_data = package_data;
229        for package in &mut package_data {
230            enrich_package_data_license_provenance(package, &path);
231        }
232
233        // Combine license expressions from package data if license_expression is None
234        license_expression = license_expression.or_else(|| {
235            let expressions = package_data
236                .iter()
237                .filter_map(|pkg| pkg.get_license_expression());
238            combine_license_expressions(expressions)
239        });
240
241        // Combine license detections from package data if none are provided
242        if license_detections.is_empty() {
243            for pkg in &package_data {
244                license_detections.extend(pkg.license_detections.clone());
245            }
246        }
247
248        // Combine license expressions from license detections if license_expression is still None
249        if license_expression.is_none() && !license_detections.is_empty() {
250            let expressions = license_detections
251                .iter()
252                .map(|detection| detection.license_expression.clone());
253            license_expression = combine_license_expressions(expressions);
254        }
255
256        let mut file_info = FileInfo {
257            name,
258            base_name,
259            extension,
260            path,
261            file_type,
262            mime_type,
263            file_type_label,
264            size,
265            date,
266            sha1,
267            md5,
268            sha256,
269            sha1_git: None,
270            programming_language,
271            package_data,
272            license_expression,
273            license_detections,
274            license_clues,
275            percentage_of_license_text: None,
276            copyrights,
277            holders,
278            authors,
279            emails,
280            urls,
281            for_packages,
282            scan_errors,
283            license_policy: None,
284            is_generated: None,
285            is_binary: None,
286            is_text: None,
287            is_archive: None,
288            is_media: None,
289            is_source: None,
290            is_script: None,
291            files_count: None,
292            dirs_count: None,
293            size_count: None,
294            source_count: None,
295            is_legal: false,
296            is_manifest: false,
297            is_readme: false,
298            is_top_level: false,
299            is_key_file: false,
300            is_community: false,
301            facets: vec![],
302            tallies: None,
303        };
304        file_info.backfill_license_provenance();
305        file_info
306    }
307
308    pub fn backfill_license_provenance(&mut self) {
309        for detection in &mut self.license_detections {
310            enrich_license_detection_provenance(detection, &self.path);
311        }
312
313        for package in &mut self.package_data {
314            enrich_package_data_license_provenance(package, &self.path);
315        }
316    }
317}
318
319fn enrich_package_data_license_provenance(package_data: &mut PackageData, path: &str) {
320    for detection in &mut package_data.license_detections {
321        enrich_license_detection_provenance(detection, path);
322    }
323    for detection in &mut package_data.other_license_detections {
324        enrich_license_detection_provenance(detection, path);
325    }
326}
327
328pub(crate) fn enrich_license_detection_provenance(detection: &mut LicenseDetection, path: &str) {
329    for detection_match in &mut detection.matches {
330        if detection_match.from_file.is_none() {
331            detection_match.from_file = Some(path.to_string());
332        }
333
334        if detection_match.rule_identifier.is_none() {
335            detection_match.rule_identifier = detection_match.matcher.clone();
336        }
337    }
338
339    if detection.identifier.is_none() {
340        detection.identifier = Some(compute_public_detection_identifier(detection));
341    }
342}
343
344fn compute_public_detection_identifier(detection: &LicenseDetection) -> String {
345    let expression = python_safe_name(&detection.license_expression);
346    let mut hasher = Sha1::new();
347    hasher.update(format_public_detection_content(detection).as_bytes());
348    let hex_str = hex::encode(hasher.finalize());
349    let uuid_hex = &hex_str[..32];
350    let content_uuid = uuid::Uuid::parse_str(uuid_hex)
351        .map(|uuid| uuid.to_string())
352        .unwrap_or_else(|_| uuid_hex.to_string());
353
354    format!("{}-{}", expression, content_uuid)
355}
356
357fn format_public_detection_content(detection: &LicenseDetection) -> String {
358    let mut result = String::from("(");
359
360    for (index, detection_match) in detection.matches.iter().enumerate() {
361        if index > 0 {
362            result.push_str(", ");
363        }
364        result.push_str(&format!(
365            "({}, {}, {})",
366            python_str_repr(
367                detection_match
368                    .rule_identifier
369                    .as_deref()
370                    .or(detection_match.matcher.as_deref())
371                    .unwrap_or("parser-declared-license")
372            ),
373            detection_match.score.value() as f32,
374            python_token_tuple_repr(&tokenize_without_stopwords(
375                detection_match.matched_text.as_deref().unwrap_or_default(),
376            )),
377        ));
378    }
379
380    if detection.matches.len() == 1 {
381        result.push(',');
382    }
383    result.push(')');
384    result
385}
386
387fn python_safe_name(value: &str) -> String {
388    let mut result = String::new();
389    let mut prev_underscore = false;
390
391    for character in value.chars() {
392        if character.is_alphanumeric() {
393            result.push(character);
394            prev_underscore = false;
395        } else if !prev_underscore {
396            result.push('_');
397            prev_underscore = true;
398        }
399    }
400
401    let trimmed = result.trim_matches('_');
402    if trimmed.is_empty() {
403        String::new()
404    } else {
405        trimmed.to_string()
406    }
407}
408
409fn python_str_repr(value: &str) -> String {
410    if value.contains('\'') && !value.contains('"') {
411        format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\""))
412    } else {
413        format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\\'"))
414    }
415}
416
417fn python_token_tuple_repr(tokens: &[String]) -> String {
418    if tokens.is_empty() {
419        return String::from("()");
420    }
421
422    let mut result = String::from("(");
423    for (index, token) in tokens.iter().enumerate() {
424        if index > 0 {
425            result.push_str(", ");
426        }
427        result.push_str(&python_str_repr(token));
428    }
429
430    if tokens.len() == 1 {
431        result.push(',');
432    }
433    result.push(')');
434    result
435}
436
437/// Package metadata extracted from manifest files.
438///
439/// Compatible with ScanCode Toolkit output format. Contains standardized package
440/// information including name, version, dependencies, licenses, and other metadata.
441/// This is the primary data structure returned by all parsers.
442#[derive(Serialize, Deserialize, Debug, Clone, Default)]
443pub struct PackageData {
444    #[serde(rename = "type")] // name used by ScanCode
445    pub package_type: Option<PackageType>,
446    pub namespace: Option<String>,
447    pub name: Option<String>,
448    pub version: Option<String>,
449    #[serde(default)]
450    pub qualifiers: Option<HashMap<String, String>>,
451    pub subpath: Option<String>,
452    pub primary_language: Option<String>,
453    pub description: Option<String>,
454    pub release_date: Option<String>,
455    #[serde(default)]
456    pub parties: Vec<Party>,
457    #[serde(default)]
458    pub keywords: Vec<String>,
459    pub homepage_url: Option<String>,
460    pub download_url: Option<String>,
461    pub size: Option<u64>,
462    pub sha1: Option<Sha1Digest>,
463    pub md5: Option<Md5Digest>,
464    pub sha256: Option<Sha256Digest>,
465    pub sha512: Option<Sha512Digest>,
466    pub bug_tracking_url: Option<String>,
467    pub code_view_url: Option<String>,
468    pub vcs_url: Option<String>,
469    pub copyright: Option<String>,
470    pub holder: Option<String>,
471    pub declared_license_expression: Option<String>,
472    pub declared_license_expression_spdx: Option<String>,
473    #[serde(default)]
474    pub license_detections: Vec<LicenseDetection>,
475    pub other_license_expression: Option<String>,
476    pub other_license_expression_spdx: Option<String>,
477    #[serde(default)]
478    pub other_license_detections: Vec<LicenseDetection>,
479    pub extracted_license_statement: Option<String>,
480    pub notice_text: Option<String>,
481    #[serde(default)]
482    pub source_packages: Vec<String>,
483    #[serde(default)]
484    pub file_references: Vec<FileReference>,
485    #[serde(default)]
486    pub is_private: bool,
487    #[serde(default)]
488    pub is_virtual: bool,
489    #[serde(default)]
490    pub extra_data: Option<HashMap<String, serde_json::Value>>,
491    #[serde(default)]
492    pub dependencies: Vec<Dependency>,
493    pub repository_homepage_url: Option<String>,
494    pub repository_download_url: Option<String>,
495    pub api_data_url: Option<String>,
496    pub datasource_id: Option<DatasourceId>,
497    pub purl: Option<String>,
498}
499
500impl PackageData {
501    /// Extracts a single license expression from all license detections in this package.
502    /// Returns None if there are no license detections.
503    pub fn get_license_expression(&self) -> Option<String> {
504        if self.license_detections.is_empty() {
505            return None;
506        }
507
508        let expressions = self
509            .license_detections
510            .iter()
511            .map(|detection| detection.license_expression.clone());
512        combine_license_expressions(expressions)
513    }
514}
515
516/// License detection result containing matched license expressions.
517///
518/// Aggregates multiple license matches into a single SPDX license expression.
519#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
520pub struct LicenseDetection {
521    pub license_expression: String,
522    pub license_expression_spdx: String,
523    pub matches: Vec<Match>,
524    #[serde(default)]
525    pub detection_log: Vec<String>,
526    pub identifier: Option<String>,
527}
528
529/// Individual license text match with location and confidence score.
530///
531/// Represents a specific region of text that matched a known license pattern.
532#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
533pub struct Match {
534    pub license_expression: String,
535    pub license_expression_spdx: String,
536    pub from_file: Option<String>,
537    pub start_line: LineNumber,
538    pub end_line: LineNumber,
539    pub matcher: Option<String>,
540    pub score: MatchScore,
541    pub matched_length: Option<usize>,
542    pub match_coverage: Option<f64>,
543    pub rule_relevance: Option<u8>,
544    pub rule_identifier: Option<String>,
545    pub rule_url: Option<String>,
546    pub matched_text: Option<String>,
547    pub matched_text_diagnostics: Option<String>,
548    #[serde(default)]
549    pub referenced_filenames: Option<Vec<String>>,
550}
551
552#[derive(Serialize, Deserialize, Debug, Clone)]
553pub struct Copyright {
554    pub copyright: String,
555    pub start_line: LineNumber,
556    pub end_line: LineNumber,
557}
558
559#[derive(Serialize, Deserialize, Debug, Clone)]
560pub struct Holder {
561    pub holder: String,
562    pub start_line: LineNumber,
563    pub end_line: LineNumber,
564}
565
566#[derive(Serialize, Deserialize, Debug, Clone)]
567pub struct Author {
568    pub author: String,
569    pub start_line: LineNumber,
570    pub end_line: LineNumber,
571}
572
573/// Package dependency information with version constraints.
574///
575/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
576/// and optional resolved package details.
577#[derive(Serialize, Deserialize, Debug, Clone)]
578pub struct Dependency {
579    pub purl: Option<String>,
580    pub extracted_requirement: Option<String>,
581    pub scope: Option<String>,
582    pub is_runtime: Option<bool>,
583    pub is_optional: Option<bool>,
584    pub is_pinned: Option<bool>,
585    pub is_direct: Option<bool>,
586    pub resolved_package: Option<Box<ResolvedPackage>>,
587    #[serde(default)]
588    pub extra_data: Option<HashMap<String, serde_json::Value>>,
589}
590
591#[derive(Serialize, Deserialize, Debug, Clone)]
592pub struct ResolvedPackage {
593    #[serde(rename = "type")]
594    pub package_type: PackageType,
595    pub namespace: String,
596    pub name: String,
597    pub version: String,
598    #[serde(default)]
599    pub qualifiers: Option<HashMap<String, String>>,
600    pub subpath: Option<String>,
601    pub primary_language: Option<String>,
602    pub description: Option<String>,
603    pub release_date: Option<String>,
604    #[serde(default)]
605    pub parties: Vec<Party>,
606    #[serde(default)]
607    pub keywords: Vec<String>,
608    pub homepage_url: Option<String>,
609    pub download_url: Option<String>,
610    pub size: Option<u64>,
611    pub sha1: Option<Sha1Digest>,
612    pub md5: Option<Md5Digest>,
613    pub sha256: Option<Sha256Digest>,
614    pub sha512: Option<Sha512Digest>,
615    pub bug_tracking_url: Option<String>,
616    pub code_view_url: Option<String>,
617    pub vcs_url: Option<String>,
618    pub copyright: Option<String>,
619    pub holder: Option<String>,
620    pub declared_license_expression: Option<String>,
621    pub declared_license_expression_spdx: Option<String>,
622    #[serde(default)]
623    pub license_detections: Vec<LicenseDetection>,
624    pub other_license_expression: Option<String>,
625    pub other_license_expression_spdx: Option<String>,
626    #[serde(default)]
627    pub other_license_detections: Vec<LicenseDetection>,
628    pub extracted_license_statement: Option<String>,
629    pub notice_text: Option<String>,
630    #[serde(default)]
631    pub source_packages: Vec<String>,
632    #[serde(default)]
633    pub file_references: Vec<FileReference>,
634    #[serde(default)]
635    pub is_private: bool,
636    #[serde(default)]
637    pub is_virtual: bool,
638    #[serde(default)]
639    pub extra_data: Option<HashMap<String, serde_json::Value>>,
640    #[serde(default)]
641    pub dependencies: Vec<Dependency>,
642    pub repository_homepage_url: Option<String>,
643    pub repository_download_url: Option<String>,
644    pub api_data_url: Option<String>,
645    pub datasource_id: Option<DatasourceId>,
646    pub purl: Option<String>,
647}
648
649impl ResolvedPackage {
650    pub fn new(
651        package_type: PackageType,
652        namespace: String,
653        name: String,
654        version: String,
655    ) -> Self {
656        Self {
657            package_type,
658            namespace,
659            name,
660            version,
661            qualifiers: None,
662            subpath: None,
663            primary_language: None,
664            description: None,
665            release_date: None,
666            parties: vec![],
667            keywords: vec![],
668            homepage_url: None,
669            download_url: None,
670            size: None,
671            sha1: None,
672            md5: None,
673            sha256: None,
674            sha512: None,
675            bug_tracking_url: None,
676            code_view_url: None,
677            vcs_url: None,
678            copyright: None,
679            holder: None,
680            declared_license_expression: None,
681            declared_license_expression_spdx: None,
682            license_detections: vec![],
683            other_license_expression: None,
684            other_license_expression_spdx: None,
685            other_license_detections: vec![],
686            extracted_license_statement: None,
687            notice_text: None,
688            source_packages: vec![],
689            file_references: vec![],
690            is_private: false,
691            is_virtual: false,
692            extra_data: None,
693            dependencies: vec![],
694            repository_homepage_url: None,
695            repository_download_url: None,
696            api_data_url: None,
697            datasource_id: None,
698            purl: None,
699        }
700    }
701
702    pub fn from_package_data(package_data: &PackageData, fallback_type: PackageType) -> Self {
703        Self {
704            package_type: package_data.package_type.unwrap_or(fallback_type),
705            namespace: package_data.namespace.clone().unwrap_or_default(),
706            name: package_data.name.clone().unwrap_or_default(),
707            version: package_data.version.clone().unwrap_or_default(),
708            qualifiers: package_data.qualifiers.clone(),
709            subpath: package_data.subpath.clone(),
710            primary_language: package_data.primary_language.clone(),
711            description: package_data.description.clone(),
712            release_date: package_data.release_date.clone(),
713            parties: package_data.parties.clone(),
714            keywords: package_data.keywords.clone(),
715            homepage_url: package_data.homepage_url.clone(),
716            download_url: package_data.download_url.clone(),
717            size: package_data.size,
718            sha1: package_data.sha1,
719            md5: package_data.md5,
720            sha256: package_data.sha256,
721            sha512: package_data.sha512,
722            bug_tracking_url: package_data.bug_tracking_url.clone(),
723            code_view_url: package_data.code_view_url.clone(),
724            vcs_url: package_data.vcs_url.clone(),
725            copyright: package_data.copyright.clone(),
726            holder: package_data.holder.clone(),
727            declared_license_expression: package_data.declared_license_expression.clone(),
728            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
729            license_detections: package_data.license_detections.clone(),
730            other_license_expression: package_data.other_license_expression.clone(),
731            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
732            other_license_detections: package_data.other_license_detections.clone(),
733            extracted_license_statement: package_data.extracted_license_statement.clone(),
734            notice_text: package_data.notice_text.clone(),
735            source_packages: package_data.source_packages.clone(),
736            file_references: package_data.file_references.clone(),
737            is_private: package_data.is_private,
738            is_virtual: package_data.is_virtual,
739            extra_data: package_data.extra_data.clone(),
740            dependencies: package_data.dependencies.clone(),
741            repository_homepage_url: package_data.repository_homepage_url.clone(),
742            repository_download_url: package_data.repository_download_url.clone(),
743            api_data_url: package_data.api_data_url.clone(),
744            datasource_id: package_data.datasource_id,
745            purl: package_data.purl.clone(),
746        }
747    }
748}
749
750/// Author, maintainer, or contributor information.
751///
752/// Represents a person or organization associated with a package.
753#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
754pub struct Party {
755    pub r#type: Option<String>,
756    pub role: Option<String>,
757    pub name: Option<String>,
758    pub email: Option<String>,
759    pub url: Option<String>,
760    pub organization: Option<String>,
761    pub organization_url: Option<String>,
762    pub timezone: Option<String>,
763}
764
765impl Party {
766    pub(crate) fn person(role: &str, name: Option<String>, email: Option<String>) -> Self {
767        Self {
768            r#type: Some("person".to_string()),
769            role: Some(role.to_string()),
770            name,
771            email,
772            url: None,
773            organization: None,
774            organization_url: None,
775            timezone: None,
776        }
777    }
778}
779
780/// Reference to a file within a package archive with checksums.
781///
782/// Used in SBOM generation to track files within distribution archives.
783#[derive(Serialize, Deserialize, Debug, Clone)]
784pub struct FileReference {
785    pub path: String,
786    pub size: Option<u64>,
787    pub sha1: Option<Sha1Digest>,
788    pub md5: Option<Md5Digest>,
789    pub sha256: Option<Sha256Digest>,
790    pub sha512: Option<Sha512Digest>,
791    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
792}
793
794impl FileReference {
795    pub(crate) fn from_path(path: String) -> Self {
796        Self {
797            path,
798            size: None,
799            sha1: None,
800            md5: None,
801            sha256: None,
802            sha512: None,
803            extra_data: None,
804        }
805    }
806}
807
808/// Top-level assembled package, created by merging one or more `PackageData`
809/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
810///
811/// Compatible with ScanCode Toolkit output format. The key differences from
812/// `PackageData` are:
813/// - `package_uid`: unique identifier (PURL with UUID qualifier)
814/// - `datafile_paths`: list of all contributing files
815/// - `datasource_ids`: list of all contributing parsers
816/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
817#[derive(Serialize, Deserialize, Debug, Clone)]
818pub struct Package {
819    #[serde(rename = "type")]
820    pub package_type: Option<PackageType>,
821    pub namespace: Option<String>,
822    pub name: Option<String>,
823    pub version: Option<String>,
824    #[serde(default)]
825    pub qualifiers: Option<HashMap<String, String>>,
826    pub subpath: Option<String>,
827    pub primary_language: Option<String>,
828    pub description: Option<String>,
829    pub release_date: Option<String>,
830    #[serde(default)]
831    pub parties: Vec<Party>,
832    #[serde(default)]
833    pub keywords: Vec<String>,
834    pub homepage_url: Option<String>,
835    pub download_url: Option<String>,
836    pub size: Option<u64>,
837    pub sha1: Option<Sha1Digest>,
838    pub md5: Option<Md5Digest>,
839    pub sha256: Option<Sha256Digest>,
840    pub sha512: Option<Sha512Digest>,
841    pub bug_tracking_url: Option<String>,
842    pub code_view_url: Option<String>,
843    pub vcs_url: Option<String>,
844    pub copyright: Option<String>,
845    pub holder: Option<String>,
846    pub declared_license_expression: Option<String>,
847    pub declared_license_expression_spdx: Option<String>,
848    #[serde(default)]
849    pub license_detections: Vec<LicenseDetection>,
850    pub other_license_expression: Option<String>,
851    pub other_license_expression_spdx: Option<String>,
852    #[serde(default)]
853    pub other_license_detections: Vec<LicenseDetection>,
854    pub extracted_license_statement: Option<String>,
855    pub notice_text: Option<String>,
856    #[serde(default)]
857    pub source_packages: Vec<String>,
858    #[serde(default)]
859    pub is_private: bool,
860    #[serde(default)]
861    pub is_virtual: bool,
862    #[serde(default)]
863    pub extra_data: Option<HashMap<String, serde_json::Value>>,
864    pub repository_homepage_url: Option<String>,
865    pub repository_download_url: Option<String>,
866    pub api_data_url: Option<String>,
867    pub purl: Option<String>,
868    /// Unique identifier for this package instance (PURL with UUID qualifier).
869    pub package_uid: PackageUid,
870    /// Paths to all datafiles that contributed to this package.
871    pub datafile_paths: Vec<String>,
872    /// Datasource identifiers for all parsers that contributed to this package.
873    pub datasource_ids: Vec<DatasourceId>,
874}
875
876impl Package {
877    /// Create a `Package` from a `PackageData` and its source file path.
878    ///
879    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
880    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
881    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
882        let mut package_data = package_data.clone();
883        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
884
885        let package_uid = package_data
886            .purl
887            .as_ref()
888            .map(|p| PackageUid::new(p))
889            .unwrap_or_else(PackageUid::empty);
890
891        Package {
892            package_type: package_data.package_type,
893            namespace: package_data.namespace.clone(),
894            name: package_data.name.clone(),
895            version: package_data.version.clone(),
896            qualifiers: package_data.qualifiers.clone(),
897            subpath: package_data.subpath.clone(),
898            primary_language: package_data.primary_language.clone(),
899            description: package_data.description.clone(),
900            release_date: package_data.release_date.clone(),
901            parties: package_data.parties.clone(),
902            keywords: package_data.keywords.clone(),
903            homepage_url: package_data.homepage_url.clone(),
904            download_url: package_data.download_url.clone(),
905            size: package_data.size,
906            sha1: package_data.sha1,
907            md5: package_data.md5,
908            sha256: package_data.sha256,
909            sha512: package_data.sha512,
910            bug_tracking_url: package_data.bug_tracking_url.clone(),
911            code_view_url: package_data.code_view_url.clone(),
912            vcs_url: package_data.vcs_url.clone(),
913            copyright: package_data.copyright.clone(),
914            holder: package_data.holder.clone(),
915            declared_license_expression: package_data.declared_license_expression.clone(),
916            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
917            license_detections: package_data.license_detections.clone(),
918            other_license_expression: package_data.other_license_expression.clone(),
919            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
920            other_license_detections: package_data.other_license_detections.clone(),
921            extracted_license_statement: package_data.extracted_license_statement.clone(),
922            notice_text: package_data.notice_text.clone(),
923            source_packages: package_data.source_packages.clone(),
924            is_private: package_data.is_private,
925            is_virtual: package_data.is_virtual,
926            extra_data: package_data.extra_data.clone(),
927            repository_homepage_url: package_data.repository_homepage_url.clone(),
928            repository_download_url: package_data.repository_download_url.clone(),
929            api_data_url: package_data.api_data_url.clone(),
930            purl: package_data.purl.clone(),
931            package_uid,
932            datafile_paths: vec![datafile_path],
933            datasource_ids: if let Some(dsid) = package_data.datasource_id {
934                vec![dsid]
935            } else {
936                vec![]
937            },
938        }
939    }
940
941    /// Update this package with data from another `PackageData`.
942    ///
943    /// Merges data from a related file (e.g., lockfile) into this package.
944    /// Existing non-empty values are preserved; empty fields are filled from
945    /// the new data. Lists (parties, license_detections) are merged.
946    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
947        let mut package_data = package_data.clone();
948        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
949
950        if let Some(dsid) = package_data.datasource_id {
951            self.datasource_ids.push(dsid);
952        }
953        self.datafile_paths.push(datafile_path);
954
955        macro_rules! fill_if_empty {
956            ($field:ident) => {
957                if self.$field.is_none() {
958                    self.$field = package_data.$field;
959                }
960            };
961        }
962
963        fill_if_empty!(package_type);
964        fill_if_empty!(name);
965        fill_if_empty!(namespace);
966        fill_if_empty!(version);
967        fill_if_empty!(qualifiers);
968        fill_if_empty!(subpath);
969        fill_if_empty!(primary_language);
970        fill_if_empty!(description);
971        fill_if_empty!(release_date);
972        fill_if_empty!(homepage_url);
973        fill_if_empty!(download_url);
974        fill_if_empty!(size);
975        fill_if_empty!(sha1);
976        fill_if_empty!(md5);
977        fill_if_empty!(sha256);
978        fill_if_empty!(sha512);
979        fill_if_empty!(bug_tracking_url);
980        fill_if_empty!(code_view_url);
981        fill_if_empty!(vcs_url);
982        fill_if_empty!(copyright);
983        fill_if_empty!(holder);
984        fill_if_empty!(declared_license_expression);
985        fill_if_empty!(declared_license_expression_spdx);
986        fill_if_empty!(other_license_expression);
987        fill_if_empty!(other_license_expression_spdx);
988        fill_if_empty!(extracted_license_statement);
989        fill_if_empty!(notice_text);
990        match (&mut self.extra_data, &package_data.extra_data) {
991            (None, Some(extra_data)) => {
992                self.extra_data = Some(extra_data.clone());
993            }
994            (Some(existing), Some(incoming)) => {
995                for (key, value) in incoming {
996                    existing.entry(key.clone()).or_insert_with(|| value.clone());
997                }
998            }
999            _ => {}
1000        }
1001        fill_if_empty!(repository_homepage_url);
1002        fill_if_empty!(repository_download_url);
1003        fill_if_empty!(api_data_url);
1004
1005        for party in &package_data.parties {
1006            if let Some(existing) = self.parties.iter_mut().find(|p| {
1007                p.role == party.role
1008                    && ((p.name.is_some() && p.name == party.name)
1009                        || (p.email.is_some() && p.email == party.email))
1010            }) {
1011                if existing.name.is_none() {
1012                    existing.name = party.name.clone();
1013                }
1014                if existing.email.is_none() {
1015                    existing.email = party.email.clone();
1016                }
1017            } else {
1018                self.parties.push(party.clone());
1019            }
1020        }
1021
1022        for keyword in &package_data.keywords {
1023            if !self.keywords.contains(keyword) {
1024                self.keywords.push(keyword.clone());
1025            }
1026        }
1027
1028        for detection in &package_data.license_detections {
1029            self.license_detections.push(detection.clone());
1030        }
1031
1032        for detection in &package_data.other_license_detections {
1033            self.other_license_detections.push(detection.clone());
1034        }
1035
1036        for source_pkg in &package_data.source_packages {
1037            if !self.source_packages.contains(source_pkg) {
1038                self.source_packages.push(source_pkg.clone());
1039            }
1040        }
1041
1042        self.refresh_identity();
1043    }
1044
1045    pub fn backfill_license_provenance(&mut self) {
1046        let Some(datafile_path) = self.datafile_paths.first().cloned() else {
1047            return;
1048        };
1049
1050        for detection in &mut self.license_detections {
1051            enrich_license_detection_provenance(detection, &datafile_path);
1052        }
1053        for detection in &mut self.other_license_detections {
1054            enrich_license_detection_provenance(detection, &datafile_path);
1055        }
1056    }
1057
1058    fn refresh_identity(&mut self) {
1059        let Some(next_purl) = self.build_current_purl() else {
1060            return;
1061        };
1062
1063        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
1064            self.package_uid = PackageUid::new(&next_purl);
1065        }
1066
1067        self.purl = Some(next_purl);
1068    }
1069
1070    fn build_current_purl(&self) -> Option<String> {
1071        if let (Some(package_type), Some(name)) = (
1072            self.package_type.as_ref(),
1073            self.name
1074                .as_deref()
1075                .filter(|value| !value.trim().is_empty()),
1076        ) {
1077            let purl_type = match package_type {
1078                PackageType::Deno => "generic",
1079                _ => package_type.as_str(),
1080            };
1081
1082            let mut purl = PackageUrl::new(purl_type, name).ok()?;
1083
1084            if let Some(namespace) = self
1085                .namespace
1086                .as_deref()
1087                .filter(|value| !value.trim().is_empty())
1088            {
1089                purl.with_namespace(namespace).ok()?;
1090            }
1091
1092            if let Some(version) = self
1093                .version
1094                .as_deref()
1095                .filter(|value| !value.trim().is_empty())
1096            {
1097                purl.with_version(version).ok()?;
1098            }
1099
1100            if let Some(qualifiers) = &self.qualifiers {
1101                for (key, value) in qualifiers {
1102                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
1103                }
1104            }
1105
1106            if let Some(subpath) = self
1107                .subpath
1108                .as_deref()
1109                .filter(|value| !value.trim().is_empty())
1110            {
1111                purl.with_subpath(subpath).ok()?;
1112            }
1113
1114            return Some(purl.to_string());
1115        }
1116
1117        let existing_purl = self.purl.as_deref()?;
1118        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
1119
1120        if let Some(version) = self
1121            .version
1122            .as_deref()
1123            .filter(|value| !value.trim().is_empty())
1124        {
1125            purl.with_version(version).ok()?;
1126        } else {
1127            purl.without_version();
1128        }
1129
1130        Some(purl.to_string())
1131    }
1132}
1133
1134#[cfg(test)]
1135mod tests {
1136    use super::*;
1137
1138    #[test]
1139    fn file_info_new_backfills_package_detection_provenance() {
1140        let package_data = PackageData {
1141            package_type: Some(PackageType::Npm),
1142            license_detections: vec![LicenseDetection {
1143                license_expression: "mit".to_string(),
1144                license_expression_spdx: "MIT".to_string(),
1145                matches: vec![Match {
1146                    license_expression: "mit".to_string(),
1147                    license_expression_spdx: "MIT".to_string(),
1148                    from_file: None,
1149                    start_line: LineNumber::ONE,
1150                    end_line: LineNumber::ONE,
1151                    matcher: Some("parser-declared-license".to_string()),
1152                    score: MatchScore::MAX,
1153                    matched_length: Some(1),
1154                    match_coverage: Some(100.0),
1155                    rule_relevance: Some(100),
1156                    rule_identifier: None,
1157                    rule_url: None,
1158                    matched_text: Some("MIT".to_string()),
1159                    referenced_filenames: None,
1160                    matched_text_diagnostics: None,
1161                }],
1162                detection_log: vec![],
1163                identifier: None,
1164            }],
1165            ..PackageData::default()
1166        };
1167
1168        let file_info = FileInfo::new(
1169            "package.json".to_string(),
1170            "package".to_string(),
1171            ".json".to_string(),
1172            "project/package.json".to_string(),
1173            FileType::File,
1174            None,
1175            None,
1176            1,
1177            None,
1178            None,
1179            None,
1180            None,
1181            None,
1182            vec![package_data],
1183            None,
1184            vec![],
1185            vec![],
1186            vec![],
1187            vec![],
1188            vec![],
1189            vec![],
1190            vec![],
1191            vec![],
1192            vec![],
1193        );
1194
1195        assert_eq!(file_info.license_detections.len(), 1);
1196        assert_eq!(
1197            file_info.license_detections[0].matches[0]
1198                .from_file
1199                .as_deref(),
1200            Some("project/package.json")
1201        );
1202        assert!(file_info.license_detections[0].identifier.is_some());
1203        assert_eq!(
1204            file_info.package_data[0].license_detections[0].matches[0]
1205                .from_file
1206                .as_deref(),
1207            Some("project/package.json")
1208        );
1209        assert_eq!(
1210            file_info.package_data[0].license_detections[0].matches[0]
1211                .rule_identifier
1212                .as_deref(),
1213            Some("parser-declared-license")
1214        );
1215        assert!(
1216            file_info.package_data[0].license_detections[0]
1217                .identifier
1218                .is_some()
1219        );
1220    }
1221
1222    #[test]
1223    fn package_from_package_data_backfills_detection_provenance() {
1224        let package_data = PackageData {
1225            package_type: Some(PackageType::Npm),
1226            license_detections: vec![LicenseDetection {
1227                license_expression: "mit".to_string(),
1228                license_expression_spdx: "MIT".to_string(),
1229                matches: vec![Match {
1230                    license_expression: "mit".to_string(),
1231                    license_expression_spdx: "MIT".to_string(),
1232                    from_file: None,
1233                    start_line: LineNumber::ONE,
1234                    end_line: LineNumber::ONE,
1235                    matcher: Some("parser-declared-license".to_string()),
1236                    score: MatchScore::MAX,
1237                    matched_length: Some(1),
1238                    match_coverage: Some(100.0),
1239                    rule_relevance: Some(100),
1240                    rule_identifier: None,
1241                    rule_url: None,
1242                    matched_text: Some("MIT".to_string()),
1243                    referenced_filenames: None,
1244                    matched_text_diagnostics: None,
1245                }],
1246                detection_log: vec![],
1247                identifier: None,
1248            }],
1249            ..PackageData::default()
1250        };
1251
1252        let package = Package::from_package_data(&package_data, "project/package.json".to_string());
1253
1254        assert_eq!(
1255            package.license_detections[0].matches[0]
1256                .from_file
1257                .as_deref(),
1258            Some("project/package.json")
1259        );
1260        assert_eq!(
1261            package.license_detections[0].matches[0]
1262                .rule_identifier
1263                .as_deref(),
1264            Some("parser-declared-license")
1265        );
1266        assert!(package.license_detections[0].identifier.is_some());
1267    }
1268}
1269
1270/// Top-level dependency instance, created during package assembly.
1271///
1272/// Extends the file-level `Dependency` with traceability fields that link
1273/// each dependency to its owning package and source datafile.
1274#[derive(Serialize, Deserialize, Debug, Clone)]
1275pub struct TopLevelDependency {
1276    pub purl: Option<String>,
1277    pub extracted_requirement: Option<String>,
1278    pub scope: Option<String>,
1279    pub is_runtime: Option<bool>,
1280    pub is_optional: Option<bool>,
1281    pub is_pinned: Option<bool>,
1282    pub is_direct: Option<bool>,
1283    pub resolved_package: Option<Box<ResolvedPackage>>,
1284    #[serde(default)]
1285    pub extra_data: Option<HashMap<String, serde_json::Value>>,
1286    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
1287    pub dependency_uid: DependencyUid,
1288    /// The `package_uid` of the package this dependency belongs to.
1289    pub for_package_uid: Option<PackageUid>,
1290    /// Path to the datafile where this dependency was declared.
1291    pub datafile_path: String,
1292    /// Datasource identifier for the parser that extracted this dependency.
1293    pub datasource_id: DatasourceId,
1294    /// Namespace for the dependency (e.g., distribution name for RPM packages).
1295    pub namespace: Option<String>,
1296}
1297
1298impl TopLevelDependency {
1299    /// Create a `TopLevelDependency` from a file-level `Dependency`.
1300    pub fn from_dependency(
1301        dep: &Dependency,
1302        datafile_path: String,
1303        datasource_id: DatasourceId,
1304        for_package_uid: Option<PackageUid>,
1305    ) -> Self {
1306        let dependency_uid = dep
1307            .purl
1308            .as_ref()
1309            .map(|p| DependencyUid::new(p))
1310            .unwrap_or_else(DependencyUid::empty);
1311
1312        TopLevelDependency {
1313            purl: dep.purl.clone(),
1314            extracted_requirement: dep.extracted_requirement.clone(),
1315            scope: dep.scope.clone(),
1316            is_runtime: dep.is_runtime,
1317            is_optional: dep.is_optional,
1318            is_pinned: dep.is_pinned,
1319            is_direct: dep.is_direct,
1320            resolved_package: dep.resolved_package.clone(),
1321            extra_data: dep.extra_data.clone(),
1322            dependency_uid,
1323            for_package_uid,
1324            datafile_path,
1325            datasource_id,
1326            namespace: None,
1327        }
1328    }
1329}
1330
1331#[derive(Serialize, Deserialize, Debug, Clone)]
1332pub struct OutputEmail {
1333    pub email: String,
1334    pub start_line: LineNumber,
1335    pub end_line: LineNumber,
1336}
1337
1338#[derive(Serialize, Deserialize, Debug, Clone)]
1339pub struct OutputURL {
1340    pub url: String,
1341    pub start_line: LineNumber,
1342    pub end_line: LineNumber,
1343}
1344
1345#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
1346pub struct LicensePolicyEntry {
1347    pub license_key: String,
1348    pub label: String,
1349    pub color_code: String,
1350    pub icon: String,
1351}
1352
1353#[derive(Debug, Clone, PartialEq)]
1354pub enum FileType {
1355    File,
1356    Directory,
1357}
1358
1359impl serde::Serialize for FileType {
1360    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1361    where
1362        S: serde::Serializer,
1363    {
1364        match self {
1365            FileType::File => serializer.serialize_str("file"),
1366            FileType::Directory => serializer.serialize_str("directory"),
1367        }
1368    }
1369}
1370
1371impl<'de> Deserialize<'de> for FileType {
1372    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1373    where
1374        D: serde::Deserializer<'de>,
1375    {
1376        let value = String::deserialize(deserializer)?;
1377        match value.as_str() {
1378            "file" => Ok(FileType::File),
1379            "directory" => Ok(FileType::Directory),
1380            _ => Err(serde::de::Error::custom("invalid file type")),
1381        }
1382    }
1383}