Skip to main content

provenant/models/
file_info.rs

1use derive_builder::Builder;
2use packageurl::PackageUrl;
3use serde::ser::Error as SerError;
4use serde::{Deserialize, Serialize, Serializer};
5use serde_json::{Map, Value};
6use std::collections::HashMap;
7use std::str::FromStr;
8use uuid::Uuid;
9
10use sha1::{Digest, Sha1};
11
12use super::DatasourceId;
13use super::PackageType;
14use crate::license_detection::tokenize::tokenize_without_stopwords;
15use crate::models::output::Tallies;
16use crate::utils::spdx::combine_license_expressions;
17
18#[derive(Debug, Builder, Deserialize, Clone)]
19#[builder(build_fn(skip))]
20/// File-level scan result containing metadata and detected findings.
21pub struct FileInfo {
22    pub name: String,
23    pub base_name: String,
24    pub extension: String,
25    pub path: String,
26    #[serde(rename = "type")] // name used by ScanCode
27    pub file_type: FileType,
28    #[builder(default)]
29    #[serde(skip_serializing_if = "Option::is_none", default)]
30    pub mime_type: Option<String>,
31    #[builder(default)]
32    #[serde(rename = "file_type", skip_serializing_if = "Option::is_none", default)]
33    pub file_type_label: Option<String>,
34    pub size: u64,
35    #[builder(default)]
36    #[serde(skip_serializing_if = "Option::is_none", default)]
37    pub date: Option<String>,
38    #[builder(default)]
39    #[serde(skip_serializing_if = "Option::is_none", default)]
40    pub sha1: Option<String>,
41    #[builder(default)]
42    #[serde(skip_serializing_if = "Option::is_none", default)]
43    pub md5: Option<String>,
44    #[builder(default)]
45    #[serde(skip_serializing_if = "Option::is_none", default)]
46    pub sha256: Option<String>,
47    #[builder(default)]
48    #[serde(skip_serializing_if = "Option::is_none", default)]
49    pub sha1_git: Option<String>,
50    #[builder(default)]
51    #[serde(skip_serializing_if = "Option::is_none", default)]
52    pub programming_language: Option<String>,
53    #[builder(default)]
54    #[serde(default)]
55    pub package_data: Vec<PackageData>,
56    #[serde(rename = "detected_license_expression_spdx")] // name used by ScanCode
57    #[builder(default)]
58    pub license_expression: Option<String>,
59    #[builder(default)]
60    #[serde(default)]
61    pub license_detections: Vec<LicenseDetection>,
62    #[builder(default)]
63    #[serde(skip_serializing_if = "Vec::is_empty", default)]
64    pub license_clues: Vec<Match>,
65    #[builder(default)]
66    #[serde(skip_serializing_if = "Option::is_none", default)]
67    pub percentage_of_license_text: Option<f64>,
68    #[builder(default)]
69    #[serde(default)]
70    pub copyrights: Vec<Copyright>,
71    #[builder(default)]
72    #[serde(default)]
73    pub holders: Vec<Holder>,
74    #[builder(default)]
75    #[serde(default)]
76    pub authors: Vec<Author>,
77    #[builder(default)]
78    #[serde(skip_serializing_if = "Vec::is_empty", default)]
79    pub emails: Vec<OutputEmail>,
80    #[builder(default)]
81    #[serde(default)]
82    pub urls: Vec<OutputURL>,
83    #[builder(default)]
84    #[serde(default)]
85    pub for_packages: Vec<String>,
86    #[builder(default)]
87    #[serde(default)]
88    pub scan_errors: Vec<String>,
89    #[builder(default)]
90    #[serde(skip_serializing_if = "Option::is_none", default)]
91    pub license_policy: Option<Vec<LicensePolicyEntry>>,
92    #[builder(default)]
93    #[serde(skip_serializing_if = "Option::is_none", default)]
94    pub is_generated: Option<bool>,
95    #[builder(default)]
96    #[serde(skip_serializing_if = "Option::is_none", default)]
97    pub is_binary: Option<bool>,
98    #[builder(default)]
99    #[serde(skip_serializing_if = "Option::is_none", default)]
100    pub is_text: Option<bool>,
101    #[builder(default)]
102    #[serde(skip_serializing_if = "Option::is_none", default)]
103    pub is_archive: Option<bool>,
104    #[builder(default)]
105    #[serde(skip_serializing_if = "Option::is_none", default)]
106    pub is_media: Option<bool>,
107    #[builder(default)]
108    #[serde(skip_serializing_if = "Option::is_none", default)]
109    pub is_source: Option<bool>,
110    #[builder(default)]
111    #[serde(skip_serializing_if = "Option::is_none", default)]
112    pub is_script: Option<bool>,
113    #[builder(default)]
114    #[serde(skip_serializing_if = "Option::is_none", default)]
115    pub files_count: Option<usize>,
116    #[builder(default)]
117    #[serde(skip_serializing_if = "Option::is_none", default)]
118    pub dirs_count: Option<usize>,
119    #[builder(default)]
120    #[serde(skip_serializing_if = "Option::is_none", default)]
121    pub size_count: Option<u64>,
122    #[builder(default)]
123    #[serde(skip_serializing_if = "Option::is_none", default)]
124    pub source_count: Option<usize>,
125    #[builder(default)]
126    #[serde(skip_serializing_if = "is_false", default)]
127    pub is_legal: bool,
128    #[builder(default)]
129    #[serde(skip_serializing_if = "is_false", default)]
130    pub is_manifest: bool,
131    #[builder(default)]
132    #[serde(skip_serializing_if = "is_false", default)]
133    pub is_readme: bool,
134    #[builder(default)]
135    #[serde(skip_serializing_if = "is_false", default)]
136    pub is_top_level: bool,
137    #[builder(default)]
138    #[serde(skip_serializing_if = "is_false", default)]
139    pub is_key_file: bool,
140    #[builder(default)]
141    #[serde(skip_serializing_if = "is_false", default)]
142    pub is_community: bool,
143    #[builder(default)]
144    #[serde(skip_serializing_if = "Vec::is_empty", default)]
145    pub facets: Vec<String>,
146    #[builder(default)]
147    #[serde(skip_serializing_if = "Option::is_none", default)]
148    pub tallies: Option<Tallies>,
149}
150
151impl FileInfoBuilder {
152    /// Build a [`FileInfo`] from the current builder state.
153    pub fn build(&self) -> Result<FileInfo, String> {
154        let mut file_info = FileInfo::new(
155            self.name.clone().ok_or("Missing field: name")?,
156            self.base_name.clone().ok_or("Missing field: base_name")?,
157            self.extension.clone().ok_or("Missing field: extension")?,
158            self.path.clone().ok_or("Missing field: path")?,
159            self.file_type.clone().ok_or("Missing field: file_type")?,
160            self.mime_type.clone().flatten(),
161            self.file_type_label.clone().flatten(),
162            self.size.ok_or("Missing field: size")?,
163            self.date.clone().flatten(),
164            self.sha1.clone().flatten(),
165            self.md5.clone().flatten(),
166            self.sha256.clone().flatten(),
167            self.programming_language.clone().flatten(),
168            self.package_data.clone().unwrap_or_default(),
169            self.license_expression.clone().flatten(),
170            self.license_detections.clone().unwrap_or_default(),
171            self.license_clues.clone().unwrap_or_default(),
172            self.copyrights.clone().unwrap_or_default(),
173            self.holders.clone().unwrap_or_default(),
174            self.authors.clone().unwrap_or_default(),
175            self.emails.clone().unwrap_or_default(),
176            self.urls.clone().unwrap_or_default(),
177            self.for_packages.clone().unwrap_or_default(),
178            self.scan_errors.clone().unwrap_or_default(),
179        );
180        file_info.license_policy = self.license_policy.clone().flatten();
181        file_info.sha1_git = self.sha1_git.clone().flatten();
182        file_info.is_binary = self.is_binary.flatten();
183        file_info.is_text = self.is_text.flatten();
184        file_info.is_archive = self.is_archive.flatten();
185        file_info.is_media = self.is_media.flatten();
186        file_info.is_script = self.is_script.flatten();
187        file_info.files_count = self.files_count.flatten();
188        file_info.dirs_count = self.dirs_count.flatten();
189        file_info.size_count = self.size_count.flatten();
190        Ok(file_info)
191    }
192}
193
194impl Serialize for FileInfo {
195    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
196    where
197        S: Serializer,
198    {
199        let mut map = Map::new();
200        insert_json(&mut map, "path", &self.path)?;
201        insert_json(&mut map, "type", &self.file_type)?;
202        insert_json(&mut map, "name", &self.name)?;
203        insert_json(&mut map, "base_name", &self.base_name)?;
204        insert_json(&mut map, "extension", &self.extension)?;
205        insert_json(&mut map, "size", self.size)?;
206
207        if self.should_serialize_info_surface() {
208            insert_json(&mut map, "date", &self.date)?;
209            insert_json(&mut map, "sha1", &self.sha1)?;
210            insert_json(&mut map, "md5", &self.md5)?;
211            insert_json(&mut map, "sha256", &self.sha256)?;
212            insert_json(&mut map, "sha1_git", &self.sha1_git)?;
213            insert_json(&mut map, "mime_type", &self.mime_type)?;
214            insert_json(&mut map, "file_type", &self.file_type_label)?;
215            insert_json(&mut map, "programming_language", &self.programming_language)?;
216            insert_json(&mut map, "is_binary", self.is_binary)?;
217            insert_json(&mut map, "is_text", self.is_text)?;
218            insert_json(&mut map, "is_archive", self.is_archive)?;
219            insert_json(&mut map, "is_media", self.is_media)?;
220            insert_json(&mut map, "is_source", self.is_source)?;
221            insert_json(&mut map, "is_script", self.is_script)?;
222            insert_json(&mut map, "files_count", self.files_count)?;
223            insert_json(&mut map, "dirs_count", self.dirs_count)?;
224            insert_json(&mut map, "size_count", self.size_count)?;
225        }
226
227        insert_json(&mut map, "package_data", &self.package_data)?;
228        insert_json(
229            &mut map,
230            "detected_license_expression_spdx",
231            &self.license_expression,
232        )?;
233        insert_json(&mut map, "license_detections", &self.license_detections)?;
234        if !self.license_clues.is_empty() {
235            insert_json(&mut map, "license_clues", &self.license_clues)?;
236        }
237        if self.percentage_of_license_text.is_some() {
238            insert_json(
239                &mut map,
240                "percentage_of_license_text",
241                self.percentage_of_license_text,
242            )?;
243        }
244        insert_json(&mut map, "copyrights", &self.copyrights)?;
245        insert_json(&mut map, "holders", &self.holders)?;
246        insert_json(&mut map, "authors", &self.authors)?;
247        if !self.emails.is_empty() {
248            insert_json(&mut map, "emails", &self.emails)?;
249        }
250        insert_json(&mut map, "urls", &self.urls)?;
251        insert_json(&mut map, "for_packages", &self.for_packages)?;
252        insert_json(&mut map, "scan_errors", &self.scan_errors)?;
253        if self.license_policy.is_some() {
254            insert_json(&mut map, "license_policy", &self.license_policy)?;
255        }
256        if self.is_generated.is_some() {
257            insert_json(&mut map, "is_generated", self.is_generated)?;
258        }
259        if self.source_count.is_some() {
260            insert_json(&mut map, "source_count", self.source_count)?;
261        }
262        if self.is_legal {
263            insert_json(&mut map, "is_legal", self.is_legal)?;
264        }
265        if self.is_manifest {
266            insert_json(&mut map, "is_manifest", self.is_manifest)?;
267        }
268        if self.is_readme {
269            insert_json(&mut map, "is_readme", self.is_readme)?;
270        }
271        if self.is_top_level {
272            insert_json(&mut map, "is_top_level", self.is_top_level)?;
273        }
274        if self.is_key_file {
275            insert_json(&mut map, "is_key_file", self.is_key_file)?;
276        }
277        if self.is_community {
278            insert_json(&mut map, "is_community", self.is_community)?;
279        }
280        if !self.facets.is_empty() {
281            insert_json(&mut map, "facets", &self.facets)?;
282        }
283        if self.tallies.is_some() {
284            insert_json(&mut map, "tallies", &self.tallies)?;
285        }
286
287        map.serialize(serializer)
288    }
289}
290
291impl FileInfo {
292    fn should_serialize_info_surface(&self) -> bool {
293        self.date.is_some()
294            || self.sha1.is_some()
295            || self.md5.is_some()
296            || self.sha256.is_some()
297            || self.sha1_git.is_some()
298            || self.mime_type.is_some()
299            || self.file_type_label.is_some()
300            || self.programming_language.is_some()
301            || self.is_binary.is_some()
302            || self.is_text.is_some()
303            || self.is_archive.is_some()
304            || self.is_media.is_some()
305            || self.is_source.is_some()
306            || self.is_script.is_some()
307            || self.files_count.is_some()
308            || self.dirs_count.is_some()
309            || self.size_count.is_some()
310    }
311}
312
313fn insert_json<S: Serialize, E: SerError>(
314    map: &mut Map<String, Value>,
315    key: &str,
316    value: S,
317) -> Result<(), E> {
318    map.insert(
319        key.to_string(),
320        serde_json::to_value(value).map_err(E::custom)?,
321    );
322    Ok(())
323}
324
325impl FileInfo {
326    #[allow(clippy::too_many_arguments)]
327    /// Construct a [`FileInfo`] from fully resolved scanner fields.
328    pub fn new(
329        name: String,
330        base_name: String,
331        extension: String,
332        path: String,
333        file_type: FileType,
334        mime_type: Option<String>,
335        file_type_label: Option<String>,
336        size: u64,
337        date: Option<String>,
338        sha1: Option<String>,
339        md5: Option<String>,
340        sha256: Option<String>,
341        programming_language: Option<String>,
342        package_data: Vec<PackageData>,
343        mut license_expression: Option<String>,
344        mut license_detections: Vec<LicenseDetection>,
345        license_clues: Vec<Match>,
346        copyrights: Vec<Copyright>,
347        holders: Vec<Holder>,
348        authors: Vec<Author>,
349        emails: Vec<OutputEmail>,
350        urls: Vec<OutputURL>,
351        for_packages: Vec<String>,
352        scan_errors: Vec<String>,
353    ) -> Self {
354        let mut package_data = package_data;
355        for package in &mut package_data {
356            enrich_package_data_license_provenance(package, &path);
357        }
358
359        // Combine license expressions from package data if license_expression is None
360        license_expression = license_expression.or_else(|| {
361            let expressions = package_data
362                .iter()
363                .filter_map(|pkg| pkg.get_license_expression());
364            combine_license_expressions(expressions)
365        });
366
367        // Combine license detections from package data if none are provided
368        if license_detections.is_empty() {
369            for pkg in &package_data {
370                license_detections.extend(pkg.license_detections.clone());
371            }
372        }
373
374        // Combine license expressions from license detections if license_expression is still None
375        if license_expression.is_none() && !license_detections.is_empty() {
376            let expressions = license_detections
377                .iter()
378                .map(|detection| detection.license_expression.clone());
379            license_expression = combine_license_expressions(expressions);
380        }
381
382        let mut file_info = FileInfo {
383            name,
384            base_name,
385            extension,
386            path,
387            file_type,
388            mime_type,
389            file_type_label,
390            size,
391            date,
392            sha1,
393            md5,
394            sha256,
395            sha1_git: None,
396            programming_language,
397            package_data,
398            license_expression,
399            license_detections,
400            license_clues,
401            percentage_of_license_text: None,
402            copyrights,
403            holders,
404            authors,
405            emails,
406            urls,
407            for_packages,
408            scan_errors,
409            license_policy: None,
410            is_generated: None,
411            is_binary: None,
412            is_text: None,
413            is_archive: None,
414            is_media: None,
415            is_source: None,
416            is_script: None,
417            files_count: None,
418            dirs_count: None,
419            size_count: None,
420            source_count: None,
421            is_legal: false,
422            is_manifest: false,
423            is_readme: false,
424            is_top_level: false,
425            is_key_file: false,
426            is_community: false,
427            facets: vec![],
428            tallies: None,
429        };
430        file_info.backfill_license_provenance();
431        file_info
432    }
433
434    pub fn backfill_license_provenance(&mut self) {
435        for detection in &mut self.license_detections {
436            enrich_license_detection_provenance(detection, &self.path);
437        }
438
439        for package in &mut self.package_data {
440            enrich_package_data_license_provenance(package, &self.path);
441        }
442    }
443}
444
445fn enrich_package_data_license_provenance(package_data: &mut PackageData, path: &str) {
446    for detection in &mut package_data.license_detections {
447        enrich_license_detection_provenance(detection, path);
448    }
449    for detection in &mut package_data.other_license_detections {
450        enrich_license_detection_provenance(detection, path);
451    }
452}
453
454pub(crate) fn enrich_license_detection_provenance(detection: &mut LicenseDetection, path: &str) {
455    for detection_match in &mut detection.matches {
456        if detection_match.from_file.is_none() {
457            detection_match.from_file = Some(path.to_string());
458        }
459    }
460
461    if detection.identifier.is_none() {
462        detection.identifier = Some(compute_public_detection_identifier(detection));
463    }
464}
465
466fn compute_public_detection_identifier(detection: &LicenseDetection) -> String {
467    let expression = python_safe_name(&detection.license_expression);
468    let mut hasher = Sha1::new();
469    hasher.update(format_public_detection_content(detection).as_bytes());
470    let hex_str = hex::encode(hasher.finalize());
471    let uuid_hex = &hex_str[..32];
472    let content_uuid = uuid::Uuid::parse_str(uuid_hex)
473        .map(|uuid| uuid.to_string())
474        .unwrap_or_else(|_| uuid_hex.to_string());
475
476    format!("{}-{}", expression, content_uuid)
477}
478
479fn format_public_detection_content(detection: &LicenseDetection) -> String {
480    let mut result = String::from("(");
481
482    for (index, detection_match) in detection.matches.iter().enumerate() {
483        if index > 0 {
484            result.push_str(", ");
485        }
486        result.push_str(&format!(
487            "({}, {}, {})",
488            python_str_repr(
489                detection_match
490                    .rule_identifier
491                    .as_deref()
492                    .or(detection_match.matcher.as_deref())
493                    .unwrap_or("parser-declared-license")
494            ),
495            detection_match.score as f32,
496            python_token_tuple_repr(&tokenize_without_stopwords(
497                detection_match.matched_text.as_deref().unwrap_or_default(),
498            )),
499        ));
500    }
501
502    if detection.matches.len() == 1 {
503        result.push(',');
504    }
505    result.push(')');
506    result
507}
508
509fn python_safe_name(value: &str) -> String {
510    let mut result = String::new();
511    let mut prev_underscore = false;
512
513    for character in value.chars() {
514        if character.is_alphanumeric() {
515            result.push(character);
516            prev_underscore = false;
517        } else if !prev_underscore {
518            result.push('_');
519            prev_underscore = true;
520        }
521    }
522
523    let trimmed = result.trim_matches('_');
524    if trimmed.is_empty() {
525        String::new()
526    } else {
527        trimmed.to_string()
528    }
529}
530
531fn python_str_repr(value: &str) -> String {
532    if value.contains('\'') && !value.contains('"') {
533        format!("\"{}\"", value.replace('\\', "\\\\").replace('"', "\\\""))
534    } else {
535        format!("'{}'", value.replace('\\', "\\\\").replace('\'', "\\\'"))
536    }
537}
538
539fn python_token_tuple_repr(tokens: &[String]) -> String {
540    if tokens.is_empty() {
541        return String::from("()");
542    }
543
544    let mut result = String::from("(");
545    for (index, token) in tokens.iter().enumerate() {
546        if index > 0 {
547            result.push_str(", ");
548        }
549        result.push_str(&python_str_repr(token));
550    }
551
552    if tokens.len() == 1 {
553        result.push(',');
554    }
555    result.push(')');
556    result
557}
558
559/// Package metadata extracted from manifest files.
560///
561/// Compatible with ScanCode Toolkit output format. Contains standardized package
562/// information including name, version, dependencies, licenses, and other metadata.
563/// This is the primary data structure returned by all parsers.
564#[derive(Serialize, Deserialize, Debug, Clone, Default)]
565pub struct PackageData {
566    #[serde(rename = "type")] // name used by ScanCode
567    pub package_type: Option<PackageType>,
568    pub namespace: Option<String>,
569    pub name: Option<String>,
570    pub version: Option<String>,
571    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
572    pub qualifiers: Option<HashMap<String, String>>,
573    pub subpath: Option<String>,
574    pub primary_language: Option<String>,
575    pub description: Option<String>,
576    pub release_date: Option<String>,
577    #[serde(default)]
578    pub parties: Vec<Party>,
579    #[serde(default)]
580    pub keywords: Vec<String>,
581    pub homepage_url: Option<String>,
582    pub download_url: Option<String>,
583    pub size: Option<u64>,
584    pub sha1: Option<String>,
585    pub md5: Option<String>,
586    pub sha256: Option<String>,
587    pub sha512: Option<String>,
588    pub bug_tracking_url: Option<String>,
589    pub code_view_url: Option<String>,
590    pub vcs_url: Option<String>,
591    pub copyright: Option<String>,
592    pub holder: Option<String>,
593    pub declared_license_expression: Option<String>,
594    pub declared_license_expression_spdx: Option<String>,
595    #[serde(default)]
596    pub license_detections: Vec<LicenseDetection>,
597    pub other_license_expression: Option<String>,
598    pub other_license_expression_spdx: Option<String>,
599    #[serde(default)]
600    pub other_license_detections: Vec<LicenseDetection>,
601    pub extracted_license_statement: Option<String>,
602    pub notice_text: Option<String>,
603    #[serde(default)]
604    pub source_packages: Vec<String>,
605    #[serde(default)]
606    pub file_references: Vec<FileReference>,
607    #[serde(default)]
608    pub is_private: bool,
609    #[serde(default)]
610    pub is_virtual: bool,
611    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
612    pub extra_data: Option<HashMap<String, serde_json::Value>>,
613    #[serde(default)]
614    pub dependencies: Vec<Dependency>,
615    pub repository_homepage_url: Option<String>,
616    pub repository_download_url: Option<String>,
617    pub api_data_url: Option<String>,
618    pub datasource_id: Option<DatasourceId>,
619    pub purl: Option<String>,
620}
621
622fn serialize_optional_map_as_object<S, T>(
623    value: &Option<HashMap<String, T>>,
624    serializer: S,
625) -> Result<S::Ok, S::Error>
626where
627    S: Serializer,
628    T: Serialize,
629{
630    match value {
631        Some(map) => map.serialize(serializer),
632        None => HashMap::<String, T>::new().serialize(serializer),
633    }
634}
635
636impl PackageData {
637    /// Extracts a single license expression from all license detections in this package.
638    /// Returns None if there are no license detections.
639    pub fn get_license_expression(&self) -> Option<String> {
640        if self.license_detections.is_empty() {
641            return None;
642        }
643
644        let expressions = self
645            .license_detections
646            .iter()
647            .map(|detection| detection.license_expression.clone());
648        combine_license_expressions(expressions)
649    }
650}
651
652/// License detection result containing matched license expressions.
653///
654/// Aggregates multiple license matches into a single SPDX license expression.
655#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
656pub struct LicenseDetection {
657    pub license_expression: String,
658    pub license_expression_spdx: String,
659    pub matches: Vec<Match>,
660    #[serde(skip_serializing_if = "Vec::is_empty", default)]
661    pub detection_log: Vec<String>,
662    #[serde(skip_serializing_if = "Option::is_none")]
663    pub identifier: Option<String>,
664}
665
666/// Individual license text match with location and confidence score.
667///
668/// Represents a specific region of text that matched a known license pattern.
669#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
670pub struct Match {
671    pub license_expression: String,
672    pub license_expression_spdx: String,
673    #[serde(skip_serializing_if = "Option::is_none")]
674    pub from_file: Option<String>,
675    pub start_line: usize,
676    pub end_line: usize,
677    #[serde(skip_serializing_if = "Option::is_none")]
678    pub matcher: Option<String>,
679    pub score: f64,
680    #[serde(skip_serializing_if = "Option::is_none")]
681    pub matched_length: Option<usize>,
682    #[serde(skip_serializing_if = "Option::is_none")]
683    pub match_coverage: Option<f64>,
684    #[serde(skip_serializing_if = "Option::is_none")]
685    pub rule_relevance: Option<usize>,
686    #[serde(skip_serializing_if = "Option::is_none")]
687    pub rule_identifier: Option<String>,
688    pub rule_url: Option<String>,
689    #[serde(skip_serializing_if = "Option::is_none")]
690    pub matched_text: Option<String>,
691    #[serde(skip_serializing_if = "Option::is_none")]
692    pub matched_text_diagnostics: Option<String>,
693    #[serde(skip_serializing_if = "Option::is_none", default)]
694    pub referenced_filenames: Option<Vec<String>>,
695}
696
697#[derive(Serialize, Deserialize, Debug, Clone)]
698pub struct Copyright {
699    pub copyright: String,
700    pub start_line: usize,
701    pub end_line: usize,
702}
703
704#[derive(Serialize, Deserialize, Debug, Clone)]
705pub struct Holder {
706    pub holder: String,
707    pub start_line: usize,
708    pub end_line: usize,
709}
710
711#[derive(Serialize, Deserialize, Debug, Clone)]
712pub struct Author {
713    pub author: String,
714    pub start_line: usize,
715    pub end_line: usize,
716}
717
718/// Package dependency information with version constraints.
719///
720/// Represents a declared dependency with scope (e.g., runtime, dev, optional)
721/// and optional resolved package details.
722#[derive(Serialize, Deserialize, Debug, Clone)]
723pub struct Dependency {
724    pub purl: Option<String>,
725    pub extracted_requirement: Option<String>,
726    pub scope: Option<String>,
727    pub is_runtime: Option<bool>,
728    pub is_optional: Option<bool>,
729    pub is_pinned: Option<bool>,
730    pub is_direct: Option<bool>,
731    pub resolved_package: Option<Box<ResolvedPackage>>,
732    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
733    pub extra_data: Option<HashMap<String, serde_json::Value>>,
734}
735
736#[derive(Serialize, Deserialize, Debug, Clone)]
737pub struct ResolvedPackage {
738    #[serde(rename = "type")]
739    pub package_type: PackageType,
740    pub namespace: String,
741    pub name: String,
742    pub version: String,
743    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
744    pub qualifiers: Option<HashMap<String, String>>,
745    pub subpath: Option<String>,
746    pub primary_language: Option<String>,
747    pub description: Option<String>,
748    pub release_date: Option<String>,
749    #[serde(default)]
750    pub parties: Vec<Party>,
751    #[serde(default)]
752    pub keywords: Vec<String>,
753    pub homepage_url: Option<String>,
754    pub download_url: Option<String>,
755    pub size: Option<u64>,
756    pub sha1: Option<String>,
757    pub md5: Option<String>,
758    pub sha256: Option<String>,
759    pub sha512: Option<String>,
760    pub bug_tracking_url: Option<String>,
761    pub code_view_url: Option<String>,
762    pub vcs_url: Option<String>,
763    pub copyright: Option<String>,
764    pub holder: Option<String>,
765    pub declared_license_expression: Option<String>,
766    pub declared_license_expression_spdx: Option<String>,
767    #[serde(default)]
768    pub license_detections: Vec<LicenseDetection>,
769    pub other_license_expression: Option<String>,
770    pub other_license_expression_spdx: Option<String>,
771    #[serde(default)]
772    pub other_license_detections: Vec<LicenseDetection>,
773    pub extracted_license_statement: Option<String>,
774    pub notice_text: Option<String>,
775    #[serde(default)]
776    pub source_packages: Vec<String>,
777    #[serde(default)]
778    pub file_references: Vec<FileReference>,
779    #[serde(default)]
780    pub is_private: bool,
781    #[serde(default)]
782    pub is_virtual: bool,
783    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
784    pub extra_data: Option<HashMap<String, serde_json::Value>>,
785    #[serde(default)]
786    pub dependencies: Vec<Dependency>,
787    pub repository_homepage_url: Option<String>,
788    pub repository_download_url: Option<String>,
789    pub api_data_url: Option<String>,
790    pub datasource_id: Option<DatasourceId>,
791    pub purl: Option<String>,
792}
793
794impl ResolvedPackage {
795    pub fn new(
796        package_type: PackageType,
797        namespace: String,
798        name: String,
799        version: String,
800    ) -> Self {
801        Self {
802            package_type,
803            namespace,
804            name,
805            version,
806            qualifiers: None,
807            subpath: None,
808            primary_language: None,
809            description: None,
810            release_date: None,
811            parties: vec![],
812            keywords: vec![],
813            homepage_url: None,
814            download_url: None,
815            size: None,
816            sha1: None,
817            md5: None,
818            sha256: None,
819            sha512: None,
820            bug_tracking_url: None,
821            code_view_url: None,
822            vcs_url: None,
823            copyright: None,
824            holder: None,
825            declared_license_expression: None,
826            declared_license_expression_spdx: None,
827            license_detections: vec![],
828            other_license_expression: None,
829            other_license_expression_spdx: None,
830            other_license_detections: vec![],
831            extracted_license_statement: None,
832            notice_text: None,
833            source_packages: vec![],
834            file_references: vec![],
835            is_private: false,
836            is_virtual: false,
837            extra_data: None,
838            dependencies: vec![],
839            repository_homepage_url: None,
840            repository_download_url: None,
841            api_data_url: None,
842            datasource_id: None,
843            purl: None,
844        }
845    }
846
847    pub fn from_package_data(package_data: &PackageData, fallback_type: PackageType) -> Self {
848        Self {
849            package_type: package_data.package_type.unwrap_or(fallback_type),
850            namespace: package_data.namespace.clone().unwrap_or_default(),
851            name: package_data.name.clone().unwrap_or_default(),
852            version: package_data.version.clone().unwrap_or_default(),
853            qualifiers: package_data.qualifiers.clone(),
854            subpath: package_data.subpath.clone(),
855            primary_language: package_data.primary_language.clone(),
856            description: package_data.description.clone(),
857            release_date: package_data.release_date.clone(),
858            parties: package_data.parties.clone(),
859            keywords: package_data.keywords.clone(),
860            homepage_url: package_data.homepage_url.clone(),
861            download_url: package_data.download_url.clone(),
862            size: package_data.size,
863            sha1: package_data.sha1.clone(),
864            md5: package_data.md5.clone(),
865            sha256: package_data.sha256.clone(),
866            sha512: package_data.sha512.clone(),
867            bug_tracking_url: package_data.bug_tracking_url.clone(),
868            code_view_url: package_data.code_view_url.clone(),
869            vcs_url: package_data.vcs_url.clone(),
870            copyright: package_data.copyright.clone(),
871            holder: package_data.holder.clone(),
872            declared_license_expression: package_data.declared_license_expression.clone(),
873            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
874            license_detections: package_data.license_detections.clone(),
875            other_license_expression: package_data.other_license_expression.clone(),
876            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
877            other_license_detections: package_data.other_license_detections.clone(),
878            extracted_license_statement: package_data.extracted_license_statement.clone(),
879            notice_text: package_data.notice_text.clone(),
880            source_packages: package_data.source_packages.clone(),
881            file_references: package_data.file_references.clone(),
882            is_private: package_data.is_private,
883            is_virtual: package_data.is_virtual,
884            extra_data: package_data.extra_data.clone(),
885            dependencies: package_data.dependencies.clone(),
886            repository_homepage_url: package_data.repository_homepage_url.clone(),
887            repository_download_url: package_data.repository_download_url.clone(),
888            api_data_url: package_data.api_data_url.clone(),
889            datasource_id: package_data.datasource_id,
890            purl: package_data.purl.clone(),
891        }
892    }
893}
894
895/// Author, maintainer, or contributor information.
896///
897/// Represents a person or organization associated with a package.
898#[derive(Serialize, Deserialize, Debug, Clone)]
899pub struct Party {
900    #[serde(skip_serializing_if = "Option::is_none")]
901    pub r#type: Option<String>,
902    #[serde(skip_serializing_if = "Option::is_none")]
903    pub role: Option<String>,
904    #[serde(skip_serializing_if = "Option::is_none")]
905    pub name: Option<String>,
906    #[serde(skip_serializing_if = "Option::is_none")]
907    pub email: Option<String>,
908    #[serde(skip_serializing_if = "Option::is_none")]
909    pub url: Option<String>,
910    #[serde(skip_serializing_if = "Option::is_none")]
911    pub organization: Option<String>,
912    #[serde(skip_serializing_if = "Option::is_none")]
913    pub organization_url: Option<String>,
914    #[serde(skip_serializing_if = "Option::is_none")]
915    pub timezone: Option<String>,
916}
917
918/// Reference to a file within a package archive with checksums.
919///
920/// Used in SBOM generation to track files within distribution archives.
921#[derive(Serialize, Deserialize, Debug, Clone)]
922pub struct FileReference {
923    pub path: String,
924    #[serde(skip_serializing_if = "Option::is_none")]
925    pub size: Option<u64>,
926    #[serde(skip_serializing_if = "Option::is_none")]
927    pub sha1: Option<String>,
928    #[serde(skip_serializing_if = "Option::is_none")]
929    pub md5: Option<String>,
930    #[serde(skip_serializing_if = "Option::is_none")]
931    pub sha256: Option<String>,
932    #[serde(skip_serializing_if = "Option::is_none")]
933    pub sha512: Option<String>,
934    #[serde(skip_serializing_if = "Option::is_none")]
935    pub extra_data: Option<std::collections::HashMap<String, serde_json::Value>>,
936}
937
938/// Top-level assembled package, created by merging one or more `PackageData`
939/// objects from related manifest/lockfiles (e.g., package.json + package-lock.json).
940///
941/// Compatible with ScanCode Toolkit output format. The key differences from
942/// `PackageData` are:
943/// - `package_uid`: unique identifier (PURL with UUID qualifier)
944/// - `datafile_paths`: list of all contributing files
945/// - `datasource_ids`: list of all contributing parsers
946/// - Excludes `dependencies` and `file_references` (hoisted to top-level)
947#[derive(Serialize, Deserialize, Debug, Clone)]
948pub struct Package {
949    #[serde(rename = "type")]
950    pub package_type: Option<PackageType>,
951    pub namespace: Option<String>,
952    pub name: Option<String>,
953    pub version: Option<String>,
954    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
955    pub qualifiers: Option<HashMap<String, String>>,
956    pub subpath: Option<String>,
957    pub primary_language: Option<String>,
958    pub description: Option<String>,
959    pub release_date: Option<String>,
960    #[serde(default)]
961    pub parties: Vec<Party>,
962    #[serde(default)]
963    pub keywords: Vec<String>,
964    pub homepage_url: Option<String>,
965    pub download_url: Option<String>,
966    pub size: Option<u64>,
967    pub sha1: Option<String>,
968    pub md5: Option<String>,
969    pub sha256: Option<String>,
970    pub sha512: Option<String>,
971    pub bug_tracking_url: Option<String>,
972    pub code_view_url: Option<String>,
973    pub vcs_url: Option<String>,
974    pub copyright: Option<String>,
975    pub holder: Option<String>,
976    pub declared_license_expression: Option<String>,
977    pub declared_license_expression_spdx: Option<String>,
978    #[serde(default)]
979    pub license_detections: Vec<LicenseDetection>,
980    pub other_license_expression: Option<String>,
981    pub other_license_expression_spdx: Option<String>,
982    #[serde(default)]
983    pub other_license_detections: Vec<LicenseDetection>,
984    pub extracted_license_statement: Option<String>,
985    pub notice_text: Option<String>,
986    #[serde(default)]
987    pub source_packages: Vec<String>,
988    #[serde(default)]
989    pub is_private: bool,
990    #[serde(default)]
991    pub is_virtual: bool,
992    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
993    pub extra_data: Option<HashMap<String, serde_json::Value>>,
994    pub repository_homepage_url: Option<String>,
995    pub repository_download_url: Option<String>,
996    pub api_data_url: Option<String>,
997    pub purl: Option<String>,
998    /// Unique identifier for this package instance (PURL with UUID qualifier).
999    pub package_uid: String,
1000    /// Paths to all datafiles that contributed to this package.
1001    pub datafile_paths: Vec<String>,
1002    /// Datasource identifiers for all parsers that contributed to this package.
1003    pub datasource_ids: Vec<DatasourceId>,
1004}
1005
1006impl Package {
1007    /// Create a `Package` from a `PackageData` and its source file path.
1008    ///
1009    /// Generates a unique `package_uid` by appending a UUID qualifier to the PURL.
1010    /// If the `PackageData` has no PURL, the package_uid will be an empty string.
1011    pub fn from_package_data(package_data: &PackageData, datafile_path: String) -> Self {
1012        let mut package_data = package_data.clone();
1013        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
1014
1015        let package_uid = package_data
1016            .purl
1017            .as_ref()
1018            .map(|p| build_package_uid(p))
1019            .unwrap_or_default();
1020
1021        Package {
1022            package_type: package_data.package_type,
1023            namespace: package_data.namespace.clone(),
1024            name: package_data.name.clone(),
1025            version: package_data.version.clone(),
1026            qualifiers: package_data.qualifiers.clone(),
1027            subpath: package_data.subpath.clone(),
1028            primary_language: package_data.primary_language.clone(),
1029            description: package_data.description.clone(),
1030            release_date: package_data.release_date.clone(),
1031            parties: package_data.parties.clone(),
1032            keywords: package_data.keywords.clone(),
1033            homepage_url: package_data.homepage_url.clone(),
1034            download_url: package_data.download_url.clone(),
1035            size: package_data.size,
1036            sha1: package_data.sha1.clone(),
1037            md5: package_data.md5.clone(),
1038            sha256: package_data.sha256.clone(),
1039            sha512: package_data.sha512.clone(),
1040            bug_tracking_url: package_data.bug_tracking_url.clone(),
1041            code_view_url: package_data.code_view_url.clone(),
1042            vcs_url: package_data.vcs_url.clone(),
1043            copyright: package_data.copyright.clone(),
1044            holder: package_data.holder.clone(),
1045            declared_license_expression: package_data.declared_license_expression.clone(),
1046            declared_license_expression_spdx: package_data.declared_license_expression_spdx.clone(),
1047            license_detections: package_data.license_detections.clone(),
1048            other_license_expression: package_data.other_license_expression.clone(),
1049            other_license_expression_spdx: package_data.other_license_expression_spdx.clone(),
1050            other_license_detections: package_data.other_license_detections.clone(),
1051            extracted_license_statement: package_data.extracted_license_statement.clone(),
1052            notice_text: package_data.notice_text.clone(),
1053            source_packages: package_data.source_packages.clone(),
1054            is_private: package_data.is_private,
1055            is_virtual: package_data.is_virtual,
1056            extra_data: package_data.extra_data.clone(),
1057            repository_homepage_url: package_data.repository_homepage_url.clone(),
1058            repository_download_url: package_data.repository_download_url.clone(),
1059            api_data_url: package_data.api_data_url.clone(),
1060            purl: package_data.purl.clone(),
1061            package_uid,
1062            datafile_paths: vec![datafile_path],
1063            datasource_ids: if let Some(dsid) = package_data.datasource_id {
1064                vec![dsid]
1065            } else {
1066                vec![]
1067            },
1068        }
1069    }
1070
1071    /// Update this package with data from another `PackageData`.
1072    ///
1073    /// Merges data from a related file (e.g., lockfile) into this package.
1074    /// Existing non-empty values are preserved; empty fields are filled from
1075    /// the new data. Lists (parties, license_detections) are merged.
1076    pub fn update(&mut self, package_data: &PackageData, datafile_path: String) {
1077        let mut package_data = package_data.clone();
1078        enrich_package_data_license_provenance(&mut package_data, &datafile_path);
1079
1080        if let Some(dsid) = package_data.datasource_id {
1081            self.datasource_ids.push(dsid);
1082        }
1083        self.datafile_paths.push(datafile_path);
1084
1085        macro_rules! fill_if_empty {
1086            ($field:ident) => {
1087                if self.$field.is_none() {
1088                    self.$field = package_data.$field.clone();
1089                }
1090            };
1091        }
1092
1093        fill_if_empty!(package_type);
1094        fill_if_empty!(name);
1095        fill_if_empty!(namespace);
1096        fill_if_empty!(version);
1097        fill_if_empty!(qualifiers);
1098        fill_if_empty!(subpath);
1099        fill_if_empty!(primary_language);
1100        fill_if_empty!(description);
1101        fill_if_empty!(release_date);
1102        fill_if_empty!(homepage_url);
1103        fill_if_empty!(download_url);
1104        fill_if_empty!(size);
1105        fill_if_empty!(sha1);
1106        fill_if_empty!(md5);
1107        fill_if_empty!(sha256);
1108        fill_if_empty!(sha512);
1109        fill_if_empty!(bug_tracking_url);
1110        fill_if_empty!(code_view_url);
1111        fill_if_empty!(vcs_url);
1112        fill_if_empty!(copyright);
1113        fill_if_empty!(holder);
1114        fill_if_empty!(declared_license_expression);
1115        fill_if_empty!(declared_license_expression_spdx);
1116        fill_if_empty!(other_license_expression);
1117        fill_if_empty!(other_license_expression_spdx);
1118        fill_if_empty!(extracted_license_statement);
1119        fill_if_empty!(notice_text);
1120        match (&mut self.extra_data, &package_data.extra_data) {
1121            (None, Some(extra_data)) => {
1122                self.extra_data = Some(extra_data.clone());
1123            }
1124            (Some(existing), Some(incoming)) => {
1125                for (key, value) in incoming {
1126                    existing.entry(key.clone()).or_insert_with(|| value.clone());
1127                }
1128            }
1129            _ => {}
1130        }
1131        fill_if_empty!(repository_homepage_url);
1132        fill_if_empty!(repository_download_url);
1133        fill_if_empty!(api_data_url);
1134
1135        for party in &package_data.parties {
1136            if let Some(existing) = self.parties.iter_mut().find(|p| {
1137                p.role == party.role
1138                    && ((p.name.is_some() && p.name == party.name)
1139                        || (p.email.is_some() && p.email == party.email))
1140            }) {
1141                if existing.name.is_none() {
1142                    existing.name = party.name.clone();
1143                }
1144                if existing.email.is_none() {
1145                    existing.email = party.email.clone();
1146                }
1147            } else {
1148                self.parties.push(party.clone());
1149            }
1150        }
1151
1152        for keyword in &package_data.keywords {
1153            if !self.keywords.contains(keyword) {
1154                self.keywords.push(keyword.clone());
1155            }
1156        }
1157
1158        for detection in &package_data.license_detections {
1159            self.license_detections.push(detection.clone());
1160        }
1161
1162        for detection in &package_data.other_license_detections {
1163            self.other_license_detections.push(detection.clone());
1164        }
1165
1166        for source_pkg in &package_data.source_packages {
1167            if !self.source_packages.contains(source_pkg) {
1168                self.source_packages.push(source_pkg.clone());
1169            }
1170        }
1171
1172        self.refresh_identity();
1173    }
1174
1175    pub fn backfill_license_provenance(&mut self) {
1176        let Some(datafile_path) = self.datafile_paths.first().cloned() else {
1177            return;
1178        };
1179
1180        for detection in &mut self.license_detections {
1181            enrich_license_detection_provenance(detection, &datafile_path);
1182        }
1183        for detection in &mut self.other_license_detections {
1184            enrich_license_detection_provenance(detection, &datafile_path);
1185        }
1186    }
1187
1188    fn refresh_identity(&mut self) {
1189        let Some(next_purl) = self.build_current_purl() else {
1190            return;
1191        };
1192
1193        if self.purl.as_deref() != Some(next_purl.as_str()) || self.package_uid.is_empty() {
1194            self.package_uid = build_package_uid(&next_purl);
1195        }
1196
1197        self.purl = Some(next_purl);
1198    }
1199
1200    fn build_current_purl(&self) -> Option<String> {
1201        if let (Some(package_type), Some(name)) = (
1202            self.package_type.as_ref(),
1203            self.name
1204                .as_deref()
1205                .filter(|value| !value.trim().is_empty()),
1206        ) {
1207            let purl_type = match package_type {
1208                PackageType::Deno => "generic",
1209                _ => package_type.as_str(),
1210            };
1211
1212            let mut purl = PackageUrl::new(purl_type, name).ok()?;
1213
1214            if let Some(namespace) = self
1215                .namespace
1216                .as_deref()
1217                .filter(|value| !value.trim().is_empty())
1218            {
1219                purl.with_namespace(namespace).ok()?;
1220            }
1221
1222            if let Some(version) = self
1223                .version
1224                .as_deref()
1225                .filter(|value| !value.trim().is_empty())
1226            {
1227                purl.with_version(version).ok()?;
1228            }
1229
1230            if let Some(qualifiers) = &self.qualifiers {
1231                for (key, value) in qualifiers {
1232                    purl.add_qualifier(key.as_str(), value.as_str()).ok()?;
1233                }
1234            }
1235
1236            if let Some(subpath) = self
1237                .subpath
1238                .as_deref()
1239                .filter(|value| !value.trim().is_empty())
1240            {
1241                purl.with_subpath(subpath).ok()?;
1242            }
1243
1244            return Some(purl.to_string());
1245        }
1246
1247        let existing_purl = self.purl.as_deref()?;
1248        let mut purl = PackageUrl::from_str(existing_purl).ok()?;
1249
1250        if let Some(version) = self
1251            .version
1252            .as_deref()
1253            .filter(|value| !value.trim().is_empty())
1254        {
1255            purl.with_version(version).ok()?;
1256        } else {
1257            purl.without_version();
1258        }
1259
1260        Some(purl.to_string())
1261    }
1262}
1263
1264#[cfg(test)]
1265mod tests {
1266    use super::*;
1267    use serde_json::json;
1268
1269    fn assert_has_key(value: &Value, key: &str) {
1270        assert!(
1271            value
1272                .as_object()
1273                .is_some_and(|object| object.contains_key(key)),
1274            "missing key {key} in {value:#?}"
1275        );
1276    }
1277
1278    #[test]
1279    fn file_info_new_backfills_package_detection_provenance() {
1280        let package_data = PackageData {
1281            package_type: Some(PackageType::Npm),
1282            license_detections: vec![LicenseDetection {
1283                license_expression: "mit".to_string(),
1284                license_expression_spdx: "MIT".to_string(),
1285                matches: vec![Match {
1286                    license_expression: "mit".to_string(),
1287                    license_expression_spdx: "MIT".to_string(),
1288                    from_file: None,
1289                    start_line: 1,
1290                    end_line: 1,
1291                    matcher: Some("parser-declared-license".to_string()),
1292                    score: 100.0,
1293                    matched_length: Some(1),
1294                    match_coverage: Some(100.0),
1295                    rule_relevance: Some(100),
1296                    rule_identifier: None,
1297                    rule_url: None,
1298                    matched_text: Some("MIT".to_string()),
1299                    referenced_filenames: None,
1300                    matched_text_diagnostics: None,
1301                }],
1302                detection_log: vec![],
1303                identifier: None,
1304            }],
1305            ..PackageData::default()
1306        };
1307
1308        let file_info = FileInfo::new(
1309            "package.json".to_string(),
1310            "package".to_string(),
1311            ".json".to_string(),
1312            "project/package.json".to_string(),
1313            FileType::File,
1314            None,
1315            None,
1316            1,
1317            None,
1318            None,
1319            None,
1320            None,
1321            None,
1322            vec![package_data],
1323            None,
1324            vec![],
1325            vec![],
1326            vec![],
1327            vec![],
1328            vec![],
1329            vec![],
1330            vec![],
1331            vec![],
1332            vec![],
1333        );
1334
1335        assert_eq!(file_info.license_detections.len(), 1);
1336        assert_eq!(
1337            file_info.license_detections[0].matches[0]
1338                .from_file
1339                .as_deref(),
1340            Some("project/package.json")
1341        );
1342        assert!(file_info.license_detections[0].identifier.is_some());
1343        assert_eq!(
1344            file_info.package_data[0].license_detections[0].matches[0]
1345                .from_file
1346                .as_deref(),
1347            Some("project/package.json")
1348        );
1349        assert!(
1350            file_info.package_data[0].license_detections[0]
1351                .identifier
1352                .is_some()
1353        );
1354    }
1355
1356    #[test]
1357    fn package_from_package_data_backfills_detection_provenance() {
1358        let package_data = PackageData {
1359            package_type: Some(PackageType::Npm),
1360            license_detections: vec![LicenseDetection {
1361                license_expression: "mit".to_string(),
1362                license_expression_spdx: "MIT".to_string(),
1363                matches: vec![Match {
1364                    license_expression: "mit".to_string(),
1365                    license_expression_spdx: "MIT".to_string(),
1366                    from_file: None,
1367                    start_line: 1,
1368                    end_line: 1,
1369                    matcher: Some("parser-declared-license".to_string()),
1370                    score: 100.0,
1371                    matched_length: Some(1),
1372                    match_coverage: Some(100.0),
1373                    rule_relevance: Some(100),
1374                    rule_identifier: None,
1375                    rule_url: None,
1376                    matched_text: Some("MIT".to_string()),
1377                    referenced_filenames: None,
1378                    matched_text_diagnostics: None,
1379                }],
1380                detection_log: vec![],
1381                identifier: None,
1382            }],
1383            ..PackageData::default()
1384        };
1385
1386        let package = Package::from_package_data(&package_data, "project/package.json".to_string());
1387
1388        assert_eq!(
1389            package.license_detections[0].matches[0]
1390                .from_file
1391                .as_deref(),
1392            Some("project/package.json")
1393        );
1394        assert!(package.license_detections[0].identifier.is_some());
1395    }
1396
1397    #[test]
1398    fn package_data_serialization_includes_scancode_style_defaults() {
1399        let package_data = PackageData {
1400            package_type: Some(PackageType::Npm),
1401            name: Some("left-pad".to_string()),
1402            datasource_id: Some(DatasourceId::NpmPackageJson),
1403            ..PackageData::default()
1404        };
1405
1406        let value = serde_json::to_value(&package_data).expect("package data should serialize");
1407
1408        assert_has_key(&value, "namespace");
1409        assert_eq!(value["namespace"], Value::Null);
1410        assert_eq!(value["qualifiers"], json!({}));
1411        assert_eq!(value["subpath"], Value::Null);
1412        assert_eq!(value["description"], Value::Null);
1413        assert_eq!(value["license_detections"], json!([]));
1414        assert_eq!(value["other_license_detections"], json!([]));
1415        assert_eq!(value["source_packages"], json!([]));
1416        assert_eq!(value["file_references"], json!([]));
1417        assert_eq!(value["is_private"], json!(false));
1418        assert_eq!(value["is_virtual"], json!(false));
1419        assert_eq!(value["extra_data"], json!({}));
1420        assert_eq!(value["repository_homepage_url"], Value::Null);
1421        assert_eq!(value["repository_download_url"], Value::Null);
1422        assert_eq!(value["api_data_url"], Value::Null);
1423        assert_has_key(&value, "datasource_id");
1424        assert_eq!(value["purl"], Value::Null);
1425    }
1426
1427    #[test]
1428    fn package_serialization_includes_scancode_style_defaults() {
1429        let package = Package::from_package_data(
1430            &PackageData {
1431                package_type: Some(PackageType::Npm),
1432                name: Some("left-pad".to_string()),
1433                datasource_id: Some(DatasourceId::NpmPackageJson),
1434                ..PackageData::default()
1435            },
1436            "project/package.json".to_string(),
1437        );
1438
1439        let value = serde_json::to_value(&package).expect("package should serialize");
1440
1441        assert_eq!(value["namespace"], Value::Null);
1442        assert_eq!(value["qualifiers"], json!({}));
1443        assert_eq!(value["subpath"], Value::Null);
1444        assert_eq!(value["keywords"], json!([]));
1445        assert_eq!(value["license_detections"], json!([]));
1446        assert_eq!(value["other_license_detections"], json!([]));
1447        assert_eq!(value["source_packages"], json!([]));
1448        assert_eq!(value["is_private"], json!(false));
1449        assert_eq!(value["is_virtual"], json!(false));
1450        assert_eq!(value["extra_data"], json!({}));
1451        assert_eq!(value["repository_homepage_url"], Value::Null);
1452        assert_eq!(value["repository_download_url"], Value::Null);
1453        assert_eq!(value["api_data_url"], Value::Null);
1454        assert_eq!(value["purl"], Value::Null);
1455    }
1456
1457    #[test]
1458    fn dependency_shapes_serialize_with_explicit_nulls_and_empty_objects() {
1459        let dependency = Dependency {
1460            purl: None,
1461            extracted_requirement: None,
1462            scope: None,
1463            is_runtime: None,
1464            is_optional: None,
1465            is_pinned: None,
1466            is_direct: None,
1467            resolved_package: None,
1468            extra_data: None,
1469        };
1470
1471        let dependency_value =
1472            serde_json::to_value(&dependency).expect("dependency should serialize");
1473        assert_eq!(dependency_value["extracted_requirement"], Value::Null);
1474        assert_eq!(dependency_value["is_runtime"], Value::Null);
1475        assert_eq!(dependency_value["is_optional"], Value::Null);
1476        assert_eq!(dependency_value["is_pinned"], Value::Null);
1477        assert_eq!(dependency_value["is_direct"], Value::Null);
1478        assert_eq!(dependency_value["resolved_package"], Value::Null);
1479        assert_eq!(dependency_value["extra_data"], json!({}));
1480
1481        let top_level = TopLevelDependency::from_dependency(
1482            &dependency,
1483            "project/package-lock.json".to_string(),
1484            DatasourceId::NpmPackageLockJson,
1485            None,
1486        );
1487        let top_level_value =
1488            serde_json::to_value(&top_level).expect("top-level dependency should serialize");
1489
1490        assert_eq!(top_level_value["resolved_package"], Value::Null);
1491        assert_eq!(top_level_value["extra_data"], json!({}));
1492        assert_eq!(top_level_value["for_package_uid"], Value::Null);
1493        assert_eq!(top_level_value["namespace"], Value::Null);
1494    }
1495
1496    #[test]
1497    fn nested_resolved_package_serialization_uses_full_package_shape() {
1498        let dependency = Dependency {
1499            purl: Some("pkg:npm/left-pad@1.3.0".to_string()),
1500            extracted_requirement: Some("1.3.0".to_string()),
1501            scope: Some("dependencies".to_string()),
1502            is_runtime: Some(true),
1503            is_optional: Some(false),
1504            is_pinned: Some(true),
1505            is_direct: Some(true),
1506            resolved_package: Some(Box::new(ResolvedPackage {
1507                primary_language: Some("JavaScript".to_string()),
1508                datasource_id: Some(DatasourceId::NpmPackageLockJson),
1509                purl: Some("pkg:npm/left-pad@1.3.0".to_string()),
1510                ..ResolvedPackage::new(
1511                    PackageType::Npm,
1512                    String::new(),
1513                    "left-pad".to_string(),
1514                    "1.3.0".to_string(),
1515                )
1516            })),
1517            extra_data: None,
1518        };
1519
1520        let value = serde_json::to_value(&dependency).expect("dependency should serialize");
1521        let resolved_package = &value["resolved_package"];
1522
1523        assert_eq!(resolved_package["namespace"], json!(""));
1524        assert_eq!(resolved_package["qualifiers"], json!({}));
1525        assert_eq!(resolved_package["subpath"], Value::Null);
1526        assert_eq!(resolved_package["description"], Value::Null);
1527        assert_eq!(resolved_package["license_detections"], json!([]));
1528        assert_eq!(resolved_package["other_license_detections"], json!([]));
1529        assert_eq!(resolved_package["source_packages"], json!([]));
1530        assert_eq!(resolved_package["file_references"], json!([]));
1531        assert_eq!(resolved_package["is_private"], json!(false));
1532        assert_eq!(resolved_package["is_virtual"], json!(false));
1533        assert_eq!(resolved_package["extra_data"], json!({}));
1534        assert_eq!(resolved_package["repository_homepage_url"], Value::Null);
1535        assert_eq!(resolved_package["repository_download_url"], Value::Null);
1536        assert_eq!(resolved_package["api_data_url"], Value::Null);
1537        assert_has_key(resolved_package, "datasource_id");
1538        assert_has_key(resolved_package, "purl");
1539    }
1540}
1541
1542/// Top-level dependency instance, created during package assembly.
1543///
1544/// Extends the file-level `Dependency` with traceability fields that link
1545/// each dependency to its owning package and source datafile.
1546#[derive(Serialize, Deserialize, Debug, Clone)]
1547pub struct TopLevelDependency {
1548    pub purl: Option<String>,
1549    pub extracted_requirement: Option<String>,
1550    pub scope: Option<String>,
1551    pub is_runtime: Option<bool>,
1552    pub is_optional: Option<bool>,
1553    pub is_pinned: Option<bool>,
1554    pub is_direct: Option<bool>,
1555    pub resolved_package: Option<Box<ResolvedPackage>>,
1556    #[serde(default, serialize_with = "serialize_optional_map_as_object")]
1557    pub extra_data: Option<HashMap<String, serde_json::Value>>,
1558    /// Unique identifier for this dependency instance (PURL with UUID qualifier).
1559    pub dependency_uid: String,
1560    /// The `package_uid` of the package this dependency belongs to.
1561    pub for_package_uid: Option<String>,
1562    /// Path to the datafile where this dependency was declared.
1563    pub datafile_path: String,
1564    /// Datasource identifier for the parser that extracted this dependency.
1565    pub datasource_id: DatasourceId,
1566    /// Namespace for the dependency (e.g., distribution name for RPM packages).
1567    pub namespace: Option<String>,
1568}
1569
1570impl TopLevelDependency {
1571    /// Create a `TopLevelDependency` from a file-level `Dependency`.
1572    pub fn from_dependency(
1573        dep: &Dependency,
1574        datafile_path: String,
1575        datasource_id: DatasourceId,
1576        for_package_uid: Option<String>,
1577    ) -> Self {
1578        let dependency_uid = dep
1579            .purl
1580            .as_ref()
1581            .map(|p| build_package_uid(p))
1582            .unwrap_or_default();
1583
1584        TopLevelDependency {
1585            purl: dep.purl.clone(),
1586            extracted_requirement: dep.extracted_requirement.clone(),
1587            scope: dep.scope.clone(),
1588            is_runtime: dep.is_runtime,
1589            is_optional: dep.is_optional,
1590            is_pinned: dep.is_pinned,
1591            is_direct: dep.is_direct,
1592            resolved_package: dep.resolved_package.clone(),
1593            extra_data: dep.extra_data.clone(),
1594            dependency_uid,
1595            for_package_uid,
1596            datafile_path,
1597            datasource_id,
1598            namespace: None,
1599        }
1600    }
1601}
1602
1603/// Generate a unique package identifier by appending a UUID v4 qualifier to a PURL.
1604///
1605/// The format matches Python ScanCode: `pkg:type/name@version?uuid=<uuid-v4>`
1606pub fn build_package_uid(purl: &str) -> String {
1607    let uuid = Uuid::new_v4();
1608    if purl.contains('?') {
1609        format!("{}&uuid={}", purl, uuid)
1610    } else {
1611        format!("{}?uuid={}", purl, uuid)
1612    }
1613}
1614
1615#[derive(Serialize, Deserialize, Debug, Clone)]
1616pub struct OutputEmail {
1617    pub email: String,
1618    pub start_line: usize,
1619    pub end_line: usize,
1620}
1621
1622#[derive(Serialize, Deserialize, Debug, Clone)]
1623pub struct OutputURL {
1624    pub url: String,
1625    pub start_line: usize,
1626    pub end_line: usize,
1627}
1628
1629#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
1630pub struct LicensePolicyEntry {
1631    pub license_key: String,
1632    pub label: String,
1633    pub color_code: String,
1634    pub icon: String,
1635}
1636
1637#[derive(Debug, Clone, PartialEq)]
1638pub enum FileType {
1639    File,
1640    Directory,
1641}
1642
1643impl Serialize for FileType {
1644    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1645    where
1646        S: serde::Serializer,
1647    {
1648        let value = match self {
1649            FileType::File => "file",
1650            FileType::Directory => "directory",
1651        };
1652        serializer.serialize_str(value)
1653    }
1654}
1655
1656impl<'de> Deserialize<'de> for FileType {
1657    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
1658    where
1659        D: serde::Deserializer<'de>,
1660    {
1661        let value = String::deserialize(deserializer)?;
1662        match value.as_str() {
1663            "file" => Ok(FileType::File),
1664            "directory" => Ok(FileType::Directory),
1665            _ => Err(serde::de::Error::custom("invalid file type")),
1666        }
1667    }
1668}