Skip to main content

provenant/output_schema/
file_info.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use serde::{Deserialize, Serialize, Serializer};
5use serde_json::Map;
6
7use super::author::OutputAuthor;
8use super::copyright::OutputCopyright;
9use super::email::OutputEmail;
10use super::holder::OutputHolder;
11use super::license_detection::OutputLicenseDetection;
12use super::license_match::OutputMatch;
13use super::license_policy_entry::OutputLicensePolicyEntry;
14use super::package_data::OutputPackageData;
15use super::serde_helpers::insert_json;
16use super::tallies::OutputTallies;
17use super::url::OutputURL;
18
19#[derive(Debug, Clone, Deserialize)]
20pub struct OutputFileInfo {
21    #[serde(default)]
22    pub name: String,
23    #[serde(default)]
24    pub base_name: String,
25    #[serde(default)]
26    pub extension: String,
27    pub path: String,
28    #[serde(rename = "type")]
29    pub file_type: crate::models::FileType,
30    pub mime_type: Option<String>,
31    pub file_type_label: Option<String>,
32    #[serde(default)]
33    pub size: u64,
34    pub date: Option<String>,
35    pub sha1: Option<String>,
36    pub md5: Option<String>,
37    pub sha256: Option<String>,
38    pub sha1_git: Option<String>,
39    pub programming_language: Option<String>,
40    #[serde(default)]
41    pub package_data: Vec<OutputPackageData>,
42    #[serde(rename = "detected_license_expression_spdx")]
43    pub license_expression: Option<String>,
44    #[serde(default)]
45    pub license_detections: Vec<OutputLicenseDetection>,
46    #[serde(default, skip_serializing_if = "Vec::is_empty")]
47    pub license_clues: Vec<OutputMatch>,
48    pub percentage_of_license_text: Option<f64>,
49    #[serde(default)]
50    pub copyrights: Vec<OutputCopyright>,
51    #[serde(default)]
52    pub holders: Vec<OutputHolder>,
53    #[serde(default)]
54    pub authors: Vec<OutputAuthor>,
55    #[serde(default, skip_serializing_if = "Vec::is_empty")]
56    pub emails: Vec<OutputEmail>,
57    #[serde(default)]
58    pub urls: Vec<OutputURL>,
59    #[serde(default)]
60    pub for_packages: Vec<String>,
61    #[serde(default)]
62    pub scan_errors: Vec<String>,
63    pub license_policy: Option<Vec<OutputLicensePolicyEntry>>,
64    pub is_generated: Option<bool>,
65    pub is_binary: Option<bool>,
66    pub is_text: Option<bool>,
67    pub is_archive: Option<bool>,
68    pub is_media: Option<bool>,
69    pub is_source: Option<bool>,
70    pub is_script: Option<bool>,
71    pub files_count: Option<usize>,
72    pub dirs_count: Option<usize>,
73    pub size_count: Option<u64>,
74    pub source_count: Option<usize>,
75    #[serde(default, skip_serializing_if = "is_false")]
76    pub is_legal: bool,
77    #[serde(default, skip_serializing_if = "is_false")]
78    pub is_manifest: bool,
79    #[serde(default, skip_serializing_if = "is_false")]
80    pub is_readme: bool,
81    #[serde(default, skip_serializing_if = "is_false")]
82    pub is_top_level: bool,
83    #[serde(default, skip_serializing_if = "is_false")]
84    pub is_key_file: bool,
85    #[serde(default, skip_serializing_if = "is_false")]
86    pub is_community: bool,
87    #[serde(default, skip_serializing_if = "Vec::is_empty")]
88    pub facets: Vec<String>,
89    pub tallies: Option<OutputTallies>,
90}
91
92impl OutputFileInfo {
93    pub(crate) fn should_serialize_info_surface(&self) -> bool {
94        self.date.is_some()
95            || self.sha1.is_some()
96            || self.md5.is_some()
97            || self.sha256.is_some()
98            || self.sha1_git.is_some()
99            || self.mime_type.is_some()
100            || self.file_type_label.is_some()
101            || self.programming_language.is_some()
102            || self.is_binary.is_some()
103            || self.is_text.is_some()
104            || self.is_archive.is_some()
105            || self.is_media.is_some()
106            || self.is_source.is_some()
107            || self.is_script.is_some()
108            || self.files_count.is_some()
109            || self.dirs_count.is_some()
110            || self.size_count.is_some()
111    }
112
113    pub(crate) fn should_serialize_license_surface(&self) -> bool {
114        self.license_expression.is_some()
115            || !self.license_detections.is_empty()
116            || !self.license_clues.is_empty()
117            || self.percentage_of_license_text.is_some()
118    }
119
120    pub(crate) fn detected_license_expression_spdx(&self) -> Option<String> {
121        {
122            let expressions: Option<Vec<String>> = self
123                .license_detections
124                .iter()
125                .map(|detection| {
126                    (!detection.license_expression_spdx.is_empty())
127                        .then(|| detection.license_expression_spdx.clone())
128                })
129                .collect();
130            expressions.and_then(|expressions| {
131                crate::utils::spdx::select_primary_license_expression_strict(expressions.clone())
132                    .or_else(|| {
133                        crate::utils::spdx::combine_license_expressions_preserving_structure_strict(
134                            expressions,
135                        )
136                    })
137            })
138        }
139        .or_else(|| {
140            let expressions: Option<Vec<String>> = self
141                .package_data
142                .iter()
143                .flat_map(|package_data| package_data.license_detections.iter())
144                .map(|detection| {
145                    (!detection.license_expression_spdx.is_empty())
146                        .then(|| detection.license_expression_spdx.clone())
147                })
148                .collect();
149            expressions.and_then(|expressions| {
150                crate::utils::spdx::select_primary_license_expression_strict(expressions.clone())
151                    .or_else(|| {
152                        crate::utils::spdx::combine_license_expressions_preserving_structure_strict(
153                            expressions,
154                        )
155                    })
156            })
157        })
158        .or_else(|| {
159            self.license_expression
160                .clone()
161                .filter(|expression| !expression.is_empty())
162                .and_then(|expression| {
163                    crate::utils::spdx::combine_license_expressions_preserving_structure_strict([
164                        expression,
165                    ])
166                })
167        })
168    }
169}
170
171impl Serialize for OutputFileInfo {
172    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
173    where
174        S: Serializer,
175    {
176        let mut map = Map::new();
177        insert_json(&mut map, "path", &self.path)?;
178        insert_json(&mut map, "type", &self.file_type)?;
179        insert_json(&mut map, "name", &self.name)?;
180        insert_json(&mut map, "base_name", &self.base_name)?;
181        insert_json(&mut map, "extension", &self.extension)?;
182        insert_json(&mut map, "size", self.size)?;
183
184        if self.should_serialize_info_surface() {
185            insert_json(&mut map, "date", &self.date)?;
186            insert_json(&mut map, "sha1", self.sha1.as_ref())?;
187            insert_json(&mut map, "md5", self.md5.as_ref())?;
188            insert_json(&mut map, "sha256", self.sha256.as_ref())?;
189            insert_json(&mut map, "sha1_git", self.sha1_git.as_ref())?;
190            insert_json(&mut map, "mime_type", &self.mime_type)?;
191            insert_json(&mut map, "file_type", &self.file_type_label)?;
192            insert_json(&mut map, "programming_language", &self.programming_language)?;
193            insert_json(&mut map, "is_binary", self.is_binary)?;
194            insert_json(&mut map, "is_text", self.is_text)?;
195            insert_json(&mut map, "is_archive", self.is_archive)?;
196            insert_json(&mut map, "is_media", self.is_media)?;
197            insert_json(&mut map, "is_source", self.is_source)?;
198            insert_json(&mut map, "is_script", self.is_script)?;
199            insert_json(&mut map, "files_count", self.files_count)?;
200            insert_json(&mut map, "dirs_count", self.dirs_count)?;
201            insert_json(&mut map, "size_count", self.size_count)?;
202        }
203
204        insert_json(&mut map, "package_data", &self.package_data)?;
205        insert_json(
206            &mut map,
207            "detected_license_expression_spdx",
208            self.detected_license_expression_spdx(),
209        )?;
210        insert_json(&mut map, "license_detections", &self.license_detections)?;
211        if self.should_serialize_license_surface() {
212            insert_json(&mut map, "license_clues", &self.license_clues)?;
213        }
214        if self.percentage_of_license_text.is_some() {
215            insert_json(
216                &mut map,
217                "percentage_of_license_text",
218                self.percentage_of_license_text,
219            )?;
220        }
221        insert_json(&mut map, "copyrights", &self.copyrights)?;
222        insert_json(&mut map, "holders", &self.holders)?;
223        insert_json(&mut map, "authors", &self.authors)?;
224        if !self.emails.is_empty() {
225            insert_json(&mut map, "emails", &self.emails)?;
226        }
227        insert_json(&mut map, "urls", &self.urls)?;
228        insert_json(&mut map, "for_packages", &self.for_packages)?;
229        insert_json(&mut map, "scan_errors", &self.scan_errors)?;
230        if self.license_policy.is_some() {
231            insert_json(&mut map, "license_policy", &self.license_policy)?;
232        }
233        if self.is_generated.is_some() {
234            insert_json(&mut map, "is_generated", self.is_generated)?;
235        }
236        if self.source_count.is_some() {
237            insert_json(&mut map, "source_count", self.source_count)?;
238        }
239        if self.is_legal {
240            insert_json(&mut map, "is_legal", self.is_legal)?;
241        }
242        if self.is_manifest {
243            insert_json(&mut map, "is_manifest", self.is_manifest)?;
244        }
245        if self.is_readme {
246            insert_json(&mut map, "is_readme", self.is_readme)?;
247        }
248        if self.is_top_level {
249            insert_json(&mut map, "is_top_level", self.is_top_level)?;
250        }
251        if self.is_key_file {
252            insert_json(&mut map, "is_key_file", self.is_key_file)?;
253        }
254        if self.is_community {
255            insert_json(&mut map, "is_community", self.is_community)?;
256        }
257        if !self.facets.is_empty() {
258            insert_json(&mut map, "facets", &self.facets)?;
259        }
260        if self.tallies.is_some() {
261            insert_json(&mut map, "tallies", &self.tallies)?;
262        }
263
264        map.serialize(serializer)
265    }
266}
267
268impl From<&crate::models::FileInfo> for OutputFileInfo {
269    fn from(value: &crate::models::FileInfo) -> Self {
270        Self::from_with_compat_mode(value, crate::cli::CompatibilityMode::Native)
271    }
272}
273
274impl OutputFileInfo {
275    pub fn from_with_compat_mode(
276        value: &crate::models::FileInfo,
277        mode: crate::cli::CompatibilityMode,
278    ) -> Self {
279        Self {
280            name: value.name.clone(),
281            base_name: value.base_name.clone(),
282            extension: value.extension.clone(),
283            path: value.path.clone(),
284            file_type: value.file_type.clone(),
285            mime_type: value.mime_type.clone(),
286            file_type_label: value.file_type_label.clone(),
287            size: value.size,
288            date: value.date.clone(),
289            sha1: value.sha1.as_ref().map(|d| d.as_hex()),
290            md5: value.md5.as_ref().map(|d| d.as_hex()),
291            sha256: value.sha256.as_ref().map(|d| d.as_hex()),
292            sha1_git: value.sha1_git.as_ref().map(|d| d.as_hex()),
293            programming_language: value.programming_language.clone(),
294            package_data: value
295                .package_data
296                .iter()
297                .map(OutputPackageData::from)
298                .collect(),
299            license_expression: value.license_expression.clone(),
300            license_detections: value
301                .license_detections
302                .iter()
303                .map(OutputLicenseDetection::from)
304                .collect(),
305            license_clues: value.license_clues.iter().map(OutputMatch::from).collect(),
306            percentage_of_license_text: value.percentage_of_license_text,
307            copyrights: value
308                .copyrights
309                .iter()
310                .map(|copyright| OutputCopyright::from_with_compat_mode(copyright, mode))
311                .collect(),
312            holders: value.holders.iter().map(OutputHolder::from).collect(),
313            authors: value.authors.iter().map(OutputAuthor::from).collect(),
314            emails: value.emails.iter().map(OutputEmail::from).collect(),
315            urls: value.urls.iter().map(OutputURL::from).collect(),
316            for_packages: value
317                .for_packages
318                .iter()
319                .map(|uid| uid.to_string())
320                .collect(),
321            scan_errors: value.scan_errors.clone(),
322            license_policy: value
323                .license_policy
324                .as_ref()
325                .map(|v| v.iter().map(OutputLicensePolicyEntry::from).collect()),
326            is_generated: value.is_generated,
327            is_binary: value.is_binary,
328            is_text: value.is_text,
329            is_archive: value.is_archive,
330            is_media: value.is_media,
331            is_source: value.is_source,
332            is_script: value.is_script,
333            files_count: value.files_count,
334            dirs_count: value.dirs_count,
335            size_count: value.size_count,
336            source_count: value.source_count,
337            is_legal: value.is_legal,
338            is_manifest: value.is_manifest,
339            is_readme: value.is_readme,
340            is_top_level: value.is_top_level,
341            is_key_file: value.is_key_file,
342            is_community: value.is_community,
343            facets: value.facets.clone(),
344            tallies: value.tallies.as_ref().map(OutputTallies::from),
345        }
346    }
347}
348
349impl TryFrom<&OutputFileInfo> for crate::models::FileInfo {
350    type Error = String;
351    fn try_from(value: &OutputFileInfo) -> Result<Self, Self::Error> {
352        let mut package_data = Vec::with_capacity(value.package_data.len());
353        for p in &value.package_data {
354            package_data.push(crate::models::PackageData::try_from(p)?);
355        }
356        let mut license_detections = Vec::with_capacity(value.license_detections.len());
357        for d in &value.license_detections {
358            license_detections.push(crate::models::LicenseDetection::try_from(d)?);
359        }
360        let mut license_clues = Vec::with_capacity(value.license_clues.len());
361        for m in &value.license_clues {
362            license_clues.push(crate::models::Match::try_from(m)?);
363        }
364        let mut copyrights = Vec::with_capacity(value.copyrights.len());
365        for c in &value.copyrights {
366            copyrights.push(crate::models::Copyright::try_from(c)?);
367        }
368        let mut holders = Vec::with_capacity(value.holders.len());
369        for h in &value.holders {
370            holders.push(crate::models::Holder::try_from(h)?);
371        }
372        let mut authors = Vec::with_capacity(value.authors.len());
373        for a in &value.authors {
374            authors.push(crate::models::Author::try_from(a)?);
375        }
376        let mut emails = Vec::with_capacity(value.emails.len());
377        for e in &value.emails {
378            emails.push(crate::models::OutputEmail::try_from(e)?);
379        }
380        let mut urls = Vec::with_capacity(value.urls.len());
381        for u in &value.urls {
382            urls.push(crate::models::OutputURL::try_from(u)?);
383        }
384        let license_policy = value
385            .license_policy
386            .as_ref()
387            .map(|v| {
388                v.iter()
389                    .map(crate::models::LicensePolicyEntry::try_from)
390                    .collect::<Result<Vec<_>, _>>()
391            })
392            .transpose()?;
393        Ok(Self {
394            name: value.name.clone(),
395            base_name: value.base_name.clone(),
396            extension: value.extension.clone(),
397            path: value.path.clone(),
398            file_type: value.file_type.clone(),
399            mime_type: value.mime_type.clone(),
400            file_type_label: value.file_type_label.clone(),
401            size: value.size,
402            date: value.date.clone(),
403            sha1: value
404                .sha1
405                .as_ref()
406                .map(|s| crate::models::Sha1Digest::from_hex(s))
407                .transpose()
408                .map_err(|e| format!("invalid sha1: {}", e))?,
409            md5: value
410                .md5
411                .as_ref()
412                .map(|s| crate::models::Md5Digest::from_hex(s))
413                .transpose()
414                .map_err(|e| format!("invalid md5: {}", e))?,
415            sha256: value
416                .sha256
417                .as_ref()
418                .map(|s| crate::models::Sha256Digest::from_hex(s))
419                .transpose()
420                .map_err(|e| format!("invalid sha256: {}", e))?,
421            sha1_git: value
422                .sha1_git
423                .as_ref()
424                .map(|s| crate::models::GitSha1::from_hex(s))
425                .transpose()
426                .map_err(|e| format!("invalid sha1_git: {}", e))?,
427            programming_language: value.programming_language.clone(),
428            package_data,
429            license_expression: value.license_expression.clone(),
430            license_detections,
431            license_clues,
432            percentage_of_license_text: value.percentage_of_license_text,
433            copyrights,
434            holders,
435            authors,
436            emails,
437            urls,
438            for_packages: value
439                .for_packages
440                .iter()
441                .map(|s| crate::models::PackageUid::from_raw(s.clone()))
442                .collect(),
443            scan_errors: value.scan_errors.clone(),
444            scan_diagnostics: crate::models::diagnostics_from_legacy_scan_errors(
445                &value.scan_errors,
446            ),
447            license_policy,
448            is_generated: value.is_generated,
449            is_binary: value.is_binary,
450            is_text: value.is_text,
451            is_archive: value.is_archive,
452            is_media: value.is_media,
453            is_source: value.is_source,
454            is_script: value.is_script,
455            files_count: value.files_count,
456            dirs_count: value.dirs_count,
457            size_count: value.size_count,
458            source_count: value.source_count,
459            is_legal: value.is_legal,
460            is_manifest: value.is_manifest,
461            is_readme: value.is_readme,
462            is_top_level: value.is_top_level,
463            is_key_file: value.is_key_file,
464            is_community: value.is_community,
465            facets: value.facets.clone(),
466            tallies: value
467                .tallies
468                .as_ref()
469                .map(crate::models::Tallies::try_from)
470                .transpose()?,
471        })
472    }
473}
474
475#[cfg(test)]
476mod tests {
477    use super::OutputFileInfo;
478    use crate::models::FileType;
479    use crate::output_schema::license_detection::OutputLicenseDetection;
480
481    fn base_output_file_info() -> OutputFileInfo {
482        OutputFileInfo {
483            name: "mod.rs".to_string(),
484            base_name: "mod".to_string(),
485            extension: ".rs".to_string(),
486            path: "mod.rs".to_string(),
487            file_type: FileType::File,
488            mime_type: None,
489            file_type_label: None,
490            size: 0,
491            date: None,
492            sha1: None,
493            md5: None,
494            sha256: None,
495            sha1_git: None,
496            programming_language: None,
497            package_data: Vec::new(),
498            license_expression: None,
499            license_detections: Vec::new(),
500            license_clues: Vec::new(),
501            percentage_of_license_text: None,
502            copyrights: Vec::new(),
503            holders: Vec::new(),
504            authors: Vec::new(),
505            emails: Vec::new(),
506            urls: Vec::new(),
507            for_packages: Vec::new(),
508            scan_errors: Vec::new(),
509            license_policy: None,
510            is_generated: None,
511            is_binary: None,
512            is_text: None,
513            is_archive: None,
514            is_media: None,
515            is_source: None,
516            is_script: None,
517            files_count: None,
518            dirs_count: None,
519            size_count: None,
520            source_count: None,
521            is_legal: false,
522            is_manifest: false,
523            is_readme: false,
524            is_top_level: false,
525            is_key_file: false,
526            is_community: false,
527            facets: Vec::new(),
528            tallies: None,
529        }
530    }
531
532    #[test]
533    fn detected_license_expression_spdx_does_not_recombine_partial_detection_spdx() {
534        let mut file_info = base_output_file_info();
535        file_info.license_expression = Some("Apache-2.0 AND MIT".to_string());
536        file_info.license_detections = vec![
537            OutputLicenseDetection {
538                license_expression: "apache-2.0".to_string(),
539                license_expression_spdx: "Apache-2.0".to_string(),
540                matches: Vec::new(),
541                detection_log: Vec::new(),
542                identifier: None,
543            },
544            OutputLicenseDetection {
545                license_expression: "mit".to_string(),
546                license_expression_spdx: String::new(),
547                matches: Vec::new(),
548                detection_log: Vec::new(),
549                identifier: None,
550            },
551        ];
552
553        assert_eq!(
554            file_info.detected_license_expression_spdx().as_deref(),
555            Some("Apache-2.0 AND MIT")
556        );
557    }
558
559    #[test]
560    fn detected_license_expression_spdx_rejects_invalid_fallback_expression() {
561        let mut file_info = base_output_file_info();
562        file_info.license_expression = Some("MIT\" or malformed".to_string());
563
564        assert_eq!(file_info.detected_license_expression_spdx(), None);
565    }
566}