Skip to main content

provenant/license_detection/
dataset.rs

1use std::fmt::Write as _;
2use std::path::Path;
3
4use anyhow::{Context, Result, anyhow};
5use serde::{Deserialize, Serialize};
6
7use crate::cache::write_bytes_atomically;
8use crate::license_detection::embedded::index::load_loader_snapshot_from_bytes;
9use crate::license_detection::embedded::schema::EmbeddedArtifactMetadata;
10use crate::license_detection::license_cache::compute_rules_fingerprint;
11use crate::license_detection::models::{LoadedLicense, LoadedRule, RuleKind};
12use crate::license_detection::rules::{parse_license_to_loaded, parse_rule_to_loaded};
13use crate::models::Sha256Digest;
14use crate::version::BUILD_VERSION;
15
16pub const LICENSE_DATASET_RULES_DIR: &str = "rules";
17pub const LICENSE_DATASET_LICENSES_DIR: &str = "licenses";
18pub const LICENSE_DATASET_MANIFEST_FILE: &str = "manifest.json";
19pub const LICENSE_DATASET_README_FILE: &str = "README.md";
20pub const CUSTOM_LICENSE_DATASET_SOURCE: &str = "custom-license-dataset";
21const LICENSE_DATASET_SCHEMA_VERSION: u32 = 1;
22
23#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
24pub struct LicenseDatasetManifest {
25    pub schema_version: u32,
26    pub spdx_license_list_version: String,
27    pub dataset_fingerprint: String,
28    pub exported_from_source: String,
29    pub exported_by_version: String,
30}
31
32#[derive(Debug, Clone)]
33pub struct LoadedLicenseDataset {
34    pub manifest: LicenseDatasetManifest,
35    pub rules: Vec<LoadedRule>,
36    pub licenses: Vec<LoadedLicense>,
37}
38
39pub fn export_embedded_license_dataset(target_root: &Path) -> Result<LicenseDatasetManifest> {
40    let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
41    let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
42        .map_err(|error| anyhow!("Failed to load embedded license dataset: {}", error))?;
43
44    export_license_dataset_to_root(
45        target_root,
46        &snapshot.rules,
47        &snapshot.licenses,
48        &snapshot.metadata,
49    )
50}
51
52pub fn export_license_dataset_to_root(
53    target_root: &Path,
54    rules: &[LoadedRule],
55    licenses: &[LoadedLicense],
56    metadata: &EmbeddedArtifactMetadata,
57) -> Result<LicenseDatasetManifest> {
58    ensure_export_target_is_empty(target_root)?;
59
60    let manifest = LicenseDatasetManifest {
61        schema_version: LICENSE_DATASET_SCHEMA_VERSION,
62        spdx_license_list_version: metadata.spdx_license_list_version.clone(),
63        dataset_fingerprint: compute_dataset_fingerprint_string(rules, licenses)?,
64        exported_from_source: metadata.license_index_provenance.source.clone(),
65        exported_by_version: BUILD_VERSION.to_string(),
66    };
67
68    write_dataset_manifest(target_root, &manifest)?;
69    write_dataset_readme(target_root, &manifest)?;
70    write_rule_files(target_root, rules)?;
71    write_license_files(target_root, licenses)?;
72
73    Ok(manifest)
74}
75
76pub fn load_license_dataset_from_root(root: &Path) -> Result<LoadedLicenseDataset> {
77    let rules_dir = root.join(LICENSE_DATASET_RULES_DIR);
78    let licenses_dir = root.join(LICENSE_DATASET_LICENSES_DIR);
79
80    if !root.is_dir() {
81        return Err(anyhow!(
82            "License dataset root does not exist or is not a directory: {}",
83            root.display()
84        ));
85    }
86    if !rules_dir.is_dir() {
87        return Err(anyhow!(
88            "License dataset is missing required rules/ directory: {}",
89            rules_dir.display()
90        ));
91    }
92    if !licenses_dir.is_dir() {
93        return Err(anyhow!(
94            "License dataset is missing required licenses/ directory: {}",
95            licenses_dir.display()
96        ));
97    }
98
99    let manifest_path = root.join(LICENSE_DATASET_MANIFEST_FILE);
100    let manifest_text = std::fs::read_to_string(&manifest_path).with_context(|| {
101        format!(
102            "License dataset is missing required manifest.json at {}",
103            manifest_path.display()
104        )
105    })?;
106    let manifest: LicenseDatasetManifest =
107        serde_json::from_str(&manifest_text).with_context(|| {
108            format!(
109                "Failed to parse license dataset manifest at {}",
110                manifest_path.display()
111            )
112        })?;
113
114    if manifest.schema_version != LICENSE_DATASET_SCHEMA_VERSION {
115        return Err(anyhow!(
116            "Unsupported license dataset schema version {} in {} (expected {})",
117            manifest.schema_version,
118            manifest_path.display(),
119            LICENSE_DATASET_SCHEMA_VERSION
120        ));
121    }
122
123    let rules = load_strict_loaded_rules_from_directory(&rules_dir)?;
124    let licenses = load_strict_loaded_licenses_from_directory(&licenses_dir)?;
125
126    Ok(LoadedLicenseDataset {
127        manifest,
128        rules,
129        licenses,
130    })
131}
132
133pub fn compute_dataset_fingerprint_string(
134    rules: &[LoadedRule],
135    licenses: &[LoadedLicense],
136) -> Result<String> {
137    Ok(Sha256Digest::from_bytes(compute_rules_fingerprint(rules, licenses)?).to_string())
138}
139
140fn ensure_export_target_is_empty(target_root: &Path) -> Result<()> {
141    if target_root.exists() {
142        let mut entries = std::fs::read_dir(target_root)
143            .with_context(|| format!("Failed to read export target {}", target_root.display()))?;
144        if entries.next().is_some() {
145            return Err(anyhow!(
146                "Refusing to export into non-empty directory {}",
147                target_root.display()
148            ));
149        }
150    } else {
151        std::fs::create_dir_all(target_root)
152            .with_context(|| format!("Failed to create export target {}", target_root.display()))?;
153    }
154
155    Ok(())
156}
157
158fn write_dataset_manifest(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
159    let payload = serde_json::to_vec_pretty(manifest).context("Serialize dataset manifest")?;
160    write_bytes_atomically(&root.join(LICENSE_DATASET_MANIFEST_FILE), &payload)
161        .context("Write dataset manifest")?;
162    Ok(())
163}
164
165fn write_dataset_readme(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
166    let text = format!(
167        "# Exported Provenant license dataset\n\nThis directory contains the effective `.RULE` and `.LICENSE` files used by Provenant.\n\n- Reuse it with `provenant --license-dataset-path <DIR> --license ...`\n- Edit files under `rules/` and `licenses/` to customize scan behavior\n- `manifest.json` records the exported dataset fingerprint and SPDX license list version\n- The fingerprint in `manifest.json` is informational; if you edit files, Provenant computes the active dataset fingerprint from current file contents\n\nExport metadata:\n\n- schema_version: {}\n- spdx_license_list_version: {}\n- dataset_fingerprint: {}\n- exported_from_source: {}\n- exported_by_version: {}\n",
168        manifest.schema_version,
169        manifest.spdx_license_list_version,
170        manifest.dataset_fingerprint,
171        manifest.exported_from_source,
172        manifest.exported_by_version,
173    );
174    write_bytes_atomically(&root.join(LICENSE_DATASET_README_FILE), text.as_bytes())
175        .context("Write dataset README")?;
176    Ok(())
177}
178
179fn write_rule_files(root: &Path, rules: &[LoadedRule]) -> Result<()> {
180    let mut sorted = rules.iter().collect::<Vec<_>>();
181    sorted.sort_by_key(|rule| &rule.identifier);
182
183    for rule in sorted {
184        validate_dataset_filename_component(&rule.identifier, "rule identifier")?;
185        let rendered = render_rule(rule)?;
186        let output_path = root.join(LICENSE_DATASET_RULES_DIR).join(&rule.identifier);
187        write_bytes_atomically(&output_path, rendered.as_bytes())
188            .with_context(|| format!("Write rule dataset file {}", output_path.display()))?;
189    }
190
191    Ok(())
192}
193
194fn write_license_files(root: &Path, licenses: &[LoadedLicense]) -> Result<()> {
195    let mut sorted = licenses.iter().collect::<Vec<_>>();
196    sorted.sort_by_key(|license| &license.key);
197
198    for license in sorted {
199        validate_dataset_filename_component(&license.key, "license key")?;
200        let rendered = render_license(license)?;
201        let output_path = root
202            .join(LICENSE_DATASET_LICENSES_DIR)
203            .join(format!("{}.LICENSE", license.key));
204        write_bytes_atomically(&output_path, rendered.as_bytes())
205            .with_context(|| format!("Write license dataset file {}", output_path.display()))?;
206    }
207
208    Ok(())
209}
210
211fn load_strict_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
212    let mut rules = Vec::new();
213    let entries = std::fs::read_dir(dir)
214        .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
215
216    for entry in entries {
217        let entry = entry
218            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
219        let path = entry.path();
220        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
221            rules.push(parse_rule_to_loaded(&path).with_context(|| {
222                format!("Failed to parse dataset rule file {}", path.display())
223            })?);
224        }
225    }
226
227    Ok(rules)
228}
229
230fn load_strict_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
231    let mut licenses = Vec::new();
232    let entries = std::fs::read_dir(dir)
233        .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
234
235    for entry in entries {
236        let entry = entry
237            .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
238        let path = entry.path();
239        if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
240            licenses.push(parse_license_to_loaded(&path).with_context(|| {
241                format!("Failed to parse dataset license file {}", path.display())
242            })?);
243        }
244    }
245
246    Ok(licenses)
247}
248
249fn validate_dataset_filename_component(value: &str, kind: &str) -> Result<()> {
250    if value.is_empty()
251        || value.contains('/')
252        || value.contains('\\')
253        || value.contains("..")
254        || Path::new(value).components().count() != 1
255    {
256        return Err(anyhow!(
257            "Invalid {} for exported license dataset: {}",
258            kind,
259            value
260        ));
261    }
262
263    Ok(())
264}
265
266fn render_rule(rule: &LoadedRule) -> Result<String> {
267    let mut rendered = String::from("---\n");
268    push_yaml_string(
269        &mut rendered,
270        "license_expression",
271        Some(&rule.license_expression),
272    )?;
273    push_rule_kind(&mut rendered, rule.rule_kind);
274    push_yaml_bool(&mut rendered, "is_false_positive", rule.is_false_positive);
275    push_yaml_bool(&mut rendered, "is_required_phrase", rule.is_required_phrase);
276    push_yaml_bool(
277        &mut rendered,
278        "skip_for_required_phrase_generation",
279        rule.skip_for_required_phrase_generation,
280    );
281    push_yaml_u8(&mut rendered, "relevance", rule.relevance);
282    if rule.has_stored_minimum_coverage {
283        push_yaml_u8(&mut rendered, "minimum_coverage", rule.minimum_coverage);
284    }
285    push_yaml_bool(&mut rendered, "is_continuous", rule.is_continuous);
286    push_yaml_bool(&mut rendered, "is_deprecated", rule.is_deprecated);
287    push_yaml_list(
288        &mut rendered,
289        "referenced_filenames",
290        rule.referenced_filenames.as_deref(),
291    )?;
292    push_yaml_list(&mut rendered, "replaced_by", Some(&rule.replaced_by))?;
293    push_yaml_list(
294        &mut rendered,
295        "ignorable_urls",
296        rule.ignorable_urls.as_deref(),
297    )?;
298    push_yaml_list(
299        &mut rendered,
300        "ignorable_emails",
301        rule.ignorable_emails.as_deref(),
302    )?;
303    push_yaml_string(&mut rendered, "notes", rule.notes.as_deref())?;
304    push_yaml_list(
305        &mut rendered,
306        "ignorable_copyrights",
307        rule.ignorable_copyrights.as_deref(),
308    )?;
309    push_yaml_list(
310        &mut rendered,
311        "ignorable_holders",
312        rule.ignorable_holders.as_deref(),
313    )?;
314    push_yaml_list(
315        &mut rendered,
316        "ignorable_authors",
317        rule.ignorable_authors.as_deref(),
318    )?;
319    push_yaml_string(&mut rendered, "language", rule.language.as_deref())?;
320    rendered.push_str("---\n\n");
321    rendered.push_str(&rule.text);
322    rendered.push('\n');
323    Ok(rendered)
324}
325
326fn render_license(license: &LoadedLicense) -> Result<String> {
327    let mut rendered = String::from("---\n");
328    push_yaml_string(&mut rendered, "key", Some(&license.key))?;
329    push_yaml_string(&mut rendered, "short_name", license.short_name.as_deref())?;
330    push_yaml_string(&mut rendered, "name", Some(&license.name))?;
331    push_yaml_string(
332        &mut rendered,
333        "spdx_license_key",
334        license.spdx_license_key.as_deref(),
335    )?;
336    push_yaml_list(
337        &mut rendered,
338        "other_spdx_license_keys",
339        Some(&license.other_spdx_license_keys),
340    )?;
341    push_yaml_string(&mut rendered, "category", license.category.as_deref())?;
342    push_yaml_string(&mut rendered, "owner", license.owner.as_deref())?;
343    push_yaml_string(
344        &mut rendered,
345        "homepage_url",
346        license.homepage_url.as_deref(),
347    )?;
348    push_yaml_string(
349        &mut rendered,
350        "osi_license_key",
351        license.osi_license_key.as_deref(),
352    )?;
353    push_yaml_list(&mut rendered, "text_urls", Some(&license.text_urls))?;
354    push_yaml_string(&mut rendered, "osi_url", license.osi_url.as_deref())?;
355    push_yaml_string(&mut rendered, "faq_url", license.faq_url.as_deref())?;
356    push_yaml_list(&mut rendered, "other_urls", Some(&license.other_urls))?;
357    push_yaml_string(&mut rendered, "notes", license.notes.as_deref())?;
358    push_yaml_bool(&mut rendered, "is_deprecated", license.is_deprecated);
359    push_yaml_bool(&mut rendered, "is_exception", license.is_exception);
360    push_yaml_bool(&mut rendered, "is_unknown", license.is_unknown);
361    push_yaml_bool(&mut rendered, "is_generic", license.is_generic);
362    push_yaml_list(&mut rendered, "replaced_by", Some(&license.replaced_by))?;
363    push_yaml_u8(&mut rendered, "minimum_coverage", license.minimum_coverage);
364    push_yaml_string(
365        &mut rendered,
366        "standard_notice",
367        license.standard_notice.as_deref(),
368    )?;
369    push_yaml_list(
370        &mut rendered,
371        "ignorable_copyrights",
372        license.ignorable_copyrights.as_deref(),
373    )?;
374    push_yaml_list(
375        &mut rendered,
376        "ignorable_holders",
377        license.ignorable_holders.as_deref(),
378    )?;
379    push_yaml_list(
380        &mut rendered,
381        "ignorable_authors",
382        license.ignorable_authors.as_deref(),
383    )?;
384    push_yaml_list(
385        &mut rendered,
386        "ignorable_urls",
387        license.ignorable_urls.as_deref(),
388    )?;
389    push_yaml_list(
390        &mut rendered,
391        "ignorable_emails",
392        license.ignorable_emails.as_deref(),
393    )?;
394    rendered.push_str("---\n\n");
395    rendered.push_str(&license.text);
396    rendered.push('\n');
397    Ok(rendered)
398}
399
400fn push_rule_kind(rendered: &mut String, rule_kind: RuleKind) {
401    let key = match rule_kind {
402        RuleKind::None => return,
403        RuleKind::Text => "is_license_text",
404        RuleKind::Notice => "is_license_notice",
405        RuleKind::Reference => "is_license_reference",
406        RuleKind::Tag => "is_license_tag",
407        RuleKind::Intro => "is_license_intro",
408        RuleKind::Clue => "is_license_clue",
409    };
410    let _ = writeln!(rendered, "{key}: true");
411}
412
413fn push_yaml_bool(rendered: &mut String, key: &str, value: bool) {
414    if value {
415        let _ = writeln!(rendered, "{key}: true");
416    }
417}
418
419fn push_yaml_u8(rendered: &mut String, key: &str, value: Option<u8>) {
420    if let Some(value) = value {
421        let _ = writeln!(rendered, "{key}: {value}");
422    }
423}
424
425fn push_yaml_string(rendered: &mut String, key: &str, value: Option<&str>) -> Result<()> {
426    let Some(value) = value else {
427        return Ok(());
428    };
429    let quoted = serde_json::to_string(value).context("serialize yaml string")?;
430    let _ = writeln!(rendered, "{key}: {quoted}");
431    Ok(())
432}
433
434fn push_yaml_list(rendered: &mut String, key: &str, values: Option<&[String]>) -> Result<()> {
435    let Some(values) = values else {
436        return Ok(());
437    };
438    if values.is_empty() {
439        return Ok(());
440    }
441
442    let _ = writeln!(rendered, "{key}:");
443    for value in values {
444        let quoted = serde_json::to_string(value).context("serialize yaml list entry")?;
445        let _ = writeln!(rendered, "  - {quoted}");
446    }
447    Ok(())
448}
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453    use crate::license_detection::models::RuleKind;
454    use crate::license_detection::rules::{parse_license_str_to_loaded, parse_rule_str_to_loaded};
455    use tempfile::TempDir;
456
457    fn create_loaded_rule() -> LoadedRule {
458        LoadedRule {
459            identifier: "example.RULE".to_string(),
460            license_expression: "mit OR apache-2.0".to_string(),
461            text: "Example rule text".to_string(),
462            rule_kind: RuleKind::Notice,
463            is_false_positive: false,
464            is_required_phrase: true,
465            skip_for_required_phrase_generation: true,
466            relevance: Some(100),
467            minimum_coverage: Some(75),
468            has_stored_minimum_coverage: true,
469            is_continuous: true,
470            referenced_filenames: Some(vec!["LICENSE".to_string()]),
471            ignorable_urls: Some(vec!["https://example.com".to_string()]),
472            ignorable_emails: Some(vec!["legal@example.com".to_string()]),
473            ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
474            ignorable_holders: Some(vec!["Example Org".to_string()]),
475            ignorable_authors: Some(vec!["Jane Doe".to_string()]),
476            language: Some("en".to_string()),
477            notes: Some("Example note".to_string()),
478            is_deprecated: true,
479            replaced_by: vec!["replacement.RULE".to_string()],
480        }
481    }
482
483    fn create_loaded_license() -> LoadedLicense {
484        LoadedLicense {
485            key: "example-license".to_string(),
486            short_name: Some("Example".to_string()),
487            name: "Example License".to_string(),
488            language: Some("en".to_string()),
489            spdx_license_key: Some("MIT".to_string()),
490            other_spdx_license_keys: vec!["Apache-2.0".to_string()],
491            category: Some("Permissive".to_string()),
492            owner: Some("Example Org".to_string()),
493            homepage_url: Some("https://example.com".to_string()),
494            text: "Example license text".to_string(),
495            reference_urls: vec![
496                "https://example.com/text".to_string(),
497                "https://example.com/other".to_string(),
498                "https://opensource.org/licenses/MIT".to_string(),
499                "https://example.com/faq".to_string(),
500                "https://example.com".to_string(),
501            ],
502            osi_license_key: Some("MIT".to_string()),
503            text_urls: vec!["https://example.com/text".to_string()],
504            osi_url: Some("https://opensource.org/licenses/MIT".to_string()),
505            faq_url: Some("https://example.com/faq".to_string()),
506            other_urls: vec!["https://example.com/other".to_string()],
507            notes: Some("Example note".to_string()),
508            is_deprecated: true,
509            is_exception: true,
510            is_unknown: true,
511            is_generic: true,
512            replaced_by: vec!["replacement".to_string()],
513            minimum_coverage: Some(55),
514            standard_notice: Some("Standard notice".to_string()),
515            ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
516            ignorable_holders: Some(vec!["Example Org".to_string()]),
517            ignorable_authors: Some(vec!["Jane Doe".to_string()]),
518            ignorable_urls: Some(vec!["https://example.com".to_string()]),
519            ignorable_emails: Some(vec!["legal@example.com".to_string()]),
520        }
521    }
522
523    #[test]
524    fn render_rule_roundtrips_through_loader() {
525        let rule = create_loaded_rule();
526        let rendered = render_rule(&rule).expect("render rule");
527        let reparsed = parse_rule_str_to_loaded(&rule.identifier, &rendered).expect("reparse rule");
528        assert_eq!(reparsed, rule);
529    }
530
531    #[test]
532    fn render_license_roundtrips_through_loader() {
533        let license = create_loaded_license();
534        let rendered = render_license(&license).expect("render license");
535        let reparsed =
536            parse_license_str_to_loaded("example-license.LICENSE", &rendered).expect("reparse");
537        assert_eq!(reparsed, license);
538    }
539
540    #[test]
541    fn load_license_dataset_requires_manifest_and_expected_dirs() {
542        let temp = TempDir::new().expect("temp dir");
543        std::fs::create_dir_all(temp.path().join("rules")).expect("rules dir");
544        std::fs::create_dir_all(temp.path().join("licenses")).expect("licenses dir");
545
546        let error = load_license_dataset_from_root(temp.path()).expect_err("missing manifest");
547        assert!(error.to_string().contains("manifest.json"));
548    }
549
550    #[test]
551    fn load_license_dataset_fails_on_invalid_rule_file() {
552        let temp = TempDir::new().expect("temp dir");
553        let root = temp.path();
554        std::fs::create_dir_all(root.join("rules")).expect("rules dir");
555        std::fs::create_dir_all(root.join("licenses")).expect("licenses dir");
556        std::fs::write(
557            root.join("manifest.json"),
558            serde_json::json!({
559                "schema_version": 1,
560                "spdx_license_list_version": "3.27",
561                "dataset_fingerprint": "abc",
562                "exported_from_source": "embedded-artifact",
563                "exported_by_version": "test",
564            })
565            .to_string(),
566        )
567        .expect("manifest");
568        std::fs::write(root.join("rules").join("broken.RULE"), "not-frontmatter")
569            .expect("broken rule");
570        std::fs::write(
571            root.join("licenses").join("mit.LICENSE"),
572            "---\nkey: \"mit\"\nname: \"MIT License\"\n---\n\nMIT text\n",
573        )
574        .expect("license");
575
576        let error = load_license_dataset_from_root(root).expect_err("invalid rule should fail");
577        assert!(
578            error
579                .to_string()
580                .contains("Failed to parse dataset rule file")
581        );
582    }
583
584    #[test]
585    fn export_license_dataset_rejects_path_like_rule_identifier() {
586        let manifest = EmbeddedArtifactMetadata {
587            spdx_license_list_version: "3.27".to_string(),
588            license_index_provenance: crate::models::LicenseIndexProvenance {
589                source: "embedded-artifact".to_string(),
590                dataset_fingerprint: "abc123".to_string(),
591                ignored_rules: vec![],
592                ignored_licenses: vec![],
593                ignored_rules_due_to_licenses: vec![],
594                added_rules: vec![],
595                replaced_rules: vec![],
596                added_licenses: vec![],
597                replaced_licenses: vec![],
598            },
599        };
600        let temp = TempDir::new().expect("temp dir");
601
602        let error = export_license_dataset_to_root(
603            temp.path(),
604            &[LoadedRule {
605                identifier: "nested/path.RULE".to_string(),
606                ..create_loaded_rule()
607            }],
608            &[create_loaded_license()],
609            &manifest,
610        )
611        .expect_err("path-like identifiers should be rejected");
612
613        assert!(
614            error
615                .to_string()
616                .contains("Invalid rule identifier for exported license dataset")
617        );
618    }
619}