1use std::fmt::Write as _;
2use std::path::Path;
3
4use anyhow::{Context, Result, anyhow};
5use serde::{Deserialize, Serialize};
6
7use crate::cache::write_bytes_atomically;
8use crate::license_detection::embedded::index::load_loader_snapshot_from_bytes;
9use crate::license_detection::embedded::schema::EmbeddedArtifactMetadata;
10use crate::license_detection::license_cache::compute_rules_fingerprint;
11use crate::license_detection::models::{LoadedLicense, LoadedRule, RuleKind};
12use crate::license_detection::rules::{parse_license_to_loaded, parse_rule_to_loaded};
13use crate::models::Sha256Digest;
14use crate::version::BUILD_VERSION;
15
16pub const LICENSE_DATASET_RULES_DIR: &str = "rules";
17pub const LICENSE_DATASET_LICENSES_DIR: &str = "licenses";
18pub const LICENSE_DATASET_MANIFEST_FILE: &str = "manifest.json";
19pub const LICENSE_DATASET_README_FILE: &str = "README.md";
20pub const CUSTOM_LICENSE_DATASET_SOURCE: &str = "custom-license-dataset";
21const LICENSE_DATASET_SCHEMA_VERSION: u32 = 1;
22
23#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
24pub struct LicenseDatasetManifest {
25 pub schema_version: u32,
26 pub spdx_license_list_version: String,
27 pub dataset_fingerprint: String,
28 pub exported_from_source: String,
29 pub exported_by_version: String,
30}
31
32#[derive(Debug, Clone)]
33pub struct LoadedLicenseDataset {
34 pub manifest: LicenseDatasetManifest,
35 pub rules: Vec<LoadedRule>,
36 pub licenses: Vec<LoadedLicense>,
37}
38
39pub fn export_embedded_license_dataset(target_root: &Path) -> Result<LicenseDatasetManifest> {
40 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
41 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
42 .map_err(|error| anyhow!("Failed to load embedded license dataset: {}", error))?;
43
44 export_license_dataset_to_root(
45 target_root,
46 &snapshot.rules,
47 &snapshot.licenses,
48 &snapshot.metadata,
49 )
50}
51
52pub fn export_license_dataset_to_root(
53 target_root: &Path,
54 rules: &[LoadedRule],
55 licenses: &[LoadedLicense],
56 metadata: &EmbeddedArtifactMetadata,
57) -> Result<LicenseDatasetManifest> {
58 ensure_export_target_is_empty(target_root)?;
59
60 let manifest = LicenseDatasetManifest {
61 schema_version: LICENSE_DATASET_SCHEMA_VERSION,
62 spdx_license_list_version: metadata.spdx_license_list_version.clone(),
63 dataset_fingerprint: compute_dataset_fingerprint_string(rules, licenses)?,
64 exported_from_source: metadata.license_index_provenance.source.clone(),
65 exported_by_version: BUILD_VERSION.to_string(),
66 };
67
68 write_dataset_manifest(target_root, &manifest)?;
69 write_dataset_readme(target_root, &manifest)?;
70 write_rule_files(target_root, rules)?;
71 write_license_files(target_root, licenses)?;
72
73 Ok(manifest)
74}
75
76pub fn load_license_dataset_from_root(root: &Path) -> Result<LoadedLicenseDataset> {
77 let rules_dir = root.join(LICENSE_DATASET_RULES_DIR);
78 let licenses_dir = root.join(LICENSE_DATASET_LICENSES_DIR);
79
80 if !root.is_dir() {
81 return Err(anyhow!(
82 "License dataset root does not exist or is not a directory: {}",
83 root.display()
84 ));
85 }
86 if !rules_dir.is_dir() {
87 return Err(anyhow!(
88 "License dataset is missing required rules/ directory: {}",
89 rules_dir.display()
90 ));
91 }
92 if !licenses_dir.is_dir() {
93 return Err(anyhow!(
94 "License dataset is missing required licenses/ directory: {}",
95 licenses_dir.display()
96 ));
97 }
98
99 let manifest_path = root.join(LICENSE_DATASET_MANIFEST_FILE);
100 let manifest_text = std::fs::read_to_string(&manifest_path).with_context(|| {
101 format!(
102 "License dataset is missing required manifest.json at {}",
103 manifest_path.display()
104 )
105 })?;
106 let manifest: LicenseDatasetManifest =
107 serde_json::from_str(&manifest_text).with_context(|| {
108 format!(
109 "Failed to parse license dataset manifest at {}",
110 manifest_path.display()
111 )
112 })?;
113
114 if manifest.schema_version != LICENSE_DATASET_SCHEMA_VERSION {
115 return Err(anyhow!(
116 "Unsupported license dataset schema version {} in {} (expected {})",
117 manifest.schema_version,
118 manifest_path.display(),
119 LICENSE_DATASET_SCHEMA_VERSION
120 ));
121 }
122
123 let rules = load_strict_loaded_rules_from_directory(&rules_dir)?;
124 let licenses = load_strict_loaded_licenses_from_directory(&licenses_dir)?;
125
126 Ok(LoadedLicenseDataset {
127 manifest,
128 rules,
129 licenses,
130 })
131}
132
133pub fn compute_dataset_fingerprint_string(
134 rules: &[LoadedRule],
135 licenses: &[LoadedLicense],
136) -> Result<String> {
137 Ok(Sha256Digest::from_bytes(compute_rules_fingerprint(rules, licenses)?).to_string())
138}
139
140fn ensure_export_target_is_empty(target_root: &Path) -> Result<()> {
141 if target_root.exists() {
142 let mut entries = std::fs::read_dir(target_root)
143 .with_context(|| format!("Failed to read export target {}", target_root.display()))?;
144 if entries.next().is_some() {
145 return Err(anyhow!(
146 "Refusing to export into non-empty directory {}",
147 target_root.display()
148 ));
149 }
150 } else {
151 std::fs::create_dir_all(target_root)
152 .with_context(|| format!("Failed to create export target {}", target_root.display()))?;
153 }
154
155 Ok(())
156}
157
158fn write_dataset_manifest(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
159 let payload = serde_json::to_vec_pretty(manifest).context("Serialize dataset manifest")?;
160 write_bytes_atomically(&root.join(LICENSE_DATASET_MANIFEST_FILE), &payload)
161 .context("Write dataset manifest")?;
162 Ok(())
163}
164
165fn write_dataset_readme(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
166 let text = format!(
167 "# Exported Provenant license dataset\n\nThis directory contains the effective `.RULE` and `.LICENSE` files used by Provenant.\n\n- Reuse it with `provenant --license-dataset-path <DIR> --license ...`\n- Edit files under `rules/` and `licenses/` to customize scan behavior\n- `manifest.json` records the exported dataset fingerprint and SPDX license list version\n- The fingerprint in `manifest.json` is informational; if you edit files, Provenant computes the active dataset fingerprint from current file contents\n\nExport metadata:\n\n- schema_version: {}\n- spdx_license_list_version: {}\n- dataset_fingerprint: {}\n- exported_from_source: {}\n- exported_by_version: {}\n",
168 manifest.schema_version,
169 manifest.spdx_license_list_version,
170 manifest.dataset_fingerprint,
171 manifest.exported_from_source,
172 manifest.exported_by_version,
173 );
174 write_bytes_atomically(&root.join(LICENSE_DATASET_README_FILE), text.as_bytes())
175 .context("Write dataset README")?;
176 Ok(())
177}
178
179fn write_rule_files(root: &Path, rules: &[LoadedRule]) -> Result<()> {
180 let mut sorted = rules.iter().collect::<Vec<_>>();
181 sorted.sort_by_key(|rule| &rule.identifier);
182
183 for rule in sorted {
184 validate_dataset_filename_component(&rule.identifier, "rule identifier")?;
185 let rendered = render_rule(rule)?;
186 let output_path = root.join(LICENSE_DATASET_RULES_DIR).join(&rule.identifier);
187 write_bytes_atomically(&output_path, rendered.as_bytes())
188 .with_context(|| format!("Write rule dataset file {}", output_path.display()))?;
189 }
190
191 Ok(())
192}
193
194fn write_license_files(root: &Path, licenses: &[LoadedLicense]) -> Result<()> {
195 let mut sorted = licenses.iter().collect::<Vec<_>>();
196 sorted.sort_by_key(|license| &license.key);
197
198 for license in sorted {
199 validate_dataset_filename_component(&license.key, "license key")?;
200 let rendered = render_license(license)?;
201 let output_path = root
202 .join(LICENSE_DATASET_LICENSES_DIR)
203 .join(format!("{}.LICENSE", license.key));
204 write_bytes_atomically(&output_path, rendered.as_bytes())
205 .with_context(|| format!("Write license dataset file {}", output_path.display()))?;
206 }
207
208 Ok(())
209}
210
211fn load_strict_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
212 let mut rules = Vec::new();
213 let entries = std::fs::read_dir(dir)
214 .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
215
216 for entry in entries {
217 let entry = entry
218 .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
219 let path = entry.path();
220 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
221 rules.push(parse_rule_to_loaded(&path).with_context(|| {
222 format!("Failed to parse dataset rule file {}", path.display())
223 })?);
224 }
225 }
226
227 Ok(rules)
228}
229
230fn load_strict_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
231 let mut licenses = Vec::new();
232 let entries = std::fs::read_dir(dir)
233 .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
234
235 for entry in entries {
236 let entry = entry
237 .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
238 let path = entry.path();
239 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
240 licenses.push(parse_license_to_loaded(&path).with_context(|| {
241 format!("Failed to parse dataset license file {}", path.display())
242 })?);
243 }
244 }
245
246 Ok(licenses)
247}
248
249fn validate_dataset_filename_component(value: &str, kind: &str) -> Result<()> {
250 if value.is_empty()
251 || value.contains('/')
252 || value.contains('\\')
253 || value.contains("..")
254 || Path::new(value).components().count() != 1
255 {
256 return Err(anyhow!(
257 "Invalid {} for exported license dataset: {}",
258 kind,
259 value
260 ));
261 }
262
263 Ok(())
264}
265
266fn render_rule(rule: &LoadedRule) -> Result<String> {
267 let mut rendered = String::from("---\n");
268 push_yaml_string(
269 &mut rendered,
270 "license_expression",
271 Some(&rule.license_expression),
272 )?;
273 push_rule_kind(&mut rendered, rule.rule_kind);
274 push_yaml_bool(&mut rendered, "is_false_positive", rule.is_false_positive);
275 push_yaml_bool(&mut rendered, "is_required_phrase", rule.is_required_phrase);
276 push_yaml_bool(
277 &mut rendered,
278 "skip_for_required_phrase_generation",
279 rule.skip_for_required_phrase_generation,
280 );
281 push_yaml_u8(&mut rendered, "relevance", rule.relevance);
282 if rule.has_stored_minimum_coverage {
283 push_yaml_u8(&mut rendered, "minimum_coverage", rule.minimum_coverage);
284 }
285 push_yaml_bool(&mut rendered, "is_continuous", rule.is_continuous);
286 push_yaml_bool(&mut rendered, "is_deprecated", rule.is_deprecated);
287 push_yaml_list(
288 &mut rendered,
289 "referenced_filenames",
290 rule.referenced_filenames.as_deref(),
291 )?;
292 push_yaml_list(&mut rendered, "replaced_by", Some(&rule.replaced_by))?;
293 push_yaml_list(
294 &mut rendered,
295 "ignorable_urls",
296 rule.ignorable_urls.as_deref(),
297 )?;
298 push_yaml_list(
299 &mut rendered,
300 "ignorable_emails",
301 rule.ignorable_emails.as_deref(),
302 )?;
303 push_yaml_string(&mut rendered, "notes", rule.notes.as_deref())?;
304 push_yaml_list(
305 &mut rendered,
306 "ignorable_copyrights",
307 rule.ignorable_copyrights.as_deref(),
308 )?;
309 push_yaml_list(
310 &mut rendered,
311 "ignorable_holders",
312 rule.ignorable_holders.as_deref(),
313 )?;
314 push_yaml_list(
315 &mut rendered,
316 "ignorable_authors",
317 rule.ignorable_authors.as_deref(),
318 )?;
319 push_yaml_string(&mut rendered, "language", rule.language.as_deref())?;
320 rendered.push_str("---\n\n");
321 rendered.push_str(&rule.text);
322 rendered.push('\n');
323 Ok(rendered)
324}
325
326fn render_license(license: &LoadedLicense) -> Result<String> {
327 let mut rendered = String::from("---\n");
328 push_yaml_string(&mut rendered, "key", Some(&license.key))?;
329 push_yaml_string(&mut rendered, "short_name", license.short_name.as_deref())?;
330 push_yaml_string(&mut rendered, "name", Some(&license.name))?;
331 push_yaml_string(
332 &mut rendered,
333 "spdx_license_key",
334 license.spdx_license_key.as_deref(),
335 )?;
336 push_yaml_list(
337 &mut rendered,
338 "other_spdx_license_keys",
339 Some(&license.other_spdx_license_keys),
340 )?;
341 push_yaml_string(&mut rendered, "category", license.category.as_deref())?;
342 push_yaml_string(&mut rendered, "owner", license.owner.as_deref())?;
343 push_yaml_string(
344 &mut rendered,
345 "homepage_url",
346 license.homepage_url.as_deref(),
347 )?;
348 push_yaml_string(
349 &mut rendered,
350 "osi_license_key",
351 license.osi_license_key.as_deref(),
352 )?;
353 push_yaml_list(&mut rendered, "text_urls", Some(&license.text_urls))?;
354 push_yaml_string(&mut rendered, "osi_url", license.osi_url.as_deref())?;
355 push_yaml_string(&mut rendered, "faq_url", license.faq_url.as_deref())?;
356 push_yaml_list(&mut rendered, "other_urls", Some(&license.other_urls))?;
357 push_yaml_string(&mut rendered, "notes", license.notes.as_deref())?;
358 push_yaml_bool(&mut rendered, "is_deprecated", license.is_deprecated);
359 push_yaml_bool(&mut rendered, "is_exception", license.is_exception);
360 push_yaml_bool(&mut rendered, "is_unknown", license.is_unknown);
361 push_yaml_bool(&mut rendered, "is_generic", license.is_generic);
362 push_yaml_list(&mut rendered, "replaced_by", Some(&license.replaced_by))?;
363 push_yaml_u8(&mut rendered, "minimum_coverage", license.minimum_coverage);
364 push_yaml_string(
365 &mut rendered,
366 "standard_notice",
367 license.standard_notice.as_deref(),
368 )?;
369 push_yaml_list(
370 &mut rendered,
371 "ignorable_copyrights",
372 license.ignorable_copyrights.as_deref(),
373 )?;
374 push_yaml_list(
375 &mut rendered,
376 "ignorable_holders",
377 license.ignorable_holders.as_deref(),
378 )?;
379 push_yaml_list(
380 &mut rendered,
381 "ignorable_authors",
382 license.ignorable_authors.as_deref(),
383 )?;
384 push_yaml_list(
385 &mut rendered,
386 "ignorable_urls",
387 license.ignorable_urls.as_deref(),
388 )?;
389 push_yaml_list(
390 &mut rendered,
391 "ignorable_emails",
392 license.ignorable_emails.as_deref(),
393 )?;
394 rendered.push_str("---\n\n");
395 rendered.push_str(&license.text);
396 rendered.push('\n');
397 Ok(rendered)
398}
399
400fn push_rule_kind(rendered: &mut String, rule_kind: RuleKind) {
401 let key = match rule_kind {
402 RuleKind::None => return,
403 RuleKind::Text => "is_license_text",
404 RuleKind::Notice => "is_license_notice",
405 RuleKind::Reference => "is_license_reference",
406 RuleKind::Tag => "is_license_tag",
407 RuleKind::Intro => "is_license_intro",
408 RuleKind::Clue => "is_license_clue",
409 };
410 let _ = writeln!(rendered, "{key}: true");
411}
412
413fn push_yaml_bool(rendered: &mut String, key: &str, value: bool) {
414 if value {
415 let _ = writeln!(rendered, "{key}: true");
416 }
417}
418
419fn push_yaml_u8(rendered: &mut String, key: &str, value: Option<u8>) {
420 if let Some(value) = value {
421 let _ = writeln!(rendered, "{key}: {value}");
422 }
423}
424
425fn push_yaml_string(rendered: &mut String, key: &str, value: Option<&str>) -> Result<()> {
426 let Some(value) = value else {
427 return Ok(());
428 };
429 let quoted = serde_json::to_string(value).context("serialize yaml string")?;
430 let _ = writeln!(rendered, "{key}: {quoted}");
431 Ok(())
432}
433
434fn push_yaml_list(rendered: &mut String, key: &str, values: Option<&[String]>) -> Result<()> {
435 let Some(values) = values else {
436 return Ok(());
437 };
438 if values.is_empty() {
439 return Ok(());
440 }
441
442 let _ = writeln!(rendered, "{key}:");
443 for value in values {
444 let quoted = serde_json::to_string(value).context("serialize yaml list entry")?;
445 let _ = writeln!(rendered, " - {quoted}");
446 }
447 Ok(())
448}
449
450#[cfg(test)]
451mod tests {
452 use super::*;
453 use crate::license_detection::models::RuleKind;
454 use crate::license_detection::rules::{parse_license_str_to_loaded, parse_rule_str_to_loaded};
455 use tempfile::TempDir;
456
457 fn create_loaded_rule() -> LoadedRule {
458 LoadedRule {
459 identifier: "example.RULE".to_string(),
460 license_expression: "mit OR apache-2.0".to_string(),
461 text: "Example rule text".to_string(),
462 rule_kind: RuleKind::Notice,
463 is_false_positive: false,
464 is_required_phrase: true,
465 skip_for_required_phrase_generation: true,
466 relevance: Some(100),
467 minimum_coverage: Some(75),
468 has_stored_minimum_coverage: true,
469 is_continuous: true,
470 referenced_filenames: Some(vec!["LICENSE".to_string()]),
471 ignorable_urls: Some(vec!["https://example.com".to_string()]),
472 ignorable_emails: Some(vec!["legal@example.com".to_string()]),
473 ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
474 ignorable_holders: Some(vec!["Example Org".to_string()]),
475 ignorable_authors: Some(vec!["Jane Doe".to_string()]),
476 language: Some("en".to_string()),
477 notes: Some("Example note".to_string()),
478 is_deprecated: true,
479 replaced_by: vec!["replacement.RULE".to_string()],
480 }
481 }
482
483 fn create_loaded_license() -> LoadedLicense {
484 LoadedLicense {
485 key: "example-license".to_string(),
486 short_name: Some("Example".to_string()),
487 name: "Example License".to_string(),
488 language: Some("en".to_string()),
489 spdx_license_key: Some("MIT".to_string()),
490 other_spdx_license_keys: vec!["Apache-2.0".to_string()],
491 category: Some("Permissive".to_string()),
492 owner: Some("Example Org".to_string()),
493 homepage_url: Some("https://example.com".to_string()),
494 text: "Example license text".to_string(),
495 reference_urls: vec![
496 "https://example.com/text".to_string(),
497 "https://example.com/other".to_string(),
498 "https://opensource.org/licenses/MIT".to_string(),
499 "https://example.com/faq".to_string(),
500 "https://example.com".to_string(),
501 ],
502 osi_license_key: Some("MIT".to_string()),
503 text_urls: vec!["https://example.com/text".to_string()],
504 osi_url: Some("https://opensource.org/licenses/MIT".to_string()),
505 faq_url: Some("https://example.com/faq".to_string()),
506 other_urls: vec!["https://example.com/other".to_string()],
507 notes: Some("Example note".to_string()),
508 is_deprecated: true,
509 is_exception: true,
510 is_unknown: true,
511 is_generic: true,
512 replaced_by: vec!["replacement".to_string()],
513 minimum_coverage: Some(55),
514 standard_notice: Some("Standard notice".to_string()),
515 ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
516 ignorable_holders: Some(vec!["Example Org".to_string()]),
517 ignorable_authors: Some(vec!["Jane Doe".to_string()]),
518 ignorable_urls: Some(vec!["https://example.com".to_string()]),
519 ignorable_emails: Some(vec!["legal@example.com".to_string()]),
520 }
521 }
522
523 #[test]
524 fn render_rule_roundtrips_through_loader() {
525 let rule = create_loaded_rule();
526 let rendered = render_rule(&rule).expect("render rule");
527 let reparsed = parse_rule_str_to_loaded(&rule.identifier, &rendered).expect("reparse rule");
528 assert_eq!(reparsed, rule);
529 }
530
531 #[test]
532 fn render_license_roundtrips_through_loader() {
533 let license = create_loaded_license();
534 let rendered = render_license(&license).expect("render license");
535 let reparsed =
536 parse_license_str_to_loaded("example-license.LICENSE", &rendered).expect("reparse");
537 assert_eq!(reparsed, license);
538 }
539
540 #[test]
541 fn load_license_dataset_requires_manifest_and_expected_dirs() {
542 let temp = TempDir::new().expect("temp dir");
543 std::fs::create_dir_all(temp.path().join("rules")).expect("rules dir");
544 std::fs::create_dir_all(temp.path().join("licenses")).expect("licenses dir");
545
546 let error = load_license_dataset_from_root(temp.path()).expect_err("missing manifest");
547 assert!(error.to_string().contains("manifest.json"));
548 }
549
550 #[test]
551 fn load_license_dataset_fails_on_invalid_rule_file() {
552 let temp = TempDir::new().expect("temp dir");
553 let root = temp.path();
554 std::fs::create_dir_all(root.join("rules")).expect("rules dir");
555 std::fs::create_dir_all(root.join("licenses")).expect("licenses dir");
556 std::fs::write(
557 root.join("manifest.json"),
558 serde_json::json!({
559 "schema_version": 1,
560 "spdx_license_list_version": "3.27",
561 "dataset_fingerprint": "abc",
562 "exported_from_source": "embedded-artifact",
563 "exported_by_version": "test",
564 })
565 .to_string(),
566 )
567 .expect("manifest");
568 std::fs::write(root.join("rules").join("broken.RULE"), "not-frontmatter")
569 .expect("broken rule");
570 std::fs::write(
571 root.join("licenses").join("mit.LICENSE"),
572 "---\nkey: \"mit\"\nname: \"MIT License\"\n---\n\nMIT text\n",
573 )
574 .expect("license");
575
576 let error = load_license_dataset_from_root(root).expect_err("invalid rule should fail");
577 assert!(
578 error
579 .to_string()
580 .contains("Failed to parse dataset rule file")
581 );
582 }
583
584 #[test]
585 fn export_license_dataset_rejects_path_like_rule_identifier() {
586 let manifest = EmbeddedArtifactMetadata {
587 spdx_license_list_version: "3.27".to_string(),
588 license_index_provenance: crate::models::LicenseIndexProvenance {
589 source: "embedded-artifact".to_string(),
590 dataset_fingerprint: "abc123".to_string(),
591 ignored_rules: vec![],
592 ignored_licenses: vec![],
593 ignored_rules_due_to_licenses: vec![],
594 added_rules: vec![],
595 replaced_rules: vec![],
596 added_licenses: vec![],
597 replaced_licenses: vec![],
598 },
599 };
600 let temp = TempDir::new().expect("temp dir");
601
602 let error = export_license_dataset_to_root(
603 temp.path(),
604 &[LoadedRule {
605 identifier: "nested/path.RULE".to_string(),
606 ..create_loaded_rule()
607 }],
608 &[create_loaded_license()],
609 &manifest,
610 )
611 .expect_err("path-like identifiers should be rejected");
612
613 assert!(
614 error
615 .to_string()
616 .contains("Invalid rule identifier for exported license dataset")
617 );
618 }
619}