1use std::fmt::Write as _;
5use std::path::Path;
6
7use anyhow::{Context, Result, anyhow};
8use serde::{Deserialize, Serialize};
9
10use crate::cache::write_bytes_atomically;
11use crate::license_detection::embedded::index::load_loader_snapshot_from_bytes;
12use crate::license_detection::embedded::schema::EmbeddedArtifactMetadata;
13use crate::license_detection::license_cache::compute_rules_fingerprint;
14use crate::license_detection::models::{LoadedLicense, LoadedRule, RuleKind};
15use crate::license_detection::rules::{parse_license_to_loaded, parse_rule_to_loaded};
16use crate::models::Sha256Digest;
17use crate::version::BUILD_VERSION;
18
19pub const LICENSE_DATASET_RULES_DIR: &str = "rules";
20pub const LICENSE_DATASET_LICENSES_DIR: &str = "licenses";
21pub const LICENSE_DATASET_MANIFEST_FILE: &str = "manifest.json";
22pub const LICENSE_DATASET_README_FILE: &str = "README.md";
23pub const CUSTOM_LICENSE_DATASET_SOURCE: &str = "custom-license-dataset";
24const LICENSE_DATASET_SCHEMA_VERSION: u32 = 1;
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
27pub struct LicenseDatasetManifest {
28 pub schema_version: u32,
29 pub spdx_license_list_version: String,
30 pub dataset_fingerprint: String,
31 pub exported_from_source: String,
32 pub exported_by_version: String,
33}
34
35#[derive(Debug, Clone)]
36pub struct LoadedLicenseDataset {
37 pub manifest: LicenseDatasetManifest,
38 pub rules: Vec<LoadedRule>,
39 pub licenses: Vec<LoadedLicense>,
40}
41
42pub fn export_embedded_license_dataset(target_root: &Path) -> Result<LicenseDatasetManifest> {
43 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
44 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
45 .map_err(|error| anyhow!("Failed to load embedded license dataset: {}", error))?;
46
47 export_license_dataset_to_root(
48 target_root,
49 &snapshot.rules,
50 &snapshot.licenses,
51 &snapshot.metadata,
52 )
53}
54
55pub fn export_license_dataset_to_root(
56 target_root: &Path,
57 rules: &[LoadedRule],
58 licenses: &[LoadedLicense],
59 metadata: &EmbeddedArtifactMetadata,
60) -> Result<LicenseDatasetManifest> {
61 ensure_export_target_is_empty(target_root)?;
62
63 let manifest = LicenseDatasetManifest {
64 schema_version: LICENSE_DATASET_SCHEMA_VERSION,
65 spdx_license_list_version: metadata.spdx_license_list_version.clone(),
66 dataset_fingerprint: compute_dataset_fingerprint_string(rules, licenses)?,
67 exported_from_source: metadata.license_index_provenance.source.clone(),
68 exported_by_version: BUILD_VERSION.to_string(),
69 };
70
71 write_dataset_manifest(target_root, &manifest)?;
72 write_dataset_readme(target_root, &manifest)?;
73 write_rule_files(target_root, rules)?;
74 write_license_files(target_root, licenses)?;
75
76 Ok(manifest)
77}
78
79pub fn load_license_dataset_from_root(root: &Path) -> Result<LoadedLicenseDataset> {
80 let rules_dir = root.join(LICENSE_DATASET_RULES_DIR);
81 let licenses_dir = root.join(LICENSE_DATASET_LICENSES_DIR);
82
83 if !root.is_dir() {
84 return Err(anyhow!(
85 "License dataset root does not exist or is not a directory: {}",
86 root.display()
87 ));
88 }
89 if !rules_dir.is_dir() {
90 return Err(anyhow!(
91 "License dataset is missing required rules/ directory: {}",
92 rules_dir.display()
93 ));
94 }
95 if !licenses_dir.is_dir() {
96 return Err(anyhow!(
97 "License dataset is missing required licenses/ directory: {}",
98 licenses_dir.display()
99 ));
100 }
101
102 let manifest_path = root.join(LICENSE_DATASET_MANIFEST_FILE);
103 let manifest_text = std::fs::read_to_string(&manifest_path).with_context(|| {
104 format!(
105 "License dataset is missing required manifest.json at {}",
106 manifest_path.display()
107 )
108 })?;
109 let manifest: LicenseDatasetManifest =
110 serde_json::from_str(&manifest_text).with_context(|| {
111 format!(
112 "Failed to parse license dataset manifest at {}",
113 manifest_path.display()
114 )
115 })?;
116
117 if manifest.schema_version != LICENSE_DATASET_SCHEMA_VERSION {
118 return Err(anyhow!(
119 "Unsupported license dataset schema version {} in {} (expected {})",
120 manifest.schema_version,
121 manifest_path.display(),
122 LICENSE_DATASET_SCHEMA_VERSION
123 ));
124 }
125
126 let rules = load_strict_loaded_rules_from_directory(&rules_dir)?;
127 let licenses = load_strict_loaded_licenses_from_directory(&licenses_dir)?;
128
129 Ok(LoadedLicenseDataset {
130 manifest,
131 rules,
132 licenses,
133 })
134}
135
136pub fn compute_dataset_fingerprint_string(
137 rules: &[LoadedRule],
138 licenses: &[LoadedLicense],
139) -> Result<String> {
140 Ok(Sha256Digest::from_bytes(compute_rules_fingerprint(rules, licenses)?).to_string())
141}
142
143fn ensure_export_target_is_empty(target_root: &Path) -> Result<()> {
144 if target_root.exists() {
145 let mut entries = std::fs::read_dir(target_root)
146 .with_context(|| format!("Failed to read export target {}", target_root.display()))?;
147 if entries.next().is_some() {
148 return Err(anyhow!(
149 "Refusing to export into non-empty directory {}",
150 target_root.display()
151 ));
152 }
153 } else {
154 std::fs::create_dir_all(target_root)
155 .with_context(|| format!("Failed to create export target {}", target_root.display()))?;
156 }
157
158 Ok(())
159}
160
161fn write_dataset_manifest(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
162 let payload = serde_json::to_vec_pretty(manifest).context("Serialize dataset manifest")?;
163 write_bytes_atomically(&root.join(LICENSE_DATASET_MANIFEST_FILE), &payload)
164 .context("Write dataset manifest")?;
165 Ok(())
166}
167
168fn write_dataset_readme(root: &Path, manifest: &LicenseDatasetManifest) -> Result<()> {
169 let text = format!(
170 "# Exported Provenant license dataset\n\nThis directory contains the effective `.RULE` and `.LICENSE` files used by Provenant.\n\n- Reuse it with `provenant --license-dataset-path <DIR> --license ...`\n- Edit files under `rules/` and `licenses/` to customize scan behavior\n- `manifest.json` records the exported dataset fingerprint and SPDX license list version\n- The fingerprint in `manifest.json` is informational; if you edit files, Provenant computes the active dataset fingerprint from current file contents\n\nExport metadata:\n\n- schema_version: {}\n- spdx_license_list_version: {}\n- dataset_fingerprint: {}\n- exported_from_source: {}\n- exported_by_version: {}\n",
171 manifest.schema_version,
172 manifest.spdx_license_list_version,
173 manifest.dataset_fingerprint,
174 manifest.exported_from_source,
175 manifest.exported_by_version,
176 );
177 write_bytes_atomically(&root.join(LICENSE_DATASET_README_FILE), text.as_bytes())
178 .context("Write dataset README")?;
179 Ok(())
180}
181
182fn write_rule_files(root: &Path, rules: &[LoadedRule]) -> Result<()> {
183 let mut sorted = rules.iter().collect::<Vec<_>>();
184 sorted.sort_by_key(|rule| &rule.identifier);
185
186 for rule in sorted {
187 validate_dataset_filename_component(&rule.identifier, "rule identifier")?;
188 let rendered = render_rule(rule)?;
189 let output_path = root.join(LICENSE_DATASET_RULES_DIR).join(&rule.identifier);
190 write_bytes_atomically(&output_path, rendered.as_bytes())
191 .with_context(|| format!("Write rule dataset file {}", output_path.display()))?;
192 }
193
194 Ok(())
195}
196
197fn write_license_files(root: &Path, licenses: &[LoadedLicense]) -> Result<()> {
198 let mut sorted = licenses.iter().collect::<Vec<_>>();
199 sorted.sort_by_key(|license| &license.key);
200
201 for license in sorted {
202 validate_dataset_filename_component(&license.key, "license key")?;
203 let rendered = render_license(license)?;
204 let output_path = root
205 .join(LICENSE_DATASET_LICENSES_DIR)
206 .join(format!("{}.LICENSE", license.key));
207 write_bytes_atomically(&output_path, rendered.as_bytes())
208 .with_context(|| format!("Write license dataset file {}", output_path.display()))?;
209 }
210
211 Ok(())
212}
213
214fn load_strict_loaded_rules_from_directory(dir: &Path) -> Result<Vec<LoadedRule>> {
215 let mut rules = Vec::new();
216 let entries = std::fs::read_dir(dir)
217 .with_context(|| format!("Failed to read rules directory: {}", dir.display()))?;
218
219 for entry in entries {
220 let entry = entry
221 .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
222 let path = entry.path();
223 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("RULE") {
224 rules.push(parse_rule_to_loaded(&path).with_context(|| {
225 format!("Failed to parse dataset rule file {}", path.display())
226 })?);
227 }
228 }
229
230 Ok(rules)
231}
232
233fn load_strict_loaded_licenses_from_directory(dir: &Path) -> Result<Vec<LoadedLicense>> {
234 let mut licenses = Vec::new();
235 let entries = std::fs::read_dir(dir)
236 .with_context(|| format!("Failed to read licenses directory: {}", dir.display()))?;
237
238 for entry in entries {
239 let entry = entry
240 .with_context(|| format!("Failed to read directory entry in: {}", dir.display()))?;
241 let path = entry.path();
242 if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("LICENSE") {
243 licenses.push(parse_license_to_loaded(&path).with_context(|| {
244 format!("Failed to parse dataset license file {}", path.display())
245 })?);
246 }
247 }
248
249 Ok(licenses)
250}
251
252fn validate_dataset_filename_component(value: &str, kind: &str) -> Result<()> {
253 if value.is_empty()
254 || value.contains('/')
255 || value.contains('\\')
256 || value.contains("..")
257 || Path::new(value).components().count() != 1
258 {
259 return Err(anyhow!(
260 "Invalid {} for exported license dataset: {}",
261 kind,
262 value
263 ));
264 }
265
266 Ok(())
267}
268
269fn render_rule(rule: &LoadedRule) -> Result<String> {
270 let mut rendered = String::from("---\n");
271 push_yaml_string(
272 &mut rendered,
273 "license_expression",
274 Some(&rule.license_expression),
275 )?;
276 push_rule_kind(&mut rendered, rule.rule_kind);
277 push_yaml_bool(&mut rendered, "is_false_positive", rule.is_false_positive);
278 push_yaml_bool(&mut rendered, "is_required_phrase", rule.is_required_phrase);
279 push_yaml_bool(
280 &mut rendered,
281 "skip_for_required_phrase_generation",
282 rule.skip_for_required_phrase_generation,
283 );
284 push_yaml_u8(&mut rendered, "relevance", rule.relevance);
285 if rule.has_stored_minimum_coverage {
286 push_yaml_u8(&mut rendered, "minimum_coverage", rule.minimum_coverage);
287 }
288 push_yaml_bool(&mut rendered, "is_continuous", rule.is_continuous);
289 push_yaml_bool(&mut rendered, "is_deprecated", rule.is_deprecated);
290 push_yaml_list(
291 &mut rendered,
292 "referenced_filenames",
293 rule.referenced_filenames.as_deref(),
294 )?;
295 push_yaml_list(&mut rendered, "replaced_by", Some(&rule.replaced_by))?;
296 push_yaml_list(
297 &mut rendered,
298 "ignorable_urls",
299 rule.ignorable_urls.as_deref(),
300 )?;
301 push_yaml_list(
302 &mut rendered,
303 "ignorable_emails",
304 rule.ignorable_emails.as_deref(),
305 )?;
306 push_yaml_string(&mut rendered, "notes", rule.notes.as_deref())?;
307 push_yaml_list(
308 &mut rendered,
309 "ignorable_copyrights",
310 rule.ignorable_copyrights.as_deref(),
311 )?;
312 push_yaml_list(
313 &mut rendered,
314 "ignorable_holders",
315 rule.ignorable_holders.as_deref(),
316 )?;
317 push_yaml_list(
318 &mut rendered,
319 "ignorable_authors",
320 rule.ignorable_authors.as_deref(),
321 )?;
322 push_yaml_string(&mut rendered, "language", rule.language.as_deref())?;
323 rendered.push_str("---\n\n");
324 rendered.push_str(&rule.text);
325 rendered.push('\n');
326 Ok(rendered)
327}
328
329fn render_license(license: &LoadedLicense) -> Result<String> {
330 let mut rendered = String::from("---\n");
331 push_yaml_string(&mut rendered, "key", Some(&license.key))?;
332 push_yaml_string(&mut rendered, "short_name", license.short_name.as_deref())?;
333 push_yaml_string(&mut rendered, "name", Some(&license.name))?;
334 push_yaml_string(
335 &mut rendered,
336 "spdx_license_key",
337 license.spdx_license_key.as_deref(),
338 )?;
339 push_yaml_list(
340 &mut rendered,
341 "other_spdx_license_keys",
342 Some(&license.other_spdx_license_keys),
343 )?;
344 push_yaml_string(&mut rendered, "category", license.category.as_deref())?;
345 push_yaml_string(&mut rendered, "owner", license.owner.as_deref())?;
346 push_yaml_string(
347 &mut rendered,
348 "homepage_url",
349 license.homepage_url.as_deref(),
350 )?;
351 push_yaml_string(
352 &mut rendered,
353 "osi_license_key",
354 license.osi_license_key.as_deref(),
355 )?;
356 push_yaml_list(&mut rendered, "text_urls", Some(&license.text_urls))?;
357 push_yaml_string(&mut rendered, "osi_url", license.osi_url.as_deref())?;
358 push_yaml_string(&mut rendered, "faq_url", license.faq_url.as_deref())?;
359 push_yaml_list(&mut rendered, "other_urls", Some(&license.other_urls))?;
360 push_yaml_string(&mut rendered, "notes", license.notes.as_deref())?;
361 push_yaml_bool(&mut rendered, "is_deprecated", license.is_deprecated);
362 push_yaml_bool(&mut rendered, "is_exception", license.is_exception);
363 push_yaml_bool(&mut rendered, "is_unknown", license.is_unknown);
364 push_yaml_bool(&mut rendered, "is_generic", license.is_generic);
365 push_yaml_list(&mut rendered, "replaced_by", Some(&license.replaced_by))?;
366 push_yaml_u8(&mut rendered, "minimum_coverage", license.minimum_coverage);
367 push_yaml_string(
368 &mut rendered,
369 "standard_notice",
370 license.standard_notice.as_deref(),
371 )?;
372 push_yaml_list(
373 &mut rendered,
374 "ignorable_copyrights",
375 license.ignorable_copyrights.as_deref(),
376 )?;
377 push_yaml_list(
378 &mut rendered,
379 "ignorable_holders",
380 license.ignorable_holders.as_deref(),
381 )?;
382 push_yaml_list(
383 &mut rendered,
384 "ignorable_authors",
385 license.ignorable_authors.as_deref(),
386 )?;
387 push_yaml_list(
388 &mut rendered,
389 "ignorable_urls",
390 license.ignorable_urls.as_deref(),
391 )?;
392 push_yaml_list(
393 &mut rendered,
394 "ignorable_emails",
395 license.ignorable_emails.as_deref(),
396 )?;
397 rendered.push_str("---\n\n");
398 rendered.push_str(&license.text);
399 rendered.push('\n');
400 Ok(rendered)
401}
402
403fn push_rule_kind(rendered: &mut String, rule_kind: RuleKind) {
404 let key = match rule_kind {
405 RuleKind::None => return,
406 RuleKind::Text => "is_license_text",
407 RuleKind::Notice => "is_license_notice",
408 RuleKind::Reference => "is_license_reference",
409 RuleKind::Tag => "is_license_tag",
410 RuleKind::Intro => "is_license_intro",
411 RuleKind::Clue => "is_license_clue",
412 };
413 let _ = writeln!(rendered, "{key}: true");
414}
415
416fn push_yaml_bool(rendered: &mut String, key: &str, value: bool) {
417 if value {
418 let _ = writeln!(rendered, "{key}: true");
419 }
420}
421
422fn push_yaml_u8(rendered: &mut String, key: &str, value: Option<u8>) {
423 if let Some(value) = value {
424 let _ = writeln!(rendered, "{key}: {value}");
425 }
426}
427
428fn push_yaml_string(rendered: &mut String, key: &str, value: Option<&str>) -> Result<()> {
429 let Some(value) = value else {
430 return Ok(());
431 };
432 let quoted = serde_json::to_string(value).context("serialize yaml string")?;
433 let _ = writeln!(rendered, "{key}: {quoted}");
434 Ok(())
435}
436
437fn push_yaml_list(rendered: &mut String, key: &str, values: Option<&[String]>) -> Result<()> {
438 let Some(values) = values else {
439 return Ok(());
440 };
441 if values.is_empty() {
442 return Ok(());
443 }
444
445 let _ = writeln!(rendered, "{key}:");
446 for value in values {
447 let quoted = serde_json::to_string(value).context("serialize yaml list entry")?;
448 let _ = writeln!(rendered, " - {quoted}");
449 }
450 Ok(())
451}
452
453#[cfg(test)]
454mod tests {
455 use super::*;
456 use crate::license_detection::models::RuleKind;
457 use crate::license_detection::rules::{parse_license_str_to_loaded, parse_rule_str_to_loaded};
458 use tempfile::TempDir;
459
460 fn create_loaded_rule() -> LoadedRule {
461 LoadedRule {
462 identifier: "example.RULE".to_string(),
463 license_expression: "mit OR apache-2.0".to_string(),
464 text: "Example rule text".to_string(),
465 rule_kind: RuleKind::Notice,
466 is_false_positive: false,
467 is_required_phrase: true,
468 skip_for_required_phrase_generation: true,
469 relevance: Some(100),
470 minimum_coverage: Some(75),
471 has_stored_minimum_coverage: true,
472 is_continuous: true,
473 referenced_filenames: Some(vec!["LICENSE".to_string()]),
474 ignorable_urls: Some(vec!["https://example.com".to_string()]),
475 ignorable_emails: Some(vec!["legal@example.com".to_string()]),
476 ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
477 ignorable_holders: Some(vec!["Example Org".to_string()]),
478 ignorable_authors: Some(vec!["Jane Doe".to_string()]),
479 language: Some("en".to_string()),
480 notes: Some("Example note".to_string()),
481 is_deprecated: true,
482 replaced_by: vec!["replacement.RULE".to_string()],
483 }
484 }
485
486 fn create_loaded_license() -> LoadedLicense {
487 LoadedLicense {
488 key: "example-license".to_string(),
489 short_name: Some("Example".to_string()),
490 name: "Example License".to_string(),
491 language: Some("en".to_string()),
492 spdx_license_key: Some("MIT".to_string()),
493 other_spdx_license_keys: vec!["Apache-2.0".to_string()],
494 category: Some("Permissive".to_string()),
495 owner: Some("Example Org".to_string()),
496 homepage_url: Some("https://example.com".to_string()),
497 text: "Example license text".to_string(),
498 reference_urls: vec![
499 "https://example.com/text".to_string(),
500 "https://example.com/other".to_string(),
501 "https://opensource.org/licenses/MIT".to_string(),
502 "https://example.com/faq".to_string(),
503 "https://example.com".to_string(),
504 ],
505 osi_license_key: Some("MIT".to_string()),
506 text_urls: vec!["https://example.com/text".to_string()],
507 osi_url: Some("https://opensource.org/licenses/MIT".to_string()),
508 faq_url: Some("https://example.com/faq".to_string()),
509 other_urls: vec!["https://example.com/other".to_string()],
510 notes: Some("Example note".to_string()),
511 is_deprecated: true,
512 is_exception: true,
513 is_unknown: true,
514 is_generic: true,
515 replaced_by: vec!["replacement".to_string()],
516 minimum_coverage: Some(55),
517 standard_notice: Some("Standard notice".to_string()),
518 ignorable_copyrights: Some(vec!["Copyright Example".to_string()]),
519 ignorable_holders: Some(vec!["Example Org".to_string()]),
520 ignorable_authors: Some(vec!["Jane Doe".to_string()]),
521 ignorable_urls: Some(vec!["https://example.com".to_string()]),
522 ignorable_emails: Some(vec!["legal@example.com".to_string()]),
523 }
524 }
525
526 #[test]
527 fn render_rule_roundtrips_through_loader() {
528 let rule = create_loaded_rule();
529 let rendered = render_rule(&rule).expect("render rule");
530 let reparsed = parse_rule_str_to_loaded(&rule.identifier, &rendered).expect("reparse rule");
531 assert_eq!(reparsed, rule);
532 }
533
534 #[test]
535 fn render_license_roundtrips_through_loader() {
536 let license = create_loaded_license();
537 let rendered = render_license(&license).expect("render license");
538 let reparsed =
539 parse_license_str_to_loaded("example-license.LICENSE", &rendered).expect("reparse");
540 assert_eq!(reparsed, license);
541 }
542
543 #[test]
544 fn load_license_dataset_requires_manifest_and_expected_dirs() {
545 let temp = TempDir::new().expect("temp dir");
546 std::fs::create_dir_all(temp.path().join("rules")).expect("rules dir");
547 std::fs::create_dir_all(temp.path().join("licenses")).expect("licenses dir");
548
549 let error = load_license_dataset_from_root(temp.path()).expect_err("missing manifest");
550 assert!(error.to_string().contains("manifest.json"));
551 }
552
553 #[test]
554 fn load_license_dataset_fails_on_invalid_rule_file() {
555 let temp = TempDir::new().expect("temp dir");
556 let root = temp.path();
557 std::fs::create_dir_all(root.join("rules")).expect("rules dir");
558 std::fs::create_dir_all(root.join("licenses")).expect("licenses dir");
559 std::fs::write(
560 root.join("manifest.json"),
561 serde_json::json!({
562 "schema_version": 1,
563 "spdx_license_list_version": "3.27",
564 "dataset_fingerprint": "abc",
565 "exported_from_source": "embedded-artifact",
566 "exported_by_version": "test",
567 })
568 .to_string(),
569 )
570 .expect("manifest");
571 std::fs::write(root.join("rules").join("broken.RULE"), "not-frontmatter")
572 .expect("broken rule");
573 std::fs::write(
574 root.join("licenses").join("mit.LICENSE"),
575 "---\nkey: \"mit\"\nname: \"MIT License\"\n---\n\nMIT text\n",
576 )
577 .expect("license");
578
579 let error = load_license_dataset_from_root(root).expect_err("invalid rule should fail");
580 assert!(
581 error
582 .to_string()
583 .contains("Failed to parse dataset rule file")
584 );
585 }
586
587 #[test]
588 fn export_license_dataset_rejects_path_like_rule_identifier() {
589 let manifest = EmbeddedArtifactMetadata {
590 spdx_license_list_version: "3.27".to_string(),
591 license_index_provenance: crate::models::LicenseIndexProvenance {
592 source: "embedded-artifact".to_string(),
593 dataset_fingerprint: "abc123".to_string(),
594 ignored_rules: vec![],
595 ignored_licenses: vec![],
596 ignored_rules_due_to_licenses: vec![],
597 added_rules: vec![],
598 replaced_rules: vec![],
599 added_licenses: vec![],
600 replaced_licenses: vec![],
601 },
602 };
603 let temp = TempDir::new().expect("temp dir");
604
605 let error = export_license_dataset_to_root(
606 temp.path(),
607 &[LoadedRule {
608 identifier: "nested/path.RULE".to_string(),
609 ..create_loaded_rule()
610 }],
611 &[create_loaded_license()],
612 &manifest,
613 )
614 .expect_err("path-like identifiers should be rejected");
615
616 assert!(
617 error
618 .to_string()
619 .contains("Invalid rule identifier for exported license dataset")
620 );
621 }
622}