Skip to main content

provenant/license_detection/
build_policy.rs

1use std::collections::{HashMap, HashSet};
2use std::sync::LazyLock;
3
4use anyhow::{Result, anyhow};
5use serde::Deserialize;
6
7use crate::license_detection::expression::parse_expression;
8use crate::license_detection::models::{LoadedLicense, LoadedRule, RuleKind};
9use crate::license_detection::rules::{parse_license_str_to_loaded, parse_rule_str_to_loaded};
10use crate::models::LicenseIndexProvenance;
11
12pub const DEFAULT_INDEX_BUILD_POLICY_PATH: &str =
13    "resources/license_detection/index_build_policy.toml";
14pub const DEFAULT_INDEX_BUILD_OVERLAY_ROOT: &str = "resources/license_detection/overlay";
15pub const EMBEDDED_LICENSE_INDEX_SOURCE: &str = "embedded-artifact";
16
17const DEFAULT_INDEX_BUILD_POLICY_TEXT: &str =
18    include_str!("../../resources/license_detection/index_build_policy.toml");
19
20pub(crate) struct BundledOverlayFile {
21    pub identifier: &'static str,
22    pub contents: &'static str,
23}
24
25mod bundled_overlay_manifest {
26    use super::BundledOverlayFile;
27
28    include!(concat!(env!("OUT_DIR"), "/bundled_license_overlays.rs"));
29}
30
31use bundled_overlay_manifest::{BUNDLED_LICENSE_OVERLAY_FILES, BUNDLED_RULE_OVERLAY_FILES};
32
33static DEFAULT_INDEX_BUILD_POLICY: LazyLock<IndexBuildPolicy> = LazyLock::new(|| {
34    toml::from_str(DEFAULT_INDEX_BUILD_POLICY_TEXT).unwrap_or_else(|error| {
35        panic!(
36            "Failed to parse bundled license index build policy at {}: {}",
37            DEFAULT_INDEX_BUILD_POLICY_PATH, error
38        )
39    })
40});
41
42#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize)]
43pub struct IndexBuildPolicy {
44    #[serde(default)]
45    pub ignored_rules: Vec<String>,
46    #[serde(default)]
47    pub ignored_licenses: Vec<String>,
48}
49
50impl IndexBuildPolicy {
51    pub fn is_empty(&self) -> bool {
52        self.ignored_rules.is_empty() && self.ignored_licenses.is_empty()
53    }
54
55    fn ignored_rule_set(&self) -> HashSet<String> {
56        self.ignored_rules
57            .iter()
58            .map(|identifier| identifier.trim())
59            .filter(|identifier| !identifier.is_empty())
60            .map(ToOwned::to_owned)
61            .collect()
62    }
63
64    fn ignored_license_set(&self) -> HashSet<String> {
65        self.ignored_licenses
66            .iter()
67            .map(|key| normalize_license_key(key))
68            .filter(|key| !key.is_empty())
69            .collect()
70    }
71}
72
73#[derive(Debug, Clone, Default, PartialEq, Eq)]
74pub struct AppliedIndexBuildPolicy {
75    pub ignored_rules: Vec<String>,
76    pub ignored_licenses: Vec<String>,
77    pub ignored_rules_due_to_licenses: Vec<String>,
78    pub added_rules: Vec<String>,
79    pub replaced_rules: Vec<String>,
80    pub added_licenses: Vec<String>,
81    pub replaced_licenses: Vec<String>,
82}
83
84impl AppliedIndexBuildPolicy {
85    pub fn is_empty(&self) -> bool {
86        self.ignored_rules.is_empty()
87            && self.ignored_licenses.is_empty()
88            && self.ignored_rules_due_to_licenses.is_empty()
89            && self.added_rules.is_empty()
90            && self.replaced_rules.is_empty()
91            && self.added_licenses.is_empty()
92            && self.replaced_licenses.is_empty()
93    }
94
95    fn sort_and_dedup(&mut self) {
96        for values in [
97            &mut self.ignored_rules,
98            &mut self.ignored_licenses,
99            &mut self.ignored_rules_due_to_licenses,
100            &mut self.added_rules,
101            &mut self.replaced_rules,
102            &mut self.added_licenses,
103            &mut self.replaced_licenses,
104        ] {
105            values.sort();
106            values.dedup();
107        }
108    }
109
110    pub fn to_license_index_provenance(
111        &self,
112        source: &str,
113        dataset_fingerprint: String,
114    ) -> LicenseIndexProvenance {
115        LicenseIndexProvenance {
116            source: source.to_string(),
117            dataset_fingerprint,
118            ignored_rules: self.ignored_rules.clone(),
119            ignored_licenses: self.ignored_licenses.clone(),
120            ignored_rules_due_to_licenses: self.ignored_rules_due_to_licenses.clone(),
121            added_rules: self.added_rules.clone(),
122            replaced_rules: self.replaced_rules.clone(),
123            added_licenses: self.added_licenses.clone(),
124            replaced_licenses: self.replaced_licenses.clone(),
125        }
126    }
127}
128
129pub fn default_index_build_policy() -> &'static IndexBuildPolicy {
130    &DEFAULT_INDEX_BUILD_POLICY
131}
132
133pub fn apply_default_index_build_policy(
134    loaded_rules: Vec<LoadedRule>,
135    loaded_licenses: Vec<LoadedLicense>,
136) -> Result<(Vec<LoadedRule>, Vec<LoadedLicense>, AppliedIndexBuildPolicy)> {
137    let overlay_rules = load_default_overlay_rules()?;
138    let overlay_licenses = load_default_overlay_licenses()?;
139    let (loaded_rules, loaded_licenses, report) = apply_index_build_policy(
140        loaded_rules,
141        loaded_licenses,
142        default_index_build_policy(),
143        &overlay_rules,
144        &overlay_licenses,
145    )?;
146    Ok((loaded_rules, loaded_licenses, report))
147}
148
149pub fn apply_index_build_policy(
150    loaded_rules: Vec<LoadedRule>,
151    loaded_licenses: Vec<LoadedLicense>,
152    policy: &IndexBuildPolicy,
153    overlay_rules: &[LoadedRule],
154    overlay_licenses: &[LoadedLicense],
155) -> Result<(Vec<LoadedRule>, Vec<LoadedLicense>, AppliedIndexBuildPolicy)> {
156    if policy.is_empty() && overlay_rules.is_empty() && overlay_licenses.is_empty() {
157        return Ok((
158            loaded_rules,
159            loaded_licenses,
160            AppliedIndexBuildPolicy::default(),
161        ));
162    }
163
164    let ignored_rule_identifiers = policy.ignored_rule_set();
165    let ignored_license_keys = policy.ignored_license_set();
166    let mut report = AppliedIndexBuildPolicy::default();
167
168    let mut filtered_licenses: Vec<_> = loaded_licenses
169        .into_iter()
170        .filter_map(|license| {
171            if ignored_license_keys.contains(&normalize_license_key(&license.key)) {
172                report.ignored_licenses.push(license.key.clone());
173                None
174            } else {
175                Some(license)
176            }
177        })
178        .collect();
179
180    let mut filtered_rules: Vec<_> = loaded_rules
181        .into_iter()
182        .filter_map(|rule| {
183            if ignored_rule_identifiers.contains(rule.identifier.as_str()) {
184                report.ignored_rules.push(rule.identifier.clone());
185                return None;
186            }
187
188            if rule_references_ignored_license(&rule, &ignored_license_keys) {
189                report
190                    .ignored_rules_due_to_licenses
191                    .push(rule.identifier.clone());
192                return None;
193            }
194
195            Some(rule)
196        })
197        .collect();
198
199    ensure_all_ignored_entries_exist(&ignored_rule_identifiers, &ignored_license_keys, &report)?;
200
201    apply_license_overlays(
202        &mut filtered_licenses,
203        overlay_licenses,
204        &ignored_license_keys,
205        &mut report,
206    )?;
207    apply_rule_overlays(
208        &mut filtered_rules,
209        overlay_rules,
210        &ignored_rule_identifiers,
211        &ignored_license_keys,
212        &filtered_licenses,
213        &mut report,
214    )?;
215
216    report.sort_and_dedup();
217
218    Ok((filtered_rules, filtered_licenses, report))
219}
220
221fn load_default_overlay_rules() -> Result<Vec<LoadedRule>> {
222    BUNDLED_RULE_OVERLAY_FILES
223        .iter()
224        .map(|overlay| {
225            parse_rule_str_to_loaded(overlay.identifier, overlay.contents).map_err(|error| {
226                anyhow!(
227                    "Failed to parse bundled overlay rule {} from {}: {}",
228                    overlay.identifier,
229                    DEFAULT_INDEX_BUILD_OVERLAY_ROOT,
230                    error
231                )
232            })
233        })
234        .collect()
235}
236
237fn load_default_overlay_licenses() -> Result<Vec<LoadedLicense>> {
238    BUNDLED_LICENSE_OVERLAY_FILES
239        .iter()
240        .map(|overlay| {
241            parse_license_str_to_loaded(overlay.identifier, overlay.contents).map_err(|error| {
242                anyhow!(
243                    "Failed to parse bundled overlay license {} from {}: {}",
244                    overlay.identifier,
245                    DEFAULT_INDEX_BUILD_OVERLAY_ROOT,
246                    error
247                )
248            })
249        })
250        .collect()
251}
252
253fn ensure_all_ignored_entries_exist(
254    ignored_rule_identifiers: &HashSet<String>,
255    ignored_license_keys: &HashSet<String>,
256    report: &AppliedIndexBuildPolicy,
257) -> Result<()> {
258    let applied_ignored_rules = report.ignored_rules.iter().cloned().collect::<HashSet<_>>();
259    let missing_rules = ignored_rule_identifiers
260        .difference(&applied_ignored_rules)
261        .cloned()
262        .collect::<Vec<_>>();
263
264    let applied_ignored_licenses = report
265        .ignored_licenses
266        .iter()
267        .map(|key| normalize_license_key(key))
268        .collect::<HashSet<_>>();
269    let missing_licenses = ignored_license_keys
270        .difference(&applied_ignored_licenses)
271        .cloned()
272        .collect::<Vec<_>>();
273
274    if missing_rules.is_empty() && missing_licenses.is_empty() {
275        Ok(())
276    } else {
277        let mut problems = Vec::new();
278        if !missing_rules.is_empty() {
279            problems.push(format!(
280                "ignored rule identifiers not found upstream: {}",
281                missing_rules.join(", ")
282            ));
283        }
284        if !missing_licenses.is_empty() {
285            problems.push(format!(
286                "ignored license keys not found upstream: {}",
287                missing_licenses.join(", ")
288            ));
289        }
290        Err(anyhow!(
291            "stale index-build policy entries detected; remove or update them: {}",
292            problems.join("; ")
293        ))
294    }
295}
296
297fn apply_license_overlays(
298    licenses: &mut Vec<LoadedLicense>,
299    overlays: &[LoadedLicense],
300    ignored_license_keys: &HashSet<String>,
301    report: &mut AppliedIndexBuildPolicy,
302) -> Result<()> {
303    let mut indices = build_license_index_map(licenses)?;
304    let mut seen_overlay_keys = HashSet::new();
305
306    for overlay in overlays {
307        let key = normalize_license_key(&overlay.key);
308
309        if !seen_overlay_keys.insert(key.clone()) {
310            return Err(anyhow!(
311                "bundled overlay contains duplicate license key '{}'",
312                overlay.key
313            ));
314        }
315
316        if ignored_license_keys.contains(&key) {
317            return Err(anyhow!(
318                "overlay license '{}' conflicts with ignored_licenses",
319                overlay.key
320            ));
321        }
322
323        if let Some(index) = indices.get(&key).copied() {
324            if licenses[index] == *overlay {
325                return Err(anyhow!(
326                    "overlay license '{}' is now identical to upstream; remove the local overlay file",
327                    overlay.key
328                ));
329            }
330            report.replaced_licenses.push(overlay.key.clone());
331            licenses[index] = overlay.clone();
332        } else {
333            report.added_licenses.push(overlay.key.clone());
334            licenses.push(overlay.clone());
335            indices.insert(key, licenses.len() - 1);
336        }
337    }
338
339    Ok(())
340}
341
342fn apply_rule_overlays(
343    rules: &mut Vec<LoadedRule>,
344    overlays: &[LoadedRule],
345    ignored_rule_identifiers: &HashSet<String>,
346    ignored_license_keys: &HashSet<String>,
347    licenses: &[LoadedLicense],
348    report: &mut AppliedIndexBuildPolicy,
349) -> Result<()> {
350    let mut indices = build_rule_index_map(rules)?;
351    let mut seen_overlay_identifiers = HashSet::new();
352    let available_license_keys = licenses
353        .iter()
354        .map(|license| normalize_license_key(&license.key))
355        .collect::<HashSet<_>>();
356
357    for overlay in overlays {
358        let identifier = overlay.identifier.clone();
359
360        if !seen_overlay_identifiers.insert(identifier.clone()) {
361            return Err(anyhow!(
362                "bundled overlay contains duplicate rule identifier '{}'",
363                identifier
364            ));
365        }
366
367        if ignored_rule_identifiers.contains(identifier.as_str()) {
368            return Err(anyhow!(
369                "overlay rule '{}' conflicts with ignored_rules",
370                identifier
371            ));
372        }
373
374        if rule_references_ignored_license(overlay, ignored_license_keys) {
375            return Err(anyhow!(
376                "overlay rule '{}' references an ignored license key",
377                identifier
378            ));
379        }
380
381        ensure_rule_references_known_licenses(overlay, &available_license_keys)?;
382
383        if let Some(index) = indices.get(identifier.as_str()).copied() {
384            if rules[index] == *overlay {
385                return Err(anyhow!(
386                    "overlay rule '{}' is now identical to upstream; remove the local overlay file",
387                    identifier
388                ));
389            }
390            report.replaced_rules.push(identifier.clone());
391            rules[index] = overlay.clone();
392        } else {
393            report.added_rules.push(identifier.clone());
394            rules.push(overlay.clone());
395            indices.insert(identifier, rules.len() - 1);
396        }
397    }
398
399    Ok(())
400}
401
402fn build_rule_index_map(rules: &[LoadedRule]) -> Result<HashMap<String, usize>> {
403    let mut indices = HashMap::new();
404    for (index, rule) in rules.iter().enumerate() {
405        if indices.insert(rule.identifier.clone(), index).is_some() {
406            return Err(anyhow!(
407                "cannot apply overlay because duplicate rule identifier '{}' is already present",
408                rule.identifier
409            ));
410        }
411    }
412    Ok(indices)
413}
414
415fn build_license_index_map(licenses: &[LoadedLicense]) -> Result<HashMap<String, usize>> {
416    let mut indices = HashMap::new();
417    for (index, license) in licenses.iter().enumerate() {
418        let normalized_key = normalize_license_key(&license.key);
419        if indices.insert(normalized_key, index).is_some() {
420            return Err(anyhow!(
421                "cannot apply overlay because duplicate license key '{}' is already present",
422                license.key
423            ));
424        }
425    }
426    Ok(indices)
427}
428
429fn ensure_rule_references_known_licenses(
430    rule: &LoadedRule,
431    available_license_keys: &HashSet<String>,
432) -> Result<()> {
433    if rule.rule_kind == RuleKind::None && rule.is_false_positive {
434        return Ok(());
435    }
436
437    let expression = parse_expression(&rule.license_expression).map_err(|error| {
438        anyhow!(
439            "overlay rule '{}' has an invalid license expression '{}': {}",
440            rule.identifier,
441            rule.license_expression,
442            error
443        )
444    })?;
445
446    let missing_keys = expression
447        .license_keys()
448        .into_iter()
449        .map(|key| normalize_license_key(&key))
450        .filter(|key| !available_license_keys.contains(key))
451        .collect::<Vec<_>>();
452
453    if missing_keys.is_empty() {
454        Ok(())
455    } else {
456        Err(anyhow!(
457            "overlay rule '{}' references unknown license keys: {}",
458            rule.identifier,
459            missing_keys.join(", ")
460        ))
461    }
462}
463
464fn normalize_license_key(key: &str) -> String {
465    key.trim().to_lowercase()
466}
467
468fn rule_references_ignored_license(
469    rule: &LoadedRule,
470    ignored_license_keys: &HashSet<String>,
471) -> bool {
472    if ignored_license_keys.is_empty() {
473        return false;
474    }
475
476    let normalized_expression = normalize_license_key(&rule.license_expression);
477    if ignored_license_keys.contains(&normalized_expression) {
478        return true;
479    }
480
481    if rule.rule_kind == RuleKind::None && rule.is_false_positive {
482        return false;
483    }
484
485    parse_expression(&rule.license_expression)
486        .map(|expression| {
487            expression
488                .license_keys()
489                .into_iter()
490                .map(|key| normalize_license_key(&key))
491                .any(|key| ignored_license_keys.contains(&key))
492        })
493        .unwrap_or(false)
494}
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    fn create_loaded_rule(identifier: &str, expression: &str) -> LoadedRule {
501        LoadedRule {
502            identifier: identifier.to_string(),
503            license_expression: expression.to_string(),
504            text: format!("{identifier} text"),
505            rule_kind: RuleKind::Text,
506            is_false_positive: false,
507            is_required_phrase: false,
508            skip_for_required_phrase_generation: false,
509            relevance: Some(100),
510            minimum_coverage: None,
511            has_stored_minimum_coverage: false,
512            is_continuous: false,
513            referenced_filenames: None,
514            ignorable_urls: None,
515            ignorable_emails: None,
516            ignorable_copyrights: None,
517            ignorable_holders: None,
518            ignorable_authors: None,
519            language: None,
520            notes: None,
521            is_deprecated: false,
522            replaced_by: vec![],
523        }
524    }
525
526    fn create_loaded_license(key: &str) -> LoadedLicense {
527        LoadedLicense {
528            key: key.to_string(),
529            short_name: Some(key.to_uppercase()),
530            name: format!("{key} license"),
531            language: Some("en".to_string()),
532            spdx_license_key: Some(key.to_uppercase()),
533            other_spdx_license_keys: vec![],
534            category: Some("Permissive".to_string()),
535            owner: None,
536            homepage_url: None,
537            text: format!("{key} text"),
538            reference_urls: vec![],
539            osi_license_key: None,
540            text_urls: vec![],
541            osi_url: None,
542            faq_url: None,
543            other_urls: vec![],
544            notes: None,
545            is_deprecated: false,
546            is_exception: false,
547            is_unknown: false,
548            is_generic: false,
549            replaced_by: vec![],
550            minimum_coverage: None,
551            standard_notice: None,
552            ignorable_copyrights: None,
553            ignorable_holders: None,
554            ignorable_authors: None,
555            ignorable_urls: None,
556            ignorable_emails: None,
557        }
558    }
559
560    #[test]
561    fn test_apply_index_build_policy_filters_direct_and_dependent_entries() {
562        let policy = IndexBuildPolicy {
563            ignored_rules: vec!["direct.RULE".to_string()],
564            ignored_licenses: vec!["apache-2.0".to_string()],
565        };
566
567        let rules = vec![
568            create_loaded_rule("keep.RULE", "mit"),
569            create_loaded_rule("direct.RULE", "mit"),
570            create_loaded_rule("dependent.RULE", "mit OR apache-2.0"),
571        ];
572        let licenses = vec![
573            create_loaded_license("mit"),
574            create_loaded_license("apache-2.0"),
575        ];
576
577        let (filtered_rules, filtered_licenses, report) =
578            apply_index_build_policy(rules, licenses, &policy, &[], &[])
579                .expect("policy application");
580
581        assert_eq!(
582            filtered_rules
583                .iter()
584                .map(|rule| rule.identifier.as_str())
585                .collect::<Vec<_>>(),
586            vec!["keep.RULE"]
587        );
588        assert_eq!(
589            filtered_licenses
590                .iter()
591                .map(|license| license.key.as_str())
592                .collect::<Vec<_>>(),
593            vec!["mit"]
594        );
595        assert_eq!(report.ignored_rules, vec!["direct.RULE".to_string()]);
596        assert_eq!(report.ignored_licenses, vec!["apache-2.0".to_string()]);
597        assert_eq!(
598            report.ignored_rules_due_to_licenses,
599            vec!["dependent.RULE".to_string()]
600        );
601    }
602
603    #[test]
604    fn test_apply_index_build_policy_fails_for_stale_ignored_entries() {
605        let policy = IndexBuildPolicy {
606            ignored_rules: vec!["missing.RULE".to_string()],
607            ignored_licenses: vec![],
608        };
609
610        let error = apply_index_build_policy(
611            vec![create_loaded_rule("keep.RULE", "mit")],
612            vec![create_loaded_license("mit")],
613            &policy,
614            &[],
615            &[],
616        )
617        .expect_err("missing ignored rule should fail");
618
619        assert!(
620            error
621                .to_string()
622                .contains("ignored rule identifiers not found upstream: missing.RULE")
623        );
624    }
625
626    #[test]
627    fn test_apply_index_build_policy_infers_add_from_new_overlay_entries() {
628        let policy = IndexBuildPolicy::default();
629        let overlay_rules = vec![create_loaded_rule("custom-rule.RULE", "mit")];
630        let overlay_licenses = vec![create_loaded_license("custom-license")];
631        let rules = vec![create_loaded_rule("keep.RULE", "mit")];
632        let licenses = vec![create_loaded_license("mit")];
633
634        let (filtered_rules, filtered_licenses, report) =
635            apply_index_build_policy(rules, licenses, &policy, &overlay_rules, &overlay_licenses)
636                .expect("policy application");
637
638        assert!(
639            filtered_rules
640                .iter()
641                .any(|rule| rule.identifier == "custom-rule.RULE")
642        );
643        assert!(
644            filtered_licenses
645                .iter()
646                .any(|license| license.key == "custom-license")
647        );
648        assert_eq!(report.added_rules, vec!["custom-rule.RULE".to_string()]);
649        assert_eq!(report.added_licenses, vec!["custom-license".to_string()]);
650    }
651
652    #[test]
653    fn test_apply_index_build_policy_infers_replace_from_colliding_overlay_entries() {
654        let policy = IndexBuildPolicy::default();
655        let overlay_rules = vec![LoadedRule {
656            text: "updated rule text".to_string(),
657            ..create_loaded_rule("replace.RULE", "mit")
658        }];
659        let overlay_licenses = vec![LoadedLicense {
660            name: "MIT Updated".to_string(),
661            text: "updated license text".to_string(),
662            ..create_loaded_license("mit")
663        }];
664        let rules = vec![create_loaded_rule("replace.RULE", "mit")];
665        let licenses = vec![create_loaded_license("mit")];
666
667        let (filtered_rules, filtered_licenses, report) =
668            apply_index_build_policy(rules, licenses, &policy, &overlay_rules, &overlay_licenses)
669                .expect("policy application");
670
671        assert_eq!(filtered_rules[0].text, "updated rule text");
672        assert_eq!(filtered_licenses[0].name, "MIT Updated");
673        assert_eq!(report.replaced_rules, vec!["replace.RULE".to_string()]);
674        assert_eq!(report.replaced_licenses, vec!["mit".to_string()]);
675    }
676
677    #[test]
678    fn test_apply_index_build_policy_rejects_redundant_rule_overlay() {
679        let policy = IndexBuildPolicy::default();
680        let base_rule = create_loaded_rule("replace.RULE", "mit");
681        let error = apply_index_build_policy(
682            vec![base_rule.clone()],
683            vec![create_loaded_license("mit")],
684            &policy,
685            &[base_rule],
686            &[],
687        )
688        .expect_err("redundant overlay should fail");
689
690        assert!(
691            error
692                .to_string()
693                .contains("overlay rule 'replace.RULE' is now identical to upstream")
694        );
695    }
696
697    #[test]
698    fn test_apply_index_build_policy_rejects_redundant_license_overlay() {
699        let policy = IndexBuildPolicy::default();
700        let base_license = create_loaded_license("mit");
701        let error = apply_index_build_policy(
702            vec![create_loaded_rule("keep.RULE", "mit")],
703            vec![base_license.clone()],
704            &policy,
705            &[],
706            &[base_license],
707        )
708        .expect_err("redundant overlay should fail");
709
710        assert!(
711            error
712                .to_string()
713                .contains("overlay license 'mit' is now identical to upstream")
714        );
715    }
716}