Skip to main content

provenant/license_detection/
build_policy.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::{HashMap, HashSet};
5use std::sync::LazyLock;
6
7use anyhow::{Result, anyhow};
8use serde::Deserialize;
9
10use crate::license_detection::expression::parse_expression;
11use crate::license_detection::models::{LoadedLicense, LoadedRule, RuleKind};
12use crate::license_detection::rules::{parse_license_str_to_loaded, parse_rule_str_to_loaded};
13use crate::models::LicenseIndexProvenance;
14
15pub const DEFAULT_INDEX_BUILD_POLICY_PATH: &str =
16    "resources/license_detection/index_build_policy.toml";
17pub const DEFAULT_INDEX_BUILD_OVERLAY_ROOT: &str = "resources/license_detection/overlay";
18pub const EMBEDDED_LICENSE_INDEX_SOURCE: &str = "embedded-artifact";
19
20const DEFAULT_INDEX_BUILD_POLICY_TEXT: &str =
21    include_str!("../../resources/license_detection/index_build_policy.toml");
22
23pub(crate) struct BundledOverlayFile {
24    pub identifier: &'static str,
25    pub contents: &'static str,
26}
27
28mod bundled_overlay_manifest {
29    use super::BundledOverlayFile;
30
31    include!(concat!(env!("OUT_DIR"), "/bundled_license_overlays.rs"));
32}
33
34use bundled_overlay_manifest::{BUNDLED_LICENSE_OVERLAY_FILES, BUNDLED_RULE_OVERLAY_FILES};
35
36static DEFAULT_INDEX_BUILD_POLICY: LazyLock<IndexBuildPolicy> = LazyLock::new(|| {
37    toml::from_str(DEFAULT_INDEX_BUILD_POLICY_TEXT).unwrap_or_else(|error| {
38        panic!(
39            "Failed to parse bundled license index build policy at {}: {}",
40            DEFAULT_INDEX_BUILD_POLICY_PATH, error
41        )
42    })
43});
44
45#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize)]
46pub struct IndexBuildPolicy {
47    #[serde(default)]
48    pub ignored_rules: Vec<String>,
49    #[serde(default)]
50    pub ignored_licenses: Vec<String>,
51}
52
53impl IndexBuildPolicy {
54    pub fn is_empty(&self) -> bool {
55        self.ignored_rules.is_empty() && self.ignored_licenses.is_empty()
56    }
57
58    fn ignored_rule_set(&self) -> HashSet<String> {
59        self.ignored_rules
60            .iter()
61            .map(|identifier| identifier.trim())
62            .filter(|identifier| !identifier.is_empty())
63            .map(ToOwned::to_owned)
64            .collect()
65    }
66
67    fn ignored_license_set(&self) -> HashSet<String> {
68        self.ignored_licenses
69            .iter()
70            .map(|key| normalize_license_key(key))
71            .filter(|key| !key.is_empty())
72            .collect()
73    }
74}
75
76#[derive(Debug, Clone, Default, PartialEq, Eq)]
77pub struct AppliedIndexBuildPolicy {
78    pub ignored_rules: Vec<String>,
79    pub ignored_licenses: Vec<String>,
80    pub ignored_rules_due_to_licenses: Vec<String>,
81    pub added_rules: Vec<String>,
82    pub replaced_rules: Vec<String>,
83    pub added_licenses: Vec<String>,
84    pub replaced_licenses: Vec<String>,
85}
86
87impl AppliedIndexBuildPolicy {
88    pub fn is_empty(&self) -> bool {
89        self.ignored_rules.is_empty()
90            && self.ignored_licenses.is_empty()
91            && self.ignored_rules_due_to_licenses.is_empty()
92            && self.added_rules.is_empty()
93            && self.replaced_rules.is_empty()
94            && self.added_licenses.is_empty()
95            && self.replaced_licenses.is_empty()
96    }
97
98    fn sort_and_dedup(&mut self) {
99        for values in [
100            &mut self.ignored_rules,
101            &mut self.ignored_licenses,
102            &mut self.ignored_rules_due_to_licenses,
103            &mut self.added_rules,
104            &mut self.replaced_rules,
105            &mut self.added_licenses,
106            &mut self.replaced_licenses,
107        ] {
108            values.sort();
109            values.dedup();
110        }
111    }
112
113    pub fn to_license_index_provenance(
114        &self,
115        source: &str,
116        dataset_fingerprint: String,
117    ) -> LicenseIndexProvenance {
118        LicenseIndexProvenance {
119            source: source.to_string(),
120            dataset_fingerprint,
121            ignored_rules: self.ignored_rules.clone(),
122            ignored_licenses: self.ignored_licenses.clone(),
123            ignored_rules_due_to_licenses: self.ignored_rules_due_to_licenses.clone(),
124            added_rules: self.added_rules.clone(),
125            replaced_rules: self.replaced_rules.clone(),
126            added_licenses: self.added_licenses.clone(),
127            replaced_licenses: self.replaced_licenses.clone(),
128        }
129    }
130}
131
132pub fn default_index_build_policy() -> &'static IndexBuildPolicy {
133    &DEFAULT_INDEX_BUILD_POLICY
134}
135
136pub fn apply_default_index_build_policy(
137    loaded_rules: Vec<LoadedRule>,
138    loaded_licenses: Vec<LoadedLicense>,
139) -> Result<(Vec<LoadedRule>, Vec<LoadedLicense>, AppliedIndexBuildPolicy)> {
140    let overlay_rules = load_default_overlay_rules()?;
141    let overlay_licenses = load_default_overlay_licenses()?;
142    let (loaded_rules, loaded_licenses, report) = apply_index_build_policy(
143        loaded_rules,
144        loaded_licenses,
145        default_index_build_policy(),
146        &overlay_rules,
147        &overlay_licenses,
148    )?;
149    Ok((loaded_rules, loaded_licenses, report))
150}
151
152pub fn apply_index_build_policy(
153    loaded_rules: Vec<LoadedRule>,
154    loaded_licenses: Vec<LoadedLicense>,
155    policy: &IndexBuildPolicy,
156    overlay_rules: &[LoadedRule],
157    overlay_licenses: &[LoadedLicense],
158) -> Result<(Vec<LoadedRule>, Vec<LoadedLicense>, AppliedIndexBuildPolicy)> {
159    if policy.is_empty() && overlay_rules.is_empty() && overlay_licenses.is_empty() {
160        return Ok((
161            loaded_rules,
162            loaded_licenses,
163            AppliedIndexBuildPolicy::default(),
164        ));
165    }
166
167    let ignored_rule_identifiers = policy.ignored_rule_set();
168    let ignored_license_keys = policy.ignored_license_set();
169    let mut report = AppliedIndexBuildPolicy::default();
170
171    let mut filtered_licenses: Vec<_> = loaded_licenses
172        .into_iter()
173        .filter_map(|license| {
174            if ignored_license_keys.contains(&normalize_license_key(&license.key)) {
175                report.ignored_licenses.push(license.key.clone());
176                None
177            } else {
178                Some(license)
179            }
180        })
181        .collect();
182
183    let mut filtered_rules: Vec<_> = loaded_rules
184        .into_iter()
185        .filter_map(|rule| {
186            if ignored_rule_identifiers.contains(rule.identifier.as_str()) {
187                report.ignored_rules.push(rule.identifier.clone());
188                return None;
189            }
190
191            if rule_references_ignored_license(&rule, &ignored_license_keys) {
192                report
193                    .ignored_rules_due_to_licenses
194                    .push(rule.identifier.clone());
195                return None;
196            }
197
198            Some(rule)
199        })
200        .collect();
201
202    ensure_all_ignored_entries_exist(&ignored_rule_identifiers, &ignored_license_keys, &report)?;
203
204    apply_license_overlays(
205        &mut filtered_licenses,
206        overlay_licenses,
207        &ignored_license_keys,
208        &mut report,
209    )?;
210    apply_rule_overlays(
211        &mut filtered_rules,
212        overlay_rules,
213        &ignored_rule_identifiers,
214        &ignored_license_keys,
215        &filtered_licenses,
216        &mut report,
217    )?;
218
219    report.sort_and_dedup();
220
221    Ok((filtered_rules, filtered_licenses, report))
222}
223
224fn load_default_overlay_rules() -> Result<Vec<LoadedRule>> {
225    BUNDLED_RULE_OVERLAY_FILES
226        .iter()
227        .map(|overlay| {
228            parse_rule_str_to_loaded(overlay.identifier, overlay.contents).map_err(|error| {
229                anyhow!(
230                    "Failed to parse bundled overlay rule {} from {}: {}",
231                    overlay.identifier,
232                    DEFAULT_INDEX_BUILD_OVERLAY_ROOT,
233                    error
234                )
235            })
236        })
237        .collect()
238}
239
240fn load_default_overlay_licenses() -> Result<Vec<LoadedLicense>> {
241    BUNDLED_LICENSE_OVERLAY_FILES
242        .iter()
243        .map(|overlay| {
244            parse_license_str_to_loaded(overlay.identifier, overlay.contents).map_err(|error| {
245                anyhow!(
246                    "Failed to parse bundled overlay license {} from {}: {}",
247                    overlay.identifier,
248                    DEFAULT_INDEX_BUILD_OVERLAY_ROOT,
249                    error
250                )
251            })
252        })
253        .collect()
254}
255
256fn ensure_all_ignored_entries_exist(
257    ignored_rule_identifiers: &HashSet<String>,
258    ignored_license_keys: &HashSet<String>,
259    report: &AppliedIndexBuildPolicy,
260) -> Result<()> {
261    let applied_ignored_rules = report.ignored_rules.iter().cloned().collect::<HashSet<_>>();
262    let missing_rules = ignored_rule_identifiers
263        .difference(&applied_ignored_rules)
264        .cloned()
265        .collect::<Vec<_>>();
266
267    let applied_ignored_licenses = report
268        .ignored_licenses
269        .iter()
270        .map(|key| normalize_license_key(key))
271        .collect::<HashSet<_>>();
272    let missing_licenses = ignored_license_keys
273        .difference(&applied_ignored_licenses)
274        .cloned()
275        .collect::<Vec<_>>();
276
277    if missing_rules.is_empty() && missing_licenses.is_empty() {
278        Ok(())
279    } else {
280        let mut problems = Vec::new();
281        if !missing_rules.is_empty() {
282            problems.push(format!(
283                "ignored rule identifiers not found upstream: {}",
284                missing_rules.join(", ")
285            ));
286        }
287        if !missing_licenses.is_empty() {
288            problems.push(format!(
289                "ignored license keys not found upstream: {}",
290                missing_licenses.join(", ")
291            ));
292        }
293        Err(anyhow!(
294            "stale index-build policy entries detected; remove or update them: {}",
295            problems.join("; ")
296        ))
297    }
298}
299
300fn apply_license_overlays(
301    licenses: &mut Vec<LoadedLicense>,
302    overlays: &[LoadedLicense],
303    ignored_license_keys: &HashSet<String>,
304    report: &mut AppliedIndexBuildPolicy,
305) -> Result<()> {
306    let mut indices = build_license_index_map(licenses)?;
307    let mut seen_overlay_keys = HashSet::new();
308
309    for overlay in overlays {
310        let key = normalize_license_key(&overlay.key);
311
312        if !seen_overlay_keys.insert(key.clone()) {
313            return Err(anyhow!(
314                "bundled overlay contains duplicate license key '{}'",
315                overlay.key
316            ));
317        }
318
319        if ignored_license_keys.contains(&key) {
320            return Err(anyhow!(
321                "overlay license '{}' conflicts with ignored_licenses",
322                overlay.key
323            ));
324        }
325
326        if let Some(index) = indices.get(&key).copied() {
327            if licenses[index] == *overlay {
328                return Err(anyhow!(
329                    "overlay license '{}' is now identical to upstream; remove the local overlay file",
330                    overlay.key
331                ));
332            }
333            report.replaced_licenses.push(overlay.key.clone());
334            licenses[index] = overlay.clone();
335        } else {
336            report.added_licenses.push(overlay.key.clone());
337            licenses.push(overlay.clone());
338            indices.insert(key, licenses.len() - 1);
339        }
340    }
341
342    Ok(())
343}
344
345fn apply_rule_overlays(
346    rules: &mut Vec<LoadedRule>,
347    overlays: &[LoadedRule],
348    ignored_rule_identifiers: &HashSet<String>,
349    ignored_license_keys: &HashSet<String>,
350    licenses: &[LoadedLicense],
351    report: &mut AppliedIndexBuildPolicy,
352) -> Result<()> {
353    let mut indices = build_rule_index_map(rules)?;
354    let mut seen_overlay_identifiers = HashSet::new();
355    let available_license_keys = licenses
356        .iter()
357        .map(|license| normalize_license_key(&license.key))
358        .collect::<HashSet<_>>();
359
360    for overlay in overlays {
361        let identifier = overlay.identifier.clone();
362
363        if !seen_overlay_identifiers.insert(identifier.clone()) {
364            return Err(anyhow!(
365                "bundled overlay contains duplicate rule identifier '{}'",
366                identifier
367            ));
368        }
369
370        if ignored_rule_identifiers.contains(identifier.as_str()) {
371            return Err(anyhow!(
372                "overlay rule '{}' conflicts with ignored_rules",
373                identifier
374            ));
375        }
376
377        if rule_references_ignored_license(overlay, ignored_license_keys) {
378            return Err(anyhow!(
379                "overlay rule '{}' references an ignored license key",
380                identifier
381            ));
382        }
383
384        ensure_rule_references_known_licenses(overlay, &available_license_keys)?;
385
386        if let Some(index) = indices.get(identifier.as_str()).copied() {
387            if rules[index] == *overlay {
388                return Err(anyhow!(
389                    "overlay rule '{}' is now identical to upstream; remove the local overlay file",
390                    identifier
391                ));
392            }
393            report.replaced_rules.push(identifier.clone());
394            rules[index] = overlay.clone();
395        } else {
396            report.added_rules.push(identifier.clone());
397            rules.push(overlay.clone());
398            indices.insert(identifier, rules.len() - 1);
399        }
400    }
401
402    Ok(())
403}
404
405fn build_rule_index_map(rules: &[LoadedRule]) -> Result<HashMap<String, usize>> {
406    let mut indices = HashMap::new();
407    for (index, rule) in rules.iter().enumerate() {
408        if indices.insert(rule.identifier.clone(), index).is_some() {
409            return Err(anyhow!(
410                "cannot apply overlay because duplicate rule identifier '{}' is already present",
411                rule.identifier
412            ));
413        }
414    }
415    Ok(indices)
416}
417
418fn build_license_index_map(licenses: &[LoadedLicense]) -> Result<HashMap<String, usize>> {
419    let mut indices = HashMap::new();
420    for (index, license) in licenses.iter().enumerate() {
421        let normalized_key = normalize_license_key(&license.key);
422        if indices.insert(normalized_key, index).is_some() {
423            return Err(anyhow!(
424                "cannot apply overlay because duplicate license key '{}' is already present",
425                license.key
426            ));
427        }
428    }
429    Ok(indices)
430}
431
432fn ensure_rule_references_known_licenses(
433    rule: &LoadedRule,
434    available_license_keys: &HashSet<String>,
435) -> Result<()> {
436    if rule.rule_kind == RuleKind::None && rule.is_false_positive {
437        return Ok(());
438    }
439
440    let expression = parse_expression(&rule.license_expression).map_err(|error| {
441        anyhow!(
442            "overlay rule '{}' has an invalid license expression '{}': {}",
443            rule.identifier,
444            rule.license_expression,
445            error
446        )
447    })?;
448
449    let missing_keys = expression
450        .license_keys()
451        .into_iter()
452        .map(|key| normalize_license_key(&key))
453        .filter(|key| !available_license_keys.contains(key))
454        .collect::<Vec<_>>();
455
456    if missing_keys.is_empty() {
457        Ok(())
458    } else {
459        Err(anyhow!(
460            "overlay rule '{}' references unknown license keys: {}",
461            rule.identifier,
462            missing_keys.join(", ")
463        ))
464    }
465}
466
467fn normalize_license_key(key: &str) -> String {
468    key.trim().to_lowercase()
469}
470
471fn rule_references_ignored_license(
472    rule: &LoadedRule,
473    ignored_license_keys: &HashSet<String>,
474) -> bool {
475    if ignored_license_keys.is_empty() {
476        return false;
477    }
478
479    let normalized_expression = normalize_license_key(&rule.license_expression);
480    if ignored_license_keys.contains(&normalized_expression) {
481        return true;
482    }
483
484    if rule.rule_kind == RuleKind::None && rule.is_false_positive {
485        return false;
486    }
487
488    parse_expression(&rule.license_expression)
489        .map(|expression| {
490            expression
491                .license_keys()
492                .into_iter()
493                .map(|key| normalize_license_key(&key))
494                .any(|key| ignored_license_keys.contains(&key))
495        })
496        .unwrap_or(false)
497}
498
499#[cfg(test)]
500mod tests {
501    use super::*;
502
503    fn create_loaded_rule(identifier: &str, expression: &str) -> LoadedRule {
504        LoadedRule {
505            identifier: identifier.to_string(),
506            license_expression: expression.to_string(),
507            text: format!("{identifier} text"),
508            rule_kind: RuleKind::Text,
509            is_false_positive: false,
510            is_required_phrase: false,
511            skip_for_required_phrase_generation: false,
512            relevance: Some(100),
513            minimum_coverage: None,
514            has_stored_minimum_coverage: false,
515            is_continuous: false,
516            referenced_filenames: None,
517            ignorable_urls: None,
518            ignorable_emails: None,
519            ignorable_copyrights: None,
520            ignorable_holders: None,
521            ignorable_authors: None,
522            language: None,
523            notes: None,
524            is_deprecated: false,
525            replaced_by: vec![],
526        }
527    }
528
529    fn create_loaded_license(key: &str) -> LoadedLicense {
530        LoadedLicense {
531            key: key.to_string(),
532            short_name: Some(key.to_uppercase()),
533            name: format!("{key} license"),
534            language: Some("en".to_string()),
535            spdx_license_key: Some(key.to_uppercase()),
536            other_spdx_license_keys: vec![],
537            category: Some("Permissive".to_string()),
538            owner: None,
539            homepage_url: None,
540            text: format!("{key} text"),
541            reference_urls: vec![],
542            osi_license_key: None,
543            text_urls: vec![],
544            osi_url: None,
545            faq_url: None,
546            other_urls: vec![],
547            notes: None,
548            is_deprecated: false,
549            is_exception: false,
550            is_unknown: false,
551            is_generic: false,
552            replaced_by: vec![],
553            minimum_coverage: None,
554            standard_notice: None,
555            ignorable_copyrights: None,
556            ignorable_holders: None,
557            ignorable_authors: None,
558            ignorable_urls: None,
559            ignorable_emails: None,
560        }
561    }
562
563    #[test]
564    fn test_apply_index_build_policy_filters_direct_and_dependent_entries() {
565        let policy = IndexBuildPolicy {
566            ignored_rules: vec!["direct.RULE".to_string()],
567            ignored_licenses: vec!["apache-2.0".to_string()],
568        };
569
570        let rules = vec![
571            create_loaded_rule("keep.RULE", "mit"),
572            create_loaded_rule("direct.RULE", "mit"),
573            create_loaded_rule("dependent.RULE", "mit OR apache-2.0"),
574        ];
575        let licenses = vec![
576            create_loaded_license("mit"),
577            create_loaded_license("apache-2.0"),
578        ];
579
580        let (filtered_rules, filtered_licenses, report) =
581            apply_index_build_policy(rules, licenses, &policy, &[], &[])
582                .expect("policy application");
583
584        assert_eq!(
585            filtered_rules
586                .iter()
587                .map(|rule| rule.identifier.as_str())
588                .collect::<Vec<_>>(),
589            vec!["keep.RULE"]
590        );
591        assert_eq!(
592            filtered_licenses
593                .iter()
594                .map(|license| license.key.as_str())
595                .collect::<Vec<_>>(),
596            vec!["mit"]
597        );
598        assert_eq!(report.ignored_rules, vec!["direct.RULE".to_string()]);
599        assert_eq!(report.ignored_licenses, vec!["apache-2.0".to_string()]);
600        assert_eq!(
601            report.ignored_rules_due_to_licenses,
602            vec!["dependent.RULE".to_string()]
603        );
604    }
605
606    #[test]
607    fn test_apply_index_build_policy_fails_for_stale_ignored_entries() {
608        let policy = IndexBuildPolicy {
609            ignored_rules: vec!["missing.RULE".to_string()],
610            ignored_licenses: vec![],
611        };
612
613        let error = apply_index_build_policy(
614            vec![create_loaded_rule("keep.RULE", "mit")],
615            vec![create_loaded_license("mit")],
616            &policy,
617            &[],
618            &[],
619        )
620        .expect_err("missing ignored rule should fail");
621
622        assert!(
623            error
624                .to_string()
625                .contains("ignored rule identifiers not found upstream: missing.RULE")
626        );
627    }
628
629    #[test]
630    fn test_apply_index_build_policy_infers_add_from_new_overlay_entries() {
631        let policy = IndexBuildPolicy::default();
632        let overlay_rules = vec![create_loaded_rule("custom-rule.RULE", "mit")];
633        let overlay_licenses = vec![create_loaded_license("custom-license")];
634        let rules = vec![create_loaded_rule("keep.RULE", "mit")];
635        let licenses = vec![create_loaded_license("mit")];
636
637        let (filtered_rules, filtered_licenses, report) =
638            apply_index_build_policy(rules, licenses, &policy, &overlay_rules, &overlay_licenses)
639                .expect("policy application");
640
641        assert!(
642            filtered_rules
643                .iter()
644                .any(|rule| rule.identifier == "custom-rule.RULE")
645        );
646        assert!(
647            filtered_licenses
648                .iter()
649                .any(|license| license.key == "custom-license")
650        );
651        assert_eq!(report.added_rules, vec!["custom-rule.RULE".to_string()]);
652        assert_eq!(report.added_licenses, vec!["custom-license".to_string()]);
653    }
654
655    #[test]
656    fn test_apply_index_build_policy_infers_replace_from_colliding_overlay_entries() {
657        let policy = IndexBuildPolicy::default();
658        let overlay_rules = vec![LoadedRule {
659            text: "updated rule text".to_string(),
660            ..create_loaded_rule("replace.RULE", "mit")
661        }];
662        let overlay_licenses = vec![LoadedLicense {
663            name: "MIT Updated".to_string(),
664            text: "updated license text".to_string(),
665            ..create_loaded_license("mit")
666        }];
667        let rules = vec![create_loaded_rule("replace.RULE", "mit")];
668        let licenses = vec![create_loaded_license("mit")];
669
670        let (filtered_rules, filtered_licenses, report) =
671            apply_index_build_policy(rules, licenses, &policy, &overlay_rules, &overlay_licenses)
672                .expect("policy application");
673
674        assert_eq!(filtered_rules[0].text, "updated rule text");
675        assert_eq!(filtered_licenses[0].name, "MIT Updated");
676        assert_eq!(report.replaced_rules, vec!["replace.RULE".to_string()]);
677        assert_eq!(report.replaced_licenses, vec!["mit".to_string()]);
678    }
679
680    #[test]
681    fn test_apply_index_build_policy_rejects_redundant_rule_overlay() {
682        let policy = IndexBuildPolicy::default();
683        let base_rule = create_loaded_rule("replace.RULE", "mit");
684        let error = apply_index_build_policy(
685            vec![base_rule.clone()],
686            vec![create_loaded_license("mit")],
687            &policy,
688            &[base_rule],
689            &[],
690        )
691        .expect_err("redundant overlay should fail");
692
693        assert!(
694            error
695                .to_string()
696                .contains("overlay rule 'replace.RULE' is now identical to upstream")
697        );
698    }
699
700    #[test]
701    fn test_apply_index_build_policy_rejects_redundant_license_overlay() {
702        let policy = IndexBuildPolicy::default();
703        let base_license = create_loaded_license("mit");
704        let error = apply_index_build_policy(
705            vec![create_loaded_rule("keep.RULE", "mit")],
706            vec![base_license.clone()],
707            &policy,
708            &[],
709            &[base_license],
710        )
711        .expect_err("redundant overlay should fail");
712
713        assert!(
714            error
715                .to_string()
716                .contains("overlay license 'mit' is now identical to upstream")
717        );
718    }
719}