Skip to main content

provenant/license_detection/models/
loaded_rule.rs

1//! Loader-stage rule type.
2//!
3//! This module defines `LoadedRule`, which represents a parsed and normalized
4//! rule file (.RULE or .LICENSE) before it is converted to a runtime `Rule`.
5//!
6//! Loader-stage responsibilities include:
7//! - Text trimming and normalization
8//! - Fallback/default handling derived only from one file
9//! - Empty-vector to `None` cleanup
10//! - File-local validation
11//! - False-positive handling for missing `license_expression`
12
13use serde::{Deserialize, Serialize};
14
15use super::RuleKind;
16
17/// Loader-stage representation of a rule.
18///
19/// This struct contains parsed and normalized data from a .RULE or .LICENSE file.
20/// It is serialized at build time and deserialized at runtime, then converted
21/// to a runtime `Rule` during the build stage.
22#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
23pub struct LoadedRule {
24    /// Unique identifier derived from the filename (e.g., "mit.LICENSE").
25    pub identifier: String,
26
27    /// License expression string using SPDX syntax and ScanCode license keys.
28    /// For false-positive rules with no source expression, this is set to "unknown".
29    pub license_expression: String,
30
31    /// Pattern text to match, trimmed and normalized.
32    pub text: String,
33
34    /// Classification of this rule, derived from source rule-kind booleans.
35    pub rule_kind: RuleKind,
36
37    /// True if exact matches to this rule are false positives.
38    pub is_false_positive: bool,
39
40    /// True if this rule text is a required phrase.
41    pub is_required_phrase: bool,
42
43    /// Relevance score 0-100 (100 is most relevant).
44    /// Stored as Option to distinguish between explicit 100 and default 100.
45    pub relevance: Option<u8>,
46
47    /// Minimum match coverage percentage (0-100) if specified.
48    pub minimum_coverage: Option<u8>,
49
50    /// True if minimum_coverage was explicitly stored in source frontmatter.
51    pub has_stored_minimum_coverage: bool,
52
53    /// Tokens must appear in order if true.
54    pub is_continuous: bool,
55
56    /// Filenames where this rule should be considered.
57    pub referenced_filenames: Option<Vec<String>>,
58
59    /// URLs that should be ignored when found in this rule text.
60    pub ignorable_urls: Option<Vec<String>>,
61
62    /// Emails that should be ignored when found in this rule text.
63    pub ignorable_emails: Option<Vec<String>>,
64
65    /// Copyrights that should be ignored when found in this rule text.
66    pub ignorable_copyrights: Option<Vec<String>>,
67
68    /// Holder names that should be ignored when found in this rule text.
69    pub ignorable_holders: Option<Vec<String>>,
70
71    /// Author names that should be ignored when found in this rule text.
72    pub ignorable_authors: Option<Vec<String>>,
73
74    /// Programming language for the rule if specified.
75    pub language: Option<String>,
76
77    /// Free text notes.
78    pub notes: Option<String>,
79
80    /// Whether this rule is deprecated.
81    pub is_deprecated: bool,
82}
83
84/// Loader-stage normalization functions for rule data.
85impl LoadedRule {
86    /// Derive identifier from filename.
87    ///
88    /// Returns the filename as-is, which serves as the unique identifier.
89    pub fn derive_identifier(filename: &str) -> String {
90        filename.to_string()
91    }
92
93    /// Derive rule kind from source rule-kind booleans.
94    ///
95    /// Returns an error if multiple flags are set.
96    pub fn derive_rule_kind(
97        is_license_text: bool,
98        is_license_notice: bool,
99        is_license_reference: bool,
100        is_license_tag: bool,
101        is_license_intro: bool,
102        is_license_clue: bool,
103    ) -> Result<RuleKind, RuleKindError> {
104        RuleKind::from_rule_flags(
105            is_license_text,
106            is_license_notice,
107            is_license_reference,
108            is_license_tag,
109            is_license_intro,
110            is_license_clue,
111        )
112        .map_err(|_| RuleKindError::MultipleFlagsSet)
113    }
114
115    /// Normalize license expression.
116    ///
117    /// - Strips trivial outer parentheses
118    /// - For false-positive rules with no expression, returns "unknown"
119    /// - For non-false-positive rules with no expression, returns an error
120    pub fn normalize_license_expression(
121        expression: Option<&str>,
122        is_false_positive: bool,
123    ) -> Result<String, LicenseExpressionError> {
124        match expression {
125            Some(expr) if !expr.trim().is_empty() => {
126                Ok(normalize_trivial_outer_parens(expr.trim()))
127            }
128            Some(_) => {
129                if is_false_positive {
130                    Ok("unknown".to_string())
131                } else {
132                    Err(LicenseExpressionError::EmptyExpression)
133                }
134            }
135            None => {
136                if is_false_positive {
137                    Ok("unknown".to_string())
138                } else {
139                    Err(LicenseExpressionError::MissingExpression)
140                }
141            }
142        }
143    }
144
145    /// Normalize optional string field.
146    ///
147    /// Returns `None` for empty strings, `Some(trimmed)` otherwise.
148    pub fn normalize_optional_string(s: Option<&str>) -> Option<String> {
149        s.map(|s| s.trim().to_string()).filter(|s| !s.is_empty())
150    }
151
152    /// Normalize optional string list.
153    ///
154    /// Returns `None` for empty lists, `Some(list)` with trimmed strings otherwise.
155    pub fn normalize_optional_list(list: Option<&[String]>) -> Option<Vec<String>> {
156        list.map(|l| {
157            l.iter()
158                .map(|s| s.trim().to_string())
159                .filter(|s| !s.is_empty())
160                .collect::<Vec<_>>()
161        })
162        .filter(|l: &Vec<String>| !l.is_empty())
163    }
164
165    /// Validate rule-kind flags against false_positive flag.
166    ///
167    /// - False-positive rules must NOT have any is_license_* flags set
168    /// - Non-false-positive rules MUST have exactly one is_license_* flag set
169    pub fn validate_rule_kind_flags(
170        rule_kind: RuleKind,
171        is_false_positive: bool,
172    ) -> Result<(), RuleKindError> {
173        if is_false_positive && rule_kind != RuleKind::None {
174            return Err(RuleKindError::FalsePositiveWithFlags);
175        }
176        if !is_false_positive && rule_kind == RuleKind::None {
177            return Err(RuleKindError::NoFlagsSet);
178        }
179        Ok(())
180    }
181}
182
183/// Error type for rule-kind validation failures.
184#[derive(Debug, Clone, PartialEq, Eq)]
185pub enum RuleKindError {
186    MultipleFlagsSet,
187    NoFlagsSet,
188    FalsePositiveWithFlags,
189}
190
191impl std::fmt::Display for RuleKindError {
192    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
193        match self {
194            Self::MultipleFlagsSet => write!(f, "rule has multiple is_license_* flags set"),
195            Self::NoFlagsSet => write!(f, "non-false-positive rule has no is_license_* flags set"),
196            Self::FalsePositiveWithFlags => {
197                write!(f, "false-positive rule cannot have is_license_* flags set")
198            }
199        }
200    }
201}
202
203impl std::error::Error for RuleKindError {}
204
205/// Error type for license expression validation failures.
206#[derive(Debug, Clone, PartialEq, Eq)]
207pub enum LicenseExpressionError {
208    MissingExpression,
209    EmptyExpression,
210}
211
212impl std::fmt::Display for LicenseExpressionError {
213    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
214        match self {
215            Self::MissingExpression => write!(
216                f,
217                "license_expression is required for non-false-positive rules"
218            ),
219            Self::EmptyExpression => write!(
220                f,
221                "license_expression cannot be empty for non-false-positive rules"
222            ),
223        }
224    }
225}
226
227impl std::error::Error for LicenseExpressionError {}
228
229/// Check if a string has trivial outer parentheses.
230///
231/// Trivial outer parentheses are a single pair of parens that wrap the entire
232/// expression without any other top-level parens.
233fn has_trivial_outer_parens(s: &str) -> bool {
234    let trimmed = s.trim();
235    if !trimmed.starts_with('(') || !trimmed.ends_with(')') {
236        return false;
237    }
238    let mut depth = 0;
239    let chars: Vec<char> = trimmed.chars().collect();
240    for (i, c) in chars.iter().enumerate() {
241        if *c == '(' {
242            depth += 1;
243        } else if *c == ')' {
244            depth -= 1;
245            if depth == 0 && i < chars.len() - 1 {
246                return false;
247            }
248        }
249    }
250    depth == 0
251}
252
253/// Normalize license expression by removing trivial outer parentheses.
254///
255/// This recursively strips outer parens that wrap the entire expression.
256fn normalize_trivial_outer_parens(expr: &str) -> String {
257    let trimmed = expr.trim();
258    if has_trivial_outer_parens(trimmed) {
259        let inner = &trimmed[1..trimmed.len() - 1];
260        normalize_trivial_outer_parens(inner)
261    } else {
262        trimmed.to_string()
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn test_derive_identifier() {
272        assert_eq!(LoadedRule::derive_identifier("mit.LICENSE"), "mit.LICENSE");
273        assert_eq!(
274            LoadedRule::derive_identifier("gpl-2.0_12.RULE"),
275            "gpl-2.0_12.RULE"
276        );
277    }
278
279    #[test]
280    fn test_derive_rule_kind_single_flag() {
281        assert_eq!(
282            LoadedRule::derive_rule_kind(true, false, false, false, false, false),
283            Ok(RuleKind::Text)
284        );
285        assert_eq!(
286            LoadedRule::derive_rule_kind(false, true, false, false, false, false),
287            Ok(RuleKind::Notice)
288        );
289        assert_eq!(
290            LoadedRule::derive_rule_kind(false, false, true, false, false, false),
291            Ok(RuleKind::Reference)
292        );
293        assert_eq!(
294            LoadedRule::derive_rule_kind(false, false, false, true, false, false),
295            Ok(RuleKind::Tag)
296        );
297        assert_eq!(
298            LoadedRule::derive_rule_kind(false, false, false, false, true, false),
299            Ok(RuleKind::Intro)
300        );
301        assert_eq!(
302            LoadedRule::derive_rule_kind(false, false, false, false, false, true),
303            Ok(RuleKind::Clue)
304        );
305    }
306
307    #[test]
308    fn test_derive_rule_kind_none() {
309        assert_eq!(
310            LoadedRule::derive_rule_kind(false, false, false, false, false, false),
311            Ok(RuleKind::None)
312        );
313    }
314
315    #[test]
316    fn test_derive_rule_kind_multiple_flags() {
317        assert_eq!(
318            LoadedRule::derive_rule_kind(true, true, false, false, false, false),
319            Err(RuleKindError::MultipleFlagsSet)
320        );
321    }
322
323    #[test]
324    fn test_normalize_license_expression_with_value() {
325        assert_eq!(
326            LoadedRule::normalize_license_expression(Some("mit"), false),
327            Ok("mit".to_string())
328        );
329    }
330
331    #[test]
332    fn test_normalize_license_expression_false_positive_fallback() {
333        assert_eq!(
334            LoadedRule::normalize_license_expression(None, true),
335            Ok("unknown".to_string())
336        );
337        assert_eq!(
338            LoadedRule::normalize_license_expression(Some(""), true),
339            Ok("unknown".to_string())
340        );
341        assert_eq!(
342            LoadedRule::normalize_license_expression(Some("   "), true),
343            Ok("unknown".to_string())
344        );
345    }
346
347    #[test]
348    fn test_normalize_license_expression_missing_error() {
349        assert_eq!(
350            LoadedRule::normalize_license_expression(None, false),
351            Err(LicenseExpressionError::MissingExpression)
352        );
353    }
354
355    #[test]
356    fn test_normalize_license_expression_empty_error() {
357        assert_eq!(
358            LoadedRule::normalize_license_expression(Some(""), false),
359            Err(LicenseExpressionError::EmptyExpression)
360        );
361    }
362
363    #[test]
364    fn test_normalize_trivial_outer_parens() {
365        assert_eq!(normalize_trivial_outer_parens("mit"), "mit");
366        assert_eq!(normalize_trivial_outer_parens("(mit)"), "mit");
367        assert_eq!(normalize_trivial_outer_parens("((mit))"), "mit");
368        assert_eq!(
369            normalize_trivial_outer_parens("(mit OR apache-2.0)"),
370            "mit OR apache-2.0"
371        );
372        assert_eq!(
373            normalize_trivial_outer_parens("(mit) OR (apache-2.0)"),
374            "(mit) OR (apache-2.0)"
375        );
376    }
377
378    #[test]
379    fn test_normalize_optional_string() {
380        assert_eq!(LoadedRule::normalize_optional_string(None), None);
381        assert_eq!(LoadedRule::normalize_optional_string(Some("")), None);
382        assert_eq!(LoadedRule::normalize_optional_string(Some("   ")), None);
383        assert_eq!(
384            LoadedRule::normalize_optional_string(Some("hello")),
385            Some("hello".to_string())
386        );
387        assert_eq!(
388            LoadedRule::normalize_optional_string(Some("  hello  ")),
389            Some("hello".to_string())
390        );
391    }
392
393    #[test]
394    fn test_normalize_optional_list() {
395        assert_eq!(LoadedRule::normalize_optional_list(None), None);
396        assert_eq!(LoadedRule::normalize_optional_list(Some(&[])), None);
397        assert_eq!(
398            LoadedRule::normalize_optional_list(Some(&["a".to_string(), "b".to_string()])),
399            Some(vec!["a".to_string(), "b".to_string()])
400        );
401        assert_eq!(
402            LoadedRule::normalize_optional_list(Some(&["  a  ".to_string(), "  b  ".to_string()])),
403            Some(vec!["a".to_string(), "b".to_string()])
404        );
405        assert_eq!(
406            LoadedRule::normalize_optional_list(Some(&["".to_string(), "  ".to_string()])),
407            None
408        );
409    }
410
411    #[test]
412    fn test_validate_rule_kind_flags() {
413        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::Text, false).is_ok());
414        assert_eq!(
415            LoadedRule::validate_rule_kind_flags(RuleKind::None, false),
416            Err(RuleKindError::NoFlagsSet)
417        );
418        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::None, true).is_ok());
419        assert_eq!(
420            LoadedRule::validate_rule_kind_flags(RuleKind::Text, true),
421            Err(RuleKindError::FalsePositiveWithFlags)
422        );
423    }
424
425    #[test]
426    fn test_serde_roundtrip() {
427        let rule = LoadedRule {
428            identifier: "mit.LICENSE".to_string(),
429            license_expression: "mit".to_string(),
430            text: "MIT License".to_string(),
431            rule_kind: RuleKind::Text,
432            is_false_positive: false,
433            is_required_phrase: false,
434            relevance: Some(100),
435            minimum_coverage: Some(90),
436            has_stored_minimum_coverage: true,
437            is_continuous: false,
438            referenced_filenames: Some(vec!["MIT.txt".to_string()]),
439            ignorable_urls: None,
440            ignorable_emails: None,
441            ignorable_copyrights: None,
442            ignorable_holders: None,
443            ignorable_authors: None,
444            language: None,
445            notes: Some("Test note".to_string()),
446            is_deprecated: false,
447        };
448
449        let json = serde_json::to_string(&rule).unwrap();
450        let deserialized: LoadedRule = serde_json::from_str(&json).unwrap();
451        assert_eq!(rule, deserialized);
452    }
453}