Skip to main content

provenant/license_detection/models/
loaded_rule.rs

1//! Loader-stage rule type.
2//!
3//! This module defines `LoadedRule`, which represents a parsed and normalized
4//! rule file (.RULE or .LICENSE) before it is converted to a runtime `Rule`.
5//!
6//! Loader-stage responsibilities include:
7//! - Text trimming and normalization
8//! - Fallback/default handling derived only from one file
9//! - Empty-vector to `None` cleanup
10//! - File-local validation
11//! - False-positive handling for missing `license_expression`
12
13use serde::{Deserialize, Serialize};
14
15use super::RuleKind;
16
17/// Loader-stage representation of a rule.
18///
19/// This struct contains parsed and normalized data from a .RULE or .LICENSE file.
20/// It is serialized at build time and deserialized at runtime, then converted
21/// to a runtime `Rule` during the build stage.
22#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
23pub struct LoadedRule {
24    /// Unique identifier derived from the filename (e.g., "mit.LICENSE").
25    pub identifier: String,
26
27    /// License expression string using SPDX syntax and ScanCode license keys.
28    /// For false-positive rules with no source expression, this is set to "unknown".
29    pub license_expression: String,
30
31    /// Pattern text to match, trimmed and normalized.
32    pub text: String,
33
34    /// Classification of this rule, derived from source rule-kind booleans.
35    pub rule_kind: RuleKind,
36
37    /// True if exact matches to this rule are false positives.
38    pub is_false_positive: bool,
39
40    /// True if this rule text is a required phrase.
41    pub is_required_phrase: bool,
42
43    #[serde(default)]
44    pub skip_for_required_phrase_generation: bool,
45
46    /// Relevance score 0-100 (100 is most relevant).
47    /// Stored as Option to distinguish between explicit 100 and default 100.
48    pub relevance: Option<u8>,
49
50    /// Minimum match coverage percentage (0-100) if specified.
51    pub minimum_coverage: Option<u8>,
52
53    /// True if minimum_coverage was explicitly stored in source frontmatter.
54    pub has_stored_minimum_coverage: bool,
55
56    /// Tokens must appear in order if true.
57    pub is_continuous: bool,
58
59    /// Filenames where this rule should be considered.
60    pub referenced_filenames: Option<Vec<String>>,
61
62    /// URLs that should be ignored when found in this rule text.
63    pub ignorable_urls: Option<Vec<String>>,
64
65    /// Emails that should be ignored when found in this rule text.
66    pub ignorable_emails: Option<Vec<String>>,
67
68    /// Copyrights that should be ignored when found in this rule text.
69    pub ignorable_copyrights: Option<Vec<String>>,
70
71    /// Holder names that should be ignored when found in this rule text.
72    pub ignorable_holders: Option<Vec<String>>,
73
74    /// Author names that should be ignored when found in this rule text.
75    pub ignorable_authors: Option<Vec<String>>,
76
77    /// Programming language for the rule if specified.
78    pub language: Option<String>,
79
80    /// Free text notes.
81    pub notes: Option<String>,
82
83    /// Whether this rule is deprecated.
84    pub is_deprecated: bool,
85
86    #[serde(default)]
87    pub replaced_by: Vec<String>,
88}
89
90/// Loader-stage normalization functions for rule data.
91impl LoadedRule {
92    /// Derive identifier from filename.
93    ///
94    /// Returns the filename as-is, which serves as the unique identifier.
95    pub fn derive_identifier(filename: &str) -> String {
96        filename.to_string()
97    }
98
99    /// Derive rule kind from source rule-kind booleans.
100    ///
101    /// Returns an error if multiple flags are set.
102    pub fn derive_rule_kind(
103        is_license_text: bool,
104        is_license_notice: bool,
105        is_license_reference: bool,
106        is_license_tag: bool,
107        is_license_intro: bool,
108        is_license_clue: bool,
109    ) -> Result<RuleKind, RuleKindError> {
110        RuleKind::from_rule_flags(
111            is_license_text,
112            is_license_notice,
113            is_license_reference,
114            is_license_tag,
115            is_license_intro,
116            is_license_clue,
117        )
118        .map_err(|_| RuleKindError::MultipleFlagsSet)
119    }
120
121    /// Normalize license expression.
122    ///
123    /// - Strips trivial outer parentheses
124    /// - For false-positive rules with no expression, returns "unknown"
125    /// - For non-false-positive rules with no expression, returns an error
126    pub fn normalize_license_expression(
127        expression: Option<&str>,
128        is_false_positive: bool,
129    ) -> Result<String, LicenseExpressionError> {
130        match expression {
131            Some(expr) if !expr.trim().is_empty() => {
132                Ok(normalize_trivial_outer_parens(expr.trim()))
133            }
134            Some(_) => {
135                if is_false_positive {
136                    Ok("unknown".to_string())
137                } else {
138                    Err(LicenseExpressionError::EmptyExpression)
139                }
140            }
141            None => {
142                if is_false_positive {
143                    Ok("unknown".to_string())
144                } else {
145                    Err(LicenseExpressionError::MissingExpression)
146                }
147            }
148        }
149    }
150
151    /// Normalize optional string field.
152    ///
153    /// Returns `None` for empty strings, `Some(trimmed)` otherwise.
154    pub fn normalize_optional_string(s: Option<&str>) -> Option<String> {
155        s.map(|s| s.trim().to_string()).filter(|s| !s.is_empty())
156    }
157
158    /// Normalize optional string list.
159    ///
160    /// Returns `None` for empty lists, `Some(list)` with trimmed strings otherwise.
161    pub fn normalize_optional_list(list: Option<&[String]>) -> Option<Vec<String>> {
162        list.map(|l| {
163            l.iter()
164                .map(|s| s.trim().to_string())
165                .filter(|s| !s.is_empty())
166                .collect::<Vec<_>>()
167        })
168        .filter(|l: &Vec<String>| !l.is_empty())
169    }
170
171    /// Validate rule-kind flags against false_positive flag.
172    ///
173    /// - False-positive rules must NOT have any is_license_* flags set
174    /// - Non-false-positive rules MUST have exactly one is_license_* flag set
175    pub fn validate_rule_kind_flags(
176        rule_kind: RuleKind,
177        is_false_positive: bool,
178    ) -> Result<(), RuleKindError> {
179        if is_false_positive && rule_kind != RuleKind::None {
180            return Err(RuleKindError::FalsePositiveWithFlags);
181        }
182        if !is_false_positive && rule_kind == RuleKind::None {
183            return Err(RuleKindError::NoFlagsSet);
184        }
185        Ok(())
186    }
187}
188
189/// Error type for rule-kind validation failures.
190#[derive(Debug, Clone, PartialEq, Eq)]
191pub enum RuleKindError {
192    MultipleFlagsSet,
193    NoFlagsSet,
194    FalsePositiveWithFlags,
195}
196
197impl std::fmt::Display for RuleKindError {
198    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
199        match self {
200            Self::MultipleFlagsSet => write!(f, "rule has multiple is_license_* flags set"),
201            Self::NoFlagsSet => write!(f, "non-false-positive rule has no is_license_* flags set"),
202            Self::FalsePositiveWithFlags => {
203                write!(f, "false-positive rule cannot have is_license_* flags set")
204            }
205        }
206    }
207}
208
209impl std::error::Error for RuleKindError {}
210
211/// Error type for license expression validation failures.
212#[derive(Debug, Clone, PartialEq, Eq)]
213pub enum LicenseExpressionError {
214    MissingExpression,
215    EmptyExpression,
216}
217
218impl std::fmt::Display for LicenseExpressionError {
219    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
220        match self {
221            Self::MissingExpression => write!(
222                f,
223                "license_expression is required for non-false-positive rules"
224            ),
225            Self::EmptyExpression => write!(
226                f,
227                "license_expression cannot be empty for non-false-positive rules"
228            ),
229        }
230    }
231}
232
233impl std::error::Error for LicenseExpressionError {}
234
235/// Check if a string has trivial outer parentheses.
236///
237/// Trivial outer parentheses are a single pair of parens that wrap the entire
238/// expression without any other top-level parens.
239fn has_trivial_outer_parens(s: &str) -> bool {
240    let trimmed = s.trim();
241    if !trimmed.starts_with('(') || !trimmed.ends_with(')') {
242        return false;
243    }
244    let mut depth = 0;
245    let chars: Vec<char> = trimmed.chars().collect();
246    for (i, c) in chars.iter().enumerate() {
247        if *c == '(' {
248            depth += 1;
249        } else if *c == ')' {
250            depth -= 1;
251            if depth == 0 && i < chars.len() - 1 {
252                return false;
253            }
254        }
255    }
256    depth == 0
257}
258
259/// Normalize license expression by removing trivial outer parentheses.
260///
261/// This recursively strips outer parens that wrap the entire expression.
262fn normalize_trivial_outer_parens(expr: &str) -> String {
263    let trimmed = expr.trim();
264    if has_trivial_outer_parens(trimmed) {
265        let inner = &trimmed[1..trimmed.len() - 1];
266        normalize_trivial_outer_parens(inner)
267    } else {
268        trimmed.to_string()
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_derive_identifier() {
278        assert_eq!(LoadedRule::derive_identifier("mit.LICENSE"), "mit.LICENSE");
279        assert_eq!(
280            LoadedRule::derive_identifier("gpl-2.0_12.RULE"),
281            "gpl-2.0_12.RULE"
282        );
283    }
284
285    #[test]
286    fn test_derive_rule_kind_single_flag() {
287        assert_eq!(
288            LoadedRule::derive_rule_kind(true, false, false, false, false, false),
289            Ok(RuleKind::Text)
290        );
291        assert_eq!(
292            LoadedRule::derive_rule_kind(false, true, false, false, false, false),
293            Ok(RuleKind::Notice)
294        );
295        assert_eq!(
296            LoadedRule::derive_rule_kind(false, false, true, false, false, false),
297            Ok(RuleKind::Reference)
298        );
299        assert_eq!(
300            LoadedRule::derive_rule_kind(false, false, false, true, false, false),
301            Ok(RuleKind::Tag)
302        );
303        assert_eq!(
304            LoadedRule::derive_rule_kind(false, false, false, false, true, false),
305            Ok(RuleKind::Intro)
306        );
307        assert_eq!(
308            LoadedRule::derive_rule_kind(false, false, false, false, false, true),
309            Ok(RuleKind::Clue)
310        );
311    }
312
313    #[test]
314    fn test_derive_rule_kind_none() {
315        assert_eq!(
316            LoadedRule::derive_rule_kind(false, false, false, false, false, false),
317            Ok(RuleKind::None)
318        );
319    }
320
321    #[test]
322    fn test_derive_rule_kind_multiple_flags() {
323        assert_eq!(
324            LoadedRule::derive_rule_kind(true, true, false, false, false, false),
325            Err(RuleKindError::MultipleFlagsSet)
326        );
327    }
328
329    #[test]
330    fn test_normalize_license_expression_with_value() {
331        assert_eq!(
332            LoadedRule::normalize_license_expression(Some("mit"), false),
333            Ok("mit".to_string())
334        );
335    }
336
337    #[test]
338    fn test_normalize_license_expression_false_positive_fallback() {
339        assert_eq!(
340            LoadedRule::normalize_license_expression(None, true),
341            Ok("unknown".to_string())
342        );
343        assert_eq!(
344            LoadedRule::normalize_license_expression(Some(""), true),
345            Ok("unknown".to_string())
346        );
347        assert_eq!(
348            LoadedRule::normalize_license_expression(Some("   "), true),
349            Ok("unknown".to_string())
350        );
351    }
352
353    #[test]
354    fn test_normalize_license_expression_missing_error() {
355        assert_eq!(
356            LoadedRule::normalize_license_expression(None, false),
357            Err(LicenseExpressionError::MissingExpression)
358        );
359    }
360
361    #[test]
362    fn test_normalize_license_expression_empty_error() {
363        assert_eq!(
364            LoadedRule::normalize_license_expression(Some(""), false),
365            Err(LicenseExpressionError::EmptyExpression)
366        );
367    }
368
369    #[test]
370    fn test_normalize_trivial_outer_parens() {
371        assert_eq!(normalize_trivial_outer_parens("mit"), "mit");
372        assert_eq!(normalize_trivial_outer_parens("(mit)"), "mit");
373        assert_eq!(normalize_trivial_outer_parens("((mit))"), "mit");
374        assert_eq!(
375            normalize_trivial_outer_parens("(mit OR apache-2.0)"),
376            "mit OR apache-2.0"
377        );
378        assert_eq!(
379            normalize_trivial_outer_parens("(mit) OR (apache-2.0)"),
380            "(mit) OR (apache-2.0)"
381        );
382    }
383
384    #[test]
385    fn test_normalize_optional_string() {
386        assert_eq!(LoadedRule::normalize_optional_string(None), None);
387        assert_eq!(LoadedRule::normalize_optional_string(Some("")), None);
388        assert_eq!(LoadedRule::normalize_optional_string(Some("   ")), None);
389        assert_eq!(
390            LoadedRule::normalize_optional_string(Some("hello")),
391            Some("hello".to_string())
392        );
393        assert_eq!(
394            LoadedRule::normalize_optional_string(Some("  hello  ")),
395            Some("hello".to_string())
396        );
397    }
398
399    #[test]
400    fn test_normalize_optional_list() {
401        assert_eq!(LoadedRule::normalize_optional_list(None), None);
402        assert_eq!(LoadedRule::normalize_optional_list(Some(&[])), None);
403        assert_eq!(
404            LoadedRule::normalize_optional_list(Some(&["a".to_string(), "b".to_string()])),
405            Some(vec!["a".to_string(), "b".to_string()])
406        );
407        assert_eq!(
408            LoadedRule::normalize_optional_list(Some(&["  a  ".to_string(), "  b  ".to_string()])),
409            Some(vec!["a".to_string(), "b".to_string()])
410        );
411        assert_eq!(
412            LoadedRule::normalize_optional_list(Some(&["".to_string(), "  ".to_string()])),
413            None
414        );
415    }
416
417    #[test]
418    fn test_validate_rule_kind_flags() {
419        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::Text, false).is_ok());
420        assert_eq!(
421            LoadedRule::validate_rule_kind_flags(RuleKind::None, false),
422            Err(RuleKindError::NoFlagsSet)
423        );
424        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::None, true).is_ok());
425        assert_eq!(
426            LoadedRule::validate_rule_kind_flags(RuleKind::Text, true),
427            Err(RuleKindError::FalsePositiveWithFlags)
428        );
429    }
430
431    #[test]
432    fn test_serde_roundtrip() {
433        let rule = LoadedRule {
434            identifier: "mit.LICENSE".to_string(),
435            license_expression: "mit".to_string(),
436            text: "MIT License".to_string(),
437            rule_kind: RuleKind::Text,
438            is_false_positive: false,
439            is_required_phrase: false,
440            skip_for_required_phrase_generation: false,
441            relevance: Some(100),
442            minimum_coverage: Some(90),
443            has_stored_minimum_coverage: true,
444            is_continuous: false,
445            referenced_filenames: Some(vec!["MIT.txt".to_string()]),
446            ignorable_urls: None,
447            ignorable_emails: None,
448            ignorable_copyrights: None,
449            ignorable_holders: None,
450            ignorable_authors: None,
451            language: None,
452            notes: Some("Test note".to_string()),
453            is_deprecated: false,
454            replaced_by: vec![],
455        };
456
457        let json = serde_json::to_string(&rule).unwrap();
458        let deserialized: LoadedRule = serde_json::from_str(&json).unwrap();
459        assert_eq!(rule, deserialized);
460    }
461}