Skip to main content

provenant/license_detection/models/
loaded_rule.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Loader-stage rule type.
5//!
6//! This module defines `LoadedRule`, which represents a parsed and normalized
7//! rule file (.RULE or .LICENSE) before it is converted to a runtime `Rule`.
8//!
9//! Loader-stage responsibilities include:
10//! - Text trimming and normalization
11//! - Fallback/default handling derived only from one file
12//! - Empty-vector to `None` cleanup
13//! - File-local validation
14//! - False-positive handling for missing `license_expression`
15
16use serde::{Deserialize, Serialize};
17
18use super::RuleKind;
19
20/// Loader-stage representation of a rule.
21///
22/// This struct contains parsed and normalized data from a .RULE or .LICENSE file.
23/// It is serialized at build time and deserialized at runtime, then converted
24/// to a runtime `Rule` during the build stage.
25#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
26pub struct LoadedRule {
27    /// Unique identifier derived from the filename (e.g., "mit.LICENSE").
28    pub identifier: String,
29
30    /// License expression string using SPDX syntax and ScanCode license keys.
31    /// For false-positive rules with no source expression, this is set to "unknown".
32    pub license_expression: String,
33
34    /// Pattern text to match, trimmed and normalized.
35    pub text: String,
36
37    /// Classification of this rule, derived from source rule-kind booleans.
38    pub rule_kind: RuleKind,
39
40    /// True if exact matches to this rule are false positives.
41    pub is_false_positive: bool,
42
43    /// True if this rule text is a required phrase.
44    pub is_required_phrase: bool,
45
46    #[serde(default)]
47    pub skip_for_required_phrase_generation: bool,
48
49    /// Relevance score 0-100 (100 is most relevant).
50    /// Stored as Option to distinguish between explicit 100 and default 100.
51    pub relevance: Option<u8>,
52
53    /// Minimum match coverage percentage (0-100) if specified.
54    pub minimum_coverage: Option<u8>,
55
56    /// True if minimum_coverage was explicitly stored in source frontmatter.
57    pub has_stored_minimum_coverage: bool,
58
59    /// Tokens must appear in order if true.
60    pub is_continuous: bool,
61
62    /// Filenames where this rule should be considered.
63    pub referenced_filenames: Option<Vec<String>>,
64
65    /// URLs that should be ignored when found in this rule text.
66    pub ignorable_urls: Option<Vec<String>>,
67
68    /// Emails that should be ignored when found in this rule text.
69    pub ignorable_emails: Option<Vec<String>>,
70
71    /// Copyrights that should be ignored when found in this rule text.
72    pub ignorable_copyrights: Option<Vec<String>>,
73
74    /// Holder names that should be ignored when found in this rule text.
75    pub ignorable_holders: Option<Vec<String>>,
76
77    /// Author names that should be ignored when found in this rule text.
78    pub ignorable_authors: Option<Vec<String>>,
79
80    /// Programming language for the rule if specified.
81    pub language: Option<String>,
82
83    /// Free text notes.
84    pub notes: Option<String>,
85
86    /// Whether this rule is deprecated.
87    pub is_deprecated: bool,
88
89    #[serde(default)]
90    pub replaced_by: Vec<String>,
91}
92
93/// Loader-stage normalization functions for rule data.
94impl LoadedRule {
95    /// Derive identifier from filename.
96    ///
97    /// Returns the filename as-is, which serves as the unique identifier.
98    pub fn derive_identifier(filename: &str) -> String {
99        filename.to_string()
100    }
101
102    /// Derive rule kind from source rule-kind booleans.
103    ///
104    /// Returns an error if multiple flags are set.
105    pub fn derive_rule_kind(
106        is_license_text: bool,
107        is_license_notice: bool,
108        is_license_reference: bool,
109        is_license_tag: bool,
110        is_license_intro: bool,
111        is_license_clue: bool,
112    ) -> Result<RuleKind, RuleKindError> {
113        RuleKind::from_rule_flags(
114            is_license_text,
115            is_license_notice,
116            is_license_reference,
117            is_license_tag,
118            is_license_intro,
119            is_license_clue,
120        )
121        .map_err(|_| RuleKindError::MultipleFlagsSet)
122    }
123
124    /// Normalize license expression.
125    ///
126    /// - Strips trivial outer parentheses
127    /// - For false-positive rules with no expression, returns "unknown"
128    /// - For non-false-positive rules with no expression, returns an error
129    pub fn normalize_license_expression(
130        expression: Option<&str>,
131        is_false_positive: bool,
132    ) -> Result<String, LicenseExpressionError> {
133        match expression {
134            Some(expr) if !expr.trim().is_empty() => {
135                Ok(normalize_trivial_outer_parens(expr.trim()))
136            }
137            Some(_) => {
138                if is_false_positive {
139                    Ok("unknown".to_string())
140                } else {
141                    Err(LicenseExpressionError::EmptyExpression)
142                }
143            }
144            None => {
145                if is_false_positive {
146                    Ok("unknown".to_string())
147                } else {
148                    Err(LicenseExpressionError::MissingExpression)
149                }
150            }
151        }
152    }
153
154    /// Normalize optional string field.
155    ///
156    /// Returns `None` for empty strings, `Some(trimmed)` otherwise.
157    pub fn normalize_optional_string(s: Option<&str>) -> Option<String> {
158        s.map(|s| s.trim().to_string()).filter(|s| !s.is_empty())
159    }
160
161    /// Normalize optional string list.
162    ///
163    /// Returns `None` for empty lists, `Some(list)` with trimmed strings otherwise.
164    pub fn normalize_optional_list(list: Option<&[String]>) -> Option<Vec<String>> {
165        list.map(|l| {
166            l.iter()
167                .map(|s| s.trim().to_string())
168                .filter(|s| !s.is_empty())
169                .collect::<Vec<_>>()
170        })
171        .filter(|l: &Vec<String>| !l.is_empty())
172    }
173
174    /// Validate rule-kind flags against false_positive flag.
175    ///
176    /// - False-positive rules must NOT have any is_license_* flags set
177    /// - Non-false-positive rules MUST have exactly one is_license_* flag set
178    pub fn validate_rule_kind_flags(
179        rule_kind: RuleKind,
180        is_false_positive: bool,
181    ) -> Result<(), RuleKindError> {
182        if is_false_positive && rule_kind != RuleKind::None {
183            return Err(RuleKindError::FalsePositiveWithFlags);
184        }
185        if !is_false_positive && rule_kind == RuleKind::None {
186            return Err(RuleKindError::NoFlagsSet);
187        }
188        Ok(())
189    }
190}
191
192/// Error type for rule-kind validation failures.
193#[derive(Debug, Clone, PartialEq, Eq)]
194pub enum RuleKindError {
195    MultipleFlagsSet,
196    NoFlagsSet,
197    FalsePositiveWithFlags,
198}
199
200impl std::fmt::Display for RuleKindError {
201    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
202        match self {
203            Self::MultipleFlagsSet => write!(f, "rule has multiple is_license_* flags set"),
204            Self::NoFlagsSet => write!(f, "non-false-positive rule has no is_license_* flags set"),
205            Self::FalsePositiveWithFlags => {
206                write!(f, "false-positive rule cannot have is_license_* flags set")
207            }
208        }
209    }
210}
211
212impl std::error::Error for RuleKindError {}
213
214/// Error type for license expression validation failures.
215#[derive(Debug, Clone, PartialEq, Eq)]
216pub enum LicenseExpressionError {
217    MissingExpression,
218    EmptyExpression,
219}
220
221impl std::fmt::Display for LicenseExpressionError {
222    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
223        match self {
224            Self::MissingExpression => write!(
225                f,
226                "license_expression is required for non-false-positive rules"
227            ),
228            Self::EmptyExpression => write!(
229                f,
230                "license_expression cannot be empty for non-false-positive rules"
231            ),
232        }
233    }
234}
235
236impl std::error::Error for LicenseExpressionError {}
237
238/// Check if a string has trivial outer parentheses.
239///
240/// Trivial outer parentheses are a single pair of parens that wrap the entire
241/// expression without any other top-level parens.
242fn has_trivial_outer_parens(s: &str) -> bool {
243    let trimmed = s.trim();
244    if !trimmed.starts_with('(') || !trimmed.ends_with(')') {
245        return false;
246    }
247    let mut depth = 0;
248    let chars: Vec<char> = trimmed.chars().collect();
249    for (i, c) in chars.iter().enumerate() {
250        if *c == '(' {
251            depth += 1;
252        } else if *c == ')' {
253            depth -= 1;
254            if depth == 0 && i < chars.len() - 1 {
255                return false;
256            }
257        }
258    }
259    depth == 0
260}
261
262/// Normalize license expression by removing trivial outer parentheses.
263///
264/// This recursively strips outer parens that wrap the entire expression.
265fn normalize_trivial_outer_parens(expr: &str) -> String {
266    let trimmed = expr.trim();
267    if has_trivial_outer_parens(trimmed) {
268        let inner = &trimmed[1..trimmed.len() - 1];
269        normalize_trivial_outer_parens(inner)
270    } else {
271        trimmed.to_string()
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_derive_identifier() {
281        assert_eq!(LoadedRule::derive_identifier("mit.LICENSE"), "mit.LICENSE");
282        assert_eq!(
283            LoadedRule::derive_identifier("gpl-2.0_12.RULE"),
284            "gpl-2.0_12.RULE"
285        );
286    }
287
288    #[test]
289    fn test_derive_rule_kind_single_flag() {
290        assert_eq!(
291            LoadedRule::derive_rule_kind(true, false, false, false, false, false),
292            Ok(RuleKind::Text)
293        );
294        assert_eq!(
295            LoadedRule::derive_rule_kind(false, true, false, false, false, false),
296            Ok(RuleKind::Notice)
297        );
298        assert_eq!(
299            LoadedRule::derive_rule_kind(false, false, true, false, false, false),
300            Ok(RuleKind::Reference)
301        );
302        assert_eq!(
303            LoadedRule::derive_rule_kind(false, false, false, true, false, false),
304            Ok(RuleKind::Tag)
305        );
306        assert_eq!(
307            LoadedRule::derive_rule_kind(false, false, false, false, true, false),
308            Ok(RuleKind::Intro)
309        );
310        assert_eq!(
311            LoadedRule::derive_rule_kind(false, false, false, false, false, true),
312            Ok(RuleKind::Clue)
313        );
314    }
315
316    #[test]
317    fn test_derive_rule_kind_none() {
318        assert_eq!(
319            LoadedRule::derive_rule_kind(false, false, false, false, false, false),
320            Ok(RuleKind::None)
321        );
322    }
323
324    #[test]
325    fn test_derive_rule_kind_multiple_flags() {
326        assert_eq!(
327            LoadedRule::derive_rule_kind(true, true, false, false, false, false),
328            Err(RuleKindError::MultipleFlagsSet)
329        );
330    }
331
332    #[test]
333    fn test_normalize_license_expression_with_value() {
334        assert_eq!(
335            LoadedRule::normalize_license_expression(Some("mit"), false),
336            Ok("mit".to_string())
337        );
338    }
339
340    #[test]
341    fn test_normalize_license_expression_false_positive_fallback() {
342        assert_eq!(
343            LoadedRule::normalize_license_expression(None, true),
344            Ok("unknown".to_string())
345        );
346        assert_eq!(
347            LoadedRule::normalize_license_expression(Some(""), true),
348            Ok("unknown".to_string())
349        );
350        assert_eq!(
351            LoadedRule::normalize_license_expression(Some("   "), true),
352            Ok("unknown".to_string())
353        );
354    }
355
356    #[test]
357    fn test_normalize_license_expression_missing_error() {
358        assert_eq!(
359            LoadedRule::normalize_license_expression(None, false),
360            Err(LicenseExpressionError::MissingExpression)
361        );
362    }
363
364    #[test]
365    fn test_normalize_license_expression_empty_error() {
366        assert_eq!(
367            LoadedRule::normalize_license_expression(Some(""), false),
368            Err(LicenseExpressionError::EmptyExpression)
369        );
370    }
371
372    #[test]
373    fn test_normalize_trivial_outer_parens() {
374        assert_eq!(normalize_trivial_outer_parens("mit"), "mit");
375        assert_eq!(normalize_trivial_outer_parens("(mit)"), "mit");
376        assert_eq!(normalize_trivial_outer_parens("((mit))"), "mit");
377        assert_eq!(
378            normalize_trivial_outer_parens("(mit OR apache-2.0)"),
379            "mit OR apache-2.0"
380        );
381        assert_eq!(
382            normalize_trivial_outer_parens("(mit) OR (apache-2.0)"),
383            "(mit) OR (apache-2.0)"
384        );
385    }
386
387    #[test]
388    fn test_normalize_optional_string() {
389        assert_eq!(LoadedRule::normalize_optional_string(None), None);
390        assert_eq!(LoadedRule::normalize_optional_string(Some("")), None);
391        assert_eq!(LoadedRule::normalize_optional_string(Some("   ")), None);
392        assert_eq!(
393            LoadedRule::normalize_optional_string(Some("hello")),
394            Some("hello".to_string())
395        );
396        assert_eq!(
397            LoadedRule::normalize_optional_string(Some("  hello  ")),
398            Some("hello".to_string())
399        );
400    }
401
402    #[test]
403    fn test_normalize_optional_list() {
404        assert_eq!(LoadedRule::normalize_optional_list(None), None);
405        assert_eq!(LoadedRule::normalize_optional_list(Some(&[])), None);
406        assert_eq!(
407            LoadedRule::normalize_optional_list(Some(&["a".to_string(), "b".to_string()])),
408            Some(vec!["a".to_string(), "b".to_string()])
409        );
410        assert_eq!(
411            LoadedRule::normalize_optional_list(Some(&["  a  ".to_string(), "  b  ".to_string()])),
412            Some(vec!["a".to_string(), "b".to_string()])
413        );
414        assert_eq!(
415            LoadedRule::normalize_optional_list(Some(&["".to_string(), "  ".to_string()])),
416            None
417        );
418    }
419
420    #[test]
421    fn test_validate_rule_kind_flags() {
422        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::Text, false).is_ok());
423        assert_eq!(
424            LoadedRule::validate_rule_kind_flags(RuleKind::None, false),
425            Err(RuleKindError::NoFlagsSet)
426        );
427        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::None, true).is_ok());
428        assert_eq!(
429            LoadedRule::validate_rule_kind_flags(RuleKind::Text, true),
430            Err(RuleKindError::FalsePositiveWithFlags)
431        );
432    }
433
434    #[test]
435    fn test_serde_roundtrip() {
436        let rule = LoadedRule {
437            identifier: "mit.LICENSE".to_string(),
438            license_expression: "mit".to_string(),
439            text: "MIT License".to_string(),
440            rule_kind: RuleKind::Text,
441            is_false_positive: false,
442            is_required_phrase: false,
443            skip_for_required_phrase_generation: false,
444            relevance: Some(100),
445            minimum_coverage: Some(90),
446            has_stored_minimum_coverage: true,
447            is_continuous: false,
448            referenced_filenames: Some(vec!["MIT.txt".to_string()]),
449            ignorable_urls: None,
450            ignorable_emails: None,
451            ignorable_copyrights: None,
452            ignorable_holders: None,
453            ignorable_authors: None,
454            language: None,
455            notes: Some("Test note".to_string()),
456            is_deprecated: false,
457            replaced_by: vec![],
458        };
459
460        let json = serde_json::to_string(&rule).unwrap();
461        let deserialized: LoadedRule = serde_json::from_str(&json).unwrap();
462        assert_eq!(rule, deserialized);
463    }
464}