Skip to main content

provenant/license_detection/models/
loaded_rule.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Loader-stage rule type.
5//!
6//! This module defines `LoadedRule`, which represents a parsed and normalized
7//! rule file (.RULE or .LICENSE) before it is converted to a runtime `Rule`.
8//!
9//! Loader-stage responsibilities include:
10//! - Text trimming and normalization
11//! - Fallback/default handling derived only from one file
12//! - Empty-vector to `None` cleanup
13//! - File-local validation
14//! - False-positive handling for missing `license_expression`
15
16use serde::{Deserialize, Serialize};
17
18use super::RuleKind;
19
20/// Loader-stage representation of a rule.
21///
22/// This struct contains parsed and normalized data from a .RULE or .LICENSE file.
23/// It is serialized at build time and deserialized at runtime, then converted
24/// to a runtime `Rule` during the build stage.
25#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
26pub struct LoadedRule {
27    /// Unique identifier derived from the filename (e.g., "mit.LICENSE").
28    pub identifier: String,
29
30    /// License expression string using SPDX syntax and ScanCode license keys.
31    /// For false-positive rules with no source expression, this is set to "unknown".
32    pub license_expression: String,
33
34    /// Pattern text to match, trimmed and normalized.
35    pub text: String,
36
37    /// Classification of this rule, derived from source rule-kind booleans.
38    pub rule_kind: RuleKind,
39
40    /// True if exact matches to this rule are false positives.
41    pub is_false_positive: bool,
42
43    /// True if this rule text is a required phrase.
44    pub is_required_phrase: bool,
45
46    #[serde(default)]
47    pub skip_for_required_phrase_generation: bool,
48
49    /// Relevance score 0-100 (100 is most relevant).
50    /// Stored as Option to distinguish between explicit 100 and default 100.
51    pub relevance: Option<u8>,
52
53    /// Minimum match coverage percentage (0-100) if specified.
54    pub minimum_coverage: Option<u8>,
55
56    /// True if minimum_coverage was explicitly stored in source frontmatter.
57    pub has_stored_minimum_coverage: bool,
58
59    /// Tokens must appear in order if true.
60    pub is_continuous: bool,
61
62    /// Filenames where this rule should be considered.
63    pub referenced_filenames: Option<Vec<String>>,
64
65    /// URLs that should be ignored when found in this rule text.
66    pub ignorable_urls: Option<Vec<String>>,
67
68    /// Emails that should be ignored when found in this rule text.
69    pub ignorable_emails: Option<Vec<String>>,
70
71    /// Copyrights that should be ignored when found in this rule text.
72    pub ignorable_copyrights: Option<Vec<String>>,
73
74    /// Holder names that should be ignored when found in this rule text.
75    pub ignorable_holders: Option<Vec<String>>,
76
77    /// Author names that should be ignored when found in this rule text.
78    pub ignorable_authors: Option<Vec<String>>,
79
80    /// Programming language for the rule if specified.
81    pub language: Option<String>,
82
83    /// Free text notes.
84    pub notes: Option<String>,
85
86    /// Whether this rule is deprecated.
87    pub is_deprecated: bool,
88
89    #[serde(default)]
90    pub replaced_by: Vec<String>,
91}
92
93/// Loader-stage normalization functions for rule data.
94impl LoadedRule {
95    /// Derive identifier from filename.
96    ///
97    /// Returns the filename as-is, which serves as the unique identifier.
98    pub fn derive_identifier(filename: &str) -> String {
99        filename.to_string()
100    }
101
102    /// Derive rule kind from source rule-kind booleans.
103    ///
104    /// Returns an error if multiple flags are set.
105    pub fn derive_rule_kind(
106        is_license_text: bool,
107        is_license_notice: bool,
108        is_license_reference: bool,
109        is_license_tag: bool,
110        is_license_intro: bool,
111        is_license_clue: bool,
112    ) -> Result<RuleKind, RuleKindError> {
113        RuleKind::from_rule_flags(
114            is_license_text,
115            is_license_notice,
116            is_license_reference,
117            is_license_tag,
118            is_license_intro,
119            is_license_clue,
120        )
121        .map_err(|_| RuleKindError::MultipleFlagsSet)
122    }
123
124    /// Normalize license expression.
125    ///
126    /// - Strips trivial outer parentheses
127    /// - For false-positive rules with no expression, returns "unknown"
128    /// - For non-false-positive rules with no expression, returns an error
129    pub fn normalize_license_expression(
130        expression: Option<&str>,
131        is_false_positive: bool,
132    ) -> Result<String, LicenseExpressionError> {
133        match expression {
134            Some(expr) if !expr.trim().is_empty() => {
135                Ok(normalize_trivial_outer_parens(expr.trim()))
136            }
137            Some(_) => {
138                if is_false_positive {
139                    Ok("unknown".to_string())
140                } else {
141                    Err(LicenseExpressionError::EmptyExpression)
142                }
143            }
144            None => {
145                if is_false_positive {
146                    Ok("unknown".to_string())
147                } else {
148                    Err(LicenseExpressionError::MissingExpression)
149                }
150            }
151        }
152    }
153
154    /// Normalize optional string field.
155    ///
156    /// Returns `None` for empty strings, `Some(trimmed)` otherwise.
157    pub fn normalize_optional_string(s: Option<&str>) -> Option<String> {
158        s.map(|s| s.trim().to_string()).filter(|s| !s.is_empty())
159    }
160
161    /// Normalize optional string list.
162    ///
163    /// Returns `None` for empty lists, `Some(list)` with trimmed strings otherwise.
164    pub fn normalize_optional_list(list: Option<&[String]>) -> Option<Vec<String>> {
165        list.map(|l| {
166            l.iter()
167                .map(|s| s.trim().to_string())
168                .filter(|s| !s.is_empty())
169                .collect::<Vec<_>>()
170        })
171        .filter(|l: &Vec<String>| !l.is_empty())
172    }
173
174    /// Validate rule-kind flags against false_positive flag.
175    ///
176    /// - False-positive rules must NOT have any is_license_* flags set
177    /// - Non-false-positive rules MUST have exactly one is_license_* flag set
178    pub fn validate_rule_kind_flags(
179        rule_kind: RuleKind,
180        is_false_positive: bool,
181    ) -> Result<(), RuleKindError> {
182        if is_false_positive && rule_kind != RuleKind::None {
183            return Err(RuleKindError::FalsePositiveWithFlags);
184        }
185        if !is_false_positive && rule_kind == RuleKind::None {
186            return Err(RuleKindError::NoFlagsSet);
187        }
188        Ok(())
189    }
190}
191
192/// Error type for rule-kind validation failures.
193#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
194pub enum RuleKindError {
195    #[error("rule has multiple is_license_* flags set")]
196    MultipleFlagsSet,
197    #[error("non-false-positive rule has no is_license_* flags set")]
198    NoFlagsSet,
199    #[error("false-positive rule cannot have is_license_* flags set")]
200    FalsePositiveWithFlags,
201}
202
203/// Error type for license expression validation failures.
204#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
205pub enum LicenseExpressionError {
206    #[error("license_expression is required for non-false-positive rules")]
207    MissingExpression,
208    #[error("license_expression cannot be empty for non-false-positive rules")]
209    EmptyExpression,
210}
211
212/// Check if a string has trivial outer parentheses.
213///
214/// Trivial outer parentheses are a single pair of parens that wrap the entire
215/// expression without any other top-level parens.
216fn has_trivial_outer_parens(s: &str) -> bool {
217    let trimmed = s.trim();
218    if !trimmed.starts_with('(') || !trimmed.ends_with(')') {
219        return false;
220    }
221    let mut depth = 0;
222    let chars: Vec<char> = trimmed.chars().collect();
223    for (i, c) in chars.iter().enumerate() {
224        if *c == '(' {
225            depth += 1;
226        } else if *c == ')' {
227            depth -= 1;
228            if depth == 0 && i < chars.len() - 1 {
229                return false;
230            }
231        }
232    }
233    depth == 0
234}
235
236/// Normalize license expression by removing trivial outer parentheses.
237///
238/// This recursively strips outer parens that wrap the entire expression.
239fn normalize_trivial_outer_parens(expr: &str) -> String {
240    let trimmed = expr.trim();
241    if has_trivial_outer_parens(trimmed) {
242        let inner = &trimmed[1..trimmed.len() - 1];
243        normalize_trivial_outer_parens(inner)
244    } else {
245        trimmed.to_string()
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn test_derive_identifier() {
255        assert_eq!(LoadedRule::derive_identifier("mit.LICENSE"), "mit.LICENSE");
256        assert_eq!(
257            LoadedRule::derive_identifier("gpl-2.0_12.RULE"),
258            "gpl-2.0_12.RULE"
259        );
260    }
261
262    #[test]
263    fn test_derive_rule_kind_single_flag() {
264        assert_eq!(
265            LoadedRule::derive_rule_kind(true, false, false, false, false, false),
266            Ok(RuleKind::Text)
267        );
268        assert_eq!(
269            LoadedRule::derive_rule_kind(false, true, false, false, false, false),
270            Ok(RuleKind::Notice)
271        );
272        assert_eq!(
273            LoadedRule::derive_rule_kind(false, false, true, false, false, false),
274            Ok(RuleKind::Reference)
275        );
276        assert_eq!(
277            LoadedRule::derive_rule_kind(false, false, false, true, false, false),
278            Ok(RuleKind::Tag)
279        );
280        assert_eq!(
281            LoadedRule::derive_rule_kind(false, false, false, false, true, false),
282            Ok(RuleKind::Intro)
283        );
284        assert_eq!(
285            LoadedRule::derive_rule_kind(false, false, false, false, false, true),
286            Ok(RuleKind::Clue)
287        );
288    }
289
290    #[test]
291    fn test_derive_rule_kind_none() {
292        assert_eq!(
293            LoadedRule::derive_rule_kind(false, false, false, false, false, false),
294            Ok(RuleKind::None)
295        );
296    }
297
298    #[test]
299    fn test_derive_rule_kind_multiple_flags() {
300        assert_eq!(
301            LoadedRule::derive_rule_kind(true, true, false, false, false, false),
302            Err(RuleKindError::MultipleFlagsSet)
303        );
304    }
305
306    #[test]
307    fn test_normalize_license_expression_with_value() {
308        assert_eq!(
309            LoadedRule::normalize_license_expression(Some("mit"), false),
310            Ok("mit".to_string())
311        );
312    }
313
314    #[test]
315    fn test_normalize_license_expression_false_positive_fallback() {
316        assert_eq!(
317            LoadedRule::normalize_license_expression(None, true),
318            Ok("unknown".to_string())
319        );
320        assert_eq!(
321            LoadedRule::normalize_license_expression(Some(""), true),
322            Ok("unknown".to_string())
323        );
324        assert_eq!(
325            LoadedRule::normalize_license_expression(Some("   "), true),
326            Ok("unknown".to_string())
327        );
328    }
329
330    #[test]
331    fn test_normalize_license_expression_missing_error() {
332        assert_eq!(
333            LoadedRule::normalize_license_expression(None, false),
334            Err(LicenseExpressionError::MissingExpression)
335        );
336    }
337
338    #[test]
339    fn test_normalize_license_expression_empty_error() {
340        assert_eq!(
341            LoadedRule::normalize_license_expression(Some(""), false),
342            Err(LicenseExpressionError::EmptyExpression)
343        );
344    }
345
346    #[test]
347    fn test_normalize_trivial_outer_parens() {
348        assert_eq!(normalize_trivial_outer_parens("mit"), "mit");
349        assert_eq!(normalize_trivial_outer_parens("(mit)"), "mit");
350        assert_eq!(normalize_trivial_outer_parens("((mit))"), "mit");
351        assert_eq!(
352            normalize_trivial_outer_parens("(mit OR apache-2.0)"),
353            "mit OR apache-2.0"
354        );
355        assert_eq!(
356            normalize_trivial_outer_parens("(mit) OR (apache-2.0)"),
357            "(mit) OR (apache-2.0)"
358        );
359    }
360
361    #[test]
362    fn test_normalize_optional_string() {
363        assert_eq!(LoadedRule::normalize_optional_string(None), None);
364        assert_eq!(LoadedRule::normalize_optional_string(Some("")), None);
365        assert_eq!(LoadedRule::normalize_optional_string(Some("   ")), None);
366        assert_eq!(
367            LoadedRule::normalize_optional_string(Some("hello")),
368            Some("hello".to_string())
369        );
370        assert_eq!(
371            LoadedRule::normalize_optional_string(Some("  hello  ")),
372            Some("hello".to_string())
373        );
374    }
375
376    #[test]
377    fn test_normalize_optional_list() {
378        assert_eq!(LoadedRule::normalize_optional_list(None), None);
379        assert_eq!(LoadedRule::normalize_optional_list(Some(&[])), None);
380        assert_eq!(
381            LoadedRule::normalize_optional_list(Some(&["a".to_string(), "b".to_string()])),
382            Some(vec!["a".to_string(), "b".to_string()])
383        );
384        assert_eq!(
385            LoadedRule::normalize_optional_list(Some(&["  a  ".to_string(), "  b  ".to_string()])),
386            Some(vec!["a".to_string(), "b".to_string()])
387        );
388        assert_eq!(
389            LoadedRule::normalize_optional_list(Some(&["".to_string(), "  ".to_string()])),
390            None
391        );
392    }
393
394    #[test]
395    fn test_validate_rule_kind_flags() {
396        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::Text, false).is_ok());
397        assert_eq!(
398            LoadedRule::validate_rule_kind_flags(RuleKind::None, false),
399            Err(RuleKindError::NoFlagsSet)
400        );
401        assert!(LoadedRule::validate_rule_kind_flags(RuleKind::None, true).is_ok());
402        assert_eq!(
403            LoadedRule::validate_rule_kind_flags(RuleKind::Text, true),
404            Err(RuleKindError::FalsePositiveWithFlags)
405        );
406    }
407
408    #[test]
409    fn test_serde_roundtrip() {
410        let rule = LoadedRule {
411            identifier: "mit.LICENSE".to_string(),
412            license_expression: "mit".to_string(),
413            text: "MIT License".to_string(),
414            rule_kind: RuleKind::Text,
415            is_false_positive: false,
416            is_required_phrase: false,
417            skip_for_required_phrase_generation: false,
418            relevance: Some(100),
419            minimum_coverage: Some(90),
420            has_stored_minimum_coverage: true,
421            is_continuous: false,
422            referenced_filenames: Some(vec!["MIT.txt".to_string()]),
423            ignorable_urls: None,
424            ignorable_emails: None,
425            ignorable_copyrights: None,
426            ignorable_holders: None,
427            ignorable_authors: None,
428            language: None,
429            notes: Some("Test note".to_string()),
430            is_deprecated: false,
431            replaced_by: vec![],
432        };
433
434        let json = serde_json::to_string(&rule).unwrap();
435        let deserialized: LoadedRule = serde_json::from_str(&json).unwrap();
436        assert_eq!(rule, deserialized);
437    }
438}