Skip to main content

provenant/license_detection/models/
rule.rs

1//! Rule metadata loaded from .LICENSE and .RULE files.
2
3use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10#[derive(
11    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
12)]
13pub enum RuleKind {
14    #[default]
15    None,
16    Text,
17    Notice,
18    Reference,
19    Tag,
20    Intro,
21    Clue,
22}
23
24impl RuleKind {
25    pub fn from_rule_flags(
26        is_license_text: bool,
27        is_license_notice: bool,
28        is_license_reference: bool,
29        is_license_tag: bool,
30        is_license_intro: bool,
31        is_license_clue: bool,
32    ) -> Result<Self, &'static str> {
33        let mut active = None;
34
35        for (enabled, kind) in [
36            (is_license_text, Self::Text),
37            (is_license_notice, Self::Notice),
38            (is_license_reference, Self::Reference),
39            (is_license_tag, Self::Tag),
40            (is_license_intro, Self::Intro),
41            (is_license_clue, Self::Clue),
42        ] {
43            if !enabled {
44                continue;
45            }
46
47            if active.replace(kind).is_some() {
48                return Err("rule has multiple rule kinds set");
49            }
50        }
51
52        Ok(active.unwrap_or(Self::None))
53    }
54
55    pub fn from_match_flags(
56        is_license_text: bool,
57        is_license_reference: bool,
58        is_license_tag: bool,
59        is_license_intro: bool,
60        is_license_clue: bool,
61    ) -> Result<Self, &'static str> {
62        Self::from_rule_flags(
63            is_license_text,
64            false,
65            is_license_reference,
66            is_license_tag,
67            is_license_intro,
68            is_license_clue,
69        )
70        .map_err(|_| "license match has multiple rule kinds set")
71    }
72
73    pub const fn is_license_text(self) -> bool {
74        matches!(self, Self::Text)
75    }
76
77    pub const fn is_license_notice(self) -> bool {
78        matches!(self, Self::Notice)
79    }
80
81    pub const fn is_license_reference(self) -> bool {
82        matches!(self, Self::Reference)
83    }
84
85    pub const fn is_license_tag(self) -> bool {
86        matches!(self, Self::Tag)
87    }
88
89    pub const fn is_license_intro(self) -> bool {
90        matches!(self, Self::Intro)
91    }
92
93    pub const fn is_license_clue(self) -> bool {
94        matches!(self, Self::Clue)
95    }
96}
97
98/// Rule metadata loaded from .LICENSE and .RULE files.
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct Rule {
101    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
102    /// Used for sorting to match Python's attr.s field order.
103    /// This is the primary sort key after rid (which is None at sort time in Python).
104    pub identifier: String,
105
106    /// License expression string using SPDX syntax and ScanCode license keys
107    pub license_expression: String,
108
109    /// Pattern text to match
110    pub text: String,
111
112    /// Token IDs for the text (assigned during indexing)
113    pub tokens: Vec<TokenId>,
114
115    /// Classification of this rule.
116    pub rule_kind: RuleKind,
117
118    /// True if exact matches to this rule are false positives
119    pub is_false_positive: bool,
120
121    /// True if this rule text is a required phrase.
122    /// A required phrase is an essential section of the rule text which must be
123    /// present in the case of partial matches.
124    pub is_required_phrase: bool,
125
126    /// True if this rule was created from a license file (not a .RULE file)
127    pub is_from_license: bool,
128
129    /// Relevance score 0-100 (100 is most relevant)
130    pub relevance: u8,
131
132    /// Minimum match coverage percentage (0-100) if specified
133    pub minimum_coverage: Option<u8>,
134
135    /// True if minimum_coverage was explicitly stored in source frontmatter
136    pub has_stored_minimum_coverage: bool,
137
138    /// Tokens must appear in order if true
139    pub is_continuous: bool,
140
141    /// Token position spans for required phrases parsed from {{...}} markers.
142    /// Each span represents positions in the rule text that MUST be matched.
143    pub required_phrase_spans: Vec<Range<usize>>,
144
145    /// Mapping from token position to count of stopwords at that position.
146    /// Used for required phrase validation.
147    pub stopwords_by_pos: HashMap<usize, usize>,
148
149    /// Filenames where this rule should be considered
150    pub referenced_filenames: Option<Vec<String>>,
151
152    /// URLs that should be ignored when found in this rule text
153    pub ignorable_urls: Option<Vec<String>>,
154
155    /// Emails that should be ignored when found in this rule text
156    pub ignorable_emails: Option<Vec<String>>,
157
158    /// Copyrights that should be ignored when found in this rule text
159    pub ignorable_copyrights: Option<Vec<String>>,
160
161    /// Holder names that should be ignored when found in this rule text
162    pub ignorable_holders: Option<Vec<String>>,
163
164    /// Author names that should be ignored when found in this rule text
165    pub ignorable_authors: Option<Vec<String>>,
166
167    /// Programming language for the rule if specified
168    pub language: Option<String>,
169
170    /// Free text notes
171    pub notes: Option<String>,
172
173    /// Count of unique token IDs in the rule (computed during indexing)
174    pub length_unique: usize,
175
176    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
177    pub high_length_unique: usize,
178
179    /// Total count of legalese token occurrences (with duplicates)
180    pub high_length: usize,
181
182    /// Minimum matched length threshold (occurrences-based)
183    pub min_matched_length: usize,
184
185    /// Minimum high-value token matched length threshold (occurrences-based)
186    pub min_high_matched_length: usize,
187
188    /// Minimum matched length threshold (unique tokens)
189    pub min_matched_length_unique: usize,
190
191    /// Minimum high-value token matched length threshold (unique tokens)
192    pub min_high_matched_length_unique: usize,
193
194    /// True if rule length < SMALL_RULE (15 tokens)
195    pub is_small: bool,
196
197    /// True if rule length < TINY_RULE (6 tokens)
198    pub is_tiny: bool,
199
200    /// True if the rule's first token is "license", "licence", or "licensed"
201    pub starts_with_license: bool,
202
203    /// True if the rule's last token is "license", "licence", or "licensed"
204    pub ends_with_license: bool,
205
206    /// Whether this rule is deprecated
207    pub is_deprecated: bool,
208
209    /// SPDX license identifier if available
210    pub spdx_license_key: Option<String>,
211
212    /// Alternative SPDX license identifiers (aliases)
213    pub other_spdx_license_keys: Vec<String>,
214}
215
216impl PartialOrd for Rule {
217    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
218        Some(self.cmp(other))
219    }
220}
221
222impl Ord for Rule {
223    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
224        self.identifier.cmp(&other.identifier)
225    }
226}
227
228impl Rule {
229    pub const fn kind(&self) -> RuleKind {
230        self.rule_kind
231    }
232
233    pub const fn is_license_text(&self) -> bool {
234        self.rule_kind.is_license_text()
235    }
236
237    /// Returns true if this rule is a license notice pattern.
238    ///
239    /// Note: This method is kept for API completeness and potential future use.
240    /// License matches cannot have `is_license_notice` - only rules can.
241    #[allow(dead_code)]
242    pub const fn is_license_notice(&self) -> bool {
243        self.rule_kind.is_license_notice()
244    }
245
246    pub const fn is_license_reference(&self) -> bool {
247        self.rule_kind.is_license_reference()
248    }
249
250    pub const fn is_license_tag(&self) -> bool {
251        self.rule_kind.is_license_tag()
252    }
253
254    /// Returns true if this rule is a license introduction pattern.
255    ///
256    /// Note: This method is kept for API completeness and potential future use.
257    #[allow(dead_code)]
258    pub const fn is_license_intro(&self) -> bool {
259        self.rule_kind.is_license_intro()
260    }
261
262    pub const fn is_license_clue(&self) -> bool {
263        self.rule_kind.is_license_clue()
264    }
265}