Skip to main content

provenant/license_detection/models/
rule.rs

1//! Rule metadata loaded from .LICENSE and .RULE files.
2
3use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10const SCANCODE_LICENSE_URL_BASE: &str =
11    "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
12const SCANCODE_RULE_URL_BASE: &str =
13    "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
14
15#[derive(
16    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
17)]
18pub enum RuleKind {
19    #[default]
20    None,
21    Text,
22    Notice,
23    Reference,
24    Tag,
25    Intro,
26    Clue,
27}
28
29impl RuleKind {
30    pub fn from_rule_flags(
31        is_license_text: bool,
32        is_license_notice: bool,
33        is_license_reference: bool,
34        is_license_tag: bool,
35        is_license_intro: bool,
36        is_license_clue: bool,
37    ) -> Result<Self, &'static str> {
38        let mut active = None;
39
40        for (enabled, kind) in [
41            (is_license_text, Self::Text),
42            (is_license_notice, Self::Notice),
43            (is_license_reference, Self::Reference),
44            (is_license_tag, Self::Tag),
45            (is_license_intro, Self::Intro),
46            (is_license_clue, Self::Clue),
47        ] {
48            if !enabled {
49                continue;
50            }
51
52            if active.replace(kind).is_some() {
53                return Err("rule has multiple rule kinds set");
54            }
55        }
56
57        Ok(active.unwrap_or(Self::None))
58    }
59
60    pub fn from_match_flags(
61        is_license_text: bool,
62        is_license_reference: bool,
63        is_license_tag: bool,
64        is_license_intro: bool,
65        is_license_clue: bool,
66    ) -> Result<Self, &'static str> {
67        Self::from_rule_flags(
68            is_license_text,
69            false,
70            is_license_reference,
71            is_license_tag,
72            is_license_intro,
73            is_license_clue,
74        )
75        .map_err(|_| "license match has multiple rule kinds set")
76    }
77
78    pub const fn is_license_text(self) -> bool {
79        matches!(self, Self::Text)
80    }
81
82    pub const fn is_license_notice(self) -> bool {
83        matches!(self, Self::Notice)
84    }
85
86    pub const fn is_license_reference(self) -> bool {
87        matches!(self, Self::Reference)
88    }
89
90    pub const fn is_license_tag(self) -> bool {
91        matches!(self, Self::Tag)
92    }
93
94    pub const fn is_license_intro(self) -> bool {
95        matches!(self, Self::Intro)
96    }
97
98    pub const fn is_license_clue(self) -> bool {
99        matches!(self, Self::Clue)
100    }
101}
102
103/// Rule metadata loaded from .LICENSE and .RULE files.
104#[derive(Debug, Clone, PartialEq, Eq)]
105pub struct Rule {
106    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
107    /// Used for sorting to match Python's attr.s field order.
108    /// This is the primary sort key after rid (which is None at sort time in Python).
109    pub identifier: String,
110
111    /// License expression string using SPDX syntax and ScanCode license keys
112    pub license_expression: String,
113
114    /// Pattern text to match
115    pub text: String,
116
117    /// Token IDs for the text (assigned during indexing)
118    pub tokens: Vec<TokenId>,
119
120    /// Classification of this rule.
121    pub rule_kind: RuleKind,
122
123    /// True if exact matches to this rule are false positives
124    pub is_false_positive: bool,
125
126    /// True if this rule text is a required phrase.
127    /// A required phrase is an essential section of the rule text which must be
128    /// present in the case of partial matches.
129    pub is_required_phrase: bool,
130
131    /// True if this rule was created from a license file (not a .RULE file)
132    pub is_from_license: bool,
133
134    /// Relevance score 0-100 (100 is most relevant)
135    pub relevance: u8,
136
137    /// Minimum match coverage percentage (0-100) if specified
138    pub minimum_coverage: Option<u8>,
139
140    /// True if minimum_coverage was explicitly stored in source frontmatter
141    pub has_stored_minimum_coverage: bool,
142
143    /// Tokens must appear in order if true
144    pub is_continuous: bool,
145
146    /// Token position spans for required phrases parsed from {{...}} markers.
147    /// Each span represents positions in the rule text that MUST be matched.
148    pub required_phrase_spans: Vec<Range<usize>>,
149
150    /// Mapping from token position to count of stopwords at that position.
151    /// Used for required phrase validation.
152    pub stopwords_by_pos: HashMap<usize, usize>,
153
154    /// Filenames where this rule should be considered
155    pub referenced_filenames: Option<Vec<String>>,
156
157    /// URLs that should be ignored when found in this rule text
158    pub ignorable_urls: Option<Vec<String>>,
159
160    /// Emails that should be ignored when found in this rule text
161    pub ignorable_emails: Option<Vec<String>>,
162
163    /// Copyrights that should be ignored when found in this rule text
164    pub ignorable_copyrights: Option<Vec<String>>,
165
166    /// Holder names that should be ignored when found in this rule text
167    pub ignorable_holders: Option<Vec<String>>,
168
169    /// Author names that should be ignored when found in this rule text
170    pub ignorable_authors: Option<Vec<String>>,
171
172    /// Programming language for the rule if specified
173    pub language: Option<String>,
174
175    /// Free text notes
176    pub notes: Option<String>,
177
178    /// Count of unique token IDs in the rule (computed during indexing)
179    pub length_unique: usize,
180
181    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
182    pub high_length_unique: usize,
183
184    /// Total count of legalese token occurrences (with duplicates)
185    pub high_length: usize,
186
187    /// Minimum matched length threshold (occurrences-based)
188    pub min_matched_length: usize,
189
190    /// Minimum high-value token matched length threshold (occurrences-based)
191    pub min_high_matched_length: usize,
192
193    /// Minimum matched length threshold (unique tokens)
194    pub min_matched_length_unique: usize,
195
196    /// Minimum high-value token matched length threshold (unique tokens)
197    pub min_high_matched_length_unique: usize,
198
199    /// True if rule length < SMALL_RULE (15 tokens)
200    pub is_small: bool,
201
202    /// True if rule length < TINY_RULE (6 tokens)
203    pub is_tiny: bool,
204
205    /// True if the rule's first token is "license", "licence", or "licensed"
206    pub starts_with_license: bool,
207
208    /// True if the rule's last token is "license", "licence", or "licensed"
209    pub ends_with_license: bool,
210
211    /// Whether this rule is deprecated
212    pub is_deprecated: bool,
213
214    /// SPDX license identifier if available
215    pub spdx_license_key: Option<String>,
216
217    /// Alternative SPDX license identifiers (aliases)
218    pub other_spdx_license_keys: Vec<String>,
219}
220
221impl PartialOrd for Rule {
222    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
223        Some(self.cmp(other))
224    }
225}
226
227impl Ord for Rule {
228    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
229        self.identifier.cmp(&other.identifier)
230    }
231}
232
233impl Rule {
234    pub fn rule_url(&self) -> Option<String> {
235        if self.is_from_license {
236            return (!self.license_expression.is_empty()).then(|| {
237                format!(
238                    "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
239                    self.license_expression
240                )
241            });
242        }
243
244        (!self.identifier.is_empty())
245            .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
246    }
247
248    pub const fn kind(&self) -> RuleKind {
249        self.rule_kind
250    }
251
252    pub const fn is_license_text(&self) -> bool {
253        self.rule_kind.is_license_text()
254    }
255
256    /// Returns true if this rule is a license notice pattern.
257    ///
258    /// Note: This method is kept for API completeness and potential future use.
259    /// License matches cannot have `is_license_notice` - only rules can.
260    #[allow(dead_code)]
261    pub const fn is_license_notice(&self) -> bool {
262        self.rule_kind.is_license_notice()
263    }
264
265    pub const fn is_license_reference(&self) -> bool {
266        self.rule_kind.is_license_reference()
267    }
268
269    pub const fn is_license_tag(&self) -> bool {
270        self.rule_kind.is_license_tag()
271    }
272
273    /// Returns true if this rule is a license introduction pattern.
274    ///
275    /// Note: This method is kept for API completeness and potential future use.
276    #[allow(dead_code)]
277    pub const fn is_license_intro(&self) -> bool {
278        self.rule_kind.is_license_intro()
279    }
280
281    pub const fn is_license_clue(&self) -> bool {
282        self.rule_kind.is_license_clue()
283    }
284}