Skip to main content

provenant/license_detection/models/
rule.rs

1//! Rule metadata loaded from .LICENSE and .RULE files.
2
3use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10const SCANCODE_LICENSE_URL_BASE: &str =
11    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
12const SCANCODE_RULE_URL_BASE: &str =
13    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
14
15mod range_serde {
16    use serde::{Deserialize, Deserializer, Serialize, Serializer};
17    use std::ops::Range;
18
19    pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
20    where
21        S: Serializer,
22    {
23        let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
24        tuples.serialize(serializer)
25    }
26
27    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
28    where
29        D: Deserializer<'de>,
30    {
31        let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
32        Ok(tuples
33            .into_iter()
34            .map(|(start, end)| Range { start, end })
35            .collect())
36    }
37}
38
39mod stopwords_serde {
40    use serde::{Deserialize, Deserializer, Serialize, Serializer};
41    use std::collections::HashMap;
42
43    pub fn serialize<S>(
44        map: &HashMap<Option<usize>, usize>,
45        serializer: S,
46    ) -> Result<S::Ok, S::Error>
47    where
48        S: Serializer,
49    {
50        let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
51        entries.sort_by_key(|(k, _)| *k);
52        entries.serialize(serializer)
53    }
54
55    pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
56    where
57        D: Deserializer<'de>,
58    {
59        let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
60        Ok(entries.into_iter().collect())
61    }
62}
63
64#[derive(
65    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
66)]
67pub enum RuleKind {
68    #[default]
69    None,
70    Text,
71    Notice,
72    Reference,
73    Tag,
74    Intro,
75    Clue,
76}
77
78impl RuleKind {
79    pub fn from_rule_flags(
80        is_license_text: bool,
81        is_license_notice: bool,
82        is_license_reference: bool,
83        is_license_tag: bool,
84        is_license_intro: bool,
85        is_license_clue: bool,
86    ) -> Result<Self, &'static str> {
87        let mut active = None;
88
89        for (enabled, kind) in [
90            (is_license_text, Self::Text),
91            (is_license_notice, Self::Notice),
92            (is_license_reference, Self::Reference),
93            (is_license_tag, Self::Tag),
94            (is_license_intro, Self::Intro),
95            (is_license_clue, Self::Clue),
96        ] {
97            if !enabled {
98                continue;
99            }
100
101            if active.replace(kind).is_some() {
102                return Err("rule has multiple rule kinds set");
103            }
104        }
105
106        Ok(active.unwrap_or(Self::None))
107    }
108
109    pub const fn is_license_text(self) -> bool {
110        matches!(self, Self::Text)
111    }
112
113    pub const fn is_license_notice(self) -> bool {
114        matches!(self, Self::Notice)
115    }
116
117    pub const fn is_license_reference(self) -> bool {
118        matches!(self, Self::Reference)
119    }
120
121    pub const fn is_license_tag(self) -> bool {
122        matches!(self, Self::Tag)
123    }
124
125    pub const fn is_license_intro(self) -> bool {
126        matches!(self, Self::Intro)
127    }
128
129    pub const fn is_license_clue(self) -> bool {
130        matches!(self, Self::Clue)
131    }
132}
133
134/// Rule metadata loaded from .LICENSE and .RULE files.
135#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
136pub struct Rule {
137    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
138    /// Used for sorting to match Python's attr.s field order.
139    /// This is the primary sort key after rid (which is None at sort time in Python).
140    pub identifier: String,
141
142    /// License expression string using SPDX syntax and ScanCode license keys
143    pub license_expression: String,
144
145    /// Pattern text to match
146    pub text: String,
147
148    /// Token IDs for the text (assigned during indexing)
149    #[serde(
150        serialize_with = "serialize_token_ids",
151        deserialize_with = "deserialize_token_ids"
152    )]
153    pub tokens: Vec<TokenId>,
154
155    /// Classification of this rule.
156    pub rule_kind: RuleKind,
157
158    /// True if exact matches to this rule are false positives
159    pub is_false_positive: bool,
160
161    /// True if this rule text is a required phrase.
162    /// A required phrase is an essential section of the rule text which must be
163    /// present in the case of partial matches.
164    pub is_required_phrase: bool,
165
166    /// True if this rule was created from a license file (not a .RULE file)
167    pub is_from_license: bool,
168
169    /// Relevance score 0-100 (100 is most relevant)
170    pub relevance: u8,
171
172    /// Minimum match coverage percentage (0-100) if specified
173    pub minimum_coverage: Option<u8>,
174
175    /// True if minimum_coverage was explicitly stored in source frontmatter
176    pub has_stored_minimum_coverage: bool,
177
178    /// Tokens must appear in order if true
179    pub is_continuous: bool,
180
181    /// Token position spans for required phrases parsed from {{...}} markers.
182    /// Each span represents positions in the rule text that MUST be matched.
183    #[serde(with = "range_serde", default)]
184    pub required_phrase_spans: Vec<Range<usize>>,
185
186    /// Mapping from token position to count of stopwords at that position.
187    /// Used for required phrase validation.
188    #[serde(with = "stopwords_serde", default)]
189    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
190
191    /// Filenames where this rule should be considered
192    pub referenced_filenames: Option<Vec<String>>,
193
194    /// URLs that should be ignored when found in this rule text
195    pub ignorable_urls: Option<Vec<String>>,
196
197    /// Emails that should be ignored when found in this rule text
198    pub ignorable_emails: Option<Vec<String>>,
199
200    /// Copyrights that should be ignored when found in this rule text
201    pub ignorable_copyrights: Option<Vec<String>>,
202
203    /// Holder names that should be ignored when found in this rule text
204    pub ignorable_holders: Option<Vec<String>>,
205
206    /// Author names that should be ignored when found in this rule text
207    pub ignorable_authors: Option<Vec<String>>,
208
209    /// Programming language for the rule if specified
210    pub language: Option<String>,
211
212    /// Free text notes
213    pub notes: Option<String>,
214
215    /// Count of unique token IDs in the rule (computed during indexing)
216    pub length_unique: usize,
217
218    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
219    pub high_length_unique: usize,
220
221    /// Total count of legalese token occurrences (with duplicates)
222    pub high_length: usize,
223
224    /// Minimum matched length threshold (occurrences-based)
225    pub min_matched_length: usize,
226
227    /// Minimum high-value token matched length threshold (occurrences-based)
228    pub min_high_matched_length: usize,
229
230    /// Minimum matched length threshold (unique tokens)
231    pub min_matched_length_unique: usize,
232
233    /// Minimum high-value token matched length threshold (unique tokens)
234    pub min_high_matched_length_unique: usize,
235
236    /// True if rule length < SMALL_RULE (15 tokens)
237    pub is_small: bool,
238
239    /// True if rule length < TINY_RULE (6 tokens)
240    pub is_tiny: bool,
241
242    /// True if the rule's first token is "license", "licence", or "licensed"
243    pub starts_with_license: bool,
244
245    /// True if the rule's last token is "license", "licence", or "licensed"
246    pub ends_with_license: bool,
247
248    /// Whether this rule is deprecated
249    pub is_deprecated: bool,
250
251    /// SPDX license identifier if available
252    pub spdx_license_key: Option<String>,
253
254    /// Alternative SPDX license identifiers (aliases)
255    pub other_spdx_license_keys: Vec<String>,
256}
257
258fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
259where
260    S: serde::Serializer,
261{
262    let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
263    <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
264}
265
266fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
267where
268    D: serde::Deserializer<'de>,
269{
270    let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
271    Ok(raw_ids.into_iter().map(TokenId::new).collect())
272}
273
274impl PartialOrd for Rule {
275    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
276        Some(self.cmp(other))
277    }
278}
279
280impl Ord for Rule {
281    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
282        self.identifier.cmp(&other.identifier)
283    }
284}
285
286impl Rule {
287    pub fn rule_url(&self) -> Option<String> {
288        if self.is_from_license {
289            return (!self.license_expression.is_empty()).then(|| {
290                format!(
291                    "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
292                    self.license_expression
293                )
294            });
295        }
296
297        (!self.identifier.is_empty())
298            .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
299    }
300
301    pub const fn kind(&self) -> RuleKind {
302        self.rule_kind
303    }
304
305    pub const fn is_license_text(&self) -> bool {
306        self.rule_kind.is_license_text()
307    }
308
309    /// Returns true if this rule is a license notice pattern.
310    ///
311    /// Note: This method is kept for API completeness and potential future use.
312    /// License matches cannot have `is_license_notice` - only rules can.
313    #[allow(dead_code)]
314    pub const fn is_license_notice(&self) -> bool {
315        self.rule_kind.is_license_notice()
316    }
317
318    pub const fn is_license_reference(&self) -> bool {
319        self.rule_kind.is_license_reference()
320    }
321
322    pub const fn is_license_tag(&self) -> bool {
323        self.rule_kind.is_license_tag()
324    }
325
326    /// Returns true if this rule is a license introduction pattern.
327    ///
328    /// Note: This method is kept for API completeness and potential future use.
329    #[allow(dead_code)]
330    pub const fn is_license_intro(&self) -> bool {
331        self.rule_kind.is_license_intro()
332    }
333
334    pub const fn is_license_clue(&self) -> bool {
335        self.rule_kind.is_license_clue()
336    }
337}