Skip to main content

provenant/license_detection/models/
rule.rs

1//! Rule metadata loaded from .LICENSE and .RULE files.
2
3use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10const SCANCODE_LICENSE_URL_BASE: &str =
11    "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
12const SCANCODE_RULE_URL_BASE: &str =
13    "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
14
15mod range_serde {
16    use serde::{Deserialize, Deserializer, Serialize, Serializer};
17    use std::ops::Range;
18
19    pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
20    where
21        S: Serializer,
22    {
23        let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
24        tuples.serialize(serializer)
25    }
26
27    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
28    where
29        D: Deserializer<'de>,
30    {
31        let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
32        Ok(tuples
33            .into_iter()
34            .map(|(start, end)| Range { start, end })
35            .collect())
36    }
37}
38
39mod stopwords_serde {
40    use serde::{Deserialize, Deserializer, Serialize, Serializer};
41    use std::collections::HashMap;
42
43    pub fn serialize<S>(map: &HashMap<usize, usize>, serializer: S) -> Result<S::Ok, S::Error>
44    where
45        S: Serializer,
46    {
47        let mut entries: Vec<(usize, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
48        entries.sort_by_key(|(k, _)| *k);
49        entries.serialize(serializer)
50    }
51
52    pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<usize, usize>, D::Error>
53    where
54        D: Deserializer<'de>,
55    {
56        let entries: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
57        Ok(entries.into_iter().collect())
58    }
59}
60
61#[derive(
62    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
63)]
64pub enum RuleKind {
65    #[default]
66    None,
67    Text,
68    Notice,
69    Reference,
70    Tag,
71    Intro,
72    Clue,
73}
74
75impl RuleKind {
76    pub fn from_rule_flags(
77        is_license_text: bool,
78        is_license_notice: bool,
79        is_license_reference: bool,
80        is_license_tag: bool,
81        is_license_intro: bool,
82        is_license_clue: bool,
83    ) -> Result<Self, &'static str> {
84        let mut active = None;
85
86        for (enabled, kind) in [
87            (is_license_text, Self::Text),
88            (is_license_notice, Self::Notice),
89            (is_license_reference, Self::Reference),
90            (is_license_tag, Self::Tag),
91            (is_license_intro, Self::Intro),
92            (is_license_clue, Self::Clue),
93        ] {
94            if !enabled {
95                continue;
96            }
97
98            if active.replace(kind).is_some() {
99                return Err("rule has multiple rule kinds set");
100            }
101        }
102
103        Ok(active.unwrap_or(Self::None))
104    }
105
106    pub fn from_match_flags(
107        is_license_text: bool,
108        is_license_reference: bool,
109        is_license_tag: bool,
110        is_license_intro: bool,
111        is_license_clue: bool,
112    ) -> Result<Self, &'static str> {
113        Self::from_rule_flags(
114            is_license_text,
115            false,
116            is_license_reference,
117            is_license_tag,
118            is_license_intro,
119            is_license_clue,
120        )
121        .map_err(|_| "license match has multiple rule kinds set")
122    }
123
124    pub const fn is_license_text(self) -> bool {
125        matches!(self, Self::Text)
126    }
127
128    pub const fn is_license_notice(self) -> bool {
129        matches!(self, Self::Notice)
130    }
131
132    pub const fn is_license_reference(self) -> bool {
133        matches!(self, Self::Reference)
134    }
135
136    pub const fn is_license_tag(self) -> bool {
137        matches!(self, Self::Tag)
138    }
139
140    pub const fn is_license_intro(self) -> bool {
141        matches!(self, Self::Intro)
142    }
143
144    pub const fn is_license_clue(self) -> bool {
145        matches!(self, Self::Clue)
146    }
147}
148
149/// Rule metadata loaded from .LICENSE and .RULE files.
150#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
151pub struct Rule {
152    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
153    /// Used for sorting to match Python's attr.s field order.
154    /// This is the primary sort key after rid (which is None at sort time in Python).
155    pub identifier: String,
156
157    /// License expression string using SPDX syntax and ScanCode license keys
158    pub license_expression: String,
159
160    /// Pattern text to match
161    pub text: String,
162
163    /// Token IDs for the text (assigned during indexing)
164    #[serde(
165        serialize_with = "serialize_token_ids",
166        deserialize_with = "deserialize_token_ids"
167    )]
168    pub tokens: Vec<TokenId>,
169
170    /// Classification of this rule.
171    pub rule_kind: RuleKind,
172
173    /// True if exact matches to this rule are false positives
174    pub is_false_positive: bool,
175
176    /// True if this rule text is a required phrase.
177    /// A required phrase is an essential section of the rule text which must be
178    /// present in the case of partial matches.
179    pub is_required_phrase: bool,
180
181    /// True if this rule was created from a license file (not a .RULE file)
182    pub is_from_license: bool,
183
184    /// Relevance score 0-100 (100 is most relevant)
185    pub relevance: u8,
186
187    /// Minimum match coverage percentage (0-100) if specified
188    pub minimum_coverage: Option<u8>,
189
190    /// True if minimum_coverage was explicitly stored in source frontmatter
191    pub has_stored_minimum_coverage: bool,
192
193    /// Tokens must appear in order if true
194    pub is_continuous: bool,
195
196    /// Token position spans for required phrases parsed from {{...}} markers.
197    /// Each span represents positions in the rule text that MUST be matched.
198    #[serde(with = "range_serde", default)]
199    pub required_phrase_spans: Vec<Range<usize>>,
200
201    /// Mapping from token position to count of stopwords at that position.
202    /// Used for required phrase validation.
203    #[serde(with = "stopwords_serde", default)]
204    pub stopwords_by_pos: HashMap<usize, usize>,
205
206    /// Filenames where this rule should be considered
207    pub referenced_filenames: Option<Vec<String>>,
208
209    /// URLs that should be ignored when found in this rule text
210    pub ignorable_urls: Option<Vec<String>>,
211
212    /// Emails that should be ignored when found in this rule text
213    pub ignorable_emails: Option<Vec<String>>,
214
215    /// Copyrights that should be ignored when found in this rule text
216    pub ignorable_copyrights: Option<Vec<String>>,
217
218    /// Holder names that should be ignored when found in this rule text
219    pub ignorable_holders: Option<Vec<String>>,
220
221    /// Author names that should be ignored when found in this rule text
222    pub ignorable_authors: Option<Vec<String>>,
223
224    /// Programming language for the rule if specified
225    pub language: Option<String>,
226
227    /// Free text notes
228    pub notes: Option<String>,
229
230    /// Count of unique token IDs in the rule (computed during indexing)
231    pub length_unique: usize,
232
233    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
234    pub high_length_unique: usize,
235
236    /// Total count of legalese token occurrences (with duplicates)
237    pub high_length: usize,
238
239    /// Minimum matched length threshold (occurrences-based)
240    pub min_matched_length: usize,
241
242    /// Minimum high-value token matched length threshold (occurrences-based)
243    pub min_high_matched_length: usize,
244
245    /// Minimum matched length threshold (unique tokens)
246    pub min_matched_length_unique: usize,
247
248    /// Minimum high-value token matched length threshold (unique tokens)
249    pub min_high_matched_length_unique: usize,
250
251    /// True if rule length < SMALL_RULE (15 tokens)
252    pub is_small: bool,
253
254    /// True if rule length < TINY_RULE (6 tokens)
255    pub is_tiny: bool,
256
257    /// True if the rule's first token is "license", "licence", or "licensed"
258    pub starts_with_license: bool,
259
260    /// True if the rule's last token is "license", "licence", or "licensed"
261    pub ends_with_license: bool,
262
263    /// Whether this rule is deprecated
264    pub is_deprecated: bool,
265
266    /// SPDX license identifier if available
267    pub spdx_license_key: Option<String>,
268
269    /// Alternative SPDX license identifiers (aliases)
270    pub other_spdx_license_keys: Vec<String>,
271}
272
273fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
274where
275    S: serde::Serializer,
276{
277    let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
278    <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
279}
280
281fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
282where
283    D: serde::Deserializer<'de>,
284{
285    let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
286    Ok(raw_ids.into_iter().map(TokenId::new).collect())
287}
288
289impl PartialOrd for Rule {
290    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
291        Some(self.cmp(other))
292    }
293}
294
295impl Ord for Rule {
296    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
297        self.identifier.cmp(&other.identifier)
298    }
299}
300
301impl Rule {
302    pub fn rule_url(&self) -> Option<String> {
303        if self.is_from_license {
304            return (!self.license_expression.is_empty()).then(|| {
305                format!(
306                    "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
307                    self.license_expression
308                )
309            });
310        }
311
312        (!self.identifier.is_empty())
313            .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
314    }
315
316    pub const fn kind(&self) -> RuleKind {
317        self.rule_kind
318    }
319
320    pub const fn is_license_text(&self) -> bool {
321        self.rule_kind.is_license_text()
322    }
323
324    /// Returns true if this rule is a license notice pattern.
325    ///
326    /// Note: This method is kept for API completeness and potential future use.
327    /// License matches cannot have `is_license_notice` - only rules can.
328    #[allow(dead_code)]
329    pub const fn is_license_notice(&self) -> bool {
330        self.rule_kind.is_license_notice()
331    }
332
333    pub const fn is_license_reference(&self) -> bool {
334        self.rule_kind.is_license_reference()
335    }
336
337    pub const fn is_license_tag(&self) -> bool {
338        self.rule_kind.is_license_tag()
339    }
340
341    /// Returns true if this rule is a license introduction pattern.
342    ///
343    /// Note: This method is kept for API completeness and potential future use.
344    #[allow(dead_code)]
345    pub const fn is_license_intro(&self) -> bool {
346        self.rule_kind.is_license_intro()
347    }
348
349    pub const fn is_license_clue(&self) -> bool {
350        self.rule_kind.is_license_clue()
351    }
352}