Skip to main content

provenant/license_detection/models/
rule.rs

1//! Rule metadata loaded from .LICENSE and .RULE files.
2
3use std::collections::HashMap;
4use std::ops::Range;
5
6use rkyv::Archive;
7use serde::{Deserialize, Serialize};
8
9use crate::license_detection::index::dictionary::TokenId;
10
11const SCANCODE_LICENSE_URL_BASE: &str =
12    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
13const SCANCODE_RULE_URL_BASE: &str =
14    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
15
16mod range_serde {
17    use serde::{Deserialize, Deserializer, Serialize, Serializer};
18    use std::ops::Range;
19
20    pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
21    where
22        S: Serializer,
23    {
24        let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
25        tuples.serialize(serializer)
26    }
27
28    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
29    where
30        D: Deserializer<'de>,
31    {
32        let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
33        Ok(tuples
34            .into_iter()
35            .map(|(start, end)| Range { start, end })
36            .collect())
37    }
38}
39
40mod stopwords_serde {
41    use serde::{Deserialize, Deserializer, Serialize, Serializer};
42    use std::collections::HashMap;
43
44    pub fn serialize<S>(
45        map: &HashMap<Option<usize>, usize>,
46        serializer: S,
47    ) -> Result<S::Ok, S::Error>
48    where
49        S: Serializer,
50    {
51        let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
52        entries.sort_by_key(|(k, _)| *k);
53        entries.serialize(serializer)
54    }
55
56    pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
57    where
58        D: Deserializer<'de>,
59    {
60        let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
61        Ok(entries.into_iter().collect())
62    }
63}
64
65#[derive(
66    Debug,
67    Clone,
68    Copy,
69    PartialEq,
70    Eq,
71    Hash,
72    PartialOrd,
73    Ord,
74    Default,
75    Serialize,
76    Deserialize,
77    Archive,
78    rkyv::Serialize,
79    rkyv::Deserialize,
80)]
81#[rkyv(derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord))]
82pub enum RuleKind {
83    #[default]
84    None,
85    Text,
86    Notice,
87    Reference,
88    Tag,
89    Intro,
90    Clue,
91}
92
93impl RuleKind {
94    pub fn from_rule_flags(
95        is_license_text: bool,
96        is_license_notice: bool,
97        is_license_reference: bool,
98        is_license_tag: bool,
99        is_license_intro: bool,
100        is_license_clue: bool,
101    ) -> Result<Self, &'static str> {
102        let mut active = None;
103
104        for (enabled, kind) in [
105            (is_license_text, Self::Text),
106            (is_license_notice, Self::Notice),
107            (is_license_reference, Self::Reference),
108            (is_license_tag, Self::Tag),
109            (is_license_intro, Self::Intro),
110            (is_license_clue, Self::Clue),
111        ] {
112            if !enabled {
113                continue;
114            }
115
116            if active.replace(kind).is_some() {
117                return Err("rule has multiple rule kinds set");
118            }
119        }
120
121        Ok(active.unwrap_or(Self::None))
122    }
123
124    pub const fn is_license_text(self) -> bool {
125        matches!(self, Self::Text)
126    }
127
128    pub const fn is_license_notice(self) -> bool {
129        matches!(self, Self::Notice)
130    }
131
132    pub const fn is_license_reference(self) -> bool {
133        matches!(self, Self::Reference)
134    }
135
136    pub const fn is_license_tag(self) -> bool {
137        matches!(self, Self::Tag)
138    }
139
140    pub const fn is_license_intro(self) -> bool {
141        matches!(self, Self::Intro)
142    }
143
144    pub const fn is_license_clue(self) -> bool {
145        matches!(self, Self::Clue)
146    }
147}
148
149/// Rule metadata loaded from .LICENSE and .RULE files.
150#[derive(
151    Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Archive, rkyv::Serialize, rkyv::Deserialize,
152)]
153pub struct Rule {
154    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
155    /// Used for sorting to match Python's attr.s field order.
156    /// This is the primary sort key after rid (which is None at sort time in Python).
157    pub identifier: String,
158
159    /// License expression string using SPDX syntax and ScanCode license keys
160    pub license_expression: String,
161
162    /// Pattern text to match
163    pub text: String,
164
165    /// Token IDs for the text (assigned during indexing)
166    #[serde(
167        serialize_with = "serialize_token_ids",
168        deserialize_with = "deserialize_token_ids"
169    )]
170    pub tokens: Vec<TokenId>,
171
172    /// Classification of this rule.
173    pub rule_kind: RuleKind,
174
175    /// True if exact matches to this rule are false positives
176    pub is_false_positive: bool,
177
178    /// True if this rule text is a required phrase.
179    /// A required phrase is an essential section of the rule text which must be
180    /// present in the case of partial matches.
181    pub is_required_phrase: bool,
182
183    /// True if this rule was created from a license file (not a .RULE file)
184    pub is_from_license: bool,
185
186    /// Relevance score 0-100 (100 is most relevant)
187    pub relevance: u8,
188
189    /// Minimum match coverage percentage (0-100) if specified
190    pub minimum_coverage: Option<u8>,
191
192    /// True if minimum_coverage was explicitly stored in source frontmatter
193    pub has_stored_minimum_coverage: bool,
194
195    /// Tokens must appear in order if true
196    pub is_continuous: bool,
197
198    /// Token position spans for required phrases parsed from {{...}} markers.
199    /// Each span represents positions in the rule text that MUST be matched.
200    #[serde(with = "range_serde", default)]
201    pub required_phrase_spans: Vec<Range<usize>>,
202
203    /// Mapping from token position to count of stopwords at that position.
204    /// Used for required phrase validation.
205    #[serde(with = "stopwords_serde", default)]
206    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
207
208    /// Filenames where this rule should be considered
209    pub referenced_filenames: Option<Vec<String>>,
210
211    /// URLs that should be ignored when found in this rule text
212    pub ignorable_urls: Option<Vec<String>>,
213
214    /// Emails that should be ignored when found in this rule text
215    pub ignorable_emails: Option<Vec<String>>,
216
217    /// Copyrights that should be ignored when found in this rule text
218    pub ignorable_copyrights: Option<Vec<String>>,
219
220    /// Holder names that should be ignored when found in this rule text
221    pub ignorable_holders: Option<Vec<String>>,
222
223    /// Author names that should be ignored when found in this rule text
224    pub ignorable_authors: Option<Vec<String>>,
225
226    /// Programming language for the rule if specified
227    pub language: Option<String>,
228
229    /// Free text notes
230    pub notes: Option<String>,
231
232    /// Count of unique token IDs in the rule (computed during indexing)
233    pub length_unique: usize,
234
235    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
236    pub high_length_unique: usize,
237
238    /// Total count of legalese token occurrences (with duplicates)
239    pub high_length: usize,
240
241    /// Minimum matched length threshold (occurrences-based)
242    pub min_matched_length: usize,
243
244    /// Minimum high-value token matched length threshold (occurrences-based)
245    pub min_high_matched_length: usize,
246
247    /// Minimum matched length threshold (unique tokens)
248    pub min_matched_length_unique: usize,
249
250    /// Minimum high-value token matched length threshold (unique tokens)
251    pub min_high_matched_length_unique: usize,
252
253    /// True if rule length < SMALL_RULE (15 tokens)
254    pub is_small: bool,
255
256    /// True if rule length < TINY_RULE (6 tokens)
257    pub is_tiny: bool,
258
259    /// True if the rule's first token is "license", "licence", or "licensed"
260    pub starts_with_license: bool,
261
262    /// True if the rule's last token is "license", "licence", or "licensed"
263    pub ends_with_license: bool,
264
265    /// Whether this rule is deprecated
266    pub is_deprecated: bool,
267
268    /// SPDX license identifier if available
269    pub spdx_license_key: Option<String>,
270
271    /// Alternative SPDX license identifiers (aliases)
272    pub other_spdx_license_keys: Vec<String>,
273}
274
275fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
276where
277    S: serde::Serializer,
278{
279    let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
280    <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
281}
282
283fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
284where
285    D: serde::Deserializer<'de>,
286{
287    let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
288    Ok(raw_ids.into_iter().map(TokenId::new).collect())
289}
290
291impl PartialOrd for Rule {
292    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
293        Some(self.cmp(other))
294    }
295}
296
297impl Ord for Rule {
298    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
299        self.identifier.cmp(&other.identifier)
300    }
301}
302
303impl Rule {
304    pub fn rule_url(&self) -> Option<String> {
305        if self.is_from_license {
306            return (!self.license_expression.is_empty()).then(|| {
307                format!(
308                    "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
309                    self.license_expression
310                )
311            });
312        }
313
314        (!self.identifier.is_empty())
315            .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
316    }
317
318    pub const fn kind(&self) -> RuleKind {
319        self.rule_kind
320    }
321
322    pub const fn is_license_text(&self) -> bool {
323        self.rule_kind.is_license_text()
324    }
325
326    /// Returns true if this rule is a license notice pattern.
327    ///
328    /// Note: This method is kept for API completeness and potential future use.
329    /// License matches cannot have `is_license_notice` - only rules can.
330    #[allow(dead_code)]
331    pub const fn is_license_notice(&self) -> bool {
332        self.rule_kind.is_license_notice()
333    }
334
335    pub const fn is_license_reference(&self) -> bool {
336        self.rule_kind.is_license_reference()
337    }
338
339    pub const fn is_license_tag(&self) -> bool {
340        self.rule_kind.is_license_tag()
341    }
342
343    /// Returns true if this rule is a license introduction pattern.
344    ///
345    /// Note: This method is kept for API completeness and potential future use.
346    #[allow(dead_code)]
347    pub const fn is_license_intro(&self) -> bool {
348        self.rule_kind.is_license_intro()
349    }
350
351    pub const fn is_license_clue(&self) -> bool {
352        self.rule_kind.is_license_clue()
353    }
354}