provenant/license_detection/models/
rule.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Rule metadata loaded from .LICENSE and .RULE files.
5
6use std::collections::HashMap;
7use std::ops::Range;
8
9use rkyv::Archive;
10use serde::{Deserialize, Serialize};
11
12use crate::license_detection::index::dictionary::TokenId;
13
14const SCANCODE_LICENSE_URL_BASE: &str =
15    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
16const SCANCODE_RULE_URL_BASE: &str =
17    "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
18
19mod range_serde {
20    use serde::{Deserialize, Deserializer, Serialize, Serializer};
21    use std::ops::Range;
22
23    pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
24    where
25        S: Serializer,
26    {
27        let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
28        tuples.serialize(serializer)
29    }
30
31    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
32    where
33        D: Deserializer<'de>,
34    {
35        let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
36        Ok(tuples
37            .into_iter()
38            .map(|(start, end)| Range { start, end })
39            .collect())
40    }
41}
42
43mod stopwords_serde {
44    use serde::{Deserialize, Deserializer, Serialize, Serializer};
45    use std::collections::HashMap;
46
47    pub fn serialize<S>(
48        map: &HashMap<Option<usize>, usize>,
49        serializer: S,
50    ) -> Result<S::Ok, S::Error>
51    where
52        S: Serializer,
53    {
54        let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
55        entries.sort_by_key(|(k, _)| *k);
56        entries.serialize(serializer)
57    }
58
59    pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
60    where
61        D: Deserializer<'de>,
62    {
63        let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
64        Ok(entries.into_iter().collect())
65    }
66}
67
68#[derive(
69    Debug,
70    Clone,
71    Copy,
72    PartialEq,
73    Eq,
74    Hash,
75    PartialOrd,
76    Ord,
77    Default,
78    Serialize,
79    Deserialize,
80    Archive,
81    rkyv::Serialize,
82    rkyv::Deserialize,
83)]
84#[rkyv(derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord))]
85pub enum RuleKind {
86    #[default]
87    None,
88    Text,
89    Notice,
90    Reference,
91    Tag,
92    Intro,
93    Clue,
94}
95
96impl RuleKind {
97    pub fn from_rule_flags(
98        is_license_text: bool,
99        is_license_notice: bool,
100        is_license_reference: bool,
101        is_license_tag: bool,
102        is_license_intro: bool,
103        is_license_clue: bool,
104    ) -> Result<Self, &'static str> {
105        let mut active = None;
106
107        for (enabled, kind) in [
108            (is_license_text, Self::Text),
109            (is_license_notice, Self::Notice),
110            (is_license_reference, Self::Reference),
111            (is_license_tag, Self::Tag),
112            (is_license_intro, Self::Intro),
113            (is_license_clue, Self::Clue),
114        ] {
115            if !enabled {
116                continue;
117            }
118
119            if active.replace(kind).is_some() {
120                return Err("rule has multiple rule kinds set");
121            }
122        }
123
124        Ok(active.unwrap_or(Self::None))
125    }
126
127    pub const fn is_license_text(self) -> bool {
128        matches!(self, Self::Text)
129    }
130
131    pub const fn is_license_notice(self) -> bool {
132        matches!(self, Self::Notice)
133    }
134
135    pub const fn is_license_reference(self) -> bool {
136        matches!(self, Self::Reference)
137    }
138
139    pub const fn is_license_tag(self) -> bool {
140        matches!(self, Self::Tag)
141    }
142
143    pub const fn is_license_intro(self) -> bool {
144        matches!(self, Self::Intro)
145    }
146
147    pub const fn is_license_clue(self) -> bool {
148        matches!(self, Self::Clue)
149    }
150}
151
152/// Rule metadata loaded from .LICENSE and .RULE files.
153#[derive(
154    Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Archive, rkyv::Serialize, rkyv::Deserialize,
155)]
156pub struct Rule {
157    /// Unique identifier for this rule (e.g., "mit.LICENSE", "gpl-2.0_12.RULE")
158    /// Used for sorting to match Python's attr.s field order.
159    /// This is the primary sort key after rid (which is None at sort time in Python).
160    pub identifier: String,
161
162    /// License expression string using SPDX syntax and ScanCode license keys
163    pub license_expression: String,
164
165    /// Pattern text to match
166    pub text: String,
167
168    /// Token IDs for the text (assigned during indexing)
169    #[serde(
170        serialize_with = "serialize_token_ids",
171        deserialize_with = "deserialize_token_ids"
172    )]
173    pub tokens: Vec<TokenId>,
174
175    /// Classification of this rule.
176    pub rule_kind: RuleKind,
177
178    /// True if exact matches to this rule are false positives
179    pub is_false_positive: bool,
180
181    /// True if this rule text is a required phrase.
182    /// A required phrase is an essential section of the rule text which must be
183    /// present in the case of partial matches.
184    pub is_required_phrase: bool,
185
186    /// True if this rule was created from a license file (not a .RULE file)
187    pub is_from_license: bool,
188
189    /// Relevance score 0-100 (100 is most relevant)
190    pub relevance: u8,
191
192    /// Minimum match coverage percentage (0-100) if specified
193    pub minimum_coverage: Option<u8>,
194
195    /// True if minimum_coverage was explicitly stored in source frontmatter
196    pub has_stored_minimum_coverage: bool,
197
198    /// Tokens must appear in order if true
199    pub is_continuous: bool,
200
201    /// Token position spans for required phrases parsed from {{...}} markers.
202    /// Each span represents positions in the rule text that MUST be matched.
203    #[serde(with = "range_serde", default)]
204    pub required_phrase_spans: Vec<Range<usize>>,
205
206    /// Mapping from token position to count of stopwords at that position.
207    /// Used for required phrase validation.
208    #[serde(with = "stopwords_serde", default)]
209    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
210
211    /// Filenames where this rule should be considered
212    pub referenced_filenames: Option<Vec<String>>,
213
214    /// URLs that should be ignored when found in this rule text
215    pub ignorable_urls: Option<Vec<String>>,
216
217    /// Emails that should be ignored when found in this rule text
218    pub ignorable_emails: Option<Vec<String>>,
219
220    /// Copyrights that should be ignored when found in this rule text
221    pub ignorable_copyrights: Option<Vec<String>>,
222
223    /// Holder names that should be ignored when found in this rule text
224    pub ignorable_holders: Option<Vec<String>>,
225
226    /// Author names that should be ignored when found in this rule text
227    pub ignorable_authors: Option<Vec<String>>,
228
229    /// Programming language for the rule if specified
230    pub language: Option<String>,
231
232    /// Free text notes
233    pub notes: Option<String>,
234
235    /// Count of unique token IDs in the rule (computed during indexing)
236    pub length_unique: usize,
237
238    /// Count of unique legalese token IDs (tokens with ID < len_legalese)
239    pub high_length_unique: usize,
240
241    /// Total count of legalese token occurrences (with duplicates)
242    pub high_length: usize,
243
244    /// Minimum matched length threshold (occurrences-based)
245    pub min_matched_length: usize,
246
247    /// Minimum high-value token matched length threshold (occurrences-based)
248    pub min_high_matched_length: usize,
249
250    /// Minimum matched length threshold (unique tokens)
251    pub min_matched_length_unique: usize,
252
253    /// Minimum high-value token matched length threshold (unique tokens)
254    pub min_high_matched_length_unique: usize,
255
256    /// True if rule length < SMALL_RULE (15 tokens)
257    pub is_small: bool,
258
259    /// True if rule length < TINY_RULE (6 tokens)
260    pub is_tiny: bool,
261
262    /// True if the rule's first token is "license", "licence", or "licensed"
263    pub starts_with_license: bool,
264
265    /// True if the rule's last token is "license", "licence", or "licensed"
266    pub ends_with_license: bool,
267
268    /// Whether this rule is deprecated
269    pub is_deprecated: bool,
270
271    /// SPDX license identifier if available
272    pub spdx_license_key: Option<String>,
273
274    /// Alternative SPDX license identifiers (aliases)
275    pub other_spdx_license_keys: Vec<String>,
276}
277
278fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
279where
280    S: serde::Serializer,
281{
282    let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
283    <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
284}
285
286fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
287where
288    D: serde::Deserializer<'de>,
289{
290    let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
291    Ok(raw_ids.into_iter().map(TokenId::new).collect())
292}
293
294impl PartialOrd for Rule {
295    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
296        Some(self.cmp(other))
297    }
298}
299
300impl Ord for Rule {
301    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
302        self.identifier.cmp(&other.identifier)
303    }
304}
305
306impl Rule {
307    pub fn rule_url(&self) -> Option<String> {
308        if self.is_from_license {
309            return (!self.license_expression.is_empty()).then(|| {
310                format!(
311                    "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
312                    self.license_expression
313                )
314            });
315        }
316
317        (!self.identifier.is_empty())
318            .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
319    }
320
321    pub const fn kind(&self) -> RuleKind {
322        self.rule_kind
323    }
324
325    pub const fn is_license_text(&self) -> bool {
326        self.rule_kind.is_license_text()
327    }
328
329    /// Returns true if this rule is a license notice pattern.
330    ///
331    /// Note: This method is kept for API completeness and potential future use.
332    /// License matches cannot have `is_license_notice` - only rules can.
333    #[allow(dead_code)]
334    pub const fn is_license_notice(&self) -> bool {
335        self.rule_kind.is_license_notice()
336    }
337
338    pub const fn is_license_reference(&self) -> bool {
339        self.rule_kind.is_license_reference()
340    }
341
342    pub const fn is_license_tag(&self) -> bool {
343        self.rule_kind.is_license_tag()
344    }
345
346    /// Returns true if this rule is a license introduction pattern.
347    ///
348    /// Note: This method is kept for API completeness and potential future use.
349    #[allow(dead_code)]
350    pub const fn is_license_intro(&self) -> bool {
351        self.rule_kind.is_license_intro()
352    }
353
354    pub const fn is_license_clue(&self) -> bool {
355        self.rule_kind.is_license_clue()
356    }
357}
provenant/license_detection/models/rule.rs

provenant/license_detection/models/
rule.rs