provenant/license_detection/models/
rule.rs1use std::collections::HashMap;
4use std::ops::Range;
5
6use rkyv::Archive;
7use serde::{Deserialize, Serialize};
8
9use crate::license_detection::index::dictionary::TokenId;
10
11const SCANCODE_LICENSE_URL_BASE: &str =
12 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
13const SCANCODE_RULE_URL_BASE: &str =
14 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
15
16mod range_serde {
17 use serde::{Deserialize, Deserializer, Serialize, Serializer};
18 use std::ops::Range;
19
20 pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
21 where
22 S: Serializer,
23 {
24 let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
25 tuples.serialize(serializer)
26 }
27
28 pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
29 where
30 D: Deserializer<'de>,
31 {
32 let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
33 Ok(tuples
34 .into_iter()
35 .map(|(start, end)| Range { start, end })
36 .collect())
37 }
38}
39
40mod stopwords_serde {
41 use serde::{Deserialize, Deserializer, Serialize, Serializer};
42 use std::collections::HashMap;
43
44 pub fn serialize<S>(
45 map: &HashMap<Option<usize>, usize>,
46 serializer: S,
47 ) -> Result<S::Ok, S::Error>
48 where
49 S: Serializer,
50 {
51 let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
52 entries.sort_by_key(|(k, _)| *k);
53 entries.serialize(serializer)
54 }
55
56 pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
57 where
58 D: Deserializer<'de>,
59 {
60 let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
61 Ok(entries.into_iter().collect())
62 }
63}
64
65#[derive(
66 Debug,
67 Clone,
68 Copy,
69 PartialEq,
70 Eq,
71 Hash,
72 PartialOrd,
73 Ord,
74 Default,
75 Serialize,
76 Deserialize,
77 Archive,
78 rkyv::Serialize,
79 rkyv::Deserialize,
80)]
81#[rkyv(derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord))]
82pub enum RuleKind {
83 #[default]
84 None,
85 Text,
86 Notice,
87 Reference,
88 Tag,
89 Intro,
90 Clue,
91}
92
93impl RuleKind {
94 pub fn from_rule_flags(
95 is_license_text: bool,
96 is_license_notice: bool,
97 is_license_reference: bool,
98 is_license_tag: bool,
99 is_license_intro: bool,
100 is_license_clue: bool,
101 ) -> Result<Self, &'static str> {
102 let mut active = None;
103
104 for (enabled, kind) in [
105 (is_license_text, Self::Text),
106 (is_license_notice, Self::Notice),
107 (is_license_reference, Self::Reference),
108 (is_license_tag, Self::Tag),
109 (is_license_intro, Self::Intro),
110 (is_license_clue, Self::Clue),
111 ] {
112 if !enabled {
113 continue;
114 }
115
116 if active.replace(kind).is_some() {
117 return Err("rule has multiple rule kinds set");
118 }
119 }
120
121 Ok(active.unwrap_or(Self::None))
122 }
123
124 pub const fn is_license_text(self) -> bool {
125 matches!(self, Self::Text)
126 }
127
128 pub const fn is_license_notice(self) -> bool {
129 matches!(self, Self::Notice)
130 }
131
132 pub const fn is_license_reference(self) -> bool {
133 matches!(self, Self::Reference)
134 }
135
136 pub const fn is_license_tag(self) -> bool {
137 matches!(self, Self::Tag)
138 }
139
140 pub const fn is_license_intro(self) -> bool {
141 matches!(self, Self::Intro)
142 }
143
144 pub const fn is_license_clue(self) -> bool {
145 matches!(self, Self::Clue)
146 }
147}
148
149#[derive(
151 Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Archive, rkyv::Serialize, rkyv::Deserialize,
152)]
153pub struct Rule {
154 pub identifier: String,
158
159 pub license_expression: String,
161
162 pub text: String,
164
165 #[serde(
167 serialize_with = "serialize_token_ids",
168 deserialize_with = "deserialize_token_ids"
169 )]
170 pub tokens: Vec<TokenId>,
171
172 pub rule_kind: RuleKind,
174
175 pub is_false_positive: bool,
177
178 pub is_required_phrase: bool,
182
183 pub is_from_license: bool,
185
186 pub relevance: u8,
188
189 pub minimum_coverage: Option<u8>,
191
192 pub has_stored_minimum_coverage: bool,
194
195 pub is_continuous: bool,
197
198 #[serde(with = "range_serde", default)]
201 pub required_phrase_spans: Vec<Range<usize>>,
202
203 #[serde(with = "stopwords_serde", default)]
206 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
207
208 pub referenced_filenames: Option<Vec<String>>,
210
211 pub ignorable_urls: Option<Vec<String>>,
213
214 pub ignorable_emails: Option<Vec<String>>,
216
217 pub ignorable_copyrights: Option<Vec<String>>,
219
220 pub ignorable_holders: Option<Vec<String>>,
222
223 pub ignorable_authors: Option<Vec<String>>,
225
226 pub language: Option<String>,
228
229 pub notes: Option<String>,
231
232 pub length_unique: usize,
234
235 pub high_length_unique: usize,
237
238 pub high_length: usize,
240
241 pub min_matched_length: usize,
243
244 pub min_high_matched_length: usize,
246
247 pub min_matched_length_unique: usize,
249
250 pub min_high_matched_length_unique: usize,
252
253 pub is_small: bool,
255
256 pub is_tiny: bool,
258
259 pub starts_with_license: bool,
261
262 pub ends_with_license: bool,
264
265 pub is_deprecated: bool,
267
268 pub spdx_license_key: Option<String>,
270
271 pub other_spdx_license_keys: Vec<String>,
273}
274
275fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
276where
277 S: serde::Serializer,
278{
279 let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
280 <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
281}
282
283fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
284where
285 D: serde::Deserializer<'de>,
286{
287 let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
288 Ok(raw_ids.into_iter().map(TokenId::new).collect())
289}
290
291impl PartialOrd for Rule {
292 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
293 Some(self.cmp(other))
294 }
295}
296
297impl Ord for Rule {
298 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
299 self.identifier.cmp(&other.identifier)
300 }
301}
302
303impl Rule {
304 pub fn rule_url(&self) -> Option<String> {
305 if self.is_from_license {
306 return (!self.license_expression.is_empty()).then(|| {
307 format!(
308 "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
309 self.license_expression
310 )
311 });
312 }
313
314 (!self.identifier.is_empty())
315 .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
316 }
317
318 pub const fn kind(&self) -> RuleKind {
319 self.rule_kind
320 }
321
322 pub const fn is_license_text(&self) -> bool {
323 self.rule_kind.is_license_text()
324 }
325
326 #[allow(dead_code)]
331 pub const fn is_license_notice(&self) -> bool {
332 self.rule_kind.is_license_notice()
333 }
334
335 pub const fn is_license_reference(&self) -> bool {
336 self.rule_kind.is_license_reference()
337 }
338
339 pub const fn is_license_tag(&self) -> bool {
340 self.rule_kind.is_license_tag()
341 }
342
343 #[allow(dead_code)]
347 pub const fn is_license_intro(&self) -> bool {
348 self.rule_kind.is_license_intro()
349 }
350
351 pub const fn is_license_clue(&self) -> bool {
352 self.rule_kind.is_license_clue()
353 }
354}