provenant/license_detection/models/
rule.rs1use std::collections::HashMap;
7use std::ops::Range;
8
9use rkyv::Archive;
10use serde::{Deserialize, Serialize};
11
12use crate::license_detection::index::dictionary::TokenId;
13
14const SCANCODE_LICENSE_URL_BASE: &str =
15 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
16const SCANCODE_RULE_URL_BASE: &str =
17 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
18
19mod range_serde {
20 use serde::{Deserialize, Deserializer, Serialize, Serializer};
21 use std::ops::Range;
22
23 pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
24 where
25 S: Serializer,
26 {
27 let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
28 tuples.serialize(serializer)
29 }
30
31 pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
32 where
33 D: Deserializer<'de>,
34 {
35 let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
36 Ok(tuples
37 .into_iter()
38 .map(|(start, end)| Range { start, end })
39 .collect())
40 }
41}
42
43mod stopwords_serde {
44 use serde::{Deserialize, Deserializer, Serialize, Serializer};
45 use std::collections::HashMap;
46
47 pub fn serialize<S>(
48 map: &HashMap<Option<usize>, usize>,
49 serializer: S,
50 ) -> Result<S::Ok, S::Error>
51 where
52 S: Serializer,
53 {
54 let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
55 entries.sort_by_key(|(k, _)| *k);
56 entries.serialize(serializer)
57 }
58
59 pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
60 where
61 D: Deserializer<'de>,
62 {
63 let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
64 Ok(entries.into_iter().collect())
65 }
66}
67
68#[derive(
69 Debug,
70 Clone,
71 Copy,
72 PartialEq,
73 Eq,
74 Hash,
75 PartialOrd,
76 Ord,
77 Default,
78 Serialize,
79 Deserialize,
80 Archive,
81 rkyv::Serialize,
82 rkyv::Deserialize,
83)]
84#[rkyv(derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord))]
85pub enum RuleKind {
86 #[default]
87 None,
88 Text,
89 Notice,
90 Reference,
91 Tag,
92 Intro,
93 Clue,
94}
95
96impl RuleKind {
97 pub fn from_rule_flags(
98 is_license_text: bool,
99 is_license_notice: bool,
100 is_license_reference: bool,
101 is_license_tag: bool,
102 is_license_intro: bool,
103 is_license_clue: bool,
104 ) -> Result<Self, &'static str> {
105 let mut active = None;
106
107 for (enabled, kind) in [
108 (is_license_text, Self::Text),
109 (is_license_notice, Self::Notice),
110 (is_license_reference, Self::Reference),
111 (is_license_tag, Self::Tag),
112 (is_license_intro, Self::Intro),
113 (is_license_clue, Self::Clue),
114 ] {
115 if !enabled {
116 continue;
117 }
118
119 if active.replace(kind).is_some() {
120 return Err("rule has multiple rule kinds set");
121 }
122 }
123
124 Ok(active.unwrap_or(Self::None))
125 }
126
127 pub const fn is_license_text(self) -> bool {
128 matches!(self, Self::Text)
129 }
130
131 pub const fn is_license_notice(self) -> bool {
132 matches!(self, Self::Notice)
133 }
134
135 pub const fn is_license_reference(self) -> bool {
136 matches!(self, Self::Reference)
137 }
138
139 pub const fn is_license_tag(self) -> bool {
140 matches!(self, Self::Tag)
141 }
142
143 pub const fn is_license_intro(self) -> bool {
144 matches!(self, Self::Intro)
145 }
146
147 pub const fn is_license_clue(self) -> bool {
148 matches!(self, Self::Clue)
149 }
150}
151
152#[derive(
154 Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Archive, rkyv::Serialize, rkyv::Deserialize,
155)]
156pub struct Rule {
157 pub identifier: String,
161
162 pub license_expression: String,
164
165 pub text: String,
167
168 #[serde(
170 serialize_with = "serialize_token_ids",
171 deserialize_with = "deserialize_token_ids"
172 )]
173 pub tokens: Vec<TokenId>,
174
175 pub rule_kind: RuleKind,
177
178 pub is_false_positive: bool,
180
181 pub is_required_phrase: bool,
185
186 pub is_from_license: bool,
188
189 pub relevance: u8,
191
192 pub minimum_coverage: Option<u8>,
194
195 pub has_stored_minimum_coverage: bool,
197
198 pub is_continuous: bool,
200
201 #[serde(with = "range_serde", default)]
204 pub required_phrase_spans: Vec<Range<usize>>,
205
206 #[serde(with = "stopwords_serde", default)]
209 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
210
211 pub referenced_filenames: Option<Vec<String>>,
213
214 pub ignorable_urls: Option<Vec<String>>,
216
217 pub ignorable_emails: Option<Vec<String>>,
219
220 pub ignorable_copyrights: Option<Vec<String>>,
222
223 pub ignorable_holders: Option<Vec<String>>,
225
226 pub ignorable_authors: Option<Vec<String>>,
228
229 pub language: Option<String>,
231
232 pub notes: Option<String>,
234
235 pub length_unique: usize,
237
238 pub high_length_unique: usize,
240
241 pub high_length: usize,
243
244 pub min_matched_length: usize,
246
247 pub min_high_matched_length: usize,
249
250 pub min_matched_length_unique: usize,
252
253 pub min_high_matched_length_unique: usize,
255
256 pub is_small: bool,
258
259 pub is_tiny: bool,
261
262 pub starts_with_license: bool,
264
265 pub ends_with_license: bool,
267
268 pub is_deprecated: bool,
270
271 pub spdx_license_key: Option<String>,
273
274 pub other_spdx_license_keys: Vec<String>,
276}
277
278fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
279where
280 S: serde::Serializer,
281{
282 let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
283 <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
284}
285
286fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
287where
288 D: serde::Deserializer<'de>,
289{
290 let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
291 Ok(raw_ids.into_iter().map(TokenId::new).collect())
292}
293
294impl PartialOrd for Rule {
295 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
296 Some(self.cmp(other))
297 }
298}
299
300impl Ord for Rule {
301 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
302 self.identifier.cmp(&other.identifier)
303 }
304}
305
306impl Rule {
307 pub fn rule_url(&self) -> Option<String> {
308 if self.is_from_license {
309 return (!self.license_expression.is_empty()).then(|| {
310 format!(
311 "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
312 self.license_expression
313 )
314 });
315 }
316
317 (!self.identifier.is_empty())
318 .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
319 }
320
321 pub const fn kind(&self) -> RuleKind {
322 self.rule_kind
323 }
324
325 pub const fn is_license_text(&self) -> bool {
326 self.rule_kind.is_license_text()
327 }
328
329 #[allow(dead_code)]
334 pub const fn is_license_notice(&self) -> bool {
335 self.rule_kind.is_license_notice()
336 }
337
338 pub const fn is_license_reference(&self) -> bool {
339 self.rule_kind.is_license_reference()
340 }
341
342 pub const fn is_license_tag(&self) -> bool {
343 self.rule_kind.is_license_tag()
344 }
345
346 #[allow(dead_code)]
350 pub const fn is_license_intro(&self) -> bool {
351 self.rule_kind.is_license_intro()
352 }
353
354 pub const fn is_license_clue(&self) -> bool {
355 self.rule_kind.is_license_clue()
356 }
357}