provenant/license_detection/models/
rule.rs1use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10const SCANCODE_LICENSE_URL_BASE: &str =
11 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
12const SCANCODE_RULE_URL_BASE: &str =
13 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
14
15mod range_serde {
16 use serde::{Deserialize, Deserializer, Serialize, Serializer};
17 use std::ops::Range;
18
19 pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
20 where
21 S: Serializer,
22 {
23 let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
24 tuples.serialize(serializer)
25 }
26
27 pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
28 where
29 D: Deserializer<'de>,
30 {
31 let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
32 Ok(tuples
33 .into_iter()
34 .map(|(start, end)| Range { start, end })
35 .collect())
36 }
37}
38
39mod stopwords_serde {
40 use serde::{Deserialize, Deserializer, Serialize, Serializer};
41 use std::collections::HashMap;
42
43 pub fn serialize<S>(map: &HashMap<usize, usize>, serializer: S) -> Result<S::Ok, S::Error>
44 where
45 S: Serializer,
46 {
47 let mut entries: Vec<(usize, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
48 entries.sort_by_key(|(k, _)| *k);
49 entries.serialize(serializer)
50 }
51
52 pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<usize, usize>, D::Error>
53 where
54 D: Deserializer<'de>,
55 {
56 let entries: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
57 Ok(entries.into_iter().collect())
58 }
59}
60
61#[derive(
62 Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
63)]
64pub enum RuleKind {
65 #[default]
66 None,
67 Text,
68 Notice,
69 Reference,
70 Tag,
71 Intro,
72 Clue,
73}
74
75impl RuleKind {
76 pub fn from_rule_flags(
77 is_license_text: bool,
78 is_license_notice: bool,
79 is_license_reference: bool,
80 is_license_tag: bool,
81 is_license_intro: bool,
82 is_license_clue: bool,
83 ) -> Result<Self, &'static str> {
84 let mut active = None;
85
86 for (enabled, kind) in [
87 (is_license_text, Self::Text),
88 (is_license_notice, Self::Notice),
89 (is_license_reference, Self::Reference),
90 (is_license_tag, Self::Tag),
91 (is_license_intro, Self::Intro),
92 (is_license_clue, Self::Clue),
93 ] {
94 if !enabled {
95 continue;
96 }
97
98 if active.replace(kind).is_some() {
99 return Err("rule has multiple rule kinds set");
100 }
101 }
102
103 Ok(active.unwrap_or(Self::None))
104 }
105
106 pub fn from_match_flags(
107 is_license_text: bool,
108 is_license_reference: bool,
109 is_license_tag: bool,
110 is_license_intro: bool,
111 is_license_clue: bool,
112 ) -> Result<Self, &'static str> {
113 Self::from_rule_flags(
114 is_license_text,
115 false,
116 is_license_reference,
117 is_license_tag,
118 is_license_intro,
119 is_license_clue,
120 )
121 .map_err(|_| "license match has multiple rule kinds set")
122 }
123
124 pub const fn is_license_text(self) -> bool {
125 matches!(self, Self::Text)
126 }
127
128 pub const fn is_license_notice(self) -> bool {
129 matches!(self, Self::Notice)
130 }
131
132 pub const fn is_license_reference(self) -> bool {
133 matches!(self, Self::Reference)
134 }
135
136 pub const fn is_license_tag(self) -> bool {
137 matches!(self, Self::Tag)
138 }
139
140 pub const fn is_license_intro(self) -> bool {
141 matches!(self, Self::Intro)
142 }
143
144 pub const fn is_license_clue(self) -> bool {
145 matches!(self, Self::Clue)
146 }
147}
148
149#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
151pub struct Rule {
152 pub identifier: String,
156
157 pub license_expression: String,
159
160 pub text: String,
162
163 #[serde(
165 serialize_with = "serialize_token_ids",
166 deserialize_with = "deserialize_token_ids"
167 )]
168 pub tokens: Vec<TokenId>,
169
170 pub rule_kind: RuleKind,
172
173 pub is_false_positive: bool,
175
176 pub is_required_phrase: bool,
180
181 pub is_from_license: bool,
183
184 pub relevance: u8,
186
187 pub minimum_coverage: Option<u8>,
189
190 pub has_stored_minimum_coverage: bool,
192
193 pub is_continuous: bool,
195
196 #[serde(with = "range_serde", default)]
199 pub required_phrase_spans: Vec<Range<usize>>,
200
201 #[serde(with = "stopwords_serde", default)]
204 pub stopwords_by_pos: HashMap<usize, usize>,
205
206 pub referenced_filenames: Option<Vec<String>>,
208
209 pub ignorable_urls: Option<Vec<String>>,
211
212 pub ignorable_emails: Option<Vec<String>>,
214
215 pub ignorable_copyrights: Option<Vec<String>>,
217
218 pub ignorable_holders: Option<Vec<String>>,
220
221 pub ignorable_authors: Option<Vec<String>>,
223
224 pub language: Option<String>,
226
227 pub notes: Option<String>,
229
230 pub length_unique: usize,
232
233 pub high_length_unique: usize,
235
236 pub high_length: usize,
238
239 pub min_matched_length: usize,
241
242 pub min_high_matched_length: usize,
244
245 pub min_matched_length_unique: usize,
247
248 pub min_high_matched_length_unique: usize,
250
251 pub is_small: bool,
253
254 pub is_tiny: bool,
256
257 pub starts_with_license: bool,
259
260 pub ends_with_license: bool,
262
263 pub is_deprecated: bool,
265
266 pub spdx_license_key: Option<String>,
268
269 pub other_spdx_license_keys: Vec<String>,
271}
272
273fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
274where
275 S: serde::Serializer,
276{
277 let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
278 <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
279}
280
281fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
282where
283 D: serde::Deserializer<'de>,
284{
285 let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
286 Ok(raw_ids.into_iter().map(TokenId::new).collect())
287}
288
289impl PartialOrd for Rule {
290 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
291 Some(self.cmp(other))
292 }
293}
294
295impl Ord for Rule {
296 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
297 self.identifier.cmp(&other.identifier)
298 }
299}
300
301impl Rule {
302 pub fn rule_url(&self) -> Option<String> {
303 if self.is_from_license {
304 return (!self.license_expression.is_empty()).then(|| {
305 format!(
306 "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
307 self.license_expression
308 )
309 });
310 }
311
312 (!self.identifier.is_empty())
313 .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
314 }
315
316 pub const fn kind(&self) -> RuleKind {
317 self.rule_kind
318 }
319
320 pub const fn is_license_text(&self) -> bool {
321 self.rule_kind.is_license_text()
322 }
323
324 #[allow(dead_code)]
329 pub const fn is_license_notice(&self) -> bool {
330 self.rule_kind.is_license_notice()
331 }
332
333 pub const fn is_license_reference(&self) -> bool {
334 self.rule_kind.is_license_reference()
335 }
336
337 pub const fn is_license_tag(&self) -> bool {
338 self.rule_kind.is_license_tag()
339 }
340
341 #[allow(dead_code)]
345 pub const fn is_license_intro(&self) -> bool {
346 self.rule_kind.is_license_intro()
347 }
348
349 pub const fn is_license_clue(&self) -> bool {
350 self.rule_kind.is_license_clue()
351 }
352}