provenant/license_detection/models/
rule.rs1use std::collections::HashMap;
4use std::ops::Range;
5
6use serde::{Deserialize, Serialize};
7
8use crate::license_detection::index::dictionary::TokenId;
9
10const SCANCODE_LICENSE_URL_BASE: &str =
11 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses";
12const SCANCODE_RULE_URL_BASE: &str =
13 "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules";
14
15mod range_serde {
16 use serde::{Deserialize, Deserializer, Serialize, Serializer};
17 use std::ops::Range;
18
19 pub fn serialize<S>(ranges: &[Range<usize>], serializer: S) -> Result<S::Ok, S::Error>
20 where
21 S: Serializer,
22 {
23 let tuples: Vec<(usize, usize)> = ranges.iter().map(|r| (r.start, r.end)).collect();
24 tuples.serialize(serializer)
25 }
26
27 pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<Range<usize>>, D::Error>
28 where
29 D: Deserializer<'de>,
30 {
31 let tuples: Vec<(usize, usize)> = Vec::deserialize(deserializer)?;
32 Ok(tuples
33 .into_iter()
34 .map(|(start, end)| Range { start, end })
35 .collect())
36 }
37}
38
39mod stopwords_serde {
40 use serde::{Deserialize, Deserializer, Serialize, Serializer};
41 use std::collections::HashMap;
42
43 pub fn serialize<S>(
44 map: &HashMap<Option<usize>, usize>,
45 serializer: S,
46 ) -> Result<S::Ok, S::Error>
47 where
48 S: Serializer,
49 {
50 let mut entries: Vec<(Option<usize>, usize)> = map.iter().map(|(k, v)| (*k, *v)).collect();
51 entries.sort_by_key(|(k, _)| *k);
52 entries.serialize(serializer)
53 }
54
55 pub fn deserialize<'de, D>(deserializer: D) -> Result<HashMap<Option<usize>, usize>, D::Error>
56 where
57 D: Deserializer<'de>,
58 {
59 let entries: Vec<(Option<usize>, usize)> = Vec::deserialize(deserializer)?;
60 Ok(entries.into_iter().collect())
61 }
62}
63
64#[derive(
65 Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default, Serialize, Deserialize,
66)]
67pub enum RuleKind {
68 #[default]
69 None,
70 Text,
71 Notice,
72 Reference,
73 Tag,
74 Intro,
75 Clue,
76}
77
78impl RuleKind {
79 pub fn from_rule_flags(
80 is_license_text: bool,
81 is_license_notice: bool,
82 is_license_reference: bool,
83 is_license_tag: bool,
84 is_license_intro: bool,
85 is_license_clue: bool,
86 ) -> Result<Self, &'static str> {
87 let mut active = None;
88
89 for (enabled, kind) in [
90 (is_license_text, Self::Text),
91 (is_license_notice, Self::Notice),
92 (is_license_reference, Self::Reference),
93 (is_license_tag, Self::Tag),
94 (is_license_intro, Self::Intro),
95 (is_license_clue, Self::Clue),
96 ] {
97 if !enabled {
98 continue;
99 }
100
101 if active.replace(kind).is_some() {
102 return Err("rule has multiple rule kinds set");
103 }
104 }
105
106 Ok(active.unwrap_or(Self::None))
107 }
108
109 pub const fn is_license_text(self) -> bool {
110 matches!(self, Self::Text)
111 }
112
113 pub const fn is_license_notice(self) -> bool {
114 matches!(self, Self::Notice)
115 }
116
117 pub const fn is_license_reference(self) -> bool {
118 matches!(self, Self::Reference)
119 }
120
121 pub const fn is_license_tag(self) -> bool {
122 matches!(self, Self::Tag)
123 }
124
125 pub const fn is_license_intro(self) -> bool {
126 matches!(self, Self::Intro)
127 }
128
129 pub const fn is_license_clue(self) -> bool {
130 matches!(self, Self::Clue)
131 }
132}
133
134#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
136pub struct Rule {
137 pub identifier: String,
141
142 pub license_expression: String,
144
145 pub text: String,
147
148 #[serde(
150 serialize_with = "serialize_token_ids",
151 deserialize_with = "deserialize_token_ids"
152 )]
153 pub tokens: Vec<TokenId>,
154
155 pub rule_kind: RuleKind,
157
158 pub is_false_positive: bool,
160
161 pub is_required_phrase: bool,
165
166 pub is_from_license: bool,
168
169 pub relevance: u8,
171
172 pub minimum_coverage: Option<u8>,
174
175 pub has_stored_minimum_coverage: bool,
177
178 pub is_continuous: bool,
180
181 #[serde(with = "range_serde", default)]
184 pub required_phrase_spans: Vec<Range<usize>>,
185
186 #[serde(with = "stopwords_serde", default)]
189 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
190
191 pub referenced_filenames: Option<Vec<String>>,
193
194 pub ignorable_urls: Option<Vec<String>>,
196
197 pub ignorable_emails: Option<Vec<String>>,
199
200 pub ignorable_copyrights: Option<Vec<String>>,
202
203 pub ignorable_holders: Option<Vec<String>>,
205
206 pub ignorable_authors: Option<Vec<String>>,
208
209 pub language: Option<String>,
211
212 pub notes: Option<String>,
214
215 pub length_unique: usize,
217
218 pub high_length_unique: usize,
220
221 pub high_length: usize,
223
224 pub min_matched_length: usize,
226
227 pub min_high_matched_length: usize,
229
230 pub min_matched_length_unique: usize,
232
233 pub min_high_matched_length_unique: usize,
235
236 pub is_small: bool,
238
239 pub is_tiny: bool,
241
242 pub starts_with_license: bool,
244
245 pub ends_with_license: bool,
247
248 pub is_deprecated: bool,
250
251 pub spdx_license_key: Option<String>,
253
254 pub other_spdx_license_keys: Vec<String>,
256}
257
258fn serialize_token_ids<S>(token_ids: &[TokenId], serializer: S) -> Result<S::Ok, S::Error>
259where
260 S: serde::Serializer,
261{
262 let raw_ids: Vec<u16> = token_ids.iter().map(|id| id.raw()).collect();
263 <Vec<u16> as serde::Serialize>::serialize(&raw_ids, serializer)
264}
265
266fn deserialize_token_ids<'de, D>(deserializer: D) -> Result<Vec<TokenId>, D::Error>
267where
268 D: serde::Deserializer<'de>,
269{
270 let raw_ids: Vec<u16> = Vec::deserialize(deserializer)?;
271 Ok(raw_ids.into_iter().map(TokenId::new).collect())
272}
273
274impl PartialOrd for Rule {
275 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
276 Some(self.cmp(other))
277 }
278}
279
280impl Ord for Rule {
281 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
282 self.identifier.cmp(&other.identifier)
283 }
284}
285
286impl Rule {
287 pub fn rule_url(&self) -> Option<String> {
288 if self.is_from_license {
289 return (!self.license_expression.is_empty()).then(|| {
290 format!(
291 "{SCANCODE_LICENSE_URL_BASE}/{}.LICENSE",
292 self.license_expression
293 )
294 });
295 }
296
297 (!self.identifier.is_empty())
298 .then(|| format!("{SCANCODE_RULE_URL_BASE}/{}", self.identifier))
299 }
300
301 pub const fn kind(&self) -> RuleKind {
302 self.rule_kind
303 }
304
305 pub const fn is_license_text(&self) -> bool {
306 self.rule_kind.is_license_text()
307 }
308
309 #[allow(dead_code)]
314 pub const fn is_license_notice(&self) -> bool {
315 self.rule_kind.is_license_notice()
316 }
317
318 pub const fn is_license_reference(&self) -> bool {
319 self.rule_kind.is_license_reference()
320 }
321
322 pub const fn is_license_tag(&self) -> bool {
323 self.rule_kind.is_license_tag()
324 }
325
326 #[allow(dead_code)]
330 pub const fn is_license_intro(&self) -> bool {
331 self.rule_kind.is_license_intro()
332 }
333
334 pub const fn is_license_clue(&self) -> bool {
335 self.rule_kind.is_license_clue()
336 }
337}