1pub const MIN_MATCH_LENGTH: usize = 4;
8
9pub const MIN_MATCH_HIGH_LENGTH: usize = 3;
11
12pub const SMALL_RULE: usize = 15;
14
15pub const TINY_RULE: usize = 6;
17
18pub fn compute_thresholds_occurrences(
33 minimum_coverage: Option<u8>,
34 length: usize,
35 high_length: usize,
36) -> (Option<u8>, usize, usize) {
37 if minimum_coverage == Some(100) {
38 return (minimum_coverage, length, high_length);
39 }
40
41 let (min_matched_length, min_high_matched_length, updated_coverage) = if length < 3 {
42 (length, high_length, Some(100))
43 } else if length < 10 {
44 (length, high_length, Some(80))
45 } else if length < 30 {
46 (length / 2, high_length.min(MIN_MATCH_HIGH_LENGTH), Some(50))
47 } else if length < 200 {
48 (
49 MIN_MATCH_LENGTH,
50 high_length.min(MIN_MATCH_HIGH_LENGTH),
51 minimum_coverage,
52 )
53 } else {
54 (length / 10, high_length / 10, minimum_coverage)
55 };
56
57 (
58 updated_coverage,
59 min_matched_length,
60 min_high_matched_length,
61 )
62}
63
64pub fn compute_thresholds_unique(
80 minimum_coverage: Option<u8>,
81 length: usize,
82 length_unique: usize,
83 high_length_unique: usize,
84) -> (usize, usize) {
85 if minimum_coverage == Some(100) {
86 return (length_unique, high_length_unique);
87 }
88
89 if length > 200 {
90 (length / 10, high_length_unique / 10)
91 } else if length < 5 {
92 (length_unique, high_length_unique)
93 } else if length < 10 {
94 let min_matched = if length_unique < 2 {
95 length_unique
96 } else {
97 length_unique - 1
98 };
99 (min_matched, high_length_unique)
100 } else if length < 20 {
101 (high_length_unique, high_length_unique)
102 } else {
103 let half = high_length_unique / 2;
104 let high_u = if half > 0 { half } else { high_length_unique };
105 (MIN_MATCH_LENGTH, high_u.min(MIN_MATCH_HIGH_LENGTH))
106 }
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112
113 #[test]
114 fn test_compute_thresholds_occurrences_100_coverage() {
115 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(Some(100), 50, 20);
116 assert_eq!(cov, Some(100));
117 assert_eq!(min_len, 50);
118 assert_eq!(min_high_len, 20);
119 }
120
121 #[test]
122 fn test_compute_thresholds_occurrences_tiny_rule() {
123 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 2, 1);
124 assert_eq!(cov, Some(100));
125 assert_eq!(min_len, 2);
126 assert_eq!(min_high_len, 1);
127 }
128
129 #[test]
130 fn test_compute_thresholds_occurrences_small_rule() {
131 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 8, 3);
132 assert_eq!(cov, Some(80));
133 assert_eq!(min_len, 8);
134 assert_eq!(min_high_len, 3);
135 }
136
137 #[test]
138 fn test_compute_thresholds_occurrences_medium_rule() {
139 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 25, 10);
140 assert_eq!(cov, Some(50));
141 assert_eq!(min_len, 12);
142 assert_eq!(min_high_len, 3);
143 }
144
145 #[test]
146 fn test_compute_thresholds_occurrences_large_rule() {
147 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 100, 40);
148 assert_eq!(cov, None);
149 assert_eq!(min_len, 4);
150 assert_eq!(min_high_len, 3);
151 }
152
153 #[test]
154 fn test_compute_thresholds_occurrences_very_large_rule() {
155 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 500, 200);
156 assert_eq!(cov, None);
157 assert_eq!(min_len, 50);
158 assert_eq!(min_high_len, 20);
159 }
160
161 #[test]
162 fn test_compute_thresholds_unique_100_coverage() {
163 let (min_len, min_high_len) = compute_thresholds_unique(Some(100), 50, 30, 15);
164 assert_eq!(min_len, 30);
165 assert_eq!(min_high_len, 15);
166 }
167
168 #[test]
169 fn test_compute_thresholds_unique_very_large() {
170 let (min_len, min_high_len) = compute_thresholds_unique(None, 500, 300, 150);
171 assert_eq!(min_len, 50);
172 assert_eq!(min_high_len, 15);
173 }
174
175 #[test]
176 fn test_compute_thresholds_unique_tiny() {
177 let (min_len, min_high_len) = compute_thresholds_unique(None, 3, 2, 1);
178 assert_eq!(min_len, 2);
179 assert_eq!(min_high_len, 1);
180 }
181
182 #[test]
183 fn test_compute_thresholds_unique_small() {
184 let (min_len, min_high_len) = compute_thresholds_unique(None, 8, 5, 3);
185 assert_eq!(min_len, 4);
186 assert_eq!(min_high_len, 3);
187 }
188
189 #[test]
190 fn test_compute_thresholds_unique_medium() {
191 let (min_len, min_high_len) = compute_thresholds_unique(None, 15, 10, 5);
192 assert_eq!(min_len, 5);
193 assert_eq!(min_high_len, 5);
194 }
195
196 #[test]
197 fn test_compute_thresholds_unique_large() {
198 let (min_len, min_high_len) = compute_thresholds_unique(None, 100, 40, 20);
199 assert_eq!(min_len, 4);
200 assert_eq!(min_high_len, 3);
201 }
202
203 #[test]
204 fn test_constants() {
205 assert_eq!(MIN_MATCH_LENGTH, 4);
206 assert_eq!(MIN_MATCH_HIGH_LENGTH, 3);
207 assert_eq!(SMALL_RULE, 15);
208 assert_eq!(TINY_RULE, 6);
209 }
210}
211
212#[cfg(test)]
213mod integration_tests {
214 use super::super::super::index::dictionary::{TokenDictionary, TokenId};
215 use super::super::super::models::Rule;
216 use super::*;
217 use crate::license_detection::{TokenMultiset, TokenSet};
218 use std::collections::HashMap;
219
220 fn create_rule_with_thresholds(
222 text: String,
223 tokens: Vec<u16>,
224 minimum_coverage: Option<u8>,
225 len_legalese: usize,
226 ) -> Rule {
227 let tokens: Vec<TokenId> = tokens.into_iter().map(TokenId::new).collect();
228 let mut rule = Rule {
229 identifier: "test.RULE".to_string(),
230 license_expression: "mit".to_string(),
231 text,
232 tokens: tokens.clone(),
233 rule_kind: crate::license_detection::models::RuleKind::None,
234 is_false_positive: false,
235 is_required_phrase: false,
236 is_from_license: false,
237 relevance: 100,
238 minimum_coverage,
239 has_stored_minimum_coverage: false,
240 is_continuous: false,
241 required_phrase_spans: vec![],
242 stopwords_by_pos: HashMap::new(),
243 referenced_filenames: None,
244 ignorable_urls: None,
245 ignorable_emails: None,
246 ignorable_copyrights: None,
247 ignorable_holders: None,
248 ignorable_authors: None,
249 language: None,
250 notes: None,
251 length_unique: 0,
252 high_length_unique: 0,
253 high_length: 0,
254 min_matched_length: 0,
255 min_high_matched_length: 0,
256 min_matched_length_unique: 0,
257 min_high_matched_length_unique: 0,
258 is_small: false,
259 is_tiny: false,
260 starts_with_license: false,
261 ends_with_license: false,
262 is_deprecated: false,
263 spdx_license_key: None,
264 other_spdx_license_keys: vec![],
265 };
266
267 let legalese_entries: Vec<(String, u16)> = (0..len_legalese)
269 .map(|i| (format!("legalese-{i}"), i as u16))
270 .collect();
271 let dictionary = TokenDictionary::new_with_legalese_pairs(
272 &legalese_entries
273 .iter()
274 .map(|(token, id)| (token.as_str(), *id))
275 .collect::<Vec<_>>(),
276 );
277 let tids_set = TokenSet::from_token_ids(tokens.iter().copied());
278 let tids_mset = TokenMultiset::from_token_ids(&tokens);
279 let tids_set_high = tids_set.high_subset(&dictionary);
280 let tids_mset_high = tids_mset.high_subset(&dictionary);
281
282 rule.length_unique = tids_set.len();
284 rule.high_length_unique = tids_set_high.len();
285 rule.high_length = tids_mset_high.total_count();
286
287 let (updated_coverage, min_len, min_high_len) =
289 compute_thresholds_occurrences(rule.minimum_coverage, tokens.len(), rule.high_length);
290 rule.minimum_coverage = updated_coverage;
291 rule.min_matched_length = min_len;
292 rule.min_high_matched_length = min_high_len;
293
294 let (min_len_unique, min_high_len_unique) = compute_thresholds_unique(
295 rule.minimum_coverage,
296 tokens.len(),
297 rule.length_unique,
298 rule.high_length_unique,
299 );
300 rule.min_matched_length_unique = min_len_unique;
301 rule.min_high_matched_length_unique = min_high_len_unique;
302
303 rule.is_tiny = tokens.len() < TINY_RULE;
305 rule.is_small = tokens.len() < SMALL_RULE;
306
307 rule
308 }
309
310 #[test]
311 fn test_threshold_computation_with_explicit_coverage() {
312 let tokens = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 11]; let len_legalese = 10;
316 let rule = create_rule_with_thresholds(
317 "MIT License text".to_string(),
318 tokens.clone(),
319 Some(100),
320 len_legalese,
321 );
322
323 assert_eq!(rule.minimum_coverage, Some(100));
324 assert_eq!(rule.min_matched_length, 10);
325 assert_eq!(rule.min_high_matched_length, 9); assert_eq!(rule.min_matched_length_unique, 10);
327 assert_eq!(rule.min_high_matched_length_unique, 9);
328 }
329
330 #[test]
331 fn test_threshold_computation_full_pipeline_small_rule() {
332 let tokens = vec![1, 2, 3, 4, 6, 7, 12, 15]; let len_legalese = 10;
335 let rule = create_rule_with_thresholds(
336 "MIT License text here".to_string(),
337 tokens,
338 None,
339 len_legalese,
340 );
341
342 assert!(!rule.is_tiny);
343 assert!(rule.is_small);
344 assert_eq!(rule.length_unique, 8);
345 assert_eq!(rule.high_length_unique, 6); assert_eq!(rule.high_length, 6);
347 assert_eq!(rule.minimum_coverage, Some(80));
348 assert_eq!(rule.min_matched_length, 8);
349 assert_eq!(rule.min_high_matched_length, 6);
350 }
351
352 #[test]
353 fn test_threshold_computation_full_pipeline_medium_rule() {
354 let tokens: Vec<u16> = (0..25).collect(); let len_legalese = 10;
357 let rule = create_rule_with_thresholds(
358 "MIT License text here with more words".to_string(),
359 tokens,
360 None,
361 len_legalese,
362 );
363
364 assert!(!rule.is_tiny);
365 assert!(!rule.is_small);
366 assert_eq!(rule.length_unique, 25);
367 assert_eq!(rule.high_length_unique, 10); assert_eq!(rule.high_length, 10);
369 assert_eq!(rule.minimum_coverage, Some(50));
370 assert_eq!(rule.min_matched_length, 12);
371 assert_eq!(rule.min_high_matched_length, 3);
372 }
373
374 #[test]
375 fn test_threshold_computation_full_pipeline_tiny_rule() {
376 let tokens = vec![1, 2, 3]; let len_legalese = 10;
381 let rule =
382 create_rule_with_thresholds("MIT License".to_string(), tokens, None, len_legalese);
383
384 assert!(rule.is_tiny);
386 assert!(rule.is_small);
387 assert_eq!(rule.length_unique, 3);
388 assert_eq!(rule.high_length_unique, 3);
389 assert_eq!(rule.high_length, 3);
390 assert_eq!(rule.minimum_coverage, Some(80));
392 assert_eq!(rule.min_matched_length, 3);
393 assert_eq!(rule.min_high_matched_length, 3);
394 }
395
396 #[test]
397 fn test_threshold_computation_unique_token_counts() {
398 let tokens = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
400 let len_legalese = 10;
401 let rule = create_rule_with_thresholds(
402 "MIT License MIT License MIT".to_string(),
403 tokens,
404 None,
405 len_legalese,
406 );
407
408 assert_eq!(rule.length_unique, 3); assert_eq!(rule.high_length_unique, 3); assert_eq!(rule.high_length, 12); }
412
413 #[test]
414 fn test_threshold_computation_no_high_tokens() {
415 let tokens: Vec<u16> = (10..20).collect(); let len_legalese = 10;
418 let rule = create_rule_with_thresholds(
419 "Some text without legal words".to_string(),
420 tokens,
421 None,
422 len_legalese,
423 );
424
425 assert_eq!(rule.high_length_unique, 0);
426 assert_eq!(rule.high_length, 0);
427 assert_eq!(rule.min_high_matched_length, 0);
428 }
429}