1pub const MIN_MATCH_LENGTH: usize = 4;
5
6pub const MIN_MATCH_HIGH_LENGTH: usize = 3;
8
9pub const SMALL_RULE: usize = 15;
11
12pub const TINY_RULE: usize = 6;
14
15pub fn compute_thresholds_occurrences(
30 minimum_coverage: Option<u8>,
31 length: usize,
32 high_length: usize,
33) -> (Option<u8>, usize, usize) {
34 if minimum_coverage == Some(100) {
35 return (minimum_coverage, length, high_length);
36 }
37
38 let (min_matched_length, min_high_matched_length, updated_coverage) = if length < 3 {
39 (length, high_length, Some(100))
40 } else if length < 10 {
41 (length, high_length, Some(80))
42 } else if length < 30 {
43 (length / 2, high_length.min(MIN_MATCH_HIGH_LENGTH), Some(50))
44 } else if length < 200 {
45 (
46 MIN_MATCH_LENGTH,
47 high_length.min(MIN_MATCH_HIGH_LENGTH),
48 minimum_coverage,
49 )
50 } else {
51 (length / 10, high_length / 10, minimum_coverage)
52 };
53
54 (
55 updated_coverage,
56 min_matched_length,
57 min_high_matched_length,
58 )
59}
60
61pub fn compute_thresholds_unique(
77 minimum_coverage: Option<u8>,
78 length: usize,
79 length_unique: usize,
80 high_length_unique: usize,
81) -> (usize, usize) {
82 if minimum_coverage == Some(100) {
83 return (length_unique, high_length_unique);
84 }
85
86 if length > 200 {
87 (length / 10, high_length_unique / 10)
88 } else if length < 5 {
89 (length_unique, high_length_unique)
90 } else if length < 10 {
91 let min_matched = if length_unique < 2 {
92 length_unique
93 } else {
94 length_unique - 1
95 };
96 (min_matched, high_length_unique)
97 } else if length < 20 {
98 (high_length_unique, high_length_unique)
99 } else {
100 let half = high_length_unique / 2;
101 let high_u = if half > 0 { half } else { high_length_unique };
102 (MIN_MATCH_LENGTH, high_u.min(MIN_MATCH_HIGH_LENGTH))
103 }
104}
105
106#[cfg(test)]
107mod tests {
108 use super::*;
109
110 #[test]
111 fn test_compute_thresholds_occurrences_100_coverage() {
112 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(Some(100), 50, 20);
113 assert_eq!(cov, Some(100));
114 assert_eq!(min_len, 50);
115 assert_eq!(min_high_len, 20);
116 }
117
118 #[test]
119 fn test_compute_thresholds_occurrences_tiny_rule() {
120 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 2, 1);
121 assert_eq!(cov, Some(100));
122 assert_eq!(min_len, 2);
123 assert_eq!(min_high_len, 1);
124 }
125
126 #[test]
127 fn test_compute_thresholds_occurrences_small_rule() {
128 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 8, 3);
129 assert_eq!(cov, Some(80));
130 assert_eq!(min_len, 8);
131 assert_eq!(min_high_len, 3);
132 }
133
134 #[test]
135 fn test_compute_thresholds_occurrences_medium_rule() {
136 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 25, 10);
137 assert_eq!(cov, Some(50));
138 assert_eq!(min_len, 12);
139 assert_eq!(min_high_len, 3);
140 }
141
142 #[test]
143 fn test_compute_thresholds_occurrences_large_rule() {
144 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 100, 40);
145 assert_eq!(cov, None);
146 assert_eq!(min_len, 4);
147 assert_eq!(min_high_len, 3);
148 }
149
150 #[test]
151 fn test_compute_thresholds_occurrences_very_large_rule() {
152 let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 500, 200);
153 assert_eq!(cov, None);
154 assert_eq!(min_len, 50);
155 assert_eq!(min_high_len, 20);
156 }
157
158 #[test]
159 fn test_compute_thresholds_unique_100_coverage() {
160 let (min_len, min_high_len) = compute_thresholds_unique(Some(100), 50, 30, 15);
161 assert_eq!(min_len, 30);
162 assert_eq!(min_high_len, 15);
163 }
164
165 #[test]
166 fn test_compute_thresholds_unique_very_large() {
167 let (min_len, min_high_len) = compute_thresholds_unique(None, 500, 300, 150);
168 assert_eq!(min_len, 50);
169 assert_eq!(min_high_len, 15);
170 }
171
172 #[test]
173 fn test_compute_thresholds_unique_tiny() {
174 let (min_len, min_high_len) = compute_thresholds_unique(None, 3, 2, 1);
175 assert_eq!(min_len, 2);
176 assert_eq!(min_high_len, 1);
177 }
178
179 #[test]
180 fn test_compute_thresholds_unique_small() {
181 let (min_len, min_high_len) = compute_thresholds_unique(None, 8, 5, 3);
182 assert_eq!(min_len, 4);
183 assert_eq!(min_high_len, 3);
184 }
185
186 #[test]
187 fn test_compute_thresholds_unique_medium() {
188 let (min_len, min_high_len) = compute_thresholds_unique(None, 15, 10, 5);
189 assert_eq!(min_len, 5);
190 assert_eq!(min_high_len, 5);
191 }
192
193 #[test]
194 fn test_compute_thresholds_unique_large() {
195 let (min_len, min_high_len) = compute_thresholds_unique(None, 100, 40, 20);
196 assert_eq!(min_len, 4);
197 assert_eq!(min_high_len, 3);
198 }
199
200 #[test]
201 fn test_constants() {
202 assert_eq!(MIN_MATCH_LENGTH, 4);
203 assert_eq!(MIN_MATCH_HIGH_LENGTH, 3);
204 assert_eq!(SMALL_RULE, 15);
205 assert_eq!(TINY_RULE, 6);
206 }
207}
208
209#[cfg(test)]
210mod integration_tests {
211 use super::super::super::index::dictionary::{TokenDictionary, TokenId};
212 use super::super::super::models::Rule;
213 use super::*;
214 use crate::license_detection::{TokenMultiset, TokenSet};
215 use std::collections::HashMap;
216
217 fn create_rule_with_thresholds(
219 text: String,
220 tokens: Vec<u16>,
221 minimum_coverage: Option<u8>,
222 len_legalese: usize,
223 ) -> Rule {
224 let tokens: Vec<TokenId> = tokens.into_iter().map(TokenId::new).collect();
225 let mut rule = Rule {
226 identifier: "test.RULE".to_string(),
227 license_expression: "mit".to_string(),
228 text,
229 tokens: tokens.clone(),
230 rule_kind: crate::license_detection::models::RuleKind::None,
231 is_false_positive: false,
232 is_required_phrase: false,
233 is_from_license: false,
234 relevance: 100,
235 minimum_coverage,
236 has_stored_minimum_coverage: false,
237 is_continuous: false,
238 required_phrase_spans: vec![],
239 stopwords_by_pos: HashMap::new(),
240 referenced_filenames: None,
241 ignorable_urls: None,
242 ignorable_emails: None,
243 ignorable_copyrights: None,
244 ignorable_holders: None,
245 ignorable_authors: None,
246 language: None,
247 notes: None,
248 length_unique: 0,
249 high_length_unique: 0,
250 high_length: 0,
251 min_matched_length: 0,
252 min_high_matched_length: 0,
253 min_matched_length_unique: 0,
254 min_high_matched_length_unique: 0,
255 is_small: false,
256 is_tiny: false,
257 starts_with_license: false,
258 ends_with_license: false,
259 is_deprecated: false,
260 spdx_license_key: None,
261 other_spdx_license_keys: vec![],
262 };
263
264 let legalese_entries: Vec<(String, u16)> = (0..len_legalese)
266 .map(|i| (format!("legalese-{i}"), i as u16))
267 .collect();
268 let dictionary = TokenDictionary::new_with_legalese(
269 &legalese_entries
270 .iter()
271 .map(|(token, id)| (token.as_str(), *id))
272 .collect::<Vec<_>>(),
273 );
274 let tids_set = TokenSet::from_token_ids(tokens.iter().copied());
275 let tids_mset = TokenMultiset::from_token_ids(&tokens);
276 let tids_set_high = tids_set.high_subset(&dictionary);
277 let tids_mset_high = tids_mset.high_subset(&dictionary);
278
279 rule.length_unique = tids_set.len();
281 rule.high_length_unique = tids_set_high.len();
282 rule.high_length = tids_mset_high.total_count();
283
284 let (updated_coverage, min_len, min_high_len) =
286 compute_thresholds_occurrences(rule.minimum_coverage, tokens.len(), rule.high_length);
287 rule.minimum_coverage = updated_coverage;
288 rule.min_matched_length = min_len;
289 rule.min_high_matched_length = min_high_len;
290
291 let (min_len_unique, min_high_len_unique) = compute_thresholds_unique(
292 rule.minimum_coverage,
293 tokens.len(),
294 rule.length_unique,
295 rule.high_length_unique,
296 );
297 rule.min_matched_length_unique = min_len_unique;
298 rule.min_high_matched_length_unique = min_high_len_unique;
299
300 rule.is_tiny = tokens.len() < TINY_RULE;
302 rule.is_small = tokens.len() < SMALL_RULE;
303
304 rule
305 }
306
307 #[test]
308 fn test_threshold_computation_with_explicit_coverage() {
309 let tokens = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 11]; let len_legalese = 10;
313 let rule = create_rule_with_thresholds(
314 "MIT License text".to_string(),
315 tokens.clone(),
316 Some(100),
317 len_legalese,
318 );
319
320 assert_eq!(rule.minimum_coverage, Some(100));
321 assert_eq!(rule.min_matched_length, 10);
322 assert_eq!(rule.min_high_matched_length, 9); assert_eq!(rule.min_matched_length_unique, 10);
324 assert_eq!(rule.min_high_matched_length_unique, 9);
325 }
326
327 #[test]
328 fn test_threshold_computation_full_pipeline_small_rule() {
329 let tokens = vec![1, 2, 3, 4, 6, 7, 12, 15]; let len_legalese = 10;
332 let rule = create_rule_with_thresholds(
333 "MIT License text here".to_string(),
334 tokens,
335 None,
336 len_legalese,
337 );
338
339 assert!(!rule.is_tiny);
340 assert!(rule.is_small);
341 assert_eq!(rule.length_unique, 8);
342 assert_eq!(rule.high_length_unique, 6); assert_eq!(rule.high_length, 6);
344 assert_eq!(rule.minimum_coverage, Some(80));
345 assert_eq!(rule.min_matched_length, 8);
346 assert_eq!(rule.min_high_matched_length, 6);
347 }
348
349 #[test]
350 fn test_threshold_computation_full_pipeline_medium_rule() {
351 let tokens: Vec<u16> = (0..25).collect(); let len_legalese = 10;
354 let rule = create_rule_with_thresholds(
355 "MIT License text here with more words".to_string(),
356 tokens,
357 None,
358 len_legalese,
359 );
360
361 assert!(!rule.is_tiny);
362 assert!(!rule.is_small);
363 assert_eq!(rule.length_unique, 25);
364 assert_eq!(rule.high_length_unique, 10); assert_eq!(rule.high_length, 10);
366 assert_eq!(rule.minimum_coverage, Some(50));
367 assert_eq!(rule.min_matched_length, 12);
368 assert_eq!(rule.min_high_matched_length, 3);
369 }
370
371 #[test]
372 fn test_threshold_computation_full_pipeline_tiny_rule() {
373 let tokens = vec![1, 2, 3]; let len_legalese = 10;
378 let rule =
379 create_rule_with_thresholds("MIT License".to_string(), tokens, None, len_legalese);
380
381 assert!(rule.is_tiny);
383 assert!(rule.is_small);
384 assert_eq!(rule.length_unique, 3);
385 assert_eq!(rule.high_length_unique, 3);
386 assert_eq!(rule.high_length, 3);
387 assert_eq!(rule.minimum_coverage, Some(80));
389 assert_eq!(rule.min_matched_length, 3);
390 assert_eq!(rule.min_high_matched_length, 3);
391 }
392
393 #[test]
394 fn test_threshold_computation_unique_token_counts() {
395 let tokens = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
397 let len_legalese = 10;
398 let rule = create_rule_with_thresholds(
399 "MIT License MIT License MIT".to_string(),
400 tokens,
401 None,
402 len_legalese,
403 );
404
405 assert_eq!(rule.length_unique, 3); assert_eq!(rule.high_length_unique, 3); assert_eq!(rule.high_length, 12); }
409
410 #[test]
411 fn test_threshold_computation_no_high_tokens() {
412 let tokens: Vec<u16> = (10..20).collect(); let len_legalese = 10;
415 let rule = create_rule_with_thresholds(
416 "Some text without legal words".to_string(),
417 tokens,
418 None,
419 len_legalese,
420 );
421
422 assert_eq!(rule.high_length_unique, 0);
423 assert_eq!(rule.high_length, 0);
424 assert_eq!(rule.min_high_matched_length, 0);
425 }
426}