Skip to main content

provenant/license_detection/rules/
thresholds.rs

1//! Compute match thresholds for license detection rules.
2
3/// Minimum match length for token-based matching.
4pub const MIN_MATCH_LENGTH: usize = 4;
5
6/// Minimum match length for high-value (legalese) token matching.
7pub const MIN_MATCH_HIGH_LENGTH: usize = 3;
8
9/// Rules shorter than this are considered "small" (exact match only).
10pub const SMALL_RULE: usize = 15;
11
12/// Rules shorter than this are considered "tiny" (very short, special handling).
13pub const TINY_RULE: usize = 6;
14
15/// Compute thresholds considering the occurrence of all tokens.
16///
17/// This function computes the minimum match thresholds based on the total
18/// length of the rule and the count of high-value (legalese) tokens.
19///
20/// # Arguments
21///
22/// * `minimum_coverage` - Required coverage percentage (0-100), None if not specified
23/// * `length` - Total number of tokens in the rule
24/// * `high_length` - Total count of legalese token occurrences
25///
26/// # Returns
27///
28/// A tuple of (updated_minimum_coverage, min_matched_length, min_high_matched_length)
29pub fn compute_thresholds_occurrences(
30    minimum_coverage: Option<u8>,
31    length: usize,
32    high_length: usize,
33) -> (Option<u8>, usize, usize) {
34    if minimum_coverage == Some(100) {
35        return (minimum_coverage, length, high_length);
36    }
37
38    let (min_matched_length, min_high_matched_length, updated_coverage) = if length < 3 {
39        (length, high_length, Some(100))
40    } else if length < 10 {
41        (length, high_length, Some(80))
42    } else if length < 30 {
43        (length / 2, high_length.min(MIN_MATCH_HIGH_LENGTH), Some(50))
44    } else if length < 200 {
45        (
46            MIN_MATCH_LENGTH,
47            high_length.min(MIN_MATCH_HIGH_LENGTH),
48            minimum_coverage,
49        )
50    } else {
51        (length / 10, high_length / 10, minimum_coverage)
52    };
53
54    (
55        updated_coverage,
56        min_matched_length,
57        min_high_matched_length,
58    )
59}
60
61/// Compute thresholds considering the occurrence of only unique tokens.
62///
63/// This function computes the minimum match thresholds based on the number of
64/// unique tokens in the rule and the count of unique high-value (legalese) tokens.
65///
66/// # Arguments
67///
68/// * `minimum_coverage` - Required coverage percentage (0-100), None if not specified
69/// * `length` - Total number of tokens in the rule
70/// * `length_unique` - Count of unique token IDs in the rule
71/// * `high_length_unique` - Count of unique legalese token IDs
72///
73/// # Returns
74///
75/// A tuple of (min_matched_length_unique, min_high_matched_length_unique)
76pub fn compute_thresholds_unique(
77    minimum_coverage: Option<u8>,
78    length: usize,
79    length_unique: usize,
80    high_length_unique: usize,
81) -> (usize, usize) {
82    if minimum_coverage == Some(100) {
83        return (length_unique, high_length_unique);
84    }
85
86    if length > 200 {
87        (length / 10, high_length_unique / 10)
88    } else if length < 5 {
89        (length_unique, high_length_unique)
90    } else if length < 10 {
91        let min_matched = if length_unique < 2 {
92            length_unique
93        } else {
94            length_unique - 1
95        };
96        (min_matched, high_length_unique)
97    } else if length < 20 {
98        (high_length_unique, high_length_unique)
99    } else {
100        let half = high_length_unique / 2;
101        let high_u = if half > 0 { half } else { high_length_unique };
102        (MIN_MATCH_LENGTH, high_u.min(MIN_MATCH_HIGH_LENGTH))
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn test_compute_thresholds_occurrences_100_coverage() {
112        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(Some(100), 50, 20);
113        assert_eq!(cov, Some(100));
114        assert_eq!(min_len, 50);
115        assert_eq!(min_high_len, 20);
116    }
117
118    #[test]
119    fn test_compute_thresholds_occurrences_tiny_rule() {
120        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 2, 1);
121        assert_eq!(cov, Some(100));
122        assert_eq!(min_len, 2);
123        assert_eq!(min_high_len, 1);
124    }
125
126    #[test]
127    fn test_compute_thresholds_occurrences_small_rule() {
128        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 8, 3);
129        assert_eq!(cov, Some(80));
130        assert_eq!(min_len, 8);
131        assert_eq!(min_high_len, 3);
132    }
133
134    #[test]
135    fn test_compute_thresholds_occurrences_medium_rule() {
136        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 25, 10);
137        assert_eq!(cov, Some(50));
138        assert_eq!(min_len, 12);
139        assert_eq!(min_high_len, 3);
140    }
141
142    #[test]
143    fn test_compute_thresholds_occurrences_large_rule() {
144        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 100, 40);
145        assert_eq!(cov, None);
146        assert_eq!(min_len, 4);
147        assert_eq!(min_high_len, 3);
148    }
149
150    #[test]
151    fn test_compute_thresholds_occurrences_very_large_rule() {
152        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 500, 200);
153        assert_eq!(cov, None);
154        assert_eq!(min_len, 50);
155        assert_eq!(min_high_len, 20);
156    }
157
158    #[test]
159    fn test_compute_thresholds_unique_100_coverage() {
160        let (min_len, min_high_len) = compute_thresholds_unique(Some(100), 50, 30, 15);
161        assert_eq!(min_len, 30);
162        assert_eq!(min_high_len, 15);
163    }
164
165    #[test]
166    fn test_compute_thresholds_unique_very_large() {
167        let (min_len, min_high_len) = compute_thresholds_unique(None, 500, 300, 150);
168        assert_eq!(min_len, 50);
169        assert_eq!(min_high_len, 15);
170    }
171
172    #[test]
173    fn test_compute_thresholds_unique_tiny() {
174        let (min_len, min_high_len) = compute_thresholds_unique(None, 3, 2, 1);
175        assert_eq!(min_len, 2);
176        assert_eq!(min_high_len, 1);
177    }
178
179    #[test]
180    fn test_compute_thresholds_unique_small() {
181        let (min_len, min_high_len) = compute_thresholds_unique(None, 8, 5, 3);
182        assert_eq!(min_len, 4);
183        assert_eq!(min_high_len, 3);
184    }
185
186    #[test]
187    fn test_compute_thresholds_unique_medium() {
188        let (min_len, min_high_len) = compute_thresholds_unique(None, 15, 10, 5);
189        assert_eq!(min_len, 5);
190        assert_eq!(min_high_len, 5);
191    }
192
193    #[test]
194    fn test_compute_thresholds_unique_large() {
195        let (min_len, min_high_len) = compute_thresholds_unique(None, 100, 40, 20);
196        assert_eq!(min_len, 4);
197        assert_eq!(min_high_len, 3);
198    }
199
200    #[test]
201    fn test_constants() {
202        assert_eq!(MIN_MATCH_LENGTH, 4);
203        assert_eq!(MIN_MATCH_HIGH_LENGTH, 3);
204        assert_eq!(SMALL_RULE, 15);
205        assert_eq!(TINY_RULE, 6);
206    }
207}
208
209#[cfg(test)]
210mod integration_tests {
211    use super::super::super::index::dictionary::{TokenDictionary, TokenId};
212    use super::super::super::models::Rule;
213    use super::*;
214    use crate::license_detection::{TokenMultiset, TokenSet};
215    use std::collections::HashMap;
216
217    /// Helper function to create a rule with mock tokens and compute thresholds.
218    fn create_rule_with_thresholds(
219        text: String,
220        tokens: Vec<u16>,
221        minimum_coverage: Option<u8>,
222        len_legalese: usize,
223    ) -> Rule {
224        let tokens: Vec<TokenId> = tokens.into_iter().map(TokenId::new).collect();
225        let mut rule = Rule {
226            identifier: "test.RULE".to_string(),
227            license_expression: "mit".to_string(),
228            text,
229            tokens: tokens.clone(),
230            rule_kind: crate::license_detection::models::RuleKind::None,
231            is_false_positive: false,
232            is_required_phrase: false,
233            is_from_license: false,
234            relevance: 100,
235            minimum_coverage,
236            has_stored_minimum_coverage: false,
237            is_continuous: false,
238            required_phrase_spans: vec![],
239            stopwords_by_pos: HashMap::new(),
240            referenced_filenames: None,
241            ignorable_urls: None,
242            ignorable_emails: None,
243            ignorable_copyrights: None,
244            ignorable_holders: None,
245            ignorable_authors: None,
246            language: None,
247            notes: None,
248            length_unique: 0,
249            high_length_unique: 0,
250            high_length: 0,
251            min_matched_length: 0,
252            min_high_matched_length: 0,
253            min_matched_length_unique: 0,
254            min_high_matched_length_unique: 0,
255            is_small: false,
256            is_tiny: false,
257            starts_with_license: false,
258            ends_with_license: false,
259            is_deprecated: false,
260            spdx_license_key: None,
261            other_spdx_license_keys: vec![],
262        };
263
264        // Build token sets and multisets
265        let legalese_entries: Vec<(String, u16)> = (0..len_legalese)
266            .map(|i| (format!("legalese-{i}"), i as u16))
267            .collect();
268        let dictionary = TokenDictionary::new_with_legalese(
269            &legalese_entries
270                .iter()
271                .map(|(token, id)| (token.as_str(), *id))
272                .collect::<Vec<_>>(),
273        );
274        let tids_set = TokenSet::from_token_ids(tokens.iter().copied());
275        let tids_mset = TokenMultiset::from_token_ids(&tokens);
276        let tids_set_high = tids_set.high_subset(&dictionary);
277        let tids_mset_high = tids_mset.high_subset(&dictionary);
278
279        // Compute token counts
280        rule.length_unique = tids_set.len();
281        rule.high_length_unique = tids_set_high.len();
282        rule.high_length = tids_mset_high.total_count();
283
284        // Compute thresholds
285        let (updated_coverage, min_len, min_high_len) =
286            compute_thresholds_occurrences(rule.minimum_coverage, tokens.len(), rule.high_length);
287        rule.minimum_coverage = updated_coverage;
288        rule.min_matched_length = min_len;
289        rule.min_high_matched_length = min_high_len;
290
291        let (min_len_unique, min_high_len_unique) = compute_thresholds_unique(
292            rule.minimum_coverage,
293            tokens.len(),
294            rule.length_unique,
295            rule.high_length_unique,
296        );
297        rule.min_matched_length_unique = min_len_unique;
298        rule.min_high_matched_length_unique = min_high_len_unique;
299
300        // Rule classification
301        rule.is_tiny = tokens.len() < TINY_RULE;
302        rule.is_small = tokens.len() < SMALL_RULE;
303
304        rule
305    }
306
307    #[test]
308    fn test_threshold_computation_with_explicit_coverage() {
309        // Rule with explicit 100% coverage
310        // Note: len_legalese=10 means IDs 0-9 are legalese, so token 10 is NOT legalese
311        let tokens = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 11]; // 11 is not legalese
312        let len_legalese = 10;
313        let rule = create_rule_with_thresholds(
314            "MIT License text".to_string(),
315            tokens.clone(),
316            Some(100),
317            len_legalese,
318        );
319
320        assert_eq!(rule.minimum_coverage, Some(100));
321        assert_eq!(rule.min_matched_length, 10);
322        assert_eq!(rule.min_high_matched_length, 9); // Only 1-9 are legalese
323        assert_eq!(rule.min_matched_length_unique, 10);
324        assert_eq!(rule.min_high_matched_length_unique, 9);
325    }
326
327    #[test]
328    fn test_threshold_computation_full_pipeline_small_rule() {
329        // Small rule with 8 tokens
330        let tokens = vec![1, 2, 3, 4, 6, 7, 12, 15]; // Some legalese (0-9), some not
331        let len_legalese = 10;
332        let rule = create_rule_with_thresholds(
333            "MIT License text here".to_string(),
334            tokens,
335            None,
336            len_legalese,
337        );
338
339        assert!(!rule.is_tiny);
340        assert!(rule.is_small);
341        assert_eq!(rule.length_unique, 8);
342        assert_eq!(rule.high_length_unique, 6); // Tokens 1,2,3,4,6,7 are legalese
343        assert_eq!(rule.high_length, 6);
344        assert_eq!(rule.minimum_coverage, Some(80));
345        assert_eq!(rule.min_matched_length, 8);
346        assert_eq!(rule.min_high_matched_length, 6);
347    }
348
349    #[test]
350    fn test_threshold_computation_full_pipeline_medium_rule() {
351        // Medium rule with 25 tokens
352        let tokens: Vec<u16> = (0..25).collect(); // Many legalese
353        let len_legalese = 10;
354        let rule = create_rule_with_thresholds(
355            "MIT License text here with more words".to_string(),
356            tokens,
357            None,
358            len_legalese,
359        );
360
361        assert!(!rule.is_tiny);
362        assert!(!rule.is_small);
363        assert_eq!(rule.length_unique, 25);
364        assert_eq!(rule.high_length_unique, 10); // Only 0-9 are legalese
365        assert_eq!(rule.high_length, 10);
366        assert_eq!(rule.minimum_coverage, Some(50));
367        assert_eq!(rule.min_matched_length, 12);
368        assert_eq!(rule.min_high_matched_length, 3);
369    }
370
371    #[test]
372    fn test_threshold_computation_full_pipeline_tiny_rule() {
373        // Tiny rule with 3 tokens
374        // Note: In Python, length >= 3 AND length < 10 gives 80% coverage
375        // Only length < 3 gives 100% coverage
376        let tokens = vec![1, 2, 3]; // All legalese
377        let len_legalese = 10;
378        let rule =
379            create_rule_with_thresholds("MIT License".to_string(), tokens, None, len_legalese);
380
381        // TINY_RULE is 6, so a 3-token rule IS tiny and is_small
382        assert!(rule.is_tiny);
383        assert!(rule.is_small);
384        assert_eq!(rule.length_unique, 3);
385        assert_eq!(rule.high_length_unique, 3);
386        assert_eq!(rule.high_length, 3);
387        // For length >= 3 and < 10, coverage is 80%
388        assert_eq!(rule.minimum_coverage, Some(80));
389        assert_eq!(rule.min_matched_length, 3);
390        assert_eq!(rule.min_high_matched_length, 3);
391    }
392
393    #[test]
394    fn test_threshold_computation_unique_token_counts() {
395        // Rule with repeated tokens to test unique counting
396        let tokens = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
397        let len_legalese = 10;
398        let rule = create_rule_with_thresholds(
399            "MIT License MIT License MIT".to_string(),
400            tokens,
401            None,
402            len_legalese,
403        );
404
405        assert_eq!(rule.length_unique, 3); // Only 3 unique tokens
406        assert_eq!(rule.high_length_unique, 3); // All are legalese
407        assert_eq!(rule.high_length, 12); // But 12 total occurrences
408    }
409
410    #[test]
411    fn test_threshold_computation_no_high_tokens() {
412        // Rule with no legalese tokens (weak rule)
413        let tokens: Vec<u16> = (10..20).collect(); // All IDs >= len_legalese
414        let len_legalese = 10;
415        let rule = create_rule_with_thresholds(
416            "Some text without legal words".to_string(),
417            tokens,
418            None,
419            len_legalese,
420        );
421
422        assert_eq!(rule.high_length_unique, 0);
423        assert_eq!(rule.high_length, 0);
424        assert_eq!(rule.min_high_matched_length, 0);
425    }
426}