Skip to main content

provenant/license_detection/rules/
thresholds.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Compute match thresholds for license detection rules.
5
6/// Minimum match length for token-based matching.
7pub const MIN_MATCH_LENGTH: usize = 4;
8
9/// Minimum match length for high-value (legalese) token matching.
10pub const MIN_MATCH_HIGH_LENGTH: usize = 3;
11
12/// Rules shorter than this are considered "small" (exact match only).
13pub const SMALL_RULE: usize = 15;
14
15/// Rules shorter than this are considered "tiny" (very short, special handling).
16pub const TINY_RULE: usize = 6;
17
18/// Compute thresholds considering the occurrence of all tokens.
19///
20/// This function computes the minimum match thresholds based on the total
21/// length of the rule and the count of high-value (legalese) tokens.
22///
23/// # Arguments
24///
25/// * `minimum_coverage` - Required coverage percentage (0-100), None if not specified
26/// * `length` - Total number of tokens in the rule
27/// * `high_length` - Total count of legalese token occurrences
28///
29/// # Returns
30///
31/// A tuple of (updated_minimum_coverage, min_matched_length, min_high_matched_length)
32pub fn compute_thresholds_occurrences(
33    minimum_coverage: Option<u8>,
34    length: usize,
35    high_length: usize,
36) -> (Option<u8>, usize, usize) {
37    if minimum_coverage == Some(100) {
38        return (minimum_coverage, length, high_length);
39    }
40
41    let (min_matched_length, min_high_matched_length, updated_coverage) = if length < 3 {
42        (length, high_length, Some(100))
43    } else if length < 10 {
44        (length, high_length, Some(80))
45    } else if length < 30 {
46        (length / 2, high_length.min(MIN_MATCH_HIGH_LENGTH), Some(50))
47    } else if length < 200 {
48        (
49            MIN_MATCH_LENGTH,
50            high_length.min(MIN_MATCH_HIGH_LENGTH),
51            minimum_coverage,
52        )
53    } else {
54        (length / 10, high_length / 10, minimum_coverage)
55    };
56
57    (
58        updated_coverage,
59        min_matched_length,
60        min_high_matched_length,
61    )
62}
63
64/// Compute thresholds considering the occurrence of only unique tokens.
65///
66/// This function computes the minimum match thresholds based on the number of
67/// unique tokens in the rule and the count of unique high-value (legalese) tokens.
68///
69/// # Arguments
70///
71/// * `minimum_coverage` - Required coverage percentage (0-100), None if not specified
72/// * `length` - Total number of tokens in the rule
73/// * `length_unique` - Count of unique token IDs in the rule
74/// * `high_length_unique` - Count of unique legalese token IDs
75///
76/// # Returns
77///
78/// A tuple of (min_matched_length_unique, min_high_matched_length_unique)
79pub fn compute_thresholds_unique(
80    minimum_coverage: Option<u8>,
81    length: usize,
82    length_unique: usize,
83    high_length_unique: usize,
84) -> (usize, usize) {
85    if minimum_coverage == Some(100) {
86        return (length_unique, high_length_unique);
87    }
88
89    if length > 200 {
90        (length / 10, high_length_unique / 10)
91    } else if length < 5 {
92        (length_unique, high_length_unique)
93    } else if length < 10 {
94        let min_matched = if length_unique < 2 {
95            length_unique
96        } else {
97            length_unique - 1
98        };
99        (min_matched, high_length_unique)
100    } else if length < 20 {
101        (high_length_unique, high_length_unique)
102    } else {
103        let half = high_length_unique / 2;
104        let high_u = if half > 0 { half } else { high_length_unique };
105        (MIN_MATCH_LENGTH, high_u.min(MIN_MATCH_HIGH_LENGTH))
106    }
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn test_compute_thresholds_occurrences_100_coverage() {
115        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(Some(100), 50, 20);
116        assert_eq!(cov, Some(100));
117        assert_eq!(min_len, 50);
118        assert_eq!(min_high_len, 20);
119    }
120
121    #[test]
122    fn test_compute_thresholds_occurrences_tiny_rule() {
123        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 2, 1);
124        assert_eq!(cov, Some(100));
125        assert_eq!(min_len, 2);
126        assert_eq!(min_high_len, 1);
127    }
128
129    #[test]
130    fn test_compute_thresholds_occurrences_small_rule() {
131        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 8, 3);
132        assert_eq!(cov, Some(80));
133        assert_eq!(min_len, 8);
134        assert_eq!(min_high_len, 3);
135    }
136
137    #[test]
138    fn test_compute_thresholds_occurrences_medium_rule() {
139        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 25, 10);
140        assert_eq!(cov, Some(50));
141        assert_eq!(min_len, 12);
142        assert_eq!(min_high_len, 3);
143    }
144
145    #[test]
146    fn test_compute_thresholds_occurrences_large_rule() {
147        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 100, 40);
148        assert_eq!(cov, None);
149        assert_eq!(min_len, 4);
150        assert_eq!(min_high_len, 3);
151    }
152
153    #[test]
154    fn test_compute_thresholds_occurrences_very_large_rule() {
155        let (cov, min_len, min_high_len) = compute_thresholds_occurrences(None, 500, 200);
156        assert_eq!(cov, None);
157        assert_eq!(min_len, 50);
158        assert_eq!(min_high_len, 20);
159    }
160
161    #[test]
162    fn test_compute_thresholds_unique_100_coverage() {
163        let (min_len, min_high_len) = compute_thresholds_unique(Some(100), 50, 30, 15);
164        assert_eq!(min_len, 30);
165        assert_eq!(min_high_len, 15);
166    }
167
168    #[test]
169    fn test_compute_thresholds_unique_very_large() {
170        let (min_len, min_high_len) = compute_thresholds_unique(None, 500, 300, 150);
171        assert_eq!(min_len, 50);
172        assert_eq!(min_high_len, 15);
173    }
174
175    #[test]
176    fn test_compute_thresholds_unique_tiny() {
177        let (min_len, min_high_len) = compute_thresholds_unique(None, 3, 2, 1);
178        assert_eq!(min_len, 2);
179        assert_eq!(min_high_len, 1);
180    }
181
182    #[test]
183    fn test_compute_thresholds_unique_small() {
184        let (min_len, min_high_len) = compute_thresholds_unique(None, 8, 5, 3);
185        assert_eq!(min_len, 4);
186        assert_eq!(min_high_len, 3);
187    }
188
189    #[test]
190    fn test_compute_thresholds_unique_medium() {
191        let (min_len, min_high_len) = compute_thresholds_unique(None, 15, 10, 5);
192        assert_eq!(min_len, 5);
193        assert_eq!(min_high_len, 5);
194    }
195
196    #[test]
197    fn test_compute_thresholds_unique_large() {
198        let (min_len, min_high_len) = compute_thresholds_unique(None, 100, 40, 20);
199        assert_eq!(min_len, 4);
200        assert_eq!(min_high_len, 3);
201    }
202
203    #[test]
204    fn test_constants() {
205        assert_eq!(MIN_MATCH_LENGTH, 4);
206        assert_eq!(MIN_MATCH_HIGH_LENGTH, 3);
207        assert_eq!(SMALL_RULE, 15);
208        assert_eq!(TINY_RULE, 6);
209    }
210}
211
212#[cfg(test)]
213mod integration_tests {
214    use super::super::super::index::dictionary::{TokenDictionary, TokenId};
215    use super::super::super::models::Rule;
216    use super::*;
217    use crate::license_detection::{TokenMultiset, TokenSet};
218    use std::collections::HashMap;
219
220    /// Helper function to create a rule with mock tokens and compute thresholds.
221    fn create_rule_with_thresholds(
222        text: String,
223        tokens: Vec<u16>,
224        minimum_coverage: Option<u8>,
225        len_legalese: usize,
226    ) -> Rule {
227        let tokens: Vec<TokenId> = tokens.into_iter().map(TokenId::new).collect();
228        let mut rule = Rule {
229            identifier: "test.RULE".to_string(),
230            license_expression: "mit".to_string(),
231            text,
232            tokens: tokens.clone(),
233            rule_kind: crate::license_detection::models::RuleKind::None,
234            is_false_positive: false,
235            is_required_phrase: false,
236            is_from_license: false,
237            relevance: 100,
238            minimum_coverage,
239            has_stored_minimum_coverage: false,
240            is_continuous: false,
241            required_phrase_spans: vec![],
242            stopwords_by_pos: HashMap::new(),
243            referenced_filenames: None,
244            ignorable_urls: None,
245            ignorable_emails: None,
246            ignorable_copyrights: None,
247            ignorable_holders: None,
248            ignorable_authors: None,
249            language: None,
250            notes: None,
251            length_unique: 0,
252            high_length_unique: 0,
253            high_length: 0,
254            min_matched_length: 0,
255            min_high_matched_length: 0,
256            min_matched_length_unique: 0,
257            min_high_matched_length_unique: 0,
258            is_small: false,
259            is_tiny: false,
260            starts_with_license: false,
261            ends_with_license: false,
262            is_deprecated: false,
263            spdx_license_key: None,
264            other_spdx_license_keys: vec![],
265        };
266
267        // Build token sets and multisets
268        let legalese_entries: Vec<(String, u16)> = (0..len_legalese)
269            .map(|i| (format!("legalese-{i}"), i as u16))
270            .collect();
271        let dictionary = TokenDictionary::new_with_legalese_pairs(
272            &legalese_entries
273                .iter()
274                .map(|(token, id)| (token.as_str(), *id))
275                .collect::<Vec<_>>(),
276        );
277        let tids_set = TokenSet::from_token_ids(tokens.iter().copied());
278        let tids_mset = TokenMultiset::from_token_ids(&tokens);
279        let tids_set_high = tids_set.high_subset(&dictionary);
280        let tids_mset_high = tids_mset.high_subset(&dictionary);
281
282        // Compute token counts
283        rule.length_unique = tids_set.len();
284        rule.high_length_unique = tids_set_high.len();
285        rule.high_length = tids_mset_high.total_count();
286
287        // Compute thresholds
288        let (updated_coverage, min_len, min_high_len) =
289            compute_thresholds_occurrences(rule.minimum_coverage, tokens.len(), rule.high_length);
290        rule.minimum_coverage = updated_coverage;
291        rule.min_matched_length = min_len;
292        rule.min_high_matched_length = min_high_len;
293
294        let (min_len_unique, min_high_len_unique) = compute_thresholds_unique(
295            rule.minimum_coverage,
296            tokens.len(),
297            rule.length_unique,
298            rule.high_length_unique,
299        );
300        rule.min_matched_length_unique = min_len_unique;
301        rule.min_high_matched_length_unique = min_high_len_unique;
302
303        // Rule classification
304        rule.is_tiny = tokens.len() < TINY_RULE;
305        rule.is_small = tokens.len() < SMALL_RULE;
306
307        rule
308    }
309
310    #[test]
311    fn test_threshold_computation_with_explicit_coverage() {
312        // Rule with explicit 100% coverage
313        // Note: len_legalese=10 means IDs 0-9 are legalese, so token 10 is NOT legalese
314        let tokens = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 11]; // 11 is not legalese
315        let len_legalese = 10;
316        let rule = create_rule_with_thresholds(
317            "MIT License text".to_string(),
318            tokens.clone(),
319            Some(100),
320            len_legalese,
321        );
322
323        assert_eq!(rule.minimum_coverage, Some(100));
324        assert_eq!(rule.min_matched_length, 10);
325        assert_eq!(rule.min_high_matched_length, 9); // Only 1-9 are legalese
326        assert_eq!(rule.min_matched_length_unique, 10);
327        assert_eq!(rule.min_high_matched_length_unique, 9);
328    }
329
330    #[test]
331    fn test_threshold_computation_full_pipeline_small_rule() {
332        // Small rule with 8 tokens
333        let tokens = vec![1, 2, 3, 4, 6, 7, 12, 15]; // Some legalese (0-9), some not
334        let len_legalese = 10;
335        let rule = create_rule_with_thresholds(
336            "MIT License text here".to_string(),
337            tokens,
338            None,
339            len_legalese,
340        );
341
342        assert!(!rule.is_tiny);
343        assert!(rule.is_small);
344        assert_eq!(rule.length_unique, 8);
345        assert_eq!(rule.high_length_unique, 6); // Tokens 1,2,3,4,6,7 are legalese
346        assert_eq!(rule.high_length, 6);
347        assert_eq!(rule.minimum_coverage, Some(80));
348        assert_eq!(rule.min_matched_length, 8);
349        assert_eq!(rule.min_high_matched_length, 6);
350    }
351
352    #[test]
353    fn test_threshold_computation_full_pipeline_medium_rule() {
354        // Medium rule with 25 tokens
355        let tokens: Vec<u16> = (0..25).collect(); // Many legalese
356        let len_legalese = 10;
357        let rule = create_rule_with_thresholds(
358            "MIT License text here with more words".to_string(),
359            tokens,
360            None,
361            len_legalese,
362        );
363
364        assert!(!rule.is_tiny);
365        assert!(!rule.is_small);
366        assert_eq!(rule.length_unique, 25);
367        assert_eq!(rule.high_length_unique, 10); // Only 0-9 are legalese
368        assert_eq!(rule.high_length, 10);
369        assert_eq!(rule.minimum_coverage, Some(50));
370        assert_eq!(rule.min_matched_length, 12);
371        assert_eq!(rule.min_high_matched_length, 3);
372    }
373
374    #[test]
375    fn test_threshold_computation_full_pipeline_tiny_rule() {
376        // Tiny rule with 3 tokens
377        // Note: In Python, length >= 3 AND length < 10 gives 80% coverage
378        // Only length < 3 gives 100% coverage
379        let tokens = vec![1, 2, 3]; // All legalese
380        let len_legalese = 10;
381        let rule =
382            create_rule_with_thresholds("MIT License".to_string(), tokens, None, len_legalese);
383
384        // TINY_RULE is 6, so a 3-token rule IS tiny and is_small
385        assert!(rule.is_tiny);
386        assert!(rule.is_small);
387        assert_eq!(rule.length_unique, 3);
388        assert_eq!(rule.high_length_unique, 3);
389        assert_eq!(rule.high_length, 3);
390        // For length >= 3 and < 10, coverage is 80%
391        assert_eq!(rule.minimum_coverage, Some(80));
392        assert_eq!(rule.min_matched_length, 3);
393        assert_eq!(rule.min_high_matched_length, 3);
394    }
395
396    #[test]
397    fn test_threshold_computation_unique_token_counts() {
398        // Rule with repeated tokens to test unique counting
399        let tokens = vec![1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3];
400        let len_legalese = 10;
401        let rule = create_rule_with_thresholds(
402            "MIT License MIT License MIT".to_string(),
403            tokens,
404            None,
405            len_legalese,
406        );
407
408        assert_eq!(rule.length_unique, 3); // Only 3 unique tokens
409        assert_eq!(rule.high_length_unique, 3); // All are legalese
410        assert_eq!(rule.high_length, 12); // But 12 total occurrences
411    }
412
413    #[test]
414    fn test_threshold_computation_no_high_tokens() {
415        // Rule with no legalese tokens (weak rule)
416        let tokens: Vec<u16> = (10..20).collect(); // All IDs >= len_legalese
417        let len_legalese = 10;
418        let rule = create_rule_with_thresholds(
419            "Some text without legal words".to_string(),
420            tokens,
421            None,
422            len_legalese,
423        );
424
425        assert_eq!(rule.high_length_unique, 0);
426        assert_eq!(rule.high_length, 0);
427        assert_eq!(rule.min_high_matched_length, 0);
428    }
429}