Skip to main content

provenant/license_detection/match_refine/
mod.rs

1//! Match refinement - merge, filter, and finalize license matches.
2//!
3//! This module implements the final phase of license matching where raw matches
4//! from all strategies are combined, refined, and finalized.
5//!
6//! Based on the Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match.py
8
9mod false_positive;
10pub(crate) mod filter_low_quality;
11mod handle_overlaps;
12mod merge;
13
14use crate::license_detection::index::LicenseIndex;
15use crate::license_detection::models::{LicenseMatch, MatcherKind};
16use crate::license_detection::query::Query;
17
18// Internal use only
19use filter_low_quality::{
20    filter_below_rule_minimum_coverage, filter_false_positive_matches,
21    filter_invalid_matches_to_single_word_gibberish, filter_matches_missing_required_phrases,
22    filter_matches_to_spurious_single_token, filter_short_matches_scattered_on_too_many_lines,
23    filter_spurious_matches, filter_too_short_matches,
24};
25use merge::{filter_license_references_with_text_match, update_match_scores};
26
27// Re-export for crate-internal use (debug_pipeline feature)
28pub use handle_overlaps::{
29    filter_contained_matches, filter_overlapping_matches, restore_non_overlapping,
30};
31pub use merge::merge_overlapping_matches;
32
33// Public API re-exports for investigation tests
34pub use false_positive::filter_false_positive_license_lists_matches;
35
36const SMALL_RULE: usize = 15;
37
38/// Filter unknown matches contained within good matches' qregion.
39///
40/// Unknown license matches that are fully contained within the qregion
41/// (token span from start_token to end_token) of a known good match
42/// should be discarded as they are redundant.
43///
44/// # Arguments
45/// * `unknown_matches` - Slice of unknown license matches to filter
46/// * `good_matches` - Slice of known good matches to check containment against
47///
48/// # Returns
49/// Vector of unknown LicenseMatch with contained matches removed
50///
51/// Based on Python: `filter_invalid_contained_unknown_matches()` (match.py:1904-1926)
52pub fn filter_invalid_contained_unknown_matches(
53    unknown_matches: &[LicenseMatch],
54    good_matches: &[LicenseMatch],
55) -> Vec<LicenseMatch> {
56    unknown_matches
57        .iter()
58        .filter(|unknown| {
59            let unknown_start = unknown.start_token;
60            let unknown_end = unknown.end_token;
61
62            let is_contained = good_matches
63                .iter()
64                .any(|good| good.start_token <= unknown_start && good.end_token >= unknown_end);
65
66            !is_contained
67        })
68        .cloned()
69        .collect()
70}
71
72/// Split matches into good and weak matches.
73///
74/// Weak matches are:
75/// - Matches to rules with "unknown" in their license expression
76/// - Sequence matches with len() <= SMALL_RULE (15) AND coverage <= 25%
77///
78/// Weak matches are set aside before unknown license matching and reinjected later.
79///
80/// # Arguments
81/// * `matches` - Slice of LicenseMatch to split
82///
83/// # Returns
84/// Tuple of (good_matches, weak_matches)
85///
86/// Based on Python: `split_weak_matches()` (match.py:1740-1765)
87pub fn split_weak_matches(
88    index: &LicenseIndex,
89    matches: &[LicenseMatch],
90) -> (Vec<LicenseMatch>, Vec<LicenseMatch>) {
91    let mut good = Vec::new();
92    let mut weak = Vec::new();
93
94    for m in matches {
95        let is_false_positive = index.false_positive_rids.contains(&m.rid);
96        let is_weak = (!is_false_positive && m.has_unknown())
97            || (m.matcher == MatcherKind::Seq && m.len() <= SMALL_RULE && m.coverage() <= 25.0);
98
99        if is_weak {
100            weak.push(m.clone());
101        } else {
102            good.push(m.clone());
103        }
104    }
105
106    (good, weak)
107}
108
109/// Main refinement function - applies all refinement operations to match results.
110///
111/// This is the main entry point for Phase 4.6 match refinement. It applies
112/// filters in the same order as Python's refine_matches():
113///
114/// 1. Filter matches missing required phrases
115/// 2. Filter spurious matches (low density)
116/// 3. Filter below rule minimum coverage
117/// 4. Filter spurious single-token matches
118/// 5. Filter too short matches
119/// 6. Filter scattered short matches
120/// 7. Filter invalid single-word gibberish (binary files)
121/// 8. Merge overlapping/adjacent matches
122/// 9. Filter contained matches
123/// 10. Filter overlapping matches
124/// 11. Restore non-overlapping discarded matches
125/// 12. Filter false positive matches
126/// 13. Filter false positive license list matches
127/// 14. Update match scores
128///
129/// The operations are applied in sequence to produce final refined matches.
130///
131/// # Arguments
132/// * `index` - LicenseIndex containing false_positive_rids and rules_by_rid
133/// * `matches` - Vector of raw LicenseMatch from all strategies
134/// * `query` - Query object for spurious/gibberish filtering
135///
136/// # Returns
137/// Vector of refined LicenseMatch ready for detection assembly
138///
139/// Based on Python: `refine_matches()` (lines 2691-2833)
140pub fn refine_matches(
141    index: &LicenseIndex,
142    matches: Vec<LicenseMatch>,
143    query: &Query,
144) -> Vec<LicenseMatch> {
145    refine_matches_internal(index, matches, query, true)
146}
147
148/// Initial refinement without false positive filtering.
149///
150/// Used before split_weak_matches and unknown detection.
151/// This matches Python's refine_matches with filter_false_positive=False.
152///
153/// Based on Python: `refine_matches()` at index.py:1073-1080
154pub fn refine_matches_without_false_positive_filter(
155    index: &LicenseIndex,
156    matches: Vec<LicenseMatch>,
157    query: &Query,
158) -> Vec<LicenseMatch> {
159    refine_matches_internal(index, matches, query, false)
160}
161
162/// Refine Aho-Corasick matches.
163///
164/// This matches Python's `get_exact_matches()` which calls `refine_matches()` with `merge=False`.
165/// Unlike full refinement, this:
166/// - Skips initial merge (merge=False)
167/// - Applies required phrase filtering
168/// - Applies all quality filters
169/// - Applies containment and overlap filtering with restore
170/// - Skips final merge (merge=False)
171///
172/// Based on Python: `get_exact_matches()` at index.py:691-696
173pub fn refine_aho_matches(
174    index: &LicenseIndex,
175    matches: Vec<LicenseMatch>,
176    query: &Query,
177) -> Vec<LicenseMatch> {
178    if matches.is_empty() {
179        return Vec::new();
180    }
181
182    let (with_required_phrases, _missing_phrases) =
183        filter_matches_missing_required_phrases(index, &matches, query);
184
185    let non_spurious = filter_spurious_matches(&with_required_phrases, query);
186
187    let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
188
189    let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
190
191    let non_short = filter_too_short_matches(index, &non_single_spurious);
192
193    let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
194
195    let non_gibberish =
196        filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
197
198    let merged_again = merge_overlapping_matches(&non_gibberish);
199
200    let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
201
202    let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
203
204    let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
205
206    let mut matches_after_first_restore = kept.clone();
207
208    if !discarded_contained.is_empty() {
209        let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
210        matches_after_first_restore.extend(restored_contained);
211    }
212
213    let mut final_matches = matches_after_first_restore.clone();
214
215    if !discarded_overlapping.is_empty() {
216        let (restored_overlapping, _) =
217            restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
218        final_matches.extend(restored_overlapping);
219    }
220
221    let (non_contained_final, _) = filter_contained_matches(&final_matches);
222
223    let filtered_refs = filter_license_references_with_text_match(&non_contained_final);
224
225    let mut final_scored = filtered_refs;
226    update_match_scores(&mut final_scored, query);
227
228    final_scored
229}
230
231fn refine_matches_internal(
232    index: &LicenseIndex,
233    matches: Vec<LicenseMatch>,
234    query: &Query,
235    filter_false_positive: bool,
236) -> Vec<LicenseMatch> {
237    if matches.is_empty() {
238        return Vec::new();
239    }
240
241    let merged = merge_overlapping_matches(&matches);
242
243    let (with_required_phrases, _missing_phrases) =
244        filter_matches_missing_required_phrases(index, &merged, query);
245
246    let non_spurious = filter_spurious_matches(&with_required_phrases, query);
247
248    let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
249
250    let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
251
252    let non_short = filter_too_short_matches(index, &non_single_spurious);
253
254    let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
255
256    let non_gibberish =
257        filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
258
259    let merged_again = merge_overlapping_matches(&non_gibberish);
260
261    let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
262
263    let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
264
265    let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
266
267    let mut matches_after_first_restore = kept.clone();
268
269    if !discarded_contained.is_empty() {
270        let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
271        matches_after_first_restore.extend(restored_contained);
272    }
273
274    let mut final_matches = matches_after_first_restore.clone();
275
276    if !discarded_overlapping.is_empty() {
277        let (restored_overlapping, _) =
278            restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
279        final_matches.extend(restored_overlapping);
280    }
281
282    let (non_contained_final, _) = filter_contained_matches(&final_matches);
283
284    let result = if filter_false_positive {
285        let non_fp = filter_false_positive_matches(index, &non_contained_final);
286        let (kept, _discarded) = filter_false_positive_license_lists_matches(non_fp);
287        kept
288    } else {
289        non_contained_final
290    };
291
292    let merged_final = merge_overlapping_matches(&result);
293
294    let filtered_refs = filter_license_references_with_text_match(&merged_final);
295
296    let mut final_scored = filtered_refs;
297    update_match_scores(&mut final_scored, query);
298
299    final_scored
300}
301
302fn filter_binary_low_coverage_same_expression_seq_bridges(
303    matches: Vec<LicenseMatch>,
304    query: &Query,
305) -> Vec<LicenseMatch> {
306    if !query.is_binary {
307        return matches;
308    }
309
310    matches
311        .iter()
312        .filter(|m| {
313            if m.matcher != MatcherKind::Seq || m.coverage() >= 90.0 {
314                return true;
315            }
316
317            !matches.iter().any(|other| {
318                other.matcher == MatcherKind::Aho
319                    && other.coverage() == 100.0
320                    && other.license_expression == m.license_expression
321                    && other.qoverlap(m) > 0
322                    && !m.qcontains(other)
323            })
324        })
325        .cloned()
326        .collect()
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    use crate::license_detection::models::MatchCoordinates;
333    use crate::license_detection::models::position_span::PositionSpan;
334    use crate::models::LineNumber;
335    use crate::models::MatchScore;
336
337    fn parse_rule_id(rule_identifier: &str) -> Option<usize> {
338        let trimmed = rule_identifier.trim();
339        if let Some(stripped) = trimmed.strip_prefix('#') {
340            stripped.parse().ok()
341        } else {
342            trimmed.parse().ok()
343        }
344    }
345
346    fn create_test_match(
347        rule_identifier: &str,
348        start_line: usize,
349        end_line: usize,
350        score: MatchScore,
351        coverage: f32,
352        relevance: u8,
353    ) -> LicenseMatch {
354        let matched_len = end_line - start_line + 1;
355        let rule_len = matched_len;
356        let rid = parse_rule_id(rule_identifier).unwrap_or(0);
357        LicenseMatch {
358            rid,
359            license_expression: "mit".to_string(),
360            license_expression_spdx: Some("MIT".to_string()),
361            from_file: None,
362            start_line: LineNumber::new(start_line).unwrap(),
363            end_line: LineNumber::new(end_line).unwrap(),
364            start_token: start_line,
365            end_token: end_line + 1,
366            matcher: crate::license_detection::models::MatcherKind::Aho,
367            score,
368            matched_length: matched_len,
369            rule_length: rule_len,
370            match_coverage: coverage,
371            rule_relevance: relevance,
372            rule_identifier: rule_identifier.to_string(),
373            rule_url: "https://example.com".to_string(),
374            matched_text: None,
375            referenced_filenames: None,
376            rule_kind: crate::license_detection::models::RuleKind::None,
377            is_from_license: false,
378            rule_start_token: 0,
379            coordinates: MatchCoordinates::query_region(PositionSpan::range(
380                start_line,
381                end_line + 1,
382            )),
383            candidate_resemblance: 0.0,
384            candidate_containment: 0.0,
385        }
386    }
387
388    #[test]
389    fn test_refine_matches_full_pipeline() {
390        let mut index = LicenseIndex::with_legalese_count(10);
391        let _ = index.false_positive_rids.insert(99);
392
393        let mut m1 = create_test_match("#1", 1, 10, MatchScore::from_percentage(0.5), 100.0, 100);
394        m1.rule_length = 100;
395        m1.rule_start_token = 0;
396        m1.coordinates = MatchCoordinates::rule_aligned(
397            PositionSpan::range(1, 11),
398            PositionSpan::range(0, 10),
399            PositionSpan::empty(),
400        );
401        let mut m2 = create_test_match("#1", 5, 15, MatchScore::from_percentage(0.5), 100.0, 100);
402        m2.rule_length = 100;
403        m2.rule_start_token = 4;
404        m2.coordinates = MatchCoordinates::rule_aligned(
405            PositionSpan::range(5, 16),
406            PositionSpan::range(4, 15),
407            PositionSpan::empty(),
408        );
409        let mut m3 = create_test_match("#2", 20, 25, MatchScore::from_percentage(0.5), 100.0, 80);
410        m3.coordinates = MatchCoordinates::rule_aligned(
411            PositionSpan::range(20, 26),
412            PositionSpan::range(0, 6),
413            PositionSpan::empty(),
414        );
415        let mut m4 = create_test_match("#99", 30, 35, MatchScore::from_percentage(0.5), 100.0, 100);
416        m4.coordinates = MatchCoordinates::rule_aligned(
417            PositionSpan::range(30, 36),
418            PositionSpan::range(0, 6),
419            PositionSpan::empty(),
420        );
421
422        let matches = vec![m1, m2, m3, m4];
423
424        let query = Query::from_extracted_text("test text", &index, false).unwrap();
425        let refined = refine_matches(&index, matches, &query);
426
427        assert_eq!(refined.len(), 2);
428
429        let rule1_match = refined.iter().find(|m| m.rule_identifier == "#1").unwrap();
430        assert_eq!(rule1_match.start_line, LineNumber::ONE);
431        assert_eq!(rule1_match.end_line, LineNumber::new(15).unwrap());
432
433        let rule2_match = refined.iter().find(|m| m.rule_identifier == "#2").unwrap();
434        assert_eq!(rule2_match.score, MatchScore::from_percentage(80.0));
435    }
436
437    #[test]
438    fn test_refine_matches_empty() {
439        let index = LicenseIndex::with_legalese_count(10);
440        let matches: Vec<LicenseMatch> = vec![];
441        let query = Query::from_extracted_text("", &index, false).unwrap();
442
443        let refined = refine_matches(&index, matches, &query);
444
445        assert_eq!(refined.len(), 0);
446    }
447
448    #[test]
449    fn test_refine_matches_single() {
450        let index = LicenseIndex::with_legalese_count(10);
451        let matches = vec![create_test_match(
452            "#1",
453            1,
454            10,
455            MatchScore::from_percentage(0.5),
456            100.0,
457            100,
458        )];
459        let query = Query::from_extracted_text("test text", &index, false).unwrap();
460
461        let refined = refine_matches(&index, matches, &query);
462
463        assert_eq!(refined.len(), 1);
464        assert_eq!(refined[0].score, MatchScore::MAX);
465    }
466
467    #[test]
468    fn test_refine_matches_no_merging_needed() {
469        let index = LicenseIndex::with_legalese_count(10);
470
471        let mut m1 = create_test_match("#1", 1, 10, MatchScore::from_percentage(0.9), 90.0, 100);
472        m1.coordinates = MatchCoordinates::rule_aligned(
473            PositionSpan::range(1, 11),
474            PositionSpan::range(0, 10),
475            PositionSpan::empty(),
476        );
477        let mut m2 = create_test_match("#2", 20, 30, MatchScore::from_percentage(0.85), 85.0, 100);
478        m2.coordinates = MatchCoordinates::rule_aligned(
479            PositionSpan::range(20, 31),
480            PositionSpan::range(0, 11),
481            PositionSpan::empty(),
482        );
483
484        let matches = vec![m1, m2];
485
486        let query = Query::from_extracted_text("test text", &index, false).unwrap();
487
488        let refined = refine_matches(&index, matches, &query);
489
490        assert_eq!(refined.len(), 2);
491    }
492
493    #[test]
494    fn test_filter_binary_low_coverage_same_expression_seq_bridges_drops_seq_bridge() {
495        let index = LicenseIndex::with_legalese_count(10);
496        let query = Query::from_extracted_text("binary strings", &index, true).unwrap();
497
498        let mut exact = create_test_match("#1", 140, 140, MatchScore::MAX, 100.0, 100);
499        exact.license_expression = "bsd-new".to_string();
500        exact.matcher = MatcherKind::Aho;
501        exact.start_token = 10;
502        exact.end_token = 16;
503        exact.matched_length = 6;
504        exact.coordinates = MatchCoordinates::rule_aligned(
505            PositionSpan::range(10, 16),
506            PositionSpan::empty(),
507            PositionSpan::empty(),
508        );
509
510        let mut seq =
511            create_test_match("#2", 140, 141, MatchScore::from_percentage(10.0), 52.9, 100);
512        seq.license_expression = "bsd-new".to_string();
513        seq.matcher = MatcherKind::Seq;
514        seq.start_token = 10;
515        seq.end_token = 18;
516        seq.matched_length = 7;
517        seq.coordinates = MatchCoordinates::rule_aligned(
518            PositionSpan::from_positions(vec![10, 11, 12, 13, 14, 16, 17]),
519            PositionSpan::empty(),
520            PositionSpan::empty(),
521        );
522
523        let filtered = filter_binary_low_coverage_same_expression_seq_bridges(
524            vec![seq.clone(), exact.clone()],
525            &query,
526        );
527
528        assert_eq!(filtered, vec![exact]);
529    }
530
531    #[test]
532    fn test_refine_aho_matches_restores_inner_merge_before_containment() {
533        let index = LicenseIndex::with_legalese_count(10);
534
535        let mut first = create_test_match("#1", 1, 10, MatchScore::from_percentage(0.9), 50.0, 100);
536        first.rule_length = 20;
537        first.rule_start_token = 0;
538        first.coordinates = MatchCoordinates::rule_aligned(
539            PositionSpan::range(1, 11),
540            PositionSpan::range(0, 10),
541            PositionSpan::empty(),
542        );
543
544        let mut second =
545            create_test_match("#1", 11, 20, MatchScore::from_percentage(0.85), 50.0, 100);
546        second.rule_length = 20;
547        second.rule_start_token = 10;
548        second.coordinates = MatchCoordinates::rule_aligned(
549            PositionSpan::range(11, 21),
550            PositionSpan::range(10, 20),
551            PositionSpan::empty(),
552        );
553
554        let query = Query::from_extracted_text("test text", &index, false).unwrap();
555        let refined = refine_aho_matches(&index, vec![first, second], &query);
556
557        assert_eq!(refined.len(), 1);
558        assert_eq!(refined[0].rule_identifier, "#1");
559        assert_eq!(refined[0].start_line, LineNumber::ONE);
560        assert_eq!(refined[0].end_line, LineNumber::new(20).unwrap());
561    }
562
563    #[test]
564    fn test_refine_matches_pipeline_preserves_non_overlapping_different_rules() {
565        let index = LicenseIndex::with_legalese_count(10);
566
567        let mut m1 = create_test_match("#1", 1, 10, MatchScore::from_percentage(0.9), 90.0, 100);
568        m1.coordinates = MatchCoordinates::rule_aligned(
569            PositionSpan::range(1, 11),
570            PositionSpan::range(0, 10),
571            PositionSpan::empty(),
572        );
573        let mut m2 = create_test_match("#2", 20, 30, MatchScore::from_percentage(0.85), 85.0, 100);
574        m2.coordinates = MatchCoordinates::rule_aligned(
575            PositionSpan::range(20, 31),
576            PositionSpan::range(0, 11),
577            PositionSpan::empty(),
578        );
579        let mut m3 = create_test_match("#3", 40, 50, MatchScore::from_percentage(0.8), 80.0, 100);
580        m3.coordinates = MatchCoordinates::rule_aligned(
581            PositionSpan::range(40, 51),
582            PositionSpan::range(0, 11),
583            PositionSpan::empty(),
584        );
585
586        let matches = vec![m1, m2, m3];
587
588        let query = Query::from_extracted_text("test text", &index, false).unwrap();
589        let refined = refine_matches(&index, matches, &query);
590
591        assert_eq!(refined.len(), 3);
592    }
593
594    #[test]
595    fn test_refine_matches_complex_scenario() {
596        let mut index = LicenseIndex::with_legalese_count(10);
597        let _ = index.false_positive_rids.insert(999);
598
599        let mut m1 = create_test_match("#1", 1, 10, MatchScore::from_percentage(0.7), 100.0, 100);
600        m1.matched_length = 100;
601        m1.rule_length = 100;
602        m1.rule_start_token = 0;
603        m1.coordinates = MatchCoordinates::rule_aligned(
604            PositionSpan::range(1, 11),
605            PositionSpan::range(0, 10),
606            PositionSpan::empty(),
607        );
608        let mut m2 = create_test_match("#1", 8, 15, MatchScore::from_percentage(0.8), 100.0, 100);
609        m2.matched_length = 100;
610        m2.rule_length = 100;
611        m2.rule_start_token = 7;
612        m2.coordinates = MatchCoordinates::rule_aligned(
613            PositionSpan::range(8, 16),
614            PositionSpan::range(7, 15),
615            PositionSpan::empty(),
616        );
617        let mut m3 = create_test_match("#2", 20, 50, MatchScore::from_percentage(0.9), 100.0, 100);
618        m3.matched_length = 300;
619        m3.rule_length = 300;
620        m3.rule_start_token = 0;
621        m3.coordinates = MatchCoordinates::rule_aligned(
622            PositionSpan::range(20, 51),
623            PositionSpan::range(0, 31),
624            PositionSpan::empty(),
625        );
626        let mut m4 = create_test_match("#2", 25, 45, MatchScore::from_percentage(0.85), 100.0, 100);
627        m4.matched_length = 150;
628        m4.rule_length = 300;
629        m4.rule_start_token = 5;
630        m4.coordinates = MatchCoordinates::rule_aligned(
631            PositionSpan::range(25, 46),
632            PositionSpan::range(5, 26),
633            PositionSpan::empty(),
634        );
635
636        let matches = vec![m1, m2, m3, m4];
637
638        let query = Query::from_extracted_text("test text", &index, false).unwrap();
639        let refined = refine_matches(&index, matches, &query);
640
641        assert!(
642            refined.len() >= 2,
643            "Should have at least 2 matches after refinement"
644        );
645    }
646
647    #[test]
648    fn test_split_weak_matches_has_unknown() {
649        let mut m = LicenseMatch {
650            license_expression: "unknown".to_string(),
651            matcher: crate::license_detection::models::MatcherKind::Hash,
652            matched_length: 100,
653            match_coverage: 100.0,
654            ..LicenseMatch::default()
655        };
656        m.end_token = 100;
657        m.rule_length = 100;
658
659        let index = LicenseIndex::with_legalese_count(10);
660        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
661        assert!(weak.contains(&m));
662        assert!(!good.contains(&m));
663    }
664
665    #[test]
666    fn test_split_weak_matches_short_seq_low_coverage() {
667        let mut m = LicenseMatch {
668            license_expression: "mit".to_string(),
669            matcher: crate::license_detection::models::MatcherKind::Seq,
670            matched_length: 10,
671            match_coverage: 20.0,
672            ..LicenseMatch::default()
673        };
674        m.end_token = 10;
675        m.rule_length = 50;
676
677        let index = LicenseIndex::with_legalese_count(10);
678        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
679        assert!(weak.contains(&m));
680        assert!(!good.contains(&m));
681    }
682
683    #[test]
684    fn test_split_weak_matches_keeps_false_positive_unknown_out_of_weak_bucket() {
685        let m = LicenseMatch {
686            rid: 42,
687            license_expression: "unknown".to_string(),
688            matcher: crate::license_detection::models::MatcherKind::Aho,
689            matched_length: 3,
690            rule_length: 3,
691            match_coverage: 100.0,
692            ..LicenseMatch::default()
693        };
694
695        let mut index = LicenseIndex::with_legalese_count(10);
696        index.false_positive_rids.insert(42);
697
698        let (good, weak) = split_weak_matches(&index, std::slice::from_ref(&m));
699        assert!(good.contains(&m));
700        assert!(!weak.contains(&m));
701    }
702
703    #[test]
704    fn test_split_weak_matches_short_seq_high_coverage() {
705        let mut m = LicenseMatch {
706            license_expression: "mit".to_string(),
707            matcher: crate::license_detection::models::MatcherKind::Seq,
708            matched_length: 10,
709            match_coverage: 80.0,
710            ..LicenseMatch::default()
711        };
712        m.end_token = 10;
713        m.rule_length = 15;
714
715        let index = LicenseIndex::with_legalese_count(10);
716        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
717        assert!(good.contains(&m));
718        assert!(!weak.contains(&m));
719    }
720
721    #[test]
722    fn test_split_weak_matches_non_seq_short() {
723        let mut m = LicenseMatch {
724            license_expression: "mit".to_string(),
725            matcher: crate::license_detection::models::MatcherKind::Hash,
726            matched_length: 10,
727            match_coverage: 20.0,
728            ..LicenseMatch::default()
729        };
730        m.end_token = 10;
731        m.rule_length = 15;
732
733        let index = LicenseIndex::with_legalese_count(10);
734        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
735        assert!(good.contains(&m));
736        assert!(!weak.contains(&m));
737    }
738
739    #[test]
740    fn test_split_weak_matches_mixed() {
741        let mut good_match = LicenseMatch {
742            license_expression: "mit".to_string(),
743            matcher: crate::license_detection::models::MatcherKind::Hash,
744            matched_length: 50,
745            match_coverage: 95.0,
746            ..LicenseMatch::default()
747        };
748        good_match.end_token = 50;
749        good_match.rule_length = 50;
750
751        let mut weak_unknown = LicenseMatch {
752            license_expression: "unknown".to_string(),
753            matcher: crate::license_detection::models::MatcherKind::Unknown,
754            matched_length: 30,
755            match_coverage: 50.0,
756            ..LicenseMatch::default()
757        };
758        weak_unknown.end_token = 30;
759        weak_unknown.rule_length = 30;
760
761        let mut weak_seq = LicenseMatch {
762            license_expression: "apache-2.0".to_string(),
763            matcher: crate::license_detection::models::MatcherKind::Seq,
764            matched_length: 10,
765            match_coverage: 20.0,
766            ..LicenseMatch::default()
767        };
768        weak_seq.end_token = 10;
769        weak_seq.rule_length = 50;
770
771        let matches = vec![good_match.clone(), weak_unknown.clone(), weak_seq.clone()];
772        let index = LicenseIndex::with_legalese_count(10);
773        let (good, weak) = split_weak_matches(&index, &matches);
774
775        assert_eq!(good.len(), 1);
776        assert_eq!(weak.len(), 2);
777        assert!(good.contains(&good_match));
778        assert!(weak.contains(&weak_unknown));
779        assert!(weak.contains(&weak_seq));
780    }
781}