Skip to main content

provenant/license_detection/match_refine/
mod.rs

1//! Match refinement - merge, filter, and finalize license matches.
2//!
3//! This module implements the final phase of license matching where raw matches
4//! from all strategies are combined, refined, and finalized.
5//!
6//! Based on the Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match.py
8
9mod false_positive;
10pub(crate) mod filter_low_quality;
11mod handle_overlaps;
12mod merge;
13
14use crate::license_detection::index::LicenseIndex;
15use crate::license_detection::models::{LicenseMatch, MatcherKind};
16use crate::license_detection::query::Query;
17
18// Internal use only
19use filter_low_quality::{
20    filter_below_rule_minimum_coverage, filter_false_positive_matches,
21    filter_invalid_matches_to_single_word_gibberish, filter_matches_missing_required_phrases,
22    filter_matches_to_spurious_single_token, filter_short_matches_scattered_on_too_many_lines,
23    filter_spurious_matches, filter_too_short_matches,
24};
25use merge::{filter_license_references_with_text_match, update_match_scores};
26
27// Re-export for crate-internal use (debug_pipeline feature)
28pub use handle_overlaps::{
29    filter_contained_matches, filter_overlapping_matches, restore_non_overlapping,
30};
31pub use merge::merge_overlapping_matches;
32
33// Public API re-exports for investigation tests
34pub use false_positive::filter_false_positive_license_lists_matches;
35
36const SMALL_RULE: usize = 15;
37
38/// Filter unknown matches contained within good matches' qregion.
39///
40/// Unknown license matches that are fully contained within the qregion
41/// (token span from start_token to end_token) of a known good match
42/// should be discarded as they are redundant.
43///
44/// # Arguments
45/// * `unknown_matches` - Slice of unknown license matches to filter
46/// * `good_matches` - Slice of known good matches to check containment against
47///
48/// # Returns
49/// Vector of unknown LicenseMatch with contained matches removed
50///
51/// Based on Python: `filter_invalid_contained_unknown_matches()` (match.py:1904-1926)
52pub fn filter_invalid_contained_unknown_matches(
53    unknown_matches: &[LicenseMatch],
54    good_matches: &[LicenseMatch],
55) -> Vec<LicenseMatch> {
56    unknown_matches
57        .iter()
58        .filter(|unknown| {
59            let unknown_start = unknown.start_token;
60            let unknown_end = unknown.end_token;
61
62            let is_contained = good_matches
63                .iter()
64                .any(|good| good.start_token <= unknown_start && good.end_token >= unknown_end);
65
66            !is_contained
67        })
68        .cloned()
69        .collect()
70}
71
72/// Split matches into good and weak matches.
73///
74/// Weak matches are:
75/// - Matches to rules with "unknown" in their license expression
76/// - Sequence matches with len() <= SMALL_RULE (15) AND coverage <= 25%
77///
78/// Weak matches are set aside before unknown license matching and reinjected later.
79///
80/// # Arguments
81/// * `matches` - Slice of LicenseMatch to split
82///
83/// # Returns
84/// Tuple of (good_matches, weak_matches)
85///
86/// Based on Python: `split_weak_matches()` (match.py:1740-1765)
87pub fn split_weak_matches(
88    index: &LicenseIndex,
89    matches: &[LicenseMatch],
90) -> (Vec<LicenseMatch>, Vec<LicenseMatch>) {
91    let mut good = Vec::new();
92    let mut weak = Vec::new();
93
94    for m in matches {
95        let is_false_positive = index.false_positive_rids.contains(&m.rid);
96        let is_weak = (!is_false_positive && m.has_unknown())
97            || (m.matcher == MatcherKind::Seq && m.len() <= SMALL_RULE && m.coverage() <= 25.0);
98
99        if is_weak {
100            weak.push(m.clone());
101        } else {
102            good.push(m.clone());
103        }
104    }
105
106    (good, weak)
107}
108
109/// Main refinement function - applies all refinement operations to match results.
110///
111/// This is the main entry point for Phase 4.6 match refinement. It applies
112/// filters in the same order as Python's refine_matches():
113///
114/// 1. Filter matches missing required phrases
115/// 2. Filter spurious matches (low density)
116/// 3. Filter below rule minimum coverage
117/// 4. Filter spurious single-token matches
118/// 5. Filter too short matches
119/// 6. Filter scattered short matches
120/// 7. Filter invalid single-word gibberish (binary files)
121/// 8. Merge overlapping/adjacent matches
122/// 9. Filter contained matches
123/// 10. Filter overlapping matches
124/// 11. Restore non-overlapping discarded matches
125/// 12. Filter false positive matches
126/// 13. Filter false positive license list matches
127/// 14. Update match scores
128///
129/// The operations are applied in sequence to produce final refined matches.
130///
131/// # Arguments
132/// * `index` - LicenseIndex containing false_positive_rids and rules_by_rid
133/// * `matches` - Vector of raw LicenseMatch from all strategies
134/// * `query` - Query object for spurious/gibberish filtering
135///
136/// # Returns
137/// Vector of refined LicenseMatch ready for detection assembly
138///
139/// Based on Python: `refine_matches()` (lines 2691-2833)
140pub fn refine_matches(
141    index: &LicenseIndex,
142    matches: Vec<LicenseMatch>,
143    query: &Query,
144) -> Vec<LicenseMatch> {
145    refine_matches_internal(index, matches, query, true)
146}
147
148/// Initial refinement without false positive filtering.
149///
150/// Used before split_weak_matches and unknown detection.
151/// This matches Python's refine_matches with filter_false_positive=False.
152///
153/// Based on Python: `refine_matches()` at index.py:1073-1080
154pub fn refine_matches_without_false_positive_filter(
155    index: &LicenseIndex,
156    matches: Vec<LicenseMatch>,
157    query: &Query,
158) -> Vec<LicenseMatch> {
159    refine_matches_internal(index, matches, query, false)
160}
161
162/// Refine Aho-Corasick matches.
163///
164/// This matches Python's `get_exact_matches()` which calls `refine_matches()` with `merge=False`.
165/// Unlike full refinement, this:
166/// - Skips initial merge (merge=False)
167/// - Applies required phrase filtering
168/// - Applies all quality filters
169/// - Applies containment and overlap filtering with restore
170/// - Skips final merge (merge=False)
171///
172/// Based on Python: `get_exact_matches()` at index.py:691-696
173pub fn refine_aho_matches(
174    index: &LicenseIndex,
175    matches: Vec<LicenseMatch>,
176    query: &Query,
177) -> Vec<LicenseMatch> {
178    if matches.is_empty() {
179        return Vec::new();
180    }
181
182    let (with_required_phrases, _missing_phrases) =
183        filter_matches_missing_required_phrases(index, &matches, query);
184
185    let non_spurious = filter_spurious_matches(&with_required_phrases, query);
186
187    let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
188
189    let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
190
191    let non_short = filter_too_short_matches(index, &non_single_spurious);
192
193    let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
194
195    let non_gibberish =
196        filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
197
198    let merged_again = merge_overlapping_matches(&non_gibberish);
199
200    let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
201
202    let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
203
204    let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
205
206    let mut matches_after_first_restore = kept.clone();
207
208    if !discarded_contained.is_empty() {
209        let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
210        matches_after_first_restore.extend(restored_contained);
211    }
212
213    let mut final_matches = matches_after_first_restore.clone();
214
215    if !discarded_overlapping.is_empty() {
216        let (restored_overlapping, _) =
217            restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
218        final_matches.extend(restored_overlapping);
219    }
220
221    let (non_contained_final, _) = filter_contained_matches(&final_matches);
222
223    let filtered_refs = filter_license_references_with_text_match(&non_contained_final);
224
225    let mut final_scored = filtered_refs;
226    update_match_scores(&mut final_scored, query);
227
228    final_scored
229}
230
231fn refine_matches_internal(
232    index: &LicenseIndex,
233    matches: Vec<LicenseMatch>,
234    query: &Query,
235    filter_false_positive: bool,
236) -> Vec<LicenseMatch> {
237    if matches.is_empty() {
238        return Vec::new();
239    }
240
241    let merged = merge_overlapping_matches(&matches);
242
243    let (with_required_phrases, _missing_phrases) =
244        filter_matches_missing_required_phrases(index, &merged, query);
245
246    let non_spurious = filter_spurious_matches(&with_required_phrases, query);
247
248    let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
249
250    let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
251
252    let non_short = filter_too_short_matches(index, &non_single_spurious);
253
254    let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
255
256    let non_gibberish =
257        filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
258
259    let merged_again = merge_overlapping_matches(&non_gibberish);
260
261    let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
262
263    let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
264
265    let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
266
267    let mut matches_after_first_restore = kept.clone();
268
269    if !discarded_contained.is_empty() {
270        let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
271        matches_after_first_restore.extend(restored_contained);
272    }
273
274    let mut final_matches = matches_after_first_restore.clone();
275
276    if !discarded_overlapping.is_empty() {
277        let (restored_overlapping, _) =
278            restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
279        final_matches.extend(restored_overlapping);
280    }
281
282    let (non_contained_final, _) = filter_contained_matches(&final_matches);
283
284    let result = if filter_false_positive {
285        let non_fp = filter_false_positive_matches(index, &non_contained_final);
286        let (kept, _discarded) = filter_false_positive_license_lists_matches(non_fp);
287        kept
288    } else {
289        non_contained_final
290    };
291
292    let merged_final = merge_overlapping_matches(&result);
293
294    let filtered_refs = filter_license_references_with_text_match(&merged_final);
295
296    let mut final_scored = filtered_refs;
297    update_match_scores(&mut final_scored, query);
298
299    final_scored
300}
301
302fn filter_binary_low_coverage_same_expression_seq_bridges(
303    matches: Vec<LicenseMatch>,
304    query: &Query,
305) -> Vec<LicenseMatch> {
306    if !query.is_binary {
307        return matches;
308    }
309
310    matches
311        .iter()
312        .filter(|m| {
313            if m.matcher != MatcherKind::Seq || m.coverage() >= 90.0 {
314                return true;
315            }
316
317            !matches.iter().any(|other| {
318                other.matcher == MatcherKind::Aho
319                    && other.coverage() == 100.0
320                    && other.license_expression == m.license_expression
321                    && other.qoverlap(m) > 0
322                    && !m.qcontains(other)
323            })
324        })
325        .cloned()
326        .collect()
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    use crate::license_detection::models::MatchCoordinates;
333    use crate::license_detection::models::position_span::PositionSpan;
334
335    fn parse_rule_id(rule_identifier: &str) -> Option<usize> {
336        let trimmed = rule_identifier.trim();
337        if let Some(stripped) = trimmed.strip_prefix('#') {
338            stripped.parse().ok()
339        } else {
340            trimmed.parse().ok()
341        }
342    }
343
344    fn create_test_match(
345        rule_identifier: &str,
346        start_line: usize,
347        end_line: usize,
348        score: f32,
349        coverage: f32,
350        relevance: u8,
351    ) -> LicenseMatch {
352        let matched_len = end_line - start_line + 1;
353        let rule_len = matched_len;
354        let rid = parse_rule_id(rule_identifier).unwrap_or(0);
355        LicenseMatch {
356            rid,
357            license_expression: "mit".to_string(),
358            license_expression_spdx: Some("MIT".to_string()),
359            from_file: None,
360            start_line,
361            end_line,
362            start_token: start_line,
363            end_token: end_line + 1,
364            matcher: crate::license_detection::models::MatcherKind::Aho,
365            score,
366            matched_length: matched_len,
367            rule_length: rule_len,
368            match_coverage: coverage,
369            rule_relevance: relevance,
370            rule_identifier: rule_identifier.to_string(),
371            rule_url: "https://example.com".to_string(),
372            matched_text: None,
373            referenced_filenames: None,
374            rule_kind: crate::license_detection::models::RuleKind::None,
375            is_from_license: false,
376            rule_start_token: 0,
377            coordinates: MatchCoordinates::query_region(PositionSpan::range(
378                start_line,
379                end_line + 1,
380            )),
381            candidate_resemblance: 0.0,
382            candidate_containment: 0.0,
383        }
384    }
385
386    #[test]
387    fn test_refine_matches_full_pipeline() {
388        let mut index = LicenseIndex::with_legalese_count(10);
389        let _ = index.false_positive_rids.insert(99);
390
391        let mut m1 = create_test_match("#1", 1, 10, 0.5, 100.0, 100);
392        m1.rule_length = 100;
393        m1.rule_start_token = 0;
394        m1.coordinates = MatchCoordinates::rule_aligned(
395            PositionSpan::range(1, 11),
396            PositionSpan::range(0, 10),
397            PositionSpan::empty(),
398        );
399        let mut m2 = create_test_match("#1", 5, 15, 0.5, 100.0, 100);
400        m2.rule_length = 100;
401        m2.rule_start_token = 4;
402        m2.coordinates = MatchCoordinates::rule_aligned(
403            PositionSpan::range(5, 16),
404            PositionSpan::range(4, 15),
405            PositionSpan::empty(),
406        );
407        let mut m3 = create_test_match("#2", 20, 25, 0.5, 100.0, 80);
408        m3.coordinates = MatchCoordinates::rule_aligned(
409            PositionSpan::range(20, 26),
410            PositionSpan::range(0, 6),
411            PositionSpan::empty(),
412        );
413        let mut m4 = create_test_match("#99", 30, 35, 0.5, 100.0, 100);
414        m4.coordinates = MatchCoordinates::rule_aligned(
415            PositionSpan::range(30, 36),
416            PositionSpan::range(0, 6),
417            PositionSpan::empty(),
418        );
419
420        let matches = vec![m1, m2, m3, m4];
421
422        let query = Query::from_extracted_text("test text", &index, false).unwrap();
423        let refined = refine_matches(&index, matches, &query);
424
425        assert_eq!(refined.len(), 2);
426
427        let rule1_match = refined.iter().find(|m| m.rule_identifier == "#1").unwrap();
428        assert_eq!(rule1_match.start_line, 1);
429        assert_eq!(rule1_match.end_line, 15);
430
431        let rule2_match = refined.iter().find(|m| m.rule_identifier == "#2").unwrap();
432        assert_eq!(rule2_match.score, 80.0);
433    }
434
435    #[test]
436    fn test_refine_matches_empty() {
437        let index = LicenseIndex::with_legalese_count(10);
438        let matches: Vec<LicenseMatch> = vec![];
439        let query = Query::from_extracted_text("", &index, false).unwrap();
440
441        let refined = refine_matches(&index, matches, &query);
442
443        assert_eq!(refined.len(), 0);
444    }
445
446    #[test]
447    fn test_refine_matches_single() {
448        let index = LicenseIndex::with_legalese_count(10);
449        let matches = vec![create_test_match("#1", 1, 10, 0.5, 100.0, 100)];
450        let query = Query::from_extracted_text("test text", &index, false).unwrap();
451
452        let refined = refine_matches(&index, matches, &query);
453
454        assert_eq!(refined.len(), 1);
455        assert_eq!(refined[0].score, 100.0);
456    }
457
458    #[test]
459    fn test_refine_matches_no_merging_needed() {
460        let index = LicenseIndex::with_legalese_count(10);
461
462        let mut m1 = create_test_match("#1", 1, 10, 0.9, 90.0, 100);
463        m1.coordinates = MatchCoordinates::rule_aligned(
464            PositionSpan::range(1, 11),
465            PositionSpan::range(0, 10),
466            PositionSpan::empty(),
467        );
468        let mut m2 = create_test_match("#2", 20, 30, 0.85, 85.0, 100);
469        m2.coordinates = MatchCoordinates::rule_aligned(
470            PositionSpan::range(20, 31),
471            PositionSpan::range(0, 11),
472            PositionSpan::empty(),
473        );
474
475        let matches = vec![m1, m2];
476
477        let query = Query::from_extracted_text("test text", &index, false).unwrap();
478
479        let refined = refine_matches(&index, matches, &query);
480
481        assert_eq!(refined.len(), 2);
482    }
483
484    #[test]
485    fn test_filter_binary_low_coverage_same_expression_seq_bridges_drops_seq_bridge() {
486        let index = LicenseIndex::with_legalese_count(10);
487        let query = Query::from_extracted_text("binary strings", &index, true).unwrap();
488
489        let mut exact = create_test_match("#1", 140, 140, 100.0, 100.0, 100);
490        exact.license_expression = "bsd-new".to_string();
491        exact.matcher = MatcherKind::Aho;
492        exact.start_token = 10;
493        exact.end_token = 16;
494        exact.matched_length = 6;
495        exact.coordinates = MatchCoordinates::rule_aligned(
496            PositionSpan::range(10, 16),
497            PositionSpan::empty(),
498            PositionSpan::empty(),
499        );
500
501        let mut seq = create_test_match("#2", 140, 141, 10.0, 52.9, 100);
502        seq.license_expression = "bsd-new".to_string();
503        seq.matcher = MatcherKind::Seq;
504        seq.start_token = 10;
505        seq.end_token = 18;
506        seq.matched_length = 7;
507        seq.coordinates = MatchCoordinates::rule_aligned(
508            PositionSpan::from_positions(vec![10, 11, 12, 13, 14, 16, 17]),
509            PositionSpan::empty(),
510            PositionSpan::empty(),
511        );
512
513        let filtered = filter_binary_low_coverage_same_expression_seq_bridges(
514            vec![seq.clone(), exact.clone()],
515            &query,
516        );
517
518        assert_eq!(filtered, vec![exact]);
519    }
520
521    #[test]
522    fn test_refine_aho_matches_restores_inner_merge_before_containment() {
523        let index = LicenseIndex::with_legalese_count(10);
524
525        let mut first = create_test_match("#1", 1, 10, 0.9, 50.0, 100);
526        first.rule_length = 20;
527        first.rule_start_token = 0;
528        first.coordinates = MatchCoordinates::rule_aligned(
529            PositionSpan::range(1, 11),
530            PositionSpan::range(0, 10),
531            PositionSpan::empty(),
532        );
533
534        let mut second = create_test_match("#1", 11, 20, 0.85, 50.0, 100);
535        second.rule_length = 20;
536        second.rule_start_token = 10;
537        second.coordinates = MatchCoordinates::rule_aligned(
538            PositionSpan::range(11, 21),
539            PositionSpan::range(10, 20),
540            PositionSpan::empty(),
541        );
542
543        let query = Query::from_extracted_text("test text", &index, false).unwrap();
544        let refined = refine_aho_matches(&index, vec![first, second], &query);
545
546        assert_eq!(refined.len(), 1);
547        assert_eq!(refined[0].rule_identifier, "#1");
548        assert_eq!(refined[0].start_line, 1);
549        assert_eq!(refined[0].end_line, 20);
550    }
551
552    #[test]
553    fn test_refine_matches_pipeline_preserves_non_overlapping_different_rules() {
554        let index = LicenseIndex::with_legalese_count(10);
555
556        let mut m1 = create_test_match("#1", 1, 10, 0.9, 90.0, 100);
557        m1.coordinates = MatchCoordinates::rule_aligned(
558            PositionSpan::range(1, 11),
559            PositionSpan::range(0, 10),
560            PositionSpan::empty(),
561        );
562        let mut m2 = create_test_match("#2", 20, 30, 0.85, 85.0, 100);
563        m2.coordinates = MatchCoordinates::rule_aligned(
564            PositionSpan::range(20, 31),
565            PositionSpan::range(0, 11),
566            PositionSpan::empty(),
567        );
568        let mut m3 = create_test_match("#3", 40, 50, 0.8, 80.0, 100);
569        m3.coordinates = MatchCoordinates::rule_aligned(
570            PositionSpan::range(40, 51),
571            PositionSpan::range(0, 11),
572            PositionSpan::empty(),
573        );
574
575        let matches = vec![m1, m2, m3];
576
577        let query = Query::from_extracted_text("test text", &index, false).unwrap();
578        let refined = refine_matches(&index, matches, &query);
579
580        assert_eq!(refined.len(), 3);
581    }
582
583    #[test]
584    fn test_refine_matches_complex_scenario() {
585        let mut index = LicenseIndex::with_legalese_count(10);
586        let _ = index.false_positive_rids.insert(999);
587
588        let mut m1 = create_test_match("#1", 1, 10, 0.7, 100.0, 100);
589        m1.matched_length = 100;
590        m1.rule_length = 100;
591        m1.rule_start_token = 0;
592        m1.coordinates = MatchCoordinates::rule_aligned(
593            PositionSpan::range(1, 11),
594            PositionSpan::range(0, 10),
595            PositionSpan::empty(),
596        );
597        let mut m2 = create_test_match("#1", 8, 15, 0.8, 100.0, 100);
598        m2.matched_length = 100;
599        m2.rule_length = 100;
600        m2.rule_start_token = 7;
601        m2.coordinates = MatchCoordinates::rule_aligned(
602            PositionSpan::range(8, 16),
603            PositionSpan::range(7, 15),
604            PositionSpan::empty(),
605        );
606        let mut m3 = create_test_match("#2", 20, 50, 0.9, 100.0, 100);
607        m3.matched_length = 300;
608        m3.rule_length = 300;
609        m3.rule_start_token = 0;
610        m3.coordinates = MatchCoordinates::rule_aligned(
611            PositionSpan::range(20, 51),
612            PositionSpan::range(0, 31),
613            PositionSpan::empty(),
614        );
615        let mut m4 = create_test_match("#2", 25, 45, 0.85, 100.0, 100);
616        m4.matched_length = 150;
617        m4.rule_length = 300;
618        m4.rule_start_token = 5;
619        m4.coordinates = MatchCoordinates::rule_aligned(
620            PositionSpan::range(25, 46),
621            PositionSpan::range(5, 26),
622            PositionSpan::empty(),
623        );
624
625        let matches = vec![m1, m2, m3, m4];
626
627        let query = Query::from_extracted_text("test text", &index, false).unwrap();
628        let refined = refine_matches(&index, matches, &query);
629
630        assert!(
631            refined.len() >= 2,
632            "Should have at least 2 matches after refinement"
633        );
634    }
635
636    #[test]
637    fn test_split_weak_matches_has_unknown() {
638        let mut m = LicenseMatch {
639            license_expression: "unknown".to_string(),
640            matcher: crate::license_detection::models::MatcherKind::Hash,
641            matched_length: 100,
642            match_coverage: 100.0,
643            ..LicenseMatch::default()
644        };
645        m.end_token = 100;
646        m.rule_length = 100;
647
648        let index = LicenseIndex::with_legalese_count(10);
649        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
650        assert!(weak.contains(&m));
651        assert!(!good.contains(&m));
652    }
653
654    #[test]
655    fn test_split_weak_matches_short_seq_low_coverage() {
656        let mut m = LicenseMatch {
657            license_expression: "mit".to_string(),
658            matcher: crate::license_detection::models::MatcherKind::Seq,
659            matched_length: 10,
660            match_coverage: 20.0,
661            ..LicenseMatch::default()
662        };
663        m.end_token = 10;
664        m.rule_length = 50;
665
666        let index = LicenseIndex::with_legalese_count(10);
667        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
668        assert!(weak.contains(&m));
669        assert!(!good.contains(&m));
670    }
671
672    #[test]
673    fn test_split_weak_matches_keeps_false_positive_unknown_out_of_weak_bucket() {
674        let m = LicenseMatch {
675            rid: 42,
676            license_expression: "unknown".to_string(),
677            matcher: crate::license_detection::models::MatcherKind::Aho,
678            matched_length: 3,
679            rule_length: 3,
680            match_coverage: 100.0,
681            ..LicenseMatch::default()
682        };
683
684        let mut index = LicenseIndex::with_legalese_count(10);
685        index.false_positive_rids.insert(42);
686
687        let (good, weak) = split_weak_matches(&index, std::slice::from_ref(&m));
688        assert!(good.contains(&m));
689        assert!(!weak.contains(&m));
690    }
691
692    #[test]
693    fn test_split_weak_matches_short_seq_high_coverage() {
694        let mut m = LicenseMatch {
695            license_expression: "mit".to_string(),
696            matcher: crate::license_detection::models::MatcherKind::Seq,
697            matched_length: 10,
698            match_coverage: 80.0,
699            ..LicenseMatch::default()
700        };
701        m.end_token = 10;
702        m.rule_length = 15;
703
704        let index = LicenseIndex::with_legalese_count(10);
705        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
706        assert!(good.contains(&m));
707        assert!(!weak.contains(&m));
708    }
709
710    #[test]
711    fn test_split_weak_matches_non_seq_short() {
712        let mut m = LicenseMatch {
713            license_expression: "mit".to_string(),
714            matcher: crate::license_detection::models::MatcherKind::Hash,
715            matched_length: 10,
716            match_coverage: 20.0,
717            ..LicenseMatch::default()
718        };
719        m.end_token = 10;
720        m.rule_length = 15;
721
722        let index = LicenseIndex::with_legalese_count(10);
723        let (good, weak) = split_weak_matches(&index, &[m.clone()]);
724        assert!(good.contains(&m));
725        assert!(!weak.contains(&m));
726    }
727
728    #[test]
729    fn test_split_weak_matches_mixed() {
730        let mut good_match = LicenseMatch {
731            license_expression: "mit".to_string(),
732            matcher: crate::license_detection::models::MatcherKind::Hash,
733            matched_length: 50,
734            match_coverage: 95.0,
735            ..LicenseMatch::default()
736        };
737        good_match.end_token = 50;
738        good_match.rule_length = 50;
739
740        let mut weak_unknown = LicenseMatch {
741            license_expression: "unknown".to_string(),
742            matcher: crate::license_detection::models::MatcherKind::Unknown,
743            matched_length: 30,
744            match_coverage: 50.0,
745            ..LicenseMatch::default()
746        };
747        weak_unknown.end_token = 30;
748        weak_unknown.rule_length = 30;
749
750        let mut weak_seq = LicenseMatch {
751            license_expression: "apache-2.0".to_string(),
752            matcher: crate::license_detection::models::MatcherKind::Seq,
753            matched_length: 10,
754            match_coverage: 20.0,
755            ..LicenseMatch::default()
756        };
757        weak_seq.end_token = 10;
758        weak_seq.rule_length = 50;
759
760        let matches = vec![good_match.clone(), weak_unknown.clone(), weak_seq.clone()];
761        let index = LicenseIndex::with_legalese_count(10);
762        let (good, weak) = split_weak_matches(&index, &matches);
763
764        assert_eq!(good.len(), 1);
765        assert_eq!(weak.len(), 2);
766        assert!(good.contains(&good_match));
767        assert!(weak.contains(&weak_unknown));
768        assert!(weak.contains(&weak_seq));
769    }
770}