1mod false_positive;
10pub(crate) mod filter_low_quality;
11mod handle_overlaps;
12mod merge;
13
14use crate::license_detection::index::LicenseIndex;
15use crate::license_detection::models::{LicenseMatch, MatcherKind};
16use crate::license_detection::query::Query;
17
18use filter_low_quality::{
20 filter_below_rule_minimum_coverage, filter_false_positive_matches,
21 filter_invalid_matches_to_single_word_gibberish, filter_matches_missing_required_phrases,
22 filter_matches_to_spurious_single_token, filter_short_matches_scattered_on_too_many_lines,
23 filter_spurious_matches, filter_too_short_matches,
24};
25use merge::{filter_license_references_with_text_match, update_match_scores};
26
27pub use handle_overlaps::{
29 filter_contained_matches, filter_overlapping_matches, restore_non_overlapping,
30};
31pub use merge::merge_overlapping_matches;
32
33pub use false_positive::filter_false_positive_license_lists_matches;
35
36const SMALL_RULE: usize = 15;
37
38pub fn filter_invalid_contained_unknown_matches(
53 unknown_matches: &[LicenseMatch],
54 good_matches: &[LicenseMatch],
55) -> Vec<LicenseMatch> {
56 unknown_matches
57 .iter()
58 .filter(|unknown| {
59 let unknown_start = unknown.start_token;
60 let unknown_end = unknown.end_token;
61
62 let is_contained = good_matches
63 .iter()
64 .any(|good| good.start_token <= unknown_start && good.end_token >= unknown_end);
65
66 !is_contained
67 })
68 .cloned()
69 .collect()
70}
71
72pub fn split_weak_matches(
88 index: &LicenseIndex,
89 matches: &[LicenseMatch],
90) -> (Vec<LicenseMatch>, Vec<LicenseMatch>) {
91 let mut good = Vec::new();
92 let mut weak = Vec::new();
93
94 for m in matches {
95 let is_false_positive = index.false_positive_rids.contains(&m.rid);
96 let is_weak = (!is_false_positive && m.has_unknown())
97 || (m.matcher == MatcherKind::Seq && m.len() <= SMALL_RULE && m.coverage() <= 25.0);
98
99 if is_weak {
100 weak.push(m.clone());
101 } else {
102 good.push(m.clone());
103 }
104 }
105
106 (good, weak)
107}
108
109pub fn refine_matches(
141 index: &LicenseIndex,
142 matches: Vec<LicenseMatch>,
143 query: &Query,
144) -> Vec<LicenseMatch> {
145 refine_matches_internal(index, matches, query, true)
146}
147
148pub fn refine_matches_without_false_positive_filter(
155 index: &LicenseIndex,
156 matches: Vec<LicenseMatch>,
157 query: &Query,
158) -> Vec<LicenseMatch> {
159 refine_matches_internal(index, matches, query, false)
160}
161
162pub fn refine_aho_matches(
174 index: &LicenseIndex,
175 matches: Vec<LicenseMatch>,
176 query: &Query,
177) -> Vec<LicenseMatch> {
178 if matches.is_empty() {
179 return Vec::new();
180 }
181
182 let (with_required_phrases, _missing_phrases) =
183 filter_matches_missing_required_phrases(index, &matches, query);
184
185 let non_spurious = filter_spurious_matches(&with_required_phrases, query);
186
187 let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
188
189 let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
190
191 let non_short = filter_too_short_matches(index, &non_single_spurious);
192
193 let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
194
195 let non_gibberish =
196 filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
197
198 let merged_again = merge_overlapping_matches(&non_gibberish);
199
200 let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
201
202 let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
203
204 let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
205
206 let mut matches_after_first_restore = kept.clone();
207
208 if !discarded_contained.is_empty() {
209 let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
210 matches_after_first_restore.extend(restored_contained);
211 }
212
213 let mut final_matches = matches_after_first_restore.clone();
214
215 if !discarded_overlapping.is_empty() {
216 let (restored_overlapping, _) =
217 restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
218 final_matches.extend(restored_overlapping);
219 }
220
221 let (non_contained_final, _) = filter_contained_matches(&final_matches);
222
223 let filtered_refs = filter_license_references_with_text_match(&non_contained_final);
224
225 let mut final_scored = filtered_refs;
226 update_match_scores(&mut final_scored, query);
227
228 final_scored
229}
230
231fn refine_matches_internal(
232 index: &LicenseIndex,
233 matches: Vec<LicenseMatch>,
234 query: &Query,
235 filter_false_positive: bool,
236) -> Vec<LicenseMatch> {
237 if matches.is_empty() {
238 return Vec::new();
239 }
240
241 let merged = merge_overlapping_matches(&matches);
242
243 let (with_required_phrases, _missing_phrases) =
244 filter_matches_missing_required_phrases(index, &merged, query);
245
246 let non_spurious = filter_spurious_matches(&with_required_phrases, query);
247
248 let above_min_cov = filter_below_rule_minimum_coverage(index, &non_spurious);
249
250 let non_single_spurious = filter_matches_to_spurious_single_token(&above_min_cov, query, 5);
251
252 let non_short = filter_too_short_matches(index, &non_single_spurious);
253
254 let non_scattered = filter_short_matches_scattered_on_too_many_lines(index, &non_short);
255
256 let non_gibberish =
257 filter_invalid_matches_to_single_word_gibberish(index, &non_scattered, query);
258
259 let merged_again = merge_overlapping_matches(&non_gibberish);
260
261 let merged_again = filter_binary_low_coverage_same_expression_seq_bridges(merged_again, query);
262
263 let (non_contained, discarded_contained) = filter_contained_matches(&merged_again);
264
265 let (kept, discarded_overlapping) = filter_overlapping_matches(non_contained, index);
266
267 let mut matches_after_first_restore = kept.clone();
268
269 if !discarded_contained.is_empty() {
270 let (restored_contained, _) = restore_non_overlapping(&kept, discarded_contained);
271 matches_after_first_restore.extend(restored_contained);
272 }
273
274 let mut final_matches = matches_after_first_restore.clone();
275
276 if !discarded_overlapping.is_empty() {
277 let (restored_overlapping, _) =
278 restore_non_overlapping(&matches_after_first_restore, discarded_overlapping);
279 final_matches.extend(restored_overlapping);
280 }
281
282 let (non_contained_final, _) = filter_contained_matches(&final_matches);
283
284 let result = if filter_false_positive {
285 let non_fp = filter_false_positive_matches(index, &non_contained_final);
286 let (kept, _discarded) = filter_false_positive_license_lists_matches(non_fp);
287 kept
288 } else {
289 non_contained_final
290 };
291
292 let merged_final = merge_overlapping_matches(&result);
293
294 let filtered_refs = filter_license_references_with_text_match(&merged_final);
295
296 let mut final_scored = filtered_refs;
297 update_match_scores(&mut final_scored, query);
298
299 final_scored
300}
301
302fn filter_binary_low_coverage_same_expression_seq_bridges(
303 matches: Vec<LicenseMatch>,
304 query: &Query,
305) -> Vec<LicenseMatch> {
306 if !query.is_binary {
307 return matches;
308 }
309
310 matches
311 .iter()
312 .filter(|m| {
313 if m.matcher != MatcherKind::Seq || m.coverage() >= 90.0 {
314 return true;
315 }
316
317 !matches.iter().any(|other| {
318 other.matcher == MatcherKind::Aho
319 && other.coverage() == 100.0
320 && other.license_expression == m.license_expression
321 && other.qoverlap(m) > 0
322 && !m.qcontains(other)
323 })
324 })
325 .cloned()
326 .collect()
327}
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332 use crate::license_detection::models::MatchCoordinates;
333 use crate::license_detection::models::position_span::PositionSpan;
334
335 fn parse_rule_id(rule_identifier: &str) -> Option<usize> {
336 let trimmed = rule_identifier.trim();
337 if let Some(stripped) = trimmed.strip_prefix('#') {
338 stripped.parse().ok()
339 } else {
340 trimmed.parse().ok()
341 }
342 }
343
344 fn create_test_match(
345 rule_identifier: &str,
346 start_line: usize,
347 end_line: usize,
348 score: f32,
349 coverage: f32,
350 relevance: u8,
351 ) -> LicenseMatch {
352 let matched_len = end_line - start_line + 1;
353 let rule_len = matched_len;
354 let rid = parse_rule_id(rule_identifier).unwrap_or(0);
355 LicenseMatch {
356 rid,
357 license_expression: "mit".to_string(),
358 license_expression_spdx: Some("MIT".to_string()),
359 from_file: None,
360 start_line,
361 end_line,
362 start_token: start_line,
363 end_token: end_line + 1,
364 matcher: crate::license_detection::models::MatcherKind::Aho,
365 score,
366 matched_length: matched_len,
367 rule_length: rule_len,
368 match_coverage: coverage,
369 rule_relevance: relevance,
370 rule_identifier: rule_identifier.to_string(),
371 rule_url: "https://example.com".to_string(),
372 matched_text: None,
373 referenced_filenames: None,
374 rule_kind: crate::license_detection::models::RuleKind::None,
375 is_from_license: false,
376 rule_start_token: 0,
377 coordinates: MatchCoordinates::query_region(PositionSpan::range(
378 start_line,
379 end_line + 1,
380 )),
381 candidate_resemblance: 0.0,
382 candidate_containment: 0.0,
383 }
384 }
385
386 #[test]
387 fn test_refine_matches_full_pipeline() {
388 let mut index = LicenseIndex::with_legalese_count(10);
389 let _ = index.false_positive_rids.insert(99);
390
391 let mut m1 = create_test_match("#1", 1, 10, 0.5, 100.0, 100);
392 m1.rule_length = 100;
393 m1.rule_start_token = 0;
394 m1.coordinates = MatchCoordinates::rule_aligned(
395 PositionSpan::range(1, 11),
396 PositionSpan::range(0, 10),
397 PositionSpan::empty(),
398 );
399 let mut m2 = create_test_match("#1", 5, 15, 0.5, 100.0, 100);
400 m2.rule_length = 100;
401 m2.rule_start_token = 4;
402 m2.coordinates = MatchCoordinates::rule_aligned(
403 PositionSpan::range(5, 16),
404 PositionSpan::range(4, 15),
405 PositionSpan::empty(),
406 );
407 let mut m3 = create_test_match("#2", 20, 25, 0.5, 100.0, 80);
408 m3.coordinates = MatchCoordinates::rule_aligned(
409 PositionSpan::range(20, 26),
410 PositionSpan::range(0, 6),
411 PositionSpan::empty(),
412 );
413 let mut m4 = create_test_match("#99", 30, 35, 0.5, 100.0, 100);
414 m4.coordinates = MatchCoordinates::rule_aligned(
415 PositionSpan::range(30, 36),
416 PositionSpan::range(0, 6),
417 PositionSpan::empty(),
418 );
419
420 let matches = vec![m1, m2, m3, m4];
421
422 let query = Query::from_extracted_text("test text", &index, false).unwrap();
423 let refined = refine_matches(&index, matches, &query);
424
425 assert_eq!(refined.len(), 2);
426
427 let rule1_match = refined.iter().find(|m| m.rule_identifier == "#1").unwrap();
428 assert_eq!(rule1_match.start_line, 1);
429 assert_eq!(rule1_match.end_line, 15);
430
431 let rule2_match = refined.iter().find(|m| m.rule_identifier == "#2").unwrap();
432 assert_eq!(rule2_match.score, 80.0);
433 }
434
435 #[test]
436 fn test_refine_matches_empty() {
437 let index = LicenseIndex::with_legalese_count(10);
438 let matches: Vec<LicenseMatch> = vec![];
439 let query = Query::from_extracted_text("", &index, false).unwrap();
440
441 let refined = refine_matches(&index, matches, &query);
442
443 assert_eq!(refined.len(), 0);
444 }
445
446 #[test]
447 fn test_refine_matches_single() {
448 let index = LicenseIndex::with_legalese_count(10);
449 let matches = vec![create_test_match("#1", 1, 10, 0.5, 100.0, 100)];
450 let query = Query::from_extracted_text("test text", &index, false).unwrap();
451
452 let refined = refine_matches(&index, matches, &query);
453
454 assert_eq!(refined.len(), 1);
455 assert_eq!(refined[0].score, 100.0);
456 }
457
458 #[test]
459 fn test_refine_matches_no_merging_needed() {
460 let index = LicenseIndex::with_legalese_count(10);
461
462 let mut m1 = create_test_match("#1", 1, 10, 0.9, 90.0, 100);
463 m1.coordinates = MatchCoordinates::rule_aligned(
464 PositionSpan::range(1, 11),
465 PositionSpan::range(0, 10),
466 PositionSpan::empty(),
467 );
468 let mut m2 = create_test_match("#2", 20, 30, 0.85, 85.0, 100);
469 m2.coordinates = MatchCoordinates::rule_aligned(
470 PositionSpan::range(20, 31),
471 PositionSpan::range(0, 11),
472 PositionSpan::empty(),
473 );
474
475 let matches = vec![m1, m2];
476
477 let query = Query::from_extracted_text("test text", &index, false).unwrap();
478
479 let refined = refine_matches(&index, matches, &query);
480
481 assert_eq!(refined.len(), 2);
482 }
483
484 #[test]
485 fn test_filter_binary_low_coverage_same_expression_seq_bridges_drops_seq_bridge() {
486 let index = LicenseIndex::with_legalese_count(10);
487 let query = Query::from_extracted_text("binary strings", &index, true).unwrap();
488
489 let mut exact = create_test_match("#1", 140, 140, 100.0, 100.0, 100);
490 exact.license_expression = "bsd-new".to_string();
491 exact.matcher = MatcherKind::Aho;
492 exact.start_token = 10;
493 exact.end_token = 16;
494 exact.matched_length = 6;
495 exact.coordinates = MatchCoordinates::rule_aligned(
496 PositionSpan::range(10, 16),
497 PositionSpan::empty(),
498 PositionSpan::empty(),
499 );
500
501 let mut seq = create_test_match("#2", 140, 141, 10.0, 52.9, 100);
502 seq.license_expression = "bsd-new".to_string();
503 seq.matcher = MatcherKind::Seq;
504 seq.start_token = 10;
505 seq.end_token = 18;
506 seq.matched_length = 7;
507 seq.coordinates = MatchCoordinates::rule_aligned(
508 PositionSpan::from_positions(vec![10, 11, 12, 13, 14, 16, 17]),
509 PositionSpan::empty(),
510 PositionSpan::empty(),
511 );
512
513 let filtered = filter_binary_low_coverage_same_expression_seq_bridges(
514 vec![seq.clone(), exact.clone()],
515 &query,
516 );
517
518 assert_eq!(filtered, vec![exact]);
519 }
520
521 #[test]
522 fn test_refine_aho_matches_restores_inner_merge_before_containment() {
523 let index = LicenseIndex::with_legalese_count(10);
524
525 let mut first = create_test_match("#1", 1, 10, 0.9, 50.0, 100);
526 first.rule_length = 20;
527 first.rule_start_token = 0;
528 first.coordinates = MatchCoordinates::rule_aligned(
529 PositionSpan::range(1, 11),
530 PositionSpan::range(0, 10),
531 PositionSpan::empty(),
532 );
533
534 let mut second = create_test_match("#1", 11, 20, 0.85, 50.0, 100);
535 second.rule_length = 20;
536 second.rule_start_token = 10;
537 second.coordinates = MatchCoordinates::rule_aligned(
538 PositionSpan::range(11, 21),
539 PositionSpan::range(10, 20),
540 PositionSpan::empty(),
541 );
542
543 let query = Query::from_extracted_text("test text", &index, false).unwrap();
544 let refined = refine_aho_matches(&index, vec![first, second], &query);
545
546 assert_eq!(refined.len(), 1);
547 assert_eq!(refined[0].rule_identifier, "#1");
548 assert_eq!(refined[0].start_line, 1);
549 assert_eq!(refined[0].end_line, 20);
550 }
551
552 #[test]
553 fn test_refine_matches_pipeline_preserves_non_overlapping_different_rules() {
554 let index = LicenseIndex::with_legalese_count(10);
555
556 let mut m1 = create_test_match("#1", 1, 10, 0.9, 90.0, 100);
557 m1.coordinates = MatchCoordinates::rule_aligned(
558 PositionSpan::range(1, 11),
559 PositionSpan::range(0, 10),
560 PositionSpan::empty(),
561 );
562 let mut m2 = create_test_match("#2", 20, 30, 0.85, 85.0, 100);
563 m2.coordinates = MatchCoordinates::rule_aligned(
564 PositionSpan::range(20, 31),
565 PositionSpan::range(0, 11),
566 PositionSpan::empty(),
567 );
568 let mut m3 = create_test_match("#3", 40, 50, 0.8, 80.0, 100);
569 m3.coordinates = MatchCoordinates::rule_aligned(
570 PositionSpan::range(40, 51),
571 PositionSpan::range(0, 11),
572 PositionSpan::empty(),
573 );
574
575 let matches = vec![m1, m2, m3];
576
577 let query = Query::from_extracted_text("test text", &index, false).unwrap();
578 let refined = refine_matches(&index, matches, &query);
579
580 assert_eq!(refined.len(), 3);
581 }
582
583 #[test]
584 fn test_refine_matches_complex_scenario() {
585 let mut index = LicenseIndex::with_legalese_count(10);
586 let _ = index.false_positive_rids.insert(999);
587
588 let mut m1 = create_test_match("#1", 1, 10, 0.7, 100.0, 100);
589 m1.matched_length = 100;
590 m1.rule_length = 100;
591 m1.rule_start_token = 0;
592 m1.coordinates = MatchCoordinates::rule_aligned(
593 PositionSpan::range(1, 11),
594 PositionSpan::range(0, 10),
595 PositionSpan::empty(),
596 );
597 let mut m2 = create_test_match("#1", 8, 15, 0.8, 100.0, 100);
598 m2.matched_length = 100;
599 m2.rule_length = 100;
600 m2.rule_start_token = 7;
601 m2.coordinates = MatchCoordinates::rule_aligned(
602 PositionSpan::range(8, 16),
603 PositionSpan::range(7, 15),
604 PositionSpan::empty(),
605 );
606 let mut m3 = create_test_match("#2", 20, 50, 0.9, 100.0, 100);
607 m3.matched_length = 300;
608 m3.rule_length = 300;
609 m3.rule_start_token = 0;
610 m3.coordinates = MatchCoordinates::rule_aligned(
611 PositionSpan::range(20, 51),
612 PositionSpan::range(0, 31),
613 PositionSpan::empty(),
614 );
615 let mut m4 = create_test_match("#2", 25, 45, 0.85, 100.0, 100);
616 m4.matched_length = 150;
617 m4.rule_length = 300;
618 m4.rule_start_token = 5;
619 m4.coordinates = MatchCoordinates::rule_aligned(
620 PositionSpan::range(25, 46),
621 PositionSpan::range(5, 26),
622 PositionSpan::empty(),
623 );
624
625 let matches = vec![m1, m2, m3, m4];
626
627 let query = Query::from_extracted_text("test text", &index, false).unwrap();
628 let refined = refine_matches(&index, matches, &query);
629
630 assert!(
631 refined.len() >= 2,
632 "Should have at least 2 matches after refinement"
633 );
634 }
635
636 #[test]
637 fn test_split_weak_matches_has_unknown() {
638 let mut m = LicenseMatch {
639 license_expression: "unknown".to_string(),
640 matcher: crate::license_detection::models::MatcherKind::Hash,
641 matched_length: 100,
642 match_coverage: 100.0,
643 ..LicenseMatch::default()
644 };
645 m.end_token = 100;
646 m.rule_length = 100;
647
648 let index = LicenseIndex::with_legalese_count(10);
649 let (good, weak) = split_weak_matches(&index, &[m.clone()]);
650 assert!(weak.contains(&m));
651 assert!(!good.contains(&m));
652 }
653
654 #[test]
655 fn test_split_weak_matches_short_seq_low_coverage() {
656 let mut m = LicenseMatch {
657 license_expression: "mit".to_string(),
658 matcher: crate::license_detection::models::MatcherKind::Seq,
659 matched_length: 10,
660 match_coverage: 20.0,
661 ..LicenseMatch::default()
662 };
663 m.end_token = 10;
664 m.rule_length = 50;
665
666 let index = LicenseIndex::with_legalese_count(10);
667 let (good, weak) = split_weak_matches(&index, &[m.clone()]);
668 assert!(weak.contains(&m));
669 assert!(!good.contains(&m));
670 }
671
672 #[test]
673 fn test_split_weak_matches_keeps_false_positive_unknown_out_of_weak_bucket() {
674 let m = LicenseMatch {
675 rid: 42,
676 license_expression: "unknown".to_string(),
677 matcher: crate::license_detection::models::MatcherKind::Aho,
678 matched_length: 3,
679 rule_length: 3,
680 match_coverage: 100.0,
681 ..LicenseMatch::default()
682 };
683
684 let mut index = LicenseIndex::with_legalese_count(10);
685 index.false_positive_rids.insert(42);
686
687 let (good, weak) = split_weak_matches(&index, std::slice::from_ref(&m));
688 assert!(good.contains(&m));
689 assert!(!weak.contains(&m));
690 }
691
692 #[test]
693 fn test_split_weak_matches_short_seq_high_coverage() {
694 let mut m = LicenseMatch {
695 license_expression: "mit".to_string(),
696 matcher: crate::license_detection::models::MatcherKind::Seq,
697 matched_length: 10,
698 match_coverage: 80.0,
699 ..LicenseMatch::default()
700 };
701 m.end_token = 10;
702 m.rule_length = 15;
703
704 let index = LicenseIndex::with_legalese_count(10);
705 let (good, weak) = split_weak_matches(&index, &[m.clone()]);
706 assert!(good.contains(&m));
707 assert!(!weak.contains(&m));
708 }
709
710 #[test]
711 fn test_split_weak_matches_non_seq_short() {
712 let mut m = LicenseMatch {
713 license_expression: "mit".to_string(),
714 matcher: crate::license_detection::models::MatcherKind::Hash,
715 matched_length: 10,
716 match_coverage: 20.0,
717 ..LicenseMatch::default()
718 };
719 m.end_token = 10;
720 m.rule_length = 15;
721
722 let index = LicenseIndex::with_legalese_count(10);
723 let (good, weak) = split_weak_matches(&index, &[m.clone()]);
724 assert!(good.contains(&m));
725 assert!(!weak.contains(&m));
726 }
727
728 #[test]
729 fn test_split_weak_matches_mixed() {
730 let mut good_match = LicenseMatch {
731 license_expression: "mit".to_string(),
732 matcher: crate::license_detection::models::MatcherKind::Hash,
733 matched_length: 50,
734 match_coverage: 95.0,
735 ..LicenseMatch::default()
736 };
737 good_match.end_token = 50;
738 good_match.rule_length = 50;
739
740 let mut weak_unknown = LicenseMatch {
741 license_expression: "unknown".to_string(),
742 matcher: crate::license_detection::models::MatcherKind::Unknown,
743 matched_length: 30,
744 match_coverage: 50.0,
745 ..LicenseMatch::default()
746 };
747 weak_unknown.end_token = 30;
748 weak_unknown.rule_length = 30;
749
750 let mut weak_seq = LicenseMatch {
751 license_expression: "apache-2.0".to_string(),
752 matcher: crate::license_detection::models::MatcherKind::Seq,
753 matched_length: 10,
754 match_coverage: 20.0,
755 ..LicenseMatch::default()
756 };
757 weak_seq.end_token = 10;
758 weak_seq.rule_length = 50;
759
760 let matches = vec![good_match.clone(), weak_unknown.clone(), weak_seq.clone()];
761 let index = LicenseIndex::with_legalese_count(10);
762 let (good, weak) = split_weak_matches(&index, &matches);
763
764 assert_eq!(good.len(), 1);
765 assert_eq!(weak.len(), 2);
766 assert!(good.contains(&good_match));
767 assert!(weak.contains(&weak_unknown));
768 assert!(weak.contains(&weak_seq));
769 }
770}