1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{
6 build_set_and_mset, high_multiset_subset, tids_set_counter,
7};
8use crate::license_detection::models::Rule;
9use crate::license_detection::query::QueryRun;
10use std::collections::{HashMap, HashSet};
11
12use super::HIGH_RESEMBLANCE_THRESHOLD;
13
14#[derive(Debug, Clone, PartialEq)]
20pub struct ScoresVector {
21 pub is_highly_resemblant: bool,
23 pub containment: f32,
25 pub resemblance: f32,
27 pub matched_length: f32,
29 pub rid: usize,
31}
32
33impl PartialOrd for ScoresVector {
34 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
35 Some(self.cmp(other))
36 }
37}
38
39impl Eq for ScoresVector {}
40
41impl Ord for ScoresVector {
42 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
43 self.is_highly_resemblant
50 .cmp(&other.is_highly_resemblant)
51 .then_with(|| {
52 self.containment
53 .partial_cmp(&other.containment)
54 .unwrap_or(std::cmp::Ordering::Equal)
55 })
56 .then_with(|| {
57 self.resemblance
58 .partial_cmp(&other.resemblance)
59 .unwrap_or(std::cmp::Ordering::Equal)
60 })
61 .then_with(|| {
62 self.matched_length
63 .partial_cmp(&other.matched_length)
64 .unwrap_or(std::cmp::Ordering::Equal)
65 })
66 }
67}
68
69#[derive(Debug, Clone, PartialEq)]
73pub struct Candidate {
74 pub score_vec_rounded: ScoresVector,
76 pub score_vec_full: ScoresVector,
78 pub rid: usize,
80 pub rule: Rule,
82 pub high_set_intersection: HashSet<TokenId>,
84}
85
86impl PartialOrd for Candidate {
87 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
88 Some(self.cmp(other))
89 }
90}
91
92impl Eq for Candidate {}
93
94impl Ord for Candidate {
95 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
96 compare_candidate_rank(
102 &self.score_vec_rounded,
103 &self.score_vec_full,
104 self.rid,
105 &other.score_vec_rounded,
106 &other.score_vec_full,
107 other.rid,
108 )
109 }
110}
111
112fn compare_candidate_rank(
113 rounded: &ScoresVector,
114 full: &ScoresVector,
115 rid: usize,
116 other_rounded: &ScoresVector,
117 other_full: &ScoresVector,
118 other_rid: usize,
119) -> std::cmp::Ordering {
120 rounded
121 .cmp(other_rounded)
122 .then_with(|| full.cmp(other_full))
123 .then_with(|| rid.cmp(&other_rid))
124}
125
126fn python_round_tenths(value: f64) -> f32 {
127 let rendered = format!("{value:.20}");
128 let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
129
130 let whole_part: i64 = whole.parse().unwrap_or(0);
131 let mut frac_chars = frac.chars();
132 let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
133 let rest: String = frac_chars.collect();
134
135 let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
136 let should_round_up = if rest > threshold {
137 true
138 } else if rest == threshold {
139 tenths % 2 == 1
140 } else {
141 false
142 };
143
144 let mut scaled = whole_part * 10 + tenths;
145 if should_round_up {
146 scaled += 1;
147 }
148
149 scaled as f32 / 10.0
150}
151
152fn quantize_tenths(value: f32) -> i32 {
153 format!("{value:.1}")
154 .chars()
155 .filter(|c| *c != '.')
156 .collect::<String>()
157 .parse()
158 .unwrap_or(0)
159}
160
161fn build_score_vectors(
162 resemblance: f64,
163 containment: f64,
164 matched_length: usize,
165 rid: usize,
166) -> (ScoresVector, ScoresVector) {
167 let amplified_resemblance = resemblance * resemblance;
168
169 let score_vec_rounded = ScoresVector {
170 is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
171 containment: python_round_tenths(containment),
172 resemblance: python_round_tenths(amplified_resemblance),
173 matched_length: python_round_tenths(matched_length as f64 / 20.0),
174 rid,
175 };
176
177 let score_vec_full = ScoresVector {
178 is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
179 containment: containment as f32,
180 resemblance: amplified_resemblance as f32,
181 matched_length: matched_length as f32,
182 rid,
183 };
184
185 (score_vec_rounded, score_vec_full)
186}
187
188#[derive(Debug, Clone, Hash, PartialEq, Eq)]
195struct DupeGroupKey {
196 license_expression: String,
197 is_highly_resemblant: bool,
198 containment: i32,
199 resemblance: i32,
200 matched_length: i32,
201 rule_length: usize,
202}
203
204pub(super) fn filter_dupes(candidates: Vec<Candidate>) -> Vec<Candidate> {
215 let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
216
217 for candidate in candidates {
218 let key = DupeGroupKey {
219 license_expression: candidate.rule.license_expression.clone(),
220 is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
221 containment: quantize_tenths(candidate.score_vec_rounded.containment),
222 resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
223 matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
224 rule_length: candidate.rule.tokens.len(),
225 };
226 groups.entry(key).or_default().push(candidate);
227 }
228
229 let mut result: Vec<Candidate> = Vec::new();
230 for mut group in groups.into_values() {
231 group.sort_by(|a, b| {
234 b.score_vec_full
235 .cmp(&a.score_vec_full)
236 .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
237 });
238 if let Some(best) = group.into_iter().next() {
239 result.push(best);
240 }
241 }
242
243 result
244}
245
246pub fn multisets_intersector(
253 qmset: &HashMap<TokenId, usize>,
254 imset: &HashMap<TokenId, usize>,
255) -> HashMap<TokenId, usize> {
256 let (set1, set2) = if qmset.len() < imset.len() {
257 (qmset, imset)
258 } else {
259 (imset, qmset)
260 };
261
262 set1.iter()
263 .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
264 .collect()
265}
266
267pub fn compute_candidates_with_msets(
273 index: &LicenseIndex,
274 query_run: &QueryRun,
275 high_resemblance: bool,
276 top_n: usize,
277) -> Vec<Candidate> {
278 let query_tokens = query_run.matchable_tokens();
279 if query_tokens.is_empty() {
280 return Vec::new();
281 }
282
283 let query_token_ids: Vec<TokenId> = query_tokens
284 .iter()
285 .filter(|&&tid| tid >= 0)
286 .map(|&tid| TokenId::new(tid as u16))
287 .collect();
288
289 if query_token_ids.is_empty() {
290 return Vec::new();
291 }
292
293 let (query_set, query_mset) = build_set_and_mset(&query_token_ids);
294
295 let query_high_set: HashSet<TokenId> = query_set
297 .iter()
298 .filter(|tid| tid.as_usize() < index.len_legalese)
299 .copied()
300 .collect();
301
302 if query_high_set.is_empty() {
303 return Vec::new();
304 }
305
306 let candidate_rids: HashSet<usize> = query_high_set
308 .iter()
309 .filter_map(|tid| index.rids_by_high_tid.get(tid))
310 .flat_map(|rids| rids.iter().copied())
311 .collect();
312
313 if candidate_rids.is_empty() {
314 return Vec::new();
315 }
316
317 let mut step1_candidates: Vec<(ScoresVector, ScoresVector, usize, Rule, HashSet<TokenId>)> =
318 Vec::new();
319
320 for rid in candidate_rids {
321 let Some(rule) = index.rules_by_rid.get(rid) else {
322 continue;
323 };
324 let Some(rule_set) = index.sets_by_rid.get(&rid) else {
325 continue;
326 };
327 let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
328 continue;
329 };
330
331 let high_intersection_size = query_high_set.intersection(rule_high_set).count();
334 if high_intersection_size < rule.min_high_matched_length_unique {
335 continue;
336 }
337
338 let high_set_intersection: HashSet<TokenId> = query_high_set
340 .intersection(rule_high_set)
341 .copied()
342 .collect();
343 if high_set_intersection.is_empty() {
344 continue;
345 }
346
347 let intersection: HashSet<TokenId> = query_set.intersection(rule_set).copied().collect();
349 if intersection.is_empty() {
350 continue;
351 }
352
353 let matched_length = tids_set_counter(&intersection);
355 if matched_length < rule.min_matched_length_unique {
356 continue;
357 }
358
359 let qset_len = query_set.len();
361 let iset_len = rule.length_unique;
362 if qset_len == 0 || iset_len == 0 {
363 continue;
364 }
365
366 let union_len = qset_len + iset_len - matched_length;
367 let resemblance = matched_length as f64 / union_len as f64;
368 let containment = matched_length as f64 / iset_len as f64;
369
370 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
373 if let Some(min_cont) = minimum_containment
374 && containment < min_cont
375 {
376 continue;
377 }
378
379 let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
380
381 if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
382 continue;
383 }
384
385 step1_candidates.push((svr, svf, rid, rule.clone(), high_set_intersection));
386 }
387
388 if step1_candidates.is_empty() {
389 return Vec::new();
390 }
391
392 step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
393
394 step1_candidates.truncate(top_n * 10);
395
396 let mut sortable_candidates: Vec<Candidate> = Vec::new();
397
398 for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
399 let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
400 continue;
401 };
402
403 let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
405 let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
406 let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
407 if high_intersection_mset.is_empty() {
408 continue;
409 }
410
411 let high_matched_length: usize = high_intersection_mset.values().sum();
412 if high_matched_length < rule.min_high_matched_length {
413 continue;
414 }
415
416 let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
418 let matched_length: usize = full_intersection_mset.values().sum();
419 if matched_length < rule.min_matched_length {
420 continue;
421 }
422 let qset_len: usize = query_mset.values().sum();
423 let iset_len: usize = rule_mset.values().sum();
424
425 if qset_len == 0 || iset_len == 0 {
426 continue;
427 }
428
429 let union_len = qset_len + iset_len - matched_length;
430 let resemblance = matched_length as f64 / union_len as f64;
431 let containment = matched_length as f64 / iset_len as f64;
432
433 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
436 if let Some(min_cont) = minimum_containment
437 && containment < min_cont
438 {
439 continue;
440 }
441
442 let (score_vec_rounded, score_vec_full) =
443 build_score_vectors(resemblance, containment, matched_length, rid);
444
445 if high_resemblance
446 && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
447 {
448 continue;
449 }
450
451 sortable_candidates.push(Candidate {
452 score_vec_rounded,
453 score_vec_full,
454 rid,
455 rule,
456 high_set_intersection,
457 });
458 }
459
460 sortable_candidates = filter_dupes(sortable_candidates);
461
462 sortable_candidates.sort_by(|a, b| b.cmp(a));
463 sortable_candidates.truncate(top_n);
464
465 sortable_candidates
466}
467
468#[cfg(test)]
469mod tests {
470 use super::*;
471 use crate::license_detection::index::dictionary::tid;
472
473 #[test]
474 fn test_scores_vector_comparison() {
475 let sv1 = ScoresVector {
476 is_highly_resemblant: true,
477 containment: 0.9,
478 resemblance: 0.8,
479 matched_length: 10.0,
480 rid: 0,
481 };
482
483 let sv2 = ScoresVector {
484 is_highly_resemblant: false,
485 containment: 0.8,
486 resemblance: 0.6,
487 matched_length: 5.0,
488 rid: 1,
489 };
490
491 assert!(sv1 > sv2);
492 }
493
494 #[test]
495 fn test_python_round_tenths_matches_python_half_even_behavior() {
496 assert_eq!(python_round_tenths(0.05), 0.1);
497 assert_eq!(python_round_tenths(0.15), 0.1);
498 assert_eq!(python_round_tenths(0.25), 0.2);
499 assert_eq!(python_round_tenths(2.25), 2.2);
500 assert_eq!(python_round_tenths(4.35), 4.3);
501 assert_eq!(python_round_tenths(6.65), 6.7);
502 }
503
504 #[test]
505 fn test_candidate_ordering() {
506 let candidate1 = Candidate {
507 score_vec_rounded: ScoresVector {
508 is_highly_resemblant: true,
509 containment: 0.9,
510 resemblance: 0.8,
511 matched_length: 10.0,
512 rid: 0,
513 },
514 score_vec_full: ScoresVector {
515 is_highly_resemblant: true,
516 containment: 0.9,
517 resemblance: 0.8,
518 matched_length: 10.0,
519 rid: 0,
520 },
521 rid: 0,
522 rule: Rule {
523 identifier: "test1".to_string(),
524 license_expression: "mit".to_string(),
525 text: String::new(),
526 tokens: vec![],
527 rule_kind: crate::license_detection::models::RuleKind::Text,
528 is_false_positive: false,
529 is_required_phrase: false,
530 is_from_license: false,
531 relevance: 100,
532 minimum_coverage: None,
533 has_stored_minimum_coverage: false,
534 is_continuous: true,
535 referenced_filenames: None,
536 ignorable_urls: None,
537 ignorable_emails: None,
538 ignorable_copyrights: None,
539 ignorable_holders: None,
540 ignorable_authors: None,
541 language: None,
542 notes: None,
543 length_unique: 0,
544 high_length_unique: 0,
545 high_length: 0,
546 min_matched_length: 0,
547 min_high_matched_length: 0,
548 min_matched_length_unique: 0,
549 min_high_matched_length_unique: 0,
550 is_small: false,
551 is_tiny: false,
552 starts_with_license: false,
553 ends_with_license: false,
554 is_deprecated: false,
555 spdx_license_key: None,
556 other_spdx_license_keys: vec![],
557 required_phrase_spans: vec![],
558 stopwords_by_pos: std::collections::HashMap::new(),
559 },
560 high_set_intersection: HashSet::new(),
561 };
562
563 let candidate2 = Candidate {
564 score_vec_rounded: ScoresVector {
565 is_highly_resemblant: false,
566 containment: 0.5,
567 resemblance: 0.3,
568 matched_length: 5.0,
569 rid: 1,
570 },
571 score_vec_full: ScoresVector {
572 is_highly_resemblant: false,
573 containment: 0.5,
574 resemblance: 0.3,
575 matched_length: 5.0,
576 rid: 1,
577 },
578 rid: 1,
579 rule: Rule {
580 identifier: "test2".to_string(),
581 license_expression: "apache".to_string(),
582 text: String::new(),
583 tokens: vec![],
584 rule_kind: crate::license_detection::models::RuleKind::Text,
585 is_false_positive: false,
586 is_required_phrase: false,
587 is_from_license: false,
588 relevance: 100,
589 minimum_coverage: None,
590 has_stored_minimum_coverage: false,
591 is_continuous: true,
592 referenced_filenames: None,
593 ignorable_urls: None,
594 ignorable_emails: None,
595 ignorable_copyrights: None,
596 ignorable_holders: None,
597 ignorable_authors: None,
598 language: None,
599 notes: None,
600 length_unique: 0,
601 high_length_unique: 0,
602 high_length: 0,
603 min_matched_length: 0,
604 min_high_matched_length: 0,
605 min_matched_length_unique: 0,
606 min_high_matched_length_unique: 0,
607 is_small: false,
608 is_tiny: false,
609 starts_with_license: false,
610 ends_with_license: false,
611 is_deprecated: false,
612 spdx_license_key: None,
613 other_spdx_license_keys: vec![],
614 required_phrase_spans: vec![],
615 stopwords_by_pos: std::collections::HashMap::new(),
616 },
617 high_set_intersection: HashSet::new(),
618 };
619
620 assert!(
621 candidate1 > candidate2,
622 "Higher containment candidate should rank higher"
623 );
624 }
625
626 #[test]
627 fn test_filter_dupes_matched_length_precision() {
628 let rule1 = Rule {
629 identifier: "x11-dec1.RULE".to_string(),
630 license_expression: "x11-dec1".to_string(),
631 text: String::new(),
632 tokens: vec![tid(0); 138],
633 rule_kind: crate::license_detection::models::RuleKind::Text,
634 is_false_positive: false,
635 is_required_phrase: false,
636 is_from_license: false,
637 relevance: 100,
638 minimum_coverage: None,
639 has_stored_minimum_coverage: false,
640 is_continuous: true,
641 referenced_filenames: None,
642 ignorable_urls: None,
643 ignorable_emails: None,
644 ignorable_copyrights: None,
645 ignorable_holders: None,
646 ignorable_authors: None,
647 language: None,
648 notes: None,
649 length_unique: 0,
650 high_length_unique: 0,
651 high_length: 0,
652 min_matched_length: 0,
653 min_high_matched_length: 0,
654 min_matched_length_unique: 0,
655 min_high_matched_length_unique: 0,
656 is_small: false,
657 is_tiny: false,
658 starts_with_license: false,
659 ends_with_license: false,
660 is_deprecated: false,
661 spdx_license_key: None,
662 other_spdx_license_keys: vec![],
663 required_phrase_spans: vec![],
664 stopwords_by_pos: std::collections::HashMap::new(),
665 };
666
667 let rule2 = Rule {
668 identifier: "cmu-uc.RULE".to_string(),
669 license_expression: "cmu-uc".to_string(),
670 text: String::new(),
671 tokens: vec![tid(0); 133],
672 ..rule1.clone()
673 };
674
675 let candidate1 = Candidate {
676 score_vec_rounded: ScoresVector {
677 is_highly_resemblant: false,
678 containment: 0.5,
679 resemblance: 0.25,
680 matched_length: 7.0,
681 rid: 1,
682 },
683 score_vec_full: ScoresVector {
684 is_highly_resemblant: false,
685 containment: 0.5,
686 resemblance: 0.25,
687 matched_length: 138.0,
688 rid: 1,
689 },
690 rid: 1,
691 rule: rule1,
692 high_set_intersection: HashSet::new(),
693 };
694
695 let candidate2 = Candidate {
696 score_vec_rounded: ScoresVector {
697 is_highly_resemblant: false,
698 containment: 0.5,
699 resemblance: 0.25,
700 matched_length: 7.0,
701 rid: 2,
702 },
703 score_vec_full: ScoresVector {
704 is_highly_resemblant: false,
705 containment: 0.5,
706 resemblance: 0.25,
707 matched_length: 133.0,
708 rid: 2,
709 },
710 rid: 2,
711 rule: rule2,
712 high_set_intersection: HashSet::new(),
713 };
714
715 let candidates = vec![candidate1, candidate2];
716 let filtered = filter_dupes(candidates);
717
718 assert_eq!(
719 filtered.len(),
720 2,
721 "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
722 );
723 }
724
725 #[test]
726 fn test_filter_dupes_same_group() {
727 let rule1 = Rule {
728 identifier: "mit.RULE".to_string(),
729 license_expression: "mit".to_string(),
730 text: String::new(),
731 tokens: vec![tid(0); 100],
732 rule_kind: crate::license_detection::models::RuleKind::Text,
733 is_false_positive: false,
734 is_required_phrase: false,
735 is_from_license: false,
736 relevance: 100,
737 minimum_coverage: None,
738 has_stored_minimum_coverage: false,
739 is_continuous: true,
740 referenced_filenames: None,
741 ignorable_urls: None,
742 ignorable_emails: None,
743 ignorable_copyrights: None,
744 ignorable_holders: None,
745 ignorable_authors: None,
746 language: None,
747 notes: None,
748 length_unique: 0,
749 high_length_unique: 0,
750 high_length: 0,
751 min_matched_length: 0,
752 min_high_matched_length: 0,
753 min_matched_length_unique: 0,
754 min_high_matched_length_unique: 0,
755 is_small: false,
756 is_tiny: false,
757 starts_with_license: false,
758 ends_with_license: false,
759 is_deprecated: false,
760 spdx_license_key: None,
761 other_spdx_license_keys: vec![],
762 required_phrase_spans: vec![],
763 stopwords_by_pos: std::collections::HashMap::new(),
764 };
765
766 let rule2 = Rule {
767 identifier: "mit_2.RULE".to_string(),
768 license_expression: "mit".to_string(),
769 text: String::new(),
770 tokens: vec![tid(0); 100],
771 ..rule1.clone()
772 };
773
774 let candidate1 = Candidate {
775 score_vec_rounded: ScoresVector {
776 is_highly_resemblant: false,
777 containment: 0.5,
778 resemblance: 0.25,
779 matched_length: 5.0,
780 rid: 1,
781 },
782 score_vec_full: ScoresVector {
783 is_highly_resemblant: false,
784 containment: 0.5,
785 resemblance: 0.25,
786 matched_length: 100.0,
787 rid: 1,
788 },
789 rid: 1,
790 rule: rule1,
791 high_set_intersection: HashSet::new(),
792 };
793
794 let candidate2 = Candidate {
795 score_vec_rounded: ScoresVector {
796 is_highly_resemblant: false,
797 containment: 0.5,
798 resemblance: 0.25,
799 matched_length: 5.0,
800 rid: 2,
801 },
802 score_vec_full: ScoresVector {
803 is_highly_resemblant: false,
804 containment: 0.5,
805 resemblance: 0.25,
806 matched_length: 100.0,
807 rid: 2,
808 },
809 rid: 2,
810 rule: rule2,
811 high_set_intersection: HashSet::new(),
812 };
813
814 let candidates = vec![candidate1, candidate2];
815 let filtered = filter_dupes(candidates);
816
817 assert_eq!(
818 filtered.len(),
819 1,
820 "Should keep only one candidate when all group keys match"
821 );
822 }
823
824 #[test]
825 fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
826 let rule_sa = Rule {
827 identifier: "cc-by-sa-1.0.RULE".to_string(),
828 license_expression: "cc-by-sa-1.0".to_string(),
829 text: String::new(),
830 tokens: vec![tid(0); 1960],
831 rule_kind: crate::license_detection::models::RuleKind::Text,
832 is_false_positive: false,
833 is_required_phrase: false,
834 is_from_license: false,
835 relevance: 100,
836 minimum_coverage: None,
837 has_stored_minimum_coverage: false,
838 is_continuous: true,
839 referenced_filenames: None,
840 ignorable_urls: None,
841 ignorable_emails: None,
842 ignorable_copyrights: None,
843 ignorable_holders: None,
844 ignorable_authors: None,
845 language: None,
846 notes: None,
847 length_unique: 0,
848 high_length_unique: 0,
849 high_length: 0,
850 min_matched_length: 0,
851 min_high_matched_length: 0,
852 min_matched_length_unique: 0,
853 min_high_matched_length_unique: 0,
854 is_small: false,
855 is_tiny: false,
856 starts_with_license: false,
857 ends_with_license: false,
858 is_deprecated: false,
859 spdx_license_key: None,
860 other_spdx_license_keys: vec![],
861 required_phrase_spans: vec![],
862 stopwords_by_pos: std::collections::HashMap::new(),
863 };
864
865 let rule_nc_sa = Rule {
866 identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
867 license_expression: "cc-by-nc-sa-1.0".to_string(),
868 text: String::new(),
869 tokens: vec![tid(0); 1829],
870 rule_kind: crate::license_detection::models::RuleKind::Text,
871 is_false_positive: false,
872 is_required_phrase: false,
873 is_from_license: false,
874 relevance: 100,
875 minimum_coverage: None,
876 has_stored_minimum_coverage: false,
877 is_continuous: true,
878 referenced_filenames: None,
879 ignorable_urls: None,
880 ignorable_emails: None,
881 ignorable_copyrights: None,
882 ignorable_holders: None,
883 ignorable_authors: None,
884 language: None,
885 notes: None,
886 length_unique: 0,
887 high_length_unique: 0,
888 high_length: 0,
889 min_matched_length: 0,
890 min_high_matched_length: 0,
891 min_matched_length_unique: 0,
892 min_high_matched_length_unique: 0,
893 is_small: false,
894 is_tiny: false,
895 starts_with_license: false,
896 ends_with_license: false,
897 is_deprecated: false,
898 spdx_license_key: None,
899 other_spdx_license_keys: vec![],
900 required_phrase_spans: vec![],
901 stopwords_by_pos: std::collections::HashMap::new(),
902 };
903
904 let candidate_sa = Candidate {
905 score_vec_rounded: ScoresVector {
906 is_highly_resemblant: true,
907 containment: 0.9,
908 resemblance: 0.8,
909 matched_length: 100.0,
910 rid: 1,
911 },
912 score_vec_full: ScoresVector {
913 is_highly_resemblant: true,
914 containment: 0.9,
915 resemblance: 0.8,
916 matched_length: 100.0,
917 rid: 1,
918 },
919 rid: 1,
920 rule: rule_sa,
921 high_set_intersection: HashSet::new(),
922 };
923
924 let candidate_nc_sa = Candidate {
925 score_vec_rounded: ScoresVector {
926 is_highly_resemblant: true,
927 containment: 0.9,
928 resemblance: 0.8,
929 matched_length: 100.0,
930 rid: 2,
931 },
932 score_vec_full: ScoresVector {
933 is_highly_resemblant: true,
934 containment: 0.9,
935 resemblance: 0.8,
936 matched_length: 100.0,
937 rid: 2,
938 },
939 rid: 2,
940 rule: rule_nc_sa,
941 high_set_intersection: HashSet::new(),
942 };
943
944 let candidates = vec![candidate_nc_sa, candidate_sa];
945 let filtered = filter_dupes(candidates);
946
947 assert_eq!(
948 filtered.len(),
949 2,
950 "Different license expressions should create different groups"
951 );
952
953 let mut same_group_candidates = vec![filtered[0].clone(), filtered[1].clone()];
954 for candidate in &mut same_group_candidates {
955 candidate.rule.license_expression = "same".to_string();
956 candidate.rule.tokens = vec![tid(0); 100];
957 }
958
959 let deduped = filter_dupes(same_group_candidates);
960 assert_eq!(deduped.len(), 1);
961 assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
962 }
963
964 #[test]
965 fn test_candidate_ordering_uses_rid_after_equal_scores() {
966 let rule_a = Rule {
967 identifier: "a.RULE".to_string(),
968 license_expression: "a".to_string(),
969 text: String::new(),
970 tokens: vec![tid(0); 10],
971 rule_kind: crate::license_detection::models::RuleKind::Text,
972 is_false_positive: false,
973 is_required_phrase: false,
974 is_from_license: false,
975 relevance: 100,
976 minimum_coverage: None,
977 has_stored_minimum_coverage: false,
978 is_continuous: true,
979 referenced_filenames: None,
980 ignorable_urls: None,
981 ignorable_emails: None,
982 ignorable_copyrights: None,
983 ignorable_holders: None,
984 ignorable_authors: None,
985 language: None,
986 notes: None,
987 length_unique: 0,
988 high_length_unique: 0,
989 high_length: 0,
990 min_matched_length: 0,
991 min_high_matched_length: 0,
992 min_matched_length_unique: 0,
993 min_high_matched_length_unique: 0,
994 is_small: false,
995 is_tiny: false,
996 starts_with_license: false,
997 ends_with_license: false,
998 is_deprecated: false,
999 spdx_license_key: None,
1000 other_spdx_license_keys: vec![],
1001 required_phrase_spans: vec![],
1002 stopwords_by_pos: std::collections::HashMap::new(),
1003 };
1004
1005 let candidate_low_rid = Candidate {
1006 score_vec_rounded: ScoresVector {
1007 is_highly_resemblant: true,
1008 containment: 0.9,
1009 resemblance: 0.8,
1010 matched_length: 10.0,
1011 rid: 1,
1012 },
1013 score_vec_full: ScoresVector {
1014 is_highly_resemblant: true,
1015 containment: 0.9,
1016 resemblance: 0.8,
1017 matched_length: 10.0,
1018 rid: 1,
1019 },
1020 rid: 1,
1021 rule: Rule {
1022 identifier: "z.RULE".to_string(),
1023 ..rule_a.clone()
1024 },
1025 high_set_intersection: HashSet::new(),
1026 };
1027
1028 let candidate_high_rid = Candidate {
1029 score_vec_rounded: ScoresVector {
1030 rid: 2,
1031 ..candidate_low_rid.score_vec_rounded.clone()
1032 },
1033 score_vec_full: ScoresVector {
1034 rid: 2,
1035 ..candidate_low_rid.score_vec_full.clone()
1036 },
1037 rid: 2,
1038 rule: Rule {
1039 identifier: "a.RULE".to_string(),
1040 ..rule_a
1041 },
1042 high_set_intersection: HashSet::new(),
1043 };
1044
1045 let mut sorted = [candidate_low_rid, candidate_high_rid];
1046 sorted.sort_by(|a, b| b.cmp(a));
1047 assert_eq!(
1048 sorted[0].rid, 2,
1049 "Python final candidate tuple ordering falls back to higher rid after equal scores"
1050 );
1051 }
1052}