1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{
6 build_set_and_mset, high_multiset_subset, tids_set_counter,
7};
8use crate::license_detection::models::Rule;
9use crate::license_detection::query::QueryRun;
10use std::collections::{HashMap, HashSet};
11
12use super::HIGH_RESEMBLANCE_THRESHOLD;
13
14#[derive(Debug, Clone, PartialEq)]
20pub struct ScoresVector {
21 pub is_highly_resemblant: bool,
23 pub containment: f32,
25 pub resemblance: f32,
27 pub matched_length: f32,
29 pub rid: usize,
31}
32
33impl PartialOrd for ScoresVector {
34 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
35 Some(self.cmp(other))
36 }
37}
38
39impl Eq for ScoresVector {}
40
41impl Ord for ScoresVector {
42 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
43 self.is_highly_resemblant
50 .cmp(&other.is_highly_resemblant)
51 .then_with(|| {
52 self.containment
53 .partial_cmp(&other.containment)
54 .unwrap_or(std::cmp::Ordering::Equal)
55 })
56 .then_with(|| {
57 self.resemblance
58 .partial_cmp(&other.resemblance)
59 .unwrap_or(std::cmp::Ordering::Equal)
60 })
61 .then_with(|| {
62 self.matched_length
63 .partial_cmp(&other.matched_length)
64 .unwrap_or(std::cmp::Ordering::Equal)
65 })
66 }
67}
68
69#[derive(Debug, Clone, PartialEq)]
73pub struct Candidate<'a> {
74 pub score_vec_rounded: ScoresVector,
76 pub score_vec_full: ScoresVector,
78 pub rid: usize,
80 pub rule: &'a Rule,
82 pub high_set_intersection: HashSet<TokenId>,
84}
85
86impl PartialOrd for Candidate<'_> {
87 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
88 Some(self.cmp(other))
89 }
90}
91
92impl Eq for Candidate<'_> {}
93
94impl Ord for Candidate<'_> {
95 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
96 compare_candidate_rank(
102 &self.score_vec_rounded,
103 &self.score_vec_full,
104 self.rid,
105 &other.score_vec_rounded,
106 &other.score_vec_full,
107 other.rid,
108 )
109 }
110}
111
112fn compare_candidate_rank(
113 rounded: &ScoresVector,
114 full: &ScoresVector,
115 rid: usize,
116 other_rounded: &ScoresVector,
117 other_full: &ScoresVector,
118 other_rid: usize,
119) -> std::cmp::Ordering {
120 rounded
121 .cmp(other_rounded)
122 .then_with(|| full.cmp(other_full))
123 .then_with(|| rid.cmp(&other_rid))
124}
125
126fn python_round_tenths(value: f64) -> f32 {
127 let rendered = format!("{value:.20}");
128 let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
129
130 let whole_part: i64 = whole.parse().unwrap_or(0);
131 let mut frac_chars = frac.chars();
132 let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
133 let rest: String = frac_chars.collect();
134
135 let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
136 let should_round_up = if rest > threshold {
137 true
138 } else if rest == threshold {
139 tenths % 2 == 1
140 } else {
141 false
142 };
143
144 let mut scaled = whole_part * 10 + tenths;
145 if should_round_up {
146 scaled += 1;
147 }
148
149 scaled as f32 / 10.0
150}
151
152fn quantize_tenths(value: f32) -> i32 {
153 format!("{value:.1}")
154 .chars()
155 .filter(|c| *c != '.')
156 .collect::<String>()
157 .parse()
158 .unwrap_or(0)
159}
160
161fn build_score_vectors(
162 resemblance: f64,
163 containment: f64,
164 matched_length: usize,
165 rid: usize,
166) -> (ScoresVector, ScoresVector) {
167 let amplified_resemblance = resemblance * resemblance;
168
169 let score_vec_rounded = ScoresVector {
170 is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
171 containment: python_round_tenths(containment),
172 resemblance: python_round_tenths(amplified_resemblance),
173 matched_length: python_round_tenths(matched_length as f64 / 20.0),
174 rid,
175 };
176
177 let score_vec_full = ScoresVector {
178 is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
179 containment: containment as f32,
180 resemblance: amplified_resemblance as f32,
181 matched_length: matched_length as f32,
182 rid,
183 };
184
185 (score_vec_rounded, score_vec_full)
186}
187
188#[derive(Debug, Clone, Hash, PartialEq, Eq)]
195struct DupeGroupKey {
196 license_expression: String,
197 is_highly_resemblant: bool,
198 containment: i32,
199 resemblance: i32,
200 matched_length: i32,
201 rule_length: usize,
202}
203
204pub(super) fn filter_dupes(candidates: Vec<Candidate<'_>>) -> Vec<Candidate<'_>> {
215 let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
216
217 for candidate in candidates {
218 let key = DupeGroupKey {
219 license_expression: candidate.rule.license_expression.clone(),
220 is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
221 containment: quantize_tenths(candidate.score_vec_rounded.containment),
222 resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
223 matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
224 rule_length: candidate.rule.tokens.len(),
225 };
226 groups.entry(key).or_default().push(candidate);
227 }
228
229 let mut result: Vec<Candidate> = Vec::new();
230 for mut group in groups.into_values() {
231 group.sort_by(|a, b| {
234 b.score_vec_full
235 .cmp(&a.score_vec_full)
236 .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
237 });
238 if let Some(best) = group.into_iter().next() {
239 result.push(best);
240 }
241 }
242
243 result
244}
245
246pub fn multisets_intersector(
253 qmset: &HashMap<TokenId, usize>,
254 imset: &HashMap<TokenId, usize>,
255) -> HashMap<TokenId, usize> {
256 let (set1, set2) = if qmset.len() < imset.len() {
257 (qmset, imset)
258 } else {
259 (imset, qmset)
260 };
261
262 set1.iter()
263 .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
264 .collect()
265}
266
267pub fn compute_candidates_with_msets<'a>(
273 index: &'a LicenseIndex,
274 query_run: &QueryRun,
275 high_resemblance: bool,
276 top_n: usize,
277) -> Vec<Candidate<'a>> {
278 let query_tokens = query_run.matchable_tokens();
279 if query_tokens.is_empty() {
280 return Vec::new();
281 }
282
283 let query_token_ids: Vec<TokenId> = query_tokens
284 .iter()
285 .filter(|&&tid| tid >= 0)
286 .map(|&tid| TokenId::new(tid as u16))
287 .collect();
288
289 if query_token_ids.is_empty() {
290 return Vec::new();
291 }
292
293 let (query_set, query_mset) = build_set_and_mset(&query_token_ids);
294
295 let query_high_set: HashSet<TokenId> = query_set
297 .iter()
298 .filter(|tid| tid.as_usize() < index.len_legalese)
299 .copied()
300 .collect();
301
302 if query_high_set.is_empty() {
303 return Vec::new();
304 }
305
306 let candidate_rids: HashSet<usize> = query_high_set
308 .iter()
309 .filter_map(|tid| index.rids_by_high_tid.get(tid))
310 .flat_map(|rids| rids.iter().copied())
311 .collect();
312
313 if candidate_rids.is_empty() {
314 return Vec::new();
315 }
316
317 let mut step1_candidates: Vec<(
318 ScoresVector,
319 ScoresVector,
320 usize,
321 &'a Rule,
322 HashSet<TokenId>,
323 )> = Vec::new();
324
325 for rid in candidate_rids {
326 let Some(rule) = index.rules_by_rid.get(rid) else {
327 continue;
328 };
329 let Some(rule_set) = index.sets_by_rid.get(&rid) else {
330 continue;
331 };
332 let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
333 continue;
334 };
335
336 let high_intersection_size = query_high_set.intersection(rule_high_set).count();
339 if high_intersection_size < rule.min_high_matched_length_unique {
340 continue;
341 }
342
343 let high_set_intersection: HashSet<TokenId> = query_high_set
345 .intersection(rule_high_set)
346 .copied()
347 .collect();
348 if high_set_intersection.is_empty() {
349 continue;
350 }
351
352 let intersection: HashSet<TokenId> = query_set.intersection(rule_set).copied().collect();
354 if intersection.is_empty() {
355 continue;
356 }
357
358 let matched_length = tids_set_counter(&intersection);
360 if matched_length < rule.min_matched_length_unique {
361 continue;
362 }
363
364 let qset_len = query_set.len();
366 let iset_len = rule.length_unique;
367 if qset_len == 0 || iset_len == 0 {
368 continue;
369 }
370
371 let union_len = qset_len + iset_len - matched_length;
372 let resemblance = matched_length as f64 / union_len as f64;
373 let containment = matched_length as f64 / iset_len as f64;
374
375 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
378 if let Some(min_cont) = minimum_containment
379 && containment < min_cont
380 {
381 continue;
382 }
383
384 let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
385
386 if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
387 continue;
388 }
389
390 step1_candidates.push((svr, svf, rid, rule, high_set_intersection));
391 }
392
393 if step1_candidates.is_empty() {
394 return Vec::new();
395 }
396
397 step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
398
399 step1_candidates.truncate(top_n * 10);
400
401 let mut sortable_candidates: Vec<Candidate<'a>> = Vec::new();
402
403 for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
404 let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
405 continue;
406 };
407
408 let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
410 let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
411 let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
412 if high_intersection_mset.is_empty() {
413 continue;
414 }
415
416 let high_matched_length: usize = high_intersection_mset.values().sum();
417 if high_matched_length < rule.min_high_matched_length {
418 continue;
419 }
420
421 let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
423 let matched_length: usize = full_intersection_mset.values().sum();
424 if matched_length < rule.min_matched_length {
425 continue;
426 }
427 let qset_len: usize = query_mset.values().sum();
428 let iset_len: usize = rule_mset.values().sum();
429
430 if qset_len == 0 || iset_len == 0 {
431 continue;
432 }
433
434 let union_len = qset_len + iset_len - matched_length;
435 let resemblance = matched_length as f64 / union_len as f64;
436 let containment = matched_length as f64 / iset_len as f64;
437
438 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
441 if let Some(min_cont) = minimum_containment
442 && containment < min_cont
443 {
444 continue;
445 }
446
447 let (score_vec_rounded, score_vec_full) =
448 build_score_vectors(resemblance, containment, matched_length, rid);
449
450 if high_resemblance
451 && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
452 {
453 continue;
454 }
455
456 sortable_candidates.push(Candidate {
457 score_vec_rounded,
458 score_vec_full,
459 rid,
460 rule,
461 high_set_intersection,
462 });
463 }
464
465 sortable_candidates = filter_dupes(sortable_candidates);
466
467 sortable_candidates.sort_by(|a, b| b.cmp(a));
468 sortable_candidates.truncate(top_n);
469
470 sortable_candidates
471}
472
473#[cfg(test)]
474mod tests {
475 use super::*;
476 use crate::license_detection::index::dictionary::tid;
477
478 #[test]
479 fn test_scores_vector_comparison() {
480 let sv1 = ScoresVector {
481 is_highly_resemblant: true,
482 containment: 0.9,
483 resemblance: 0.8,
484 matched_length: 10.0,
485 rid: 0,
486 };
487
488 let sv2 = ScoresVector {
489 is_highly_resemblant: false,
490 containment: 0.8,
491 resemblance: 0.6,
492 matched_length: 5.0,
493 rid: 1,
494 };
495
496 assert!(sv1 > sv2);
497 }
498
499 #[test]
500 fn test_python_round_tenths_matches_python_half_even_behavior() {
501 assert_eq!(python_round_tenths(0.05), 0.1);
502 assert_eq!(python_round_tenths(0.15), 0.1);
503 assert_eq!(python_round_tenths(0.25), 0.2);
504 assert_eq!(python_round_tenths(2.25), 2.2);
505 assert_eq!(python_round_tenths(4.35), 4.3);
506 assert_eq!(python_round_tenths(6.65), 6.7);
507 }
508
509 #[test]
510 fn test_candidate_ordering() {
511 let rule1 = Rule {
512 identifier: "test1".to_string(),
513 license_expression: "mit".to_string(),
514 text: String::new(),
515 tokens: vec![],
516 rule_kind: crate::license_detection::models::RuleKind::Text,
517 is_false_positive: false,
518 is_required_phrase: false,
519 is_from_license: false,
520 relevance: 100,
521 minimum_coverage: None,
522 has_stored_minimum_coverage: false,
523 is_continuous: true,
524 referenced_filenames: None,
525 ignorable_urls: None,
526 ignorable_emails: None,
527 ignorable_copyrights: None,
528 ignorable_holders: None,
529 ignorable_authors: None,
530 language: None,
531 notes: None,
532 length_unique: 0,
533 high_length_unique: 0,
534 high_length: 0,
535 min_matched_length: 0,
536 min_high_matched_length: 0,
537 min_matched_length_unique: 0,
538 min_high_matched_length_unique: 0,
539 is_small: false,
540 is_tiny: false,
541 starts_with_license: false,
542 ends_with_license: false,
543 is_deprecated: false,
544 spdx_license_key: None,
545 other_spdx_license_keys: vec![],
546 required_phrase_spans: vec![],
547 stopwords_by_pos: std::collections::HashMap::new(),
548 };
549
550 let rule2 = Rule {
551 identifier: "test2".to_string(),
552 license_expression: "apache".to_string(),
553 text: String::new(),
554 tokens: vec![],
555 rule_kind: crate::license_detection::models::RuleKind::Text,
556 is_false_positive: false,
557 is_required_phrase: false,
558 is_from_license: false,
559 relevance: 100,
560 minimum_coverage: None,
561 has_stored_minimum_coverage: false,
562 is_continuous: true,
563 referenced_filenames: None,
564 ignorable_urls: None,
565 ignorable_emails: None,
566 ignorable_copyrights: None,
567 ignorable_holders: None,
568 ignorable_authors: None,
569 language: None,
570 notes: None,
571 length_unique: 0,
572 high_length_unique: 0,
573 high_length: 0,
574 min_matched_length: 0,
575 min_high_matched_length: 0,
576 min_matched_length_unique: 0,
577 min_high_matched_length_unique: 0,
578 is_small: false,
579 is_tiny: false,
580 starts_with_license: false,
581 ends_with_license: false,
582 is_deprecated: false,
583 spdx_license_key: None,
584 other_spdx_license_keys: vec![],
585 required_phrase_spans: vec![],
586 stopwords_by_pos: std::collections::HashMap::new(),
587 };
588
589 let candidate1 = Candidate {
590 score_vec_rounded: ScoresVector {
591 is_highly_resemblant: true,
592 containment: 0.9,
593 resemblance: 0.8,
594 matched_length: 10.0,
595 rid: 0,
596 },
597 score_vec_full: ScoresVector {
598 is_highly_resemblant: true,
599 containment: 0.9,
600 resemblance: 0.8,
601 matched_length: 10.0,
602 rid: 0,
603 },
604 rid: 0,
605 rule: &rule1,
606 high_set_intersection: HashSet::new(),
607 };
608
609 let candidate2 = Candidate {
610 score_vec_rounded: ScoresVector {
611 is_highly_resemblant: false,
612 containment: 0.5,
613 resemblance: 0.3,
614 matched_length: 5.0,
615 rid: 1,
616 },
617 score_vec_full: ScoresVector {
618 is_highly_resemblant: false,
619 containment: 0.5,
620 resemblance: 0.3,
621 matched_length: 5.0,
622 rid: 1,
623 },
624 rid: 1,
625 rule: &rule2,
626 high_set_intersection: HashSet::new(),
627 };
628
629 assert!(
630 candidate1 > candidate2,
631 "Higher containment candidate should rank higher"
632 );
633 }
634
635 #[test]
636 fn test_filter_dupes_matched_length_precision() {
637 let rule1 = Rule {
638 identifier: "x11-dec1.RULE".to_string(),
639 license_expression: "x11-dec1".to_string(),
640 text: String::new(),
641 tokens: vec![tid(0); 138],
642 rule_kind: crate::license_detection::models::RuleKind::Text,
643 is_false_positive: false,
644 is_required_phrase: false,
645 is_from_license: false,
646 relevance: 100,
647 minimum_coverage: None,
648 has_stored_minimum_coverage: false,
649 is_continuous: true,
650 referenced_filenames: None,
651 ignorable_urls: None,
652 ignorable_emails: None,
653 ignorable_copyrights: None,
654 ignorable_holders: None,
655 ignorable_authors: None,
656 language: None,
657 notes: None,
658 length_unique: 0,
659 high_length_unique: 0,
660 high_length: 0,
661 min_matched_length: 0,
662 min_high_matched_length: 0,
663 min_matched_length_unique: 0,
664 min_high_matched_length_unique: 0,
665 is_small: false,
666 is_tiny: false,
667 starts_with_license: false,
668 ends_with_license: false,
669 is_deprecated: false,
670 spdx_license_key: None,
671 other_spdx_license_keys: vec![],
672 required_phrase_spans: vec![],
673 stopwords_by_pos: std::collections::HashMap::new(),
674 };
675
676 let rule2 = Rule {
677 identifier: "cmu-uc.RULE".to_string(),
678 license_expression: "cmu-uc".to_string(),
679 text: String::new(),
680 tokens: vec![tid(0); 133],
681 ..rule1.clone()
682 };
683
684 let candidate1 = Candidate {
685 score_vec_rounded: ScoresVector {
686 is_highly_resemblant: false,
687 containment: 0.5,
688 resemblance: 0.25,
689 matched_length: 7.0,
690 rid: 1,
691 },
692 score_vec_full: ScoresVector {
693 is_highly_resemblant: false,
694 containment: 0.5,
695 resemblance: 0.25,
696 matched_length: 138.0,
697 rid: 1,
698 },
699 rid: 1,
700 rule: &rule1,
701 high_set_intersection: HashSet::new(),
702 };
703
704 let candidate2 = Candidate {
705 score_vec_rounded: ScoresVector {
706 is_highly_resemblant: false,
707 containment: 0.5,
708 resemblance: 0.25,
709 matched_length: 7.0,
710 rid: 2,
711 },
712 score_vec_full: ScoresVector {
713 is_highly_resemblant: false,
714 containment: 0.5,
715 resemblance: 0.25,
716 matched_length: 133.0,
717 rid: 2,
718 },
719 rid: 2,
720 rule: &rule2,
721 high_set_intersection: HashSet::new(),
722 };
723
724 let candidates = vec![candidate1, candidate2];
725 let filtered = filter_dupes(candidates);
726
727 assert_eq!(
728 filtered.len(),
729 2,
730 "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
731 );
732 }
733
734 #[test]
735 fn test_filter_dupes_same_group() {
736 let rule1 = Rule {
737 identifier: "mit.RULE".to_string(),
738 license_expression: "mit".to_string(),
739 text: String::new(),
740 tokens: vec![tid(0); 100],
741 rule_kind: crate::license_detection::models::RuleKind::Text,
742 is_false_positive: false,
743 is_required_phrase: false,
744 is_from_license: false,
745 relevance: 100,
746 minimum_coverage: None,
747 has_stored_minimum_coverage: false,
748 is_continuous: true,
749 referenced_filenames: None,
750 ignorable_urls: None,
751 ignorable_emails: None,
752 ignorable_copyrights: None,
753 ignorable_holders: None,
754 ignorable_authors: None,
755 language: None,
756 notes: None,
757 length_unique: 0,
758 high_length_unique: 0,
759 high_length: 0,
760 min_matched_length: 0,
761 min_high_matched_length: 0,
762 min_matched_length_unique: 0,
763 min_high_matched_length_unique: 0,
764 is_small: false,
765 is_tiny: false,
766 starts_with_license: false,
767 ends_with_license: false,
768 is_deprecated: false,
769 spdx_license_key: None,
770 other_spdx_license_keys: vec![],
771 required_phrase_spans: vec![],
772 stopwords_by_pos: std::collections::HashMap::new(),
773 };
774
775 let rule2 = Rule {
776 identifier: "mit_2.RULE".to_string(),
777 license_expression: "mit".to_string(),
778 text: String::new(),
779 tokens: vec![tid(0); 100],
780 ..rule1.clone()
781 };
782
783 let candidate1 = Candidate {
784 score_vec_rounded: ScoresVector {
785 is_highly_resemblant: false,
786 containment: 0.5,
787 resemblance: 0.25,
788 matched_length: 5.0,
789 rid: 1,
790 },
791 score_vec_full: ScoresVector {
792 is_highly_resemblant: false,
793 containment: 0.5,
794 resemblance: 0.25,
795 matched_length: 100.0,
796 rid: 1,
797 },
798 rid: 1,
799 rule: &rule1,
800 high_set_intersection: HashSet::new(),
801 };
802
803 let candidate2 = Candidate {
804 score_vec_rounded: ScoresVector {
805 is_highly_resemblant: false,
806 containment: 0.5,
807 resemblance: 0.25,
808 matched_length: 5.0,
809 rid: 2,
810 },
811 score_vec_full: ScoresVector {
812 is_highly_resemblant: false,
813 containment: 0.5,
814 resemblance: 0.25,
815 matched_length: 100.0,
816 rid: 2,
817 },
818 rid: 2,
819 rule: &rule2,
820 high_set_intersection: HashSet::new(),
821 };
822
823 let candidates = vec![candidate1, candidate2];
824 let filtered = filter_dupes(candidates);
825
826 assert_eq!(
827 filtered.len(),
828 1,
829 "Should keep only one candidate when all group keys match"
830 );
831 }
832
833 #[test]
834 fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
835 let rule_sa = Rule {
836 identifier: "cc-by-sa-1.0.RULE".to_string(),
837 license_expression: "cc-by-sa-1.0".to_string(),
838 text: String::new(),
839 tokens: vec![tid(0); 1960],
840 rule_kind: crate::license_detection::models::RuleKind::Text,
841 is_false_positive: false,
842 is_required_phrase: false,
843 is_from_license: false,
844 relevance: 100,
845 minimum_coverage: None,
846 has_stored_minimum_coverage: false,
847 is_continuous: true,
848 referenced_filenames: None,
849 ignorable_urls: None,
850 ignorable_emails: None,
851 ignorable_copyrights: None,
852 ignorable_holders: None,
853 ignorable_authors: None,
854 language: None,
855 notes: None,
856 length_unique: 0,
857 high_length_unique: 0,
858 high_length: 0,
859 min_matched_length: 0,
860 min_high_matched_length: 0,
861 min_matched_length_unique: 0,
862 min_high_matched_length_unique: 0,
863 is_small: false,
864 is_tiny: false,
865 starts_with_license: false,
866 ends_with_license: false,
867 is_deprecated: false,
868 spdx_license_key: None,
869 other_spdx_license_keys: vec![],
870 required_phrase_spans: vec![],
871 stopwords_by_pos: std::collections::HashMap::new(),
872 };
873
874 let rule_nc_sa = Rule {
875 identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
876 license_expression: "cc-by-nc-sa-1.0".to_string(),
877 text: String::new(),
878 tokens: vec![tid(0); 1829],
879 rule_kind: crate::license_detection::models::RuleKind::Text,
880 is_false_positive: false,
881 is_required_phrase: false,
882 is_from_license: false,
883 relevance: 100,
884 minimum_coverage: None,
885 has_stored_minimum_coverage: false,
886 is_continuous: true,
887 referenced_filenames: None,
888 ignorable_urls: None,
889 ignorable_emails: None,
890 ignorable_copyrights: None,
891 ignorable_holders: None,
892 ignorable_authors: None,
893 language: None,
894 notes: None,
895 length_unique: 0,
896 high_length_unique: 0,
897 high_length: 0,
898 min_matched_length: 0,
899 min_high_matched_length: 0,
900 min_matched_length_unique: 0,
901 min_high_matched_length_unique: 0,
902 is_small: false,
903 is_tiny: false,
904 starts_with_license: false,
905 ends_with_license: false,
906 is_deprecated: false,
907 spdx_license_key: None,
908 other_spdx_license_keys: vec![],
909 required_phrase_spans: vec![],
910 stopwords_by_pos: std::collections::HashMap::new(),
911 };
912
913 let candidate_sa = Candidate {
914 score_vec_rounded: ScoresVector {
915 is_highly_resemblant: true,
916 containment: 0.9,
917 resemblance: 0.8,
918 matched_length: 100.0,
919 rid: 1,
920 },
921 score_vec_full: ScoresVector {
922 is_highly_resemblant: true,
923 containment: 0.9,
924 resemblance: 0.8,
925 matched_length: 100.0,
926 rid: 1,
927 },
928 rid: 1,
929 rule: &rule_sa,
930 high_set_intersection: HashSet::new(),
931 };
932
933 let candidate_nc_sa = Candidate {
934 score_vec_rounded: ScoresVector {
935 is_highly_resemblant: true,
936 containment: 0.9,
937 resemblance: 0.8,
938 matched_length: 100.0,
939 rid: 2,
940 },
941 score_vec_full: ScoresVector {
942 is_highly_resemblant: true,
943 containment: 0.9,
944 resemblance: 0.8,
945 matched_length: 100.0,
946 rid: 2,
947 },
948 rid: 2,
949 rule: &rule_nc_sa,
950 high_set_intersection: HashSet::new(),
951 };
952
953 let candidates = vec![candidate_nc_sa, candidate_sa];
954 let filtered = filter_dupes(candidates);
955
956 assert_eq!(
957 filtered.len(),
958 2,
959 "Different license expressions should create different groups"
960 );
961
962 let mut rule_same1 = Rule {
963 license_expression: "same".to_string(),
964 tokens: vec![tid(0); 100],
965 ..rule_sa.clone()
966 };
967 let mut rule_same2 = Rule {
968 license_expression: "same".to_string(),
969 tokens: vec![tid(0); 100],
970 ..rule_nc_sa.clone()
971 };
972
973 let same_group_candidates = vec![
974 Candidate {
975 score_vec_rounded: filtered[0].score_vec_rounded.clone(),
976 score_vec_full: filtered[0].score_vec_full.clone(),
977 rid: filtered[0].rid,
978 rule: &mut rule_same1,
979 high_set_intersection: HashSet::new(),
980 },
981 Candidate {
982 score_vec_rounded: filtered[1].score_vec_rounded.clone(),
983 score_vec_full: filtered[1].score_vec_full.clone(),
984 rid: filtered[1].rid,
985 rule: &mut rule_same2,
986 high_set_intersection: HashSet::new(),
987 },
988 ];
989
990 let deduped = filter_dupes(same_group_candidates);
991 assert_eq!(deduped.len(), 1);
992 assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
993 }
994
995 #[test]
996 fn test_candidate_ordering_uses_rid_after_equal_scores() {
997 let rule_a = Rule {
998 identifier: "a.RULE".to_string(),
999 license_expression: "a".to_string(),
1000 text: String::new(),
1001 tokens: vec![tid(0); 10],
1002 rule_kind: crate::license_detection::models::RuleKind::Text,
1003 is_false_positive: false,
1004 is_required_phrase: false,
1005 is_from_license: false,
1006 relevance: 100,
1007 minimum_coverage: None,
1008 has_stored_minimum_coverage: false,
1009 is_continuous: true,
1010 referenced_filenames: None,
1011 ignorable_urls: None,
1012 ignorable_emails: None,
1013 ignorable_copyrights: None,
1014 ignorable_holders: None,
1015 ignorable_authors: None,
1016 language: None,
1017 notes: None,
1018 length_unique: 0,
1019 high_length_unique: 0,
1020 high_length: 0,
1021 min_matched_length: 0,
1022 min_high_matched_length: 0,
1023 min_matched_length_unique: 0,
1024 min_high_matched_length_unique: 0,
1025 is_small: false,
1026 is_tiny: false,
1027 starts_with_license: false,
1028 ends_with_license: false,
1029 is_deprecated: false,
1030 spdx_license_key: None,
1031 other_spdx_license_keys: vec![],
1032 required_phrase_spans: vec![],
1033 stopwords_by_pos: std::collections::HashMap::new(),
1034 };
1035
1036 let rule_z = Rule {
1037 identifier: "z.RULE".to_string(),
1038 ..rule_a.clone()
1039 };
1040
1041 let candidate_low_rid = Candidate {
1042 score_vec_rounded: ScoresVector {
1043 is_highly_resemblant: true,
1044 containment: 0.9,
1045 resemblance: 0.8,
1046 matched_length: 10.0,
1047 rid: 1,
1048 },
1049 score_vec_full: ScoresVector {
1050 is_highly_resemblant: true,
1051 containment: 0.9,
1052 resemblance: 0.8,
1053 matched_length: 10.0,
1054 rid: 1,
1055 },
1056 rid: 1,
1057 rule: &rule_z,
1058 high_set_intersection: HashSet::new(),
1059 };
1060
1061 let candidate_high_rid = Candidate {
1062 score_vec_rounded: ScoresVector {
1063 rid: 2,
1064 ..candidate_low_rid.score_vec_rounded.clone()
1065 },
1066 score_vec_full: ScoresVector {
1067 rid: 2,
1068 ..candidate_low_rid.score_vec_full.clone()
1069 },
1070 rid: 2,
1071 rule: &rule_a,
1072 high_set_intersection: HashSet::new(),
1073 };
1074
1075 let mut sorted = [candidate_low_rid, candidate_high_rid];
1076 sorted.sort_by(|a, b| b.cmp(a));
1077 assert_eq!(
1078 sorted[0].rid, 2,
1079 "Python final candidate tuple ordering falls back to higher rid after equal scores"
1080 );
1081 }
1082}