1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{build_set_and_mset, high_multiset_subset};
6use crate::license_detection::models::Rule;
7use crate::license_detection::query::QueryRun;
8use crate::license_detection::token_set::TokenSet;
9use std::collections::{HashMap, HashSet};
10
11use super::HIGH_RESEMBLANCE_THRESHOLD;
12
13#[derive(Debug, Clone, PartialEq)]
19pub struct ScoresVector {
20 pub is_highly_resemblant: bool,
22 pub containment: f32,
24 pub resemblance: f32,
26 pub matched_length: f32,
28 pub rid: usize,
30}
31
32impl PartialOrd for ScoresVector {
33 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
34 Some(self.cmp(other))
35 }
36}
37
38impl Eq for ScoresVector {}
39
40impl Ord for ScoresVector {
41 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
42 self.is_highly_resemblant
49 .cmp(&other.is_highly_resemblant)
50 .then_with(|| {
51 self.containment
52 .partial_cmp(&other.containment)
53 .unwrap_or(std::cmp::Ordering::Equal)
54 })
55 .then_with(|| {
56 self.resemblance
57 .partial_cmp(&other.resemblance)
58 .unwrap_or(std::cmp::Ordering::Equal)
59 })
60 .then_with(|| {
61 self.matched_length
62 .partial_cmp(&other.matched_length)
63 .unwrap_or(std::cmp::Ordering::Equal)
64 })
65 }
66}
67
68#[derive(Debug, Clone, PartialEq)]
72pub struct Candidate<'a> {
73 pub score_vec_rounded: ScoresVector,
75 pub score_vec_full: ScoresVector,
77 pub rid: usize,
79 pub rule: &'a Rule,
81 pub high_set_intersection: TokenSet,
83}
84
85impl PartialOrd for Candidate<'_> {
86 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
87 Some(self.cmp(other))
88 }
89}
90
91impl Eq for Candidate<'_> {}
92
93impl Ord for Candidate<'_> {
94 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
95 compare_candidate_rank(
101 &self.score_vec_rounded,
102 &self.score_vec_full,
103 self.rid,
104 &other.score_vec_rounded,
105 &other.score_vec_full,
106 other.rid,
107 )
108 }
109}
110
111fn compare_candidate_rank(
112 rounded: &ScoresVector,
113 full: &ScoresVector,
114 rid: usize,
115 other_rounded: &ScoresVector,
116 other_full: &ScoresVector,
117 other_rid: usize,
118) -> std::cmp::Ordering {
119 rounded
120 .cmp(other_rounded)
121 .then_with(|| full.cmp(other_full))
122 .then_with(|| rid.cmp(&other_rid))
123}
124
125fn python_round_tenths(value: f64) -> f32 {
126 let rendered = format!("{value:.20}");
127 let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
128
129 let whole_part: i64 = whole.parse().unwrap_or(0);
130 let mut frac_chars = frac.chars();
131 let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
132 let rest: String = frac_chars.collect();
133
134 let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
135 let should_round_up = if rest > threshold {
136 true
137 } else if rest == threshold {
138 tenths % 2 == 1
139 } else {
140 false
141 };
142
143 let mut scaled = whole_part * 10 + tenths;
144 if should_round_up {
145 scaled += 1;
146 }
147
148 scaled as f32 / 10.0
149}
150
151fn quantize_tenths(value: f32) -> i32 {
152 ((value * 10.0).round()) as i32
153}
154
155fn build_score_vectors(
156 resemblance: f64,
157 containment: f64,
158 matched_length: usize,
159 rid: usize,
160) -> (ScoresVector, ScoresVector) {
161 let amplified_resemblance = resemblance * resemblance;
162
163 let score_vec_rounded = ScoresVector {
164 is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
165 containment: python_round_tenths(containment),
166 resemblance: python_round_tenths(amplified_resemblance),
167 matched_length: python_round_tenths(matched_length as f64 / 20.0),
168 rid,
169 };
170
171 let score_vec_full = ScoresVector {
172 is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
173 containment: containment as f32,
174 resemblance: amplified_resemblance as f32,
175 matched_length: matched_length as f32,
176 rid,
177 };
178
179 (score_vec_rounded, score_vec_full)
180}
181
182#[derive(Debug, Clone, Hash, PartialEq, Eq)]
189struct DupeGroupKey {
190 license_expression: String,
191 is_highly_resemblant: bool,
192 containment: i32,
193 resemblance: i32,
194 matched_length: i32,
195 rule_length: usize,
196}
197
198pub(super) fn filter_dupes(candidates: Vec<Candidate<'_>>) -> Vec<Candidate<'_>> {
209 let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
210
211 for candidate in candidates {
212 let key = DupeGroupKey {
213 license_expression: candidate.rule.license_expression.clone(),
214 is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
215 containment: quantize_tenths(candidate.score_vec_rounded.containment),
216 resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
217 matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
218 rule_length: candidate.rule.tokens.len(),
219 };
220 groups.entry(key).or_default().push(candidate);
221 }
222
223 let mut result: Vec<Candidate> = Vec::new();
224 for mut group in groups.into_values() {
225 group.sort_by(|a, b| {
228 b.score_vec_full
229 .cmp(&a.score_vec_full)
230 .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
231 });
232 if let Some(best) = group.into_iter().next() {
233 result.push(best);
234 }
235 }
236
237 result
238}
239
240pub fn multisets_intersector(
247 qmset: &HashMap<TokenId, usize>,
248 imset: &HashMap<TokenId, usize>,
249) -> HashMap<TokenId, usize> {
250 let (set1, set2) = if qmset.len() < imset.len() {
251 (qmset, imset)
252 } else {
253 (imset, qmset)
254 };
255
256 set1.iter()
257 .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
258 .collect()
259}
260
261pub fn compute_candidates_with_msets<'a>(
267 index: &'a LicenseIndex,
268 query_run: &QueryRun,
269 high_resemblance: bool,
270 top_n: usize,
271) -> Vec<Candidate<'a>> {
272 let query_tokens = query_run.matchable_tokens();
273 if query_tokens.is_empty() {
274 return Vec::new();
275 }
276
277 let query_token_ids: Vec<TokenId> = query_tokens
278 .iter()
279 .filter(|&&tid| tid >= 0)
280 .map(|&tid| TokenId::new(tid as u16))
281 .collect();
282
283 if query_token_ids.is_empty() {
284 return Vec::new();
285 }
286
287 let (query_set_hash, query_mset) = build_set_and_mset(&query_token_ids);
288 let query_set: TokenSet = TokenSet::from_u16_iter(query_set_hash.iter().map(|tid| tid.raw()));
289
290 let query_high_set: TokenSet = TokenSet::from_u16_iter(
292 query_set
293 .iter()
294 .filter(|tid| (*tid as usize) < index.len_legalese),
295 );
296
297 if query_high_set.is_empty() {
298 return Vec::new();
299 }
300
301 let candidate_rids: HashSet<usize> = query_high_set
303 .iter()
304 .filter_map(|tid| index.rids_by_high_tid.get(&TokenId::new(tid)))
305 .flat_map(|rids| rids.iter().copied())
306 .collect();
307
308 if candidate_rids.is_empty() {
309 return Vec::new();
310 }
311
312 let mut step1_candidates: Vec<(ScoresVector, ScoresVector, usize, &'a Rule, TokenSet)> =
313 Vec::new();
314
315 for rid in candidate_rids {
316 let Some(rule) = index.rules_by_rid.get(rid) else {
317 continue;
318 };
319 let Some(rule_set) = index.sets_by_rid.get(&rid) else {
320 continue;
321 };
322 let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
323 continue;
324 };
325
326 let high_intersection_size = query_high_set.intersection_count(rule_high_set);
329 if high_intersection_size < rule.min_high_matched_length_unique {
330 continue;
331 }
332
333 let high_set_intersection: TokenSet = query_high_set.intersection(rule_high_set);
335 if high_set_intersection.is_empty() {
336 continue;
337 }
338
339 let intersection: TokenSet = query_set.intersection(rule_set);
341 if intersection.is_empty() {
342 continue;
343 }
344
345 let matched_length = intersection.len();
347 if matched_length < rule.min_matched_length_unique {
348 continue;
349 }
350
351 let qset_len = query_set.len();
353 let iset_len = rule.length_unique;
354 if qset_len == 0 || iset_len == 0 {
355 continue;
356 }
357
358 let union_len = qset_len + iset_len - matched_length;
359 let resemblance = matched_length as f64 / union_len as f64;
360 let containment = matched_length as f64 / iset_len as f64;
361
362 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
365 if let Some(min_cont) = minimum_containment
366 && containment < min_cont
367 {
368 continue;
369 }
370
371 let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
372
373 if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
374 continue;
375 }
376
377 step1_candidates.push((svr, svf, rid, rule, high_set_intersection));
378 }
379
380 if step1_candidates.is_empty() {
381 return Vec::new();
382 }
383
384 step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
385
386 step1_candidates.truncate(top_n * 10);
387
388 let mut sortable_candidates: Vec<Candidate<'a>> = Vec::new();
389
390 for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
391 let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
392 continue;
393 };
394
395 let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
397 let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
398 let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
399 if high_intersection_mset.is_empty() {
400 continue;
401 }
402
403 let high_matched_length: usize = high_intersection_mset.values().sum();
404 if high_matched_length < rule.min_high_matched_length {
405 continue;
406 }
407
408 let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
410 let matched_length: usize = full_intersection_mset.values().sum();
411 if matched_length < rule.min_matched_length {
412 continue;
413 }
414 let qset_len: usize = query_mset.values().sum();
415 let iset_len: usize = rule_mset.values().sum();
416
417 if qset_len == 0 || iset_len == 0 {
418 continue;
419 }
420
421 let union_len = qset_len + iset_len - matched_length;
422 let resemblance = matched_length as f64 / union_len as f64;
423 let containment = matched_length as f64 / iset_len as f64;
424
425 let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
428 if let Some(min_cont) = minimum_containment
429 && containment < min_cont
430 {
431 continue;
432 }
433
434 let (score_vec_rounded, score_vec_full) =
435 build_score_vectors(resemblance, containment, matched_length, rid);
436
437 if high_resemblance
438 && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
439 {
440 continue;
441 }
442
443 sortable_candidates.push(Candidate {
444 score_vec_rounded,
445 score_vec_full,
446 rid,
447 rule,
448 high_set_intersection,
449 });
450 }
451
452 sortable_candidates = filter_dupes(sortable_candidates);
453
454 sortable_candidates.sort_by(|a, b| b.cmp(a));
455 sortable_candidates.truncate(top_n);
456
457 sortable_candidates
458}
459
460#[cfg(test)]
461mod tests {
462 use super::*;
463 use crate::license_detection::index::dictionary::tid;
464
465 #[test]
466 fn test_scores_vector_comparison() {
467 let sv1 = ScoresVector {
468 is_highly_resemblant: true,
469 containment: 0.9,
470 resemblance: 0.8,
471 matched_length: 10.0,
472 rid: 0,
473 };
474
475 let sv2 = ScoresVector {
476 is_highly_resemblant: false,
477 containment: 0.8,
478 resemblance: 0.6,
479 matched_length: 5.0,
480 rid: 1,
481 };
482
483 assert!(sv1 > sv2);
484 }
485
486 #[test]
487 fn test_python_round_tenths_matches_python_half_even_behavior() {
488 assert_eq!(python_round_tenths(0.05), 0.1);
489 assert_eq!(python_round_tenths(0.15), 0.1);
490 assert_eq!(python_round_tenths(0.25), 0.2);
491 assert_eq!(python_round_tenths(2.25), 2.2);
492 assert_eq!(python_round_tenths(4.35), 4.3);
493 assert_eq!(python_round_tenths(6.65), 6.7);
494 }
495
496 #[test]
497 fn test_candidate_ordering() {
498 let rule1 = Rule {
499 identifier: "test1".to_string(),
500 license_expression: "mit".to_string(),
501 text: String::new(),
502 tokens: vec![],
503 rule_kind: crate::license_detection::models::RuleKind::Text,
504 is_false_positive: false,
505 is_required_phrase: false,
506 is_from_license: false,
507 relevance: 100,
508 minimum_coverage: None,
509 has_stored_minimum_coverage: false,
510 is_continuous: true,
511 referenced_filenames: None,
512 ignorable_urls: None,
513 ignorable_emails: None,
514 ignorable_copyrights: None,
515 ignorable_holders: None,
516 ignorable_authors: None,
517 language: None,
518 notes: None,
519 length_unique: 0,
520 high_length_unique: 0,
521 high_length: 0,
522 min_matched_length: 0,
523 min_high_matched_length: 0,
524 min_matched_length_unique: 0,
525 min_high_matched_length_unique: 0,
526 is_small: false,
527 is_tiny: false,
528 starts_with_license: false,
529 ends_with_license: false,
530 is_deprecated: false,
531 spdx_license_key: None,
532 other_spdx_license_keys: vec![],
533 required_phrase_spans: vec![],
534 stopwords_by_pos: std::collections::HashMap::new(),
535 };
536
537 let rule2 = Rule {
538 identifier: "test2".to_string(),
539 license_expression: "apache".to_string(),
540 text: String::new(),
541 tokens: vec![],
542 rule_kind: crate::license_detection::models::RuleKind::Text,
543 is_false_positive: false,
544 is_required_phrase: false,
545 is_from_license: false,
546 relevance: 100,
547 minimum_coverage: None,
548 has_stored_minimum_coverage: false,
549 is_continuous: true,
550 referenced_filenames: None,
551 ignorable_urls: None,
552 ignorable_emails: None,
553 ignorable_copyrights: None,
554 ignorable_holders: None,
555 ignorable_authors: None,
556 language: None,
557 notes: None,
558 length_unique: 0,
559 high_length_unique: 0,
560 high_length: 0,
561 min_matched_length: 0,
562 min_high_matched_length: 0,
563 min_matched_length_unique: 0,
564 min_high_matched_length_unique: 0,
565 is_small: false,
566 is_tiny: false,
567 starts_with_license: false,
568 ends_with_license: false,
569 is_deprecated: false,
570 spdx_license_key: None,
571 other_spdx_license_keys: vec![],
572 required_phrase_spans: vec![],
573 stopwords_by_pos: std::collections::HashMap::new(),
574 };
575
576 let candidate1 = Candidate {
577 score_vec_rounded: ScoresVector {
578 is_highly_resemblant: true,
579 containment: 0.9,
580 resemblance: 0.8,
581 matched_length: 10.0,
582 rid: 0,
583 },
584 score_vec_full: ScoresVector {
585 is_highly_resemblant: true,
586 containment: 0.9,
587 resemblance: 0.8,
588 matched_length: 10.0,
589 rid: 0,
590 },
591 rid: 0,
592 rule: &rule1,
593 high_set_intersection: TokenSet::new(),
594 };
595
596 let candidate2 = Candidate {
597 score_vec_rounded: ScoresVector {
598 is_highly_resemblant: false,
599 containment: 0.5,
600 resemblance: 0.3,
601 matched_length: 5.0,
602 rid: 1,
603 },
604 score_vec_full: ScoresVector {
605 is_highly_resemblant: false,
606 containment: 0.5,
607 resemblance: 0.3,
608 matched_length: 5.0,
609 rid: 1,
610 },
611 rid: 1,
612 rule: &rule2,
613 high_set_intersection: TokenSet::new(),
614 };
615
616 assert!(
617 candidate1 > candidate2,
618 "Higher containment candidate should rank higher"
619 );
620 }
621
622 #[test]
623 fn test_filter_dupes_matched_length_precision() {
624 let rule1 = Rule {
625 identifier: "x11-dec1.RULE".to_string(),
626 license_expression: "x11-dec1".to_string(),
627 text: String::new(),
628 tokens: vec![tid(0); 138],
629 rule_kind: crate::license_detection::models::RuleKind::Text,
630 is_false_positive: false,
631 is_required_phrase: false,
632 is_from_license: false,
633 relevance: 100,
634 minimum_coverage: None,
635 has_stored_minimum_coverage: false,
636 is_continuous: true,
637 referenced_filenames: None,
638 ignorable_urls: None,
639 ignorable_emails: None,
640 ignorable_copyrights: None,
641 ignorable_holders: None,
642 ignorable_authors: None,
643 language: None,
644 notes: None,
645 length_unique: 0,
646 high_length_unique: 0,
647 high_length: 0,
648 min_matched_length: 0,
649 min_high_matched_length: 0,
650 min_matched_length_unique: 0,
651 min_high_matched_length_unique: 0,
652 is_small: false,
653 is_tiny: false,
654 starts_with_license: false,
655 ends_with_license: false,
656 is_deprecated: false,
657 spdx_license_key: None,
658 other_spdx_license_keys: vec![],
659 required_phrase_spans: vec![],
660 stopwords_by_pos: std::collections::HashMap::new(),
661 };
662
663 let rule2 = Rule {
664 identifier: "cmu-uc.RULE".to_string(),
665 license_expression: "cmu-uc".to_string(),
666 text: String::new(),
667 tokens: vec![tid(0); 133],
668 ..rule1.clone()
669 };
670
671 let candidate1 = Candidate {
672 score_vec_rounded: ScoresVector {
673 is_highly_resemblant: false,
674 containment: 0.5,
675 resemblance: 0.25,
676 matched_length: 7.0,
677 rid: 1,
678 },
679 score_vec_full: ScoresVector {
680 is_highly_resemblant: false,
681 containment: 0.5,
682 resemblance: 0.25,
683 matched_length: 138.0,
684 rid: 1,
685 },
686 rid: 1,
687 rule: &rule1,
688 high_set_intersection: TokenSet::new(),
689 };
690
691 let candidate2 = Candidate {
692 score_vec_rounded: ScoresVector {
693 is_highly_resemblant: false,
694 containment: 0.5,
695 resemblance: 0.25,
696 matched_length: 7.0,
697 rid: 2,
698 },
699 score_vec_full: ScoresVector {
700 is_highly_resemblant: false,
701 containment: 0.5,
702 resemblance: 0.25,
703 matched_length: 133.0,
704 rid: 2,
705 },
706 rid: 2,
707 rule: &rule2,
708 high_set_intersection: TokenSet::new(),
709 };
710
711 let candidates = vec![candidate1, candidate2];
712 let filtered = filter_dupes(candidates);
713
714 assert_eq!(
715 filtered.len(),
716 2,
717 "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
718 );
719 }
720
721 #[test]
722 fn test_filter_dupes_same_group() {
723 let rule1 = Rule {
724 identifier: "mit.RULE".to_string(),
725 license_expression: "mit".to_string(),
726 text: String::new(),
727 tokens: vec![tid(0); 100],
728 rule_kind: crate::license_detection::models::RuleKind::Text,
729 is_false_positive: false,
730 is_required_phrase: false,
731 is_from_license: false,
732 relevance: 100,
733 minimum_coverage: None,
734 has_stored_minimum_coverage: false,
735 is_continuous: true,
736 referenced_filenames: None,
737 ignorable_urls: None,
738 ignorable_emails: None,
739 ignorable_copyrights: None,
740 ignorable_holders: None,
741 ignorable_authors: None,
742 language: None,
743 notes: None,
744 length_unique: 0,
745 high_length_unique: 0,
746 high_length: 0,
747 min_matched_length: 0,
748 min_high_matched_length: 0,
749 min_matched_length_unique: 0,
750 min_high_matched_length_unique: 0,
751 is_small: false,
752 is_tiny: false,
753 starts_with_license: false,
754 ends_with_license: false,
755 is_deprecated: false,
756 spdx_license_key: None,
757 other_spdx_license_keys: vec![],
758 required_phrase_spans: vec![],
759 stopwords_by_pos: std::collections::HashMap::new(),
760 };
761
762 let rule2 = Rule {
763 identifier: "mit_2.RULE".to_string(),
764 license_expression: "mit".to_string(),
765 text: String::new(),
766 tokens: vec![tid(0); 100],
767 ..rule1.clone()
768 };
769
770 let candidate1 = Candidate {
771 score_vec_rounded: ScoresVector {
772 is_highly_resemblant: false,
773 containment: 0.5,
774 resemblance: 0.25,
775 matched_length: 5.0,
776 rid: 1,
777 },
778 score_vec_full: ScoresVector {
779 is_highly_resemblant: false,
780 containment: 0.5,
781 resemblance: 0.25,
782 matched_length: 100.0,
783 rid: 1,
784 },
785 rid: 1,
786 rule: &rule1,
787 high_set_intersection: TokenSet::new(),
788 };
789
790 let candidate2 = Candidate {
791 score_vec_rounded: ScoresVector {
792 is_highly_resemblant: false,
793 containment: 0.5,
794 resemblance: 0.25,
795 matched_length: 5.0,
796 rid: 2,
797 },
798 score_vec_full: ScoresVector {
799 is_highly_resemblant: false,
800 containment: 0.5,
801 resemblance: 0.25,
802 matched_length: 100.0,
803 rid: 2,
804 },
805 rid: 2,
806 rule: &rule2,
807 high_set_intersection: TokenSet::new(),
808 };
809
810 let candidates = vec![candidate1, candidate2];
811 let filtered = filter_dupes(candidates);
812
813 assert_eq!(
814 filtered.len(),
815 1,
816 "Should keep only one candidate when all group keys match"
817 );
818 }
819
820 #[test]
821 fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
822 let rule_sa = Rule {
823 identifier: "cc-by-sa-1.0.RULE".to_string(),
824 license_expression: "cc-by-sa-1.0".to_string(),
825 text: String::new(),
826 tokens: vec![tid(0); 1960],
827 rule_kind: crate::license_detection::models::RuleKind::Text,
828 is_false_positive: false,
829 is_required_phrase: false,
830 is_from_license: false,
831 relevance: 100,
832 minimum_coverage: None,
833 has_stored_minimum_coverage: false,
834 is_continuous: true,
835 referenced_filenames: None,
836 ignorable_urls: None,
837 ignorable_emails: None,
838 ignorable_copyrights: None,
839 ignorable_holders: None,
840 ignorable_authors: None,
841 language: None,
842 notes: None,
843 length_unique: 0,
844 high_length_unique: 0,
845 high_length: 0,
846 min_matched_length: 0,
847 min_high_matched_length: 0,
848 min_matched_length_unique: 0,
849 min_high_matched_length_unique: 0,
850 is_small: false,
851 is_tiny: false,
852 starts_with_license: false,
853 ends_with_license: false,
854 is_deprecated: false,
855 spdx_license_key: None,
856 other_spdx_license_keys: vec![],
857 required_phrase_spans: vec![],
858 stopwords_by_pos: std::collections::HashMap::new(),
859 };
860
861 let rule_nc_sa = Rule {
862 identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
863 license_expression: "cc-by-nc-sa-1.0".to_string(),
864 text: String::new(),
865 tokens: vec![tid(0); 1829],
866 rule_kind: crate::license_detection::models::RuleKind::Text,
867 is_false_positive: false,
868 is_required_phrase: false,
869 is_from_license: false,
870 relevance: 100,
871 minimum_coverage: None,
872 has_stored_minimum_coverage: false,
873 is_continuous: true,
874 referenced_filenames: None,
875 ignorable_urls: None,
876 ignorable_emails: None,
877 ignorable_copyrights: None,
878 ignorable_holders: None,
879 ignorable_authors: None,
880 language: None,
881 notes: None,
882 length_unique: 0,
883 high_length_unique: 0,
884 high_length: 0,
885 min_matched_length: 0,
886 min_high_matched_length: 0,
887 min_matched_length_unique: 0,
888 min_high_matched_length_unique: 0,
889 is_small: false,
890 is_tiny: false,
891 starts_with_license: false,
892 ends_with_license: false,
893 is_deprecated: false,
894 spdx_license_key: None,
895 other_spdx_license_keys: vec![],
896 required_phrase_spans: vec![],
897 stopwords_by_pos: std::collections::HashMap::new(),
898 };
899
900 let candidate_sa = Candidate {
901 score_vec_rounded: ScoresVector {
902 is_highly_resemblant: true,
903 containment: 0.9,
904 resemblance: 0.8,
905 matched_length: 100.0,
906 rid: 1,
907 },
908 score_vec_full: ScoresVector {
909 is_highly_resemblant: true,
910 containment: 0.9,
911 resemblance: 0.8,
912 matched_length: 100.0,
913 rid: 1,
914 },
915 rid: 1,
916 rule: &rule_sa,
917 high_set_intersection: TokenSet::new(),
918 };
919
920 let candidate_nc_sa = Candidate {
921 score_vec_rounded: ScoresVector {
922 is_highly_resemblant: true,
923 containment: 0.9,
924 resemblance: 0.8,
925 matched_length: 100.0,
926 rid: 2,
927 },
928 score_vec_full: ScoresVector {
929 is_highly_resemblant: true,
930 containment: 0.9,
931 resemblance: 0.8,
932 matched_length: 100.0,
933 rid: 2,
934 },
935 rid: 2,
936 rule: &rule_nc_sa,
937 high_set_intersection: TokenSet::new(),
938 };
939
940 let candidates = vec![candidate_nc_sa, candidate_sa];
941 let filtered = filter_dupes(candidates);
942
943 assert_eq!(
944 filtered.len(),
945 2,
946 "Different license expressions should create different groups"
947 );
948
949 let mut rule_same1 = Rule {
950 license_expression: "same".to_string(),
951 tokens: vec![tid(0); 100],
952 ..rule_sa.clone()
953 };
954 let mut rule_same2 = Rule {
955 license_expression: "same".to_string(),
956 tokens: vec![tid(0); 100],
957 ..rule_nc_sa.clone()
958 };
959
960 let same_group_candidates = vec![
961 Candidate {
962 score_vec_rounded: filtered[0].score_vec_rounded.clone(),
963 score_vec_full: filtered[0].score_vec_full.clone(),
964 rid: filtered[0].rid,
965 rule: &mut rule_same1,
966 high_set_intersection: TokenSet::new(),
967 },
968 Candidate {
969 score_vec_rounded: filtered[1].score_vec_rounded.clone(),
970 score_vec_full: filtered[1].score_vec_full.clone(),
971 rid: filtered[1].rid,
972 rule: &mut rule_same2,
973 high_set_intersection: TokenSet::new(),
974 },
975 ];
976
977 let deduped = filter_dupes(same_group_candidates);
978 assert_eq!(deduped.len(), 1);
979 assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
980 }
981
982 #[test]
983 fn test_candidate_ordering_uses_rid_after_equal_scores() {
984 let rule_a = Rule {
985 identifier: "a.RULE".to_string(),
986 license_expression: "a".to_string(),
987 text: String::new(),
988 tokens: vec![tid(0); 10],
989 rule_kind: crate::license_detection::models::RuleKind::Text,
990 is_false_positive: false,
991 is_required_phrase: false,
992 is_from_license: false,
993 relevance: 100,
994 minimum_coverage: None,
995 has_stored_minimum_coverage: false,
996 is_continuous: true,
997 referenced_filenames: None,
998 ignorable_urls: None,
999 ignorable_emails: None,
1000 ignorable_copyrights: None,
1001 ignorable_holders: None,
1002 ignorable_authors: None,
1003 language: None,
1004 notes: None,
1005 length_unique: 0,
1006 high_length_unique: 0,
1007 high_length: 0,
1008 min_matched_length: 0,
1009 min_high_matched_length: 0,
1010 min_matched_length_unique: 0,
1011 min_high_matched_length_unique: 0,
1012 is_small: false,
1013 is_tiny: false,
1014 starts_with_license: false,
1015 ends_with_license: false,
1016 is_deprecated: false,
1017 spdx_license_key: None,
1018 other_spdx_license_keys: vec![],
1019 required_phrase_spans: vec![],
1020 stopwords_by_pos: std::collections::HashMap::new(),
1021 };
1022
1023 let rule_z = Rule {
1024 identifier: "z.RULE".to_string(),
1025 ..rule_a.clone()
1026 };
1027
1028 let candidate_low_rid = Candidate {
1029 score_vec_rounded: ScoresVector {
1030 is_highly_resemblant: true,
1031 containment: 0.9,
1032 resemblance: 0.8,
1033 matched_length: 10.0,
1034 rid: 1,
1035 },
1036 score_vec_full: ScoresVector {
1037 is_highly_resemblant: true,
1038 containment: 0.9,
1039 resemblance: 0.8,
1040 matched_length: 10.0,
1041 rid: 1,
1042 },
1043 rid: 1,
1044 rule: &rule_z,
1045 high_set_intersection: TokenSet::new(),
1046 };
1047
1048 let candidate_high_rid = Candidate {
1049 score_vec_rounded: ScoresVector {
1050 rid: 2,
1051 ..candidate_low_rid.score_vec_rounded.clone()
1052 },
1053 score_vec_full: ScoresVector {
1054 rid: 2,
1055 ..candidate_low_rid.score_vec_full.clone()
1056 },
1057 rid: 2,
1058 rule: &rule_a,
1059 high_set_intersection: TokenSet::new(),
1060 };
1061
1062 let mut sorted = [candidate_low_rid, candidate_high_rid];
1063 sorted.sort_by(|a, b| b.cmp(a));
1064 assert_eq!(
1065 sorted[0].rid, 2,
1066 "Python final candidate tuple ordering falls back to higher rid after equal scores"
1067 );
1068 }
1069}