Skip to main content

provenant/license_detection/seq_match/
candidates.rs

1//! Candidate selection using set and multiset similarity.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{build_set_and_mset, high_multiset_subset};
6use crate::license_detection::models::Rule;
7use crate::license_detection::query::QueryRun;
8use crate::license_detection::token_set::TokenSet;
9use std::collections::{HashMap, HashSet};
10
11use super::HIGH_RESEMBLANCE_THRESHOLD;
12
13/// Score vector for ranking candidates using set similarity.
14///
15/// Contains metrics computed from set/multiset intersections.
16///
17/// Corresponds to Python: `ScoresVector` namedtuple in match_set.py (line 458)
18#[derive(Debug, Clone, PartialEq)]
19pub struct ScoresVector {
20    /// True if the sets are highly similar (resemblance >= threshold)
21    pub is_highly_resemblant: bool,
22    /// Containment ratio (how much of rule is in query)
23    pub containment: f32,
24    /// Amplified resemblance (squared to boost high values)
25    pub resemblance: f32,
26    /// Number of matched tokens (normalized for ranking)
27    pub matched_length: f32,
28    /// Rule ID for tie-breaking
29    pub rid: usize,
30}
31
32impl PartialOrd for ScoresVector {
33    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
34        Some(self.cmp(other))
35    }
36}
37
38impl Eq for ScoresVector {}
39
40impl Ord for ScoresVector {
41    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
42        // Python sorts ScoresVector namedtuple with reverse=True:
43        // 1. is_highly_resemblant (True > False)
44        // 2. containment (higher is better)
45        // 3. resemblance (higher is better)
46        // 4. matched_length (higher is better)
47        // Note: Python does NOT use rid for tie-breaking in ScoresVector
48        self.is_highly_resemblant
49            .cmp(&other.is_highly_resemblant)
50            .then_with(|| {
51                self.containment
52                    .partial_cmp(&other.containment)
53                    .unwrap_or(std::cmp::Ordering::Equal)
54            })
55            .then_with(|| {
56                self.resemblance
57                    .partial_cmp(&other.resemblance)
58                    .unwrap_or(std::cmp::Ordering::Equal)
59            })
60            .then_with(|| {
61                self.matched_length
62                    .partial_cmp(&other.matched_length)
63                    .unwrap_or(std::cmp::Ordering::Equal)
64            })
65    }
66}
67
68/// Candidate with its score vector and metadata.
69///
70/// Corresponds to the tuple structure used in Python: (scores_vectors, rid, rule, high_set_intersection)
71#[derive(Debug, Clone, PartialEq)]
72pub struct Candidate<'a> {
73    /// Rounded score vector for display/grouping
74    pub score_vec_rounded: ScoresVector,
75    /// Full score vector for sorting
76    pub score_vec_full: ScoresVector,
77    /// Rule ID
78    pub rid: usize,
79    /// Reference to the rule (borrowed from LicenseIndex)
80    pub rule: &'a Rule,
81    /// Set of high-value (legalese) tokens in the intersection
82    pub high_set_intersection: TokenSet,
83}
84
85impl PartialOrd for Candidate<'_> {
86    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
87        Some(self.cmp(other))
88    }
89}
90
91impl Eq for Candidate<'_> {}
92
93impl Ord for Candidate<'_> {
94    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
95        // Python sorts the tuple ((svr, svf), rid, rule, ...) with reverse=True
96        // So it compares (svr, svf) tuple first, which means:
97        // 1. Compare rounded (svr) first
98        // 2. Then compare full (svf) if rounded is equal
99        // 3. Then compare rid if scores are still equal.
100        compare_candidate_rank(
101            &self.score_vec_rounded,
102            &self.score_vec_full,
103            self.rid,
104            &other.score_vec_rounded,
105            &other.score_vec_full,
106            other.rid,
107        )
108    }
109}
110
111fn compare_candidate_rank(
112    rounded: &ScoresVector,
113    full: &ScoresVector,
114    rid: usize,
115    other_rounded: &ScoresVector,
116    other_full: &ScoresVector,
117    other_rid: usize,
118) -> std::cmp::Ordering {
119    rounded
120        .cmp(other_rounded)
121        .then_with(|| full.cmp(other_full))
122        .then_with(|| rid.cmp(&other_rid))
123}
124
125fn python_round_tenths(value: f64) -> f32 {
126    let rendered = format!("{value:.20}");
127    let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
128
129    let whole_part: i64 = whole.parse().unwrap_or(0);
130    let mut frac_chars = frac.chars();
131    let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
132    let rest: String = frac_chars.collect();
133
134    let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
135    let should_round_up = if rest > threshold {
136        true
137    } else if rest == threshold {
138        tenths % 2 == 1
139    } else {
140        false
141    };
142
143    let mut scaled = whole_part * 10 + tenths;
144    if should_round_up {
145        scaled += 1;
146    }
147
148    scaled as f32 / 10.0
149}
150
151fn quantize_tenths(value: f32) -> i32 {
152    ((value * 10.0).round()) as i32
153}
154
155fn build_score_vectors(
156    resemblance: f64,
157    containment: f64,
158    matched_length: usize,
159    rid: usize,
160) -> (ScoresVector, ScoresVector) {
161    let amplified_resemblance = resemblance * resemblance;
162
163    let score_vec_rounded = ScoresVector {
164        is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
165        containment: python_round_tenths(containment),
166        resemblance: python_round_tenths(amplified_resemblance),
167        matched_length: python_round_tenths(matched_length as f64 / 20.0),
168        rid,
169    };
170
171    let score_vec_full = ScoresVector {
172        is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
173        containment: containment as f32,
174        resemblance: amplified_resemblance as f32,
175        matched_length: matched_length as f32,
176        rid,
177    };
178
179    (score_vec_rounded, score_vec_full)
180}
181
182/// Key for grouping duplicate candidates.
183///
184/// Candidates with the same DupeGroupKey are considered duplicates,
185/// and only the best one is kept.
186///
187/// Corresponds to Python: `filter_dupes.group_key()` in match_set.py (line 467-476)
188#[derive(Debug, Clone, Hash, PartialEq, Eq)]
189struct DupeGroupKey {
190    license_expression: String,
191    is_highly_resemblant: bool,
192    containment: i32,
193    resemblance: i32,
194    matched_length: i32,
195    rule_length: usize,
196}
197
198/// Filter duplicate candidates, keeping only the best from each group.
199///
200/// Candidates are grouped by (license_expression, is_highly_resemblant, containment,
201/// resemblance, matched_length, rule_length). Within each group, candidates are
202/// ranked by (score_vec_full, rule.identifier) and only the best is kept.
203///
204/// This matches Python's filter_dupes behavior where matched_length uses 1-decimal
205/// precision (e.g., 6.9 and 6.7 are different, but 7 and 7 would be same).
206///
207/// Corresponds to Python: `filter_dupes()` in match_set.py (line 461-498)
208pub(super) fn filter_dupes(candidates: Vec<Candidate<'_>>) -> Vec<Candidate<'_>> {
209    let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
210
211    for candidate in candidates {
212        let key = DupeGroupKey {
213            license_expression: candidate.rule.license_expression.clone(),
214            is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
215            containment: quantize_tenths(candidate.score_vec_rounded.containment),
216            resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
217            matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
218            rule_length: candidate.rule.tokens.len(),
219        };
220        groups.entry(key).or_default().push(candidate);
221    }
222
223    let mut result: Vec<Candidate> = Vec::new();
224    for mut group in groups.into_values() {
225        // Python: duplicates = sorted(duplicates, reverse=True, key=lambda x: (sv_full, rule.identifier))
226        // Higher sv_full wins, then HIGHER identifier alphabetically (reverse=True)
227        group.sort_by(|a, b| {
228            b.score_vec_full
229                .cmp(&a.score_vec_full)
230                .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
231        });
232        if let Some(best) = group.into_iter().next() {
233            result.push(best);
234        }
235    }
236
237    result
238}
239
240/// Compute intersection of two multisets.
241///
242/// For each token ID present in both multisets, the intersection value is the
243/// smaller of the occurrence counts.
244///
245/// Corresponds to Python: `multisets_intersector()` in match_set.py (line 119)
246pub fn multisets_intersector(
247    qmset: &HashMap<TokenId, usize>,
248    imset: &HashMap<TokenId, usize>,
249) -> HashMap<TokenId, usize> {
250    let (set1, set2) = if qmset.len() < imset.len() {
251        (qmset, imset)
252    } else {
253        (imset, qmset)
254    };
255
256    set1.iter()
257        .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
258        .collect()
259}
260
261/// Compute multiset-based candidates (Phase 2 refinement).
262///
263/// After selecting candidates using sets, this refines the ranking using multisets.
264///
265/// Corresponds to Python: `compute_candidates()` step 2 in match_set.py (line 311-350)
266pub fn compute_candidates_with_msets<'a>(
267    index: &'a LicenseIndex,
268    query_run: &QueryRun,
269    high_resemblance: bool,
270    top_n: usize,
271) -> Vec<Candidate<'a>> {
272    let query_tokens = query_run.matchable_tokens();
273    if query_tokens.is_empty() {
274        return Vec::new();
275    }
276
277    let query_token_ids: Vec<TokenId> = query_tokens
278        .iter()
279        .filter(|&&tid| tid >= 0)
280        .map(|&tid| TokenId::new(tid as u16))
281        .collect();
282
283    if query_token_ids.is_empty() {
284        return Vec::new();
285    }
286
287    let (query_set_hash, query_mset) = build_set_and_mset(&query_token_ids);
288    let query_set: TokenSet = TokenSet::from_u16_iter(query_set_hash.iter().map(|tid| tid.raw()));
289
290    // Build the set of high-value tokens in the query
291    let query_high_set: TokenSet = TokenSet::from_u16_iter(
292        query_set
293            .iter()
294            .filter(|tid| (*tid as usize) < index.len_legalese),
295    );
296
297    if query_high_set.is_empty() {
298        return Vec::new();
299    }
300
301    // Use inverted index to find candidate rules that share high-value tokens
302    let candidate_rids: HashSet<usize> = query_high_set
303        .iter()
304        .filter_map(|tid| index.rids_by_high_tid.get(&TokenId::new(tid)))
305        .flat_map(|rids| rids.iter().copied())
306        .collect();
307
308    if candidate_rids.is_empty() {
309        return Vec::new();
310    }
311
312    let mut step1_candidates: Vec<(ScoresVector, ScoresVector, usize, &'a Rule, TokenSet)> =
313        Vec::new();
314
315    for rid in candidate_rids {
316        let Some(rule) = index.rules_by_rid.get(rid) else {
317            continue;
318        };
319        let Some(rule_set) = index.sets_by_rid.get(&rid) else {
320            continue;
321        };
322        let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
323            continue;
324        };
325
326        // STEP 1: Compute HIGH intersection first (smaller sets, faster)
327        // Check size without allocation for early rejection
328        let high_intersection_size = query_high_set.intersection_count(rule_high_set);
329        if high_intersection_size < rule.min_high_matched_length_unique {
330            continue;
331        }
332
333        // Allocate the high intersection (passed threshold check)
334        let high_set_intersection: TokenSet = query_high_set.intersection(rule_high_set);
335        if high_set_intersection.is_empty() {
336            continue;
337        }
338
339        // STEP 2: Only now compute FULL intersection (fewer candidates reach here)
340        let intersection: TokenSet = query_set.intersection(rule_set);
341        if intersection.is_empty() {
342            continue;
343        }
344
345        // Check total intersection threshold
346        let matched_length = intersection.len();
347        if matched_length < rule.min_matched_length_unique {
348            continue;
349        }
350
351        // Compute resemblance using TOTAL intersection, not just high
352        let qset_len = query_set.len();
353        let iset_len = rule.length_unique;
354        if qset_len == 0 || iset_len == 0 {
355            continue;
356        }
357
358        let union_len = qset_len + iset_len - matched_length;
359        let resemblance = matched_length as f64 / union_len as f64;
360        let containment = matched_length as f64 / iset_len as f64;
361
362        // Check minimum_containment (Python: match_set.py:429-433)
363        // Rules with minimum_coverage require a minimum containment ratio
364        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
365        if let Some(min_cont) = minimum_containment
366            && containment < min_cont
367        {
368            continue;
369        }
370
371        let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
372
373        if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
374            continue;
375        }
376
377        step1_candidates.push((svr, svf, rid, rule, high_set_intersection));
378    }
379
380    if step1_candidates.is_empty() {
381        return Vec::new();
382    }
383
384    step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
385
386    step1_candidates.truncate(top_n * 10);
387
388    let mut sortable_candidates: Vec<Candidate<'a>> = Vec::new();
389
390    for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
391        let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
392            continue;
393        };
394
395        // Filter using HIGH multisets (Python: high_intersection check)
396        let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
397        let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
398        let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
399        if high_intersection_mset.is_empty() {
400            continue;
401        }
402
403        let high_matched_length: usize = high_intersection_mset.values().sum();
404        if high_matched_length < rule.min_high_matched_length {
405            continue;
406        }
407
408        // Compute scores using FULL multisets (Python: matched_length = counter(intersection))
409        let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
410        let matched_length: usize = full_intersection_mset.values().sum();
411        if matched_length < rule.min_matched_length {
412            continue;
413        }
414        let qset_len: usize = query_mset.values().sum();
415        let iset_len: usize = rule_mset.values().sum();
416
417        if qset_len == 0 || iset_len == 0 {
418            continue;
419        }
420
421        let union_len = qset_len + iset_len - matched_length;
422        let resemblance = matched_length as f64 / union_len as f64;
423        let containment = matched_length as f64 / iset_len as f64;
424
425        // Check minimum_containment (Python: match_set.py:429-433)
426        // Rules with minimum_coverage require a minimum containment ratio
427        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
428        if let Some(min_cont) = minimum_containment
429            && containment < min_cont
430        {
431            continue;
432        }
433
434        let (score_vec_rounded, score_vec_full) =
435            build_score_vectors(resemblance, containment, matched_length, rid);
436
437        if high_resemblance
438            && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
439        {
440            continue;
441        }
442
443        sortable_candidates.push(Candidate {
444            score_vec_rounded,
445            score_vec_full,
446            rid,
447            rule,
448            high_set_intersection,
449        });
450    }
451
452    sortable_candidates = filter_dupes(sortable_candidates);
453
454    sortable_candidates.sort_by(|a, b| b.cmp(a));
455    sortable_candidates.truncate(top_n);
456
457    sortable_candidates
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463    use crate::license_detection::index::dictionary::tid;
464
465    #[test]
466    fn test_scores_vector_comparison() {
467        let sv1 = ScoresVector {
468            is_highly_resemblant: true,
469            containment: 0.9,
470            resemblance: 0.8,
471            matched_length: 10.0,
472            rid: 0,
473        };
474
475        let sv2 = ScoresVector {
476            is_highly_resemblant: false,
477            containment: 0.8,
478            resemblance: 0.6,
479            matched_length: 5.0,
480            rid: 1,
481        };
482
483        assert!(sv1 > sv2);
484    }
485
486    #[test]
487    fn test_python_round_tenths_matches_python_half_even_behavior() {
488        assert_eq!(python_round_tenths(0.05), 0.1);
489        assert_eq!(python_round_tenths(0.15), 0.1);
490        assert_eq!(python_round_tenths(0.25), 0.2);
491        assert_eq!(python_round_tenths(2.25), 2.2);
492        assert_eq!(python_round_tenths(4.35), 4.3);
493        assert_eq!(python_round_tenths(6.65), 6.7);
494    }
495
496    #[test]
497    fn test_candidate_ordering() {
498        let rule1 = Rule {
499            identifier: "test1".to_string(),
500            license_expression: "mit".to_string(),
501            text: String::new(),
502            tokens: vec![],
503            rule_kind: crate::license_detection::models::RuleKind::Text,
504            is_false_positive: false,
505            is_required_phrase: false,
506            is_from_license: false,
507            relevance: 100,
508            minimum_coverage: None,
509            has_stored_minimum_coverage: false,
510            is_continuous: true,
511            referenced_filenames: None,
512            ignorable_urls: None,
513            ignorable_emails: None,
514            ignorable_copyrights: None,
515            ignorable_holders: None,
516            ignorable_authors: None,
517            language: None,
518            notes: None,
519            length_unique: 0,
520            high_length_unique: 0,
521            high_length: 0,
522            min_matched_length: 0,
523            min_high_matched_length: 0,
524            min_matched_length_unique: 0,
525            min_high_matched_length_unique: 0,
526            is_small: false,
527            is_tiny: false,
528            starts_with_license: false,
529            ends_with_license: false,
530            is_deprecated: false,
531            spdx_license_key: None,
532            other_spdx_license_keys: vec![],
533            required_phrase_spans: vec![],
534            stopwords_by_pos: std::collections::HashMap::new(),
535        };
536
537        let rule2 = Rule {
538            identifier: "test2".to_string(),
539            license_expression: "apache".to_string(),
540            text: String::new(),
541            tokens: vec![],
542            rule_kind: crate::license_detection::models::RuleKind::Text,
543            is_false_positive: false,
544            is_required_phrase: false,
545            is_from_license: false,
546            relevance: 100,
547            minimum_coverage: None,
548            has_stored_minimum_coverage: false,
549            is_continuous: true,
550            referenced_filenames: None,
551            ignorable_urls: None,
552            ignorable_emails: None,
553            ignorable_copyrights: None,
554            ignorable_holders: None,
555            ignorable_authors: None,
556            language: None,
557            notes: None,
558            length_unique: 0,
559            high_length_unique: 0,
560            high_length: 0,
561            min_matched_length: 0,
562            min_high_matched_length: 0,
563            min_matched_length_unique: 0,
564            min_high_matched_length_unique: 0,
565            is_small: false,
566            is_tiny: false,
567            starts_with_license: false,
568            ends_with_license: false,
569            is_deprecated: false,
570            spdx_license_key: None,
571            other_spdx_license_keys: vec![],
572            required_phrase_spans: vec![],
573            stopwords_by_pos: std::collections::HashMap::new(),
574        };
575
576        let candidate1 = Candidate {
577            score_vec_rounded: ScoresVector {
578                is_highly_resemblant: true,
579                containment: 0.9,
580                resemblance: 0.8,
581                matched_length: 10.0,
582                rid: 0,
583            },
584            score_vec_full: ScoresVector {
585                is_highly_resemblant: true,
586                containment: 0.9,
587                resemblance: 0.8,
588                matched_length: 10.0,
589                rid: 0,
590            },
591            rid: 0,
592            rule: &rule1,
593            high_set_intersection: TokenSet::new(),
594        };
595
596        let candidate2 = Candidate {
597            score_vec_rounded: ScoresVector {
598                is_highly_resemblant: false,
599                containment: 0.5,
600                resemblance: 0.3,
601                matched_length: 5.0,
602                rid: 1,
603            },
604            score_vec_full: ScoresVector {
605                is_highly_resemblant: false,
606                containment: 0.5,
607                resemblance: 0.3,
608                matched_length: 5.0,
609                rid: 1,
610            },
611            rid: 1,
612            rule: &rule2,
613            high_set_intersection: TokenSet::new(),
614        };
615
616        assert!(
617            candidate1 > candidate2,
618            "Higher containment candidate should rank higher"
619        );
620    }
621
622    #[test]
623    fn test_filter_dupes_matched_length_precision() {
624        let rule1 = Rule {
625            identifier: "x11-dec1.RULE".to_string(),
626            license_expression: "x11-dec1".to_string(),
627            text: String::new(),
628            tokens: vec![tid(0); 138],
629            rule_kind: crate::license_detection::models::RuleKind::Text,
630            is_false_positive: false,
631            is_required_phrase: false,
632            is_from_license: false,
633            relevance: 100,
634            minimum_coverage: None,
635            has_stored_minimum_coverage: false,
636            is_continuous: true,
637            referenced_filenames: None,
638            ignorable_urls: None,
639            ignorable_emails: None,
640            ignorable_copyrights: None,
641            ignorable_holders: None,
642            ignorable_authors: None,
643            language: None,
644            notes: None,
645            length_unique: 0,
646            high_length_unique: 0,
647            high_length: 0,
648            min_matched_length: 0,
649            min_high_matched_length: 0,
650            min_matched_length_unique: 0,
651            min_high_matched_length_unique: 0,
652            is_small: false,
653            is_tiny: false,
654            starts_with_license: false,
655            ends_with_license: false,
656            is_deprecated: false,
657            spdx_license_key: None,
658            other_spdx_license_keys: vec![],
659            required_phrase_spans: vec![],
660            stopwords_by_pos: std::collections::HashMap::new(),
661        };
662
663        let rule2 = Rule {
664            identifier: "cmu-uc.RULE".to_string(),
665            license_expression: "cmu-uc".to_string(),
666            text: String::new(),
667            tokens: vec![tid(0); 133],
668            ..rule1.clone()
669        };
670
671        let candidate1 = Candidate {
672            score_vec_rounded: ScoresVector {
673                is_highly_resemblant: false,
674                containment: 0.5,
675                resemblance: 0.25,
676                matched_length: 7.0,
677                rid: 1,
678            },
679            score_vec_full: ScoresVector {
680                is_highly_resemblant: false,
681                containment: 0.5,
682                resemblance: 0.25,
683                matched_length: 138.0,
684                rid: 1,
685            },
686            rid: 1,
687            rule: &rule1,
688            high_set_intersection: TokenSet::new(),
689        };
690
691        let candidate2 = Candidate {
692            score_vec_rounded: ScoresVector {
693                is_highly_resemblant: false,
694                containment: 0.5,
695                resemblance: 0.25,
696                matched_length: 7.0,
697                rid: 2,
698            },
699            score_vec_full: ScoresVector {
700                is_highly_resemblant: false,
701                containment: 0.5,
702                resemblance: 0.25,
703                matched_length: 133.0,
704                rid: 2,
705            },
706            rid: 2,
707            rule: &rule2,
708            high_set_intersection: TokenSet::new(),
709        };
710
711        let candidates = vec![candidate1, candidate2];
712        let filtered = filter_dupes(candidates);
713
714        assert_eq!(
715            filtered.len(),
716            2,
717            "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
718        );
719    }
720
721    #[test]
722    fn test_filter_dupes_same_group() {
723        let rule1 = Rule {
724            identifier: "mit.RULE".to_string(),
725            license_expression: "mit".to_string(),
726            text: String::new(),
727            tokens: vec![tid(0); 100],
728            rule_kind: crate::license_detection::models::RuleKind::Text,
729            is_false_positive: false,
730            is_required_phrase: false,
731            is_from_license: false,
732            relevance: 100,
733            minimum_coverage: None,
734            has_stored_minimum_coverage: false,
735            is_continuous: true,
736            referenced_filenames: None,
737            ignorable_urls: None,
738            ignorable_emails: None,
739            ignorable_copyrights: None,
740            ignorable_holders: None,
741            ignorable_authors: None,
742            language: None,
743            notes: None,
744            length_unique: 0,
745            high_length_unique: 0,
746            high_length: 0,
747            min_matched_length: 0,
748            min_high_matched_length: 0,
749            min_matched_length_unique: 0,
750            min_high_matched_length_unique: 0,
751            is_small: false,
752            is_tiny: false,
753            starts_with_license: false,
754            ends_with_license: false,
755            is_deprecated: false,
756            spdx_license_key: None,
757            other_spdx_license_keys: vec![],
758            required_phrase_spans: vec![],
759            stopwords_by_pos: std::collections::HashMap::new(),
760        };
761
762        let rule2 = Rule {
763            identifier: "mit_2.RULE".to_string(),
764            license_expression: "mit".to_string(),
765            text: String::new(),
766            tokens: vec![tid(0); 100],
767            ..rule1.clone()
768        };
769
770        let candidate1 = Candidate {
771            score_vec_rounded: ScoresVector {
772                is_highly_resemblant: false,
773                containment: 0.5,
774                resemblance: 0.25,
775                matched_length: 5.0,
776                rid: 1,
777            },
778            score_vec_full: ScoresVector {
779                is_highly_resemblant: false,
780                containment: 0.5,
781                resemblance: 0.25,
782                matched_length: 100.0,
783                rid: 1,
784            },
785            rid: 1,
786            rule: &rule1,
787            high_set_intersection: TokenSet::new(),
788        };
789
790        let candidate2 = Candidate {
791            score_vec_rounded: ScoresVector {
792                is_highly_resemblant: false,
793                containment: 0.5,
794                resemblance: 0.25,
795                matched_length: 5.0,
796                rid: 2,
797            },
798            score_vec_full: ScoresVector {
799                is_highly_resemblant: false,
800                containment: 0.5,
801                resemblance: 0.25,
802                matched_length: 100.0,
803                rid: 2,
804            },
805            rid: 2,
806            rule: &rule2,
807            high_set_intersection: TokenSet::new(),
808        };
809
810        let candidates = vec![candidate1, candidate2];
811        let filtered = filter_dupes(candidates);
812
813        assert_eq!(
814            filtered.len(),
815            1,
816            "Should keep only one candidate when all group keys match"
817        );
818    }
819
820    #[test]
821    fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
822        let rule_sa = Rule {
823            identifier: "cc-by-sa-1.0.RULE".to_string(),
824            license_expression: "cc-by-sa-1.0".to_string(),
825            text: String::new(),
826            tokens: vec![tid(0); 1960],
827            rule_kind: crate::license_detection::models::RuleKind::Text,
828            is_false_positive: false,
829            is_required_phrase: false,
830            is_from_license: false,
831            relevance: 100,
832            minimum_coverage: None,
833            has_stored_minimum_coverage: false,
834            is_continuous: true,
835            referenced_filenames: None,
836            ignorable_urls: None,
837            ignorable_emails: None,
838            ignorable_copyrights: None,
839            ignorable_holders: None,
840            ignorable_authors: None,
841            language: None,
842            notes: None,
843            length_unique: 0,
844            high_length_unique: 0,
845            high_length: 0,
846            min_matched_length: 0,
847            min_high_matched_length: 0,
848            min_matched_length_unique: 0,
849            min_high_matched_length_unique: 0,
850            is_small: false,
851            is_tiny: false,
852            starts_with_license: false,
853            ends_with_license: false,
854            is_deprecated: false,
855            spdx_license_key: None,
856            other_spdx_license_keys: vec![],
857            required_phrase_spans: vec![],
858            stopwords_by_pos: std::collections::HashMap::new(),
859        };
860
861        let rule_nc_sa = Rule {
862            identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
863            license_expression: "cc-by-nc-sa-1.0".to_string(),
864            text: String::new(),
865            tokens: vec![tid(0); 1829],
866            rule_kind: crate::license_detection::models::RuleKind::Text,
867            is_false_positive: false,
868            is_required_phrase: false,
869            is_from_license: false,
870            relevance: 100,
871            minimum_coverage: None,
872            has_stored_minimum_coverage: false,
873            is_continuous: true,
874            referenced_filenames: None,
875            ignorable_urls: None,
876            ignorable_emails: None,
877            ignorable_copyrights: None,
878            ignorable_holders: None,
879            ignorable_authors: None,
880            language: None,
881            notes: None,
882            length_unique: 0,
883            high_length_unique: 0,
884            high_length: 0,
885            min_matched_length: 0,
886            min_high_matched_length: 0,
887            min_matched_length_unique: 0,
888            min_high_matched_length_unique: 0,
889            is_small: false,
890            is_tiny: false,
891            starts_with_license: false,
892            ends_with_license: false,
893            is_deprecated: false,
894            spdx_license_key: None,
895            other_spdx_license_keys: vec![],
896            required_phrase_spans: vec![],
897            stopwords_by_pos: std::collections::HashMap::new(),
898        };
899
900        let candidate_sa = Candidate {
901            score_vec_rounded: ScoresVector {
902                is_highly_resemblant: true,
903                containment: 0.9,
904                resemblance: 0.8,
905                matched_length: 100.0,
906                rid: 1,
907            },
908            score_vec_full: ScoresVector {
909                is_highly_resemblant: true,
910                containment: 0.9,
911                resemblance: 0.8,
912                matched_length: 100.0,
913                rid: 1,
914            },
915            rid: 1,
916            rule: &rule_sa,
917            high_set_intersection: TokenSet::new(),
918        };
919
920        let candidate_nc_sa = Candidate {
921            score_vec_rounded: ScoresVector {
922                is_highly_resemblant: true,
923                containment: 0.9,
924                resemblance: 0.8,
925                matched_length: 100.0,
926                rid: 2,
927            },
928            score_vec_full: ScoresVector {
929                is_highly_resemblant: true,
930                containment: 0.9,
931                resemblance: 0.8,
932                matched_length: 100.0,
933                rid: 2,
934            },
935            rid: 2,
936            rule: &rule_nc_sa,
937            high_set_intersection: TokenSet::new(),
938        };
939
940        let candidates = vec![candidate_nc_sa, candidate_sa];
941        let filtered = filter_dupes(candidates);
942
943        assert_eq!(
944            filtered.len(),
945            2,
946            "Different license expressions should create different groups"
947        );
948
949        let mut rule_same1 = Rule {
950            license_expression: "same".to_string(),
951            tokens: vec![tid(0); 100],
952            ..rule_sa.clone()
953        };
954        let mut rule_same2 = Rule {
955            license_expression: "same".to_string(),
956            tokens: vec![tid(0); 100],
957            ..rule_nc_sa.clone()
958        };
959
960        let same_group_candidates = vec![
961            Candidate {
962                score_vec_rounded: filtered[0].score_vec_rounded.clone(),
963                score_vec_full: filtered[0].score_vec_full.clone(),
964                rid: filtered[0].rid,
965                rule: &mut rule_same1,
966                high_set_intersection: TokenSet::new(),
967            },
968            Candidate {
969                score_vec_rounded: filtered[1].score_vec_rounded.clone(),
970                score_vec_full: filtered[1].score_vec_full.clone(),
971                rid: filtered[1].rid,
972                rule: &mut rule_same2,
973                high_set_intersection: TokenSet::new(),
974            },
975        ];
976
977        let deduped = filter_dupes(same_group_candidates);
978        assert_eq!(deduped.len(), 1);
979        assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
980    }
981
982    #[test]
983    fn test_candidate_ordering_uses_rid_after_equal_scores() {
984        let rule_a = Rule {
985            identifier: "a.RULE".to_string(),
986            license_expression: "a".to_string(),
987            text: String::new(),
988            tokens: vec![tid(0); 10],
989            rule_kind: crate::license_detection::models::RuleKind::Text,
990            is_false_positive: false,
991            is_required_phrase: false,
992            is_from_license: false,
993            relevance: 100,
994            minimum_coverage: None,
995            has_stored_minimum_coverage: false,
996            is_continuous: true,
997            referenced_filenames: None,
998            ignorable_urls: None,
999            ignorable_emails: None,
1000            ignorable_copyrights: None,
1001            ignorable_holders: None,
1002            ignorable_authors: None,
1003            language: None,
1004            notes: None,
1005            length_unique: 0,
1006            high_length_unique: 0,
1007            high_length: 0,
1008            min_matched_length: 0,
1009            min_high_matched_length: 0,
1010            min_matched_length_unique: 0,
1011            min_high_matched_length_unique: 0,
1012            is_small: false,
1013            is_tiny: false,
1014            starts_with_license: false,
1015            ends_with_license: false,
1016            is_deprecated: false,
1017            spdx_license_key: None,
1018            other_spdx_license_keys: vec![],
1019            required_phrase_spans: vec![],
1020            stopwords_by_pos: std::collections::HashMap::new(),
1021        };
1022
1023        let rule_z = Rule {
1024            identifier: "z.RULE".to_string(),
1025            ..rule_a.clone()
1026        };
1027
1028        let candidate_low_rid = Candidate {
1029            score_vec_rounded: ScoresVector {
1030                is_highly_resemblant: true,
1031                containment: 0.9,
1032                resemblance: 0.8,
1033                matched_length: 10.0,
1034                rid: 1,
1035            },
1036            score_vec_full: ScoresVector {
1037                is_highly_resemblant: true,
1038                containment: 0.9,
1039                resemblance: 0.8,
1040                matched_length: 10.0,
1041                rid: 1,
1042            },
1043            rid: 1,
1044            rule: &rule_z,
1045            high_set_intersection: TokenSet::new(),
1046        };
1047
1048        let candidate_high_rid = Candidate {
1049            score_vec_rounded: ScoresVector {
1050                rid: 2,
1051                ..candidate_low_rid.score_vec_rounded.clone()
1052            },
1053            score_vec_full: ScoresVector {
1054                rid: 2,
1055                ..candidate_low_rid.score_vec_full.clone()
1056            },
1057            rid: 2,
1058            rule: &rule_a,
1059            high_set_intersection: TokenSet::new(),
1060        };
1061
1062        let mut sorted = [candidate_low_rid, candidate_high_rid];
1063        sorted.sort_by(|a, b| b.cmp(a));
1064        assert_eq!(
1065            sorted[0].rid, 2,
1066            "Python final candidate tuple ordering falls back to higher rid after equal scores"
1067        );
1068    }
1069}