Skip to main content

provenant/license_detection/seq_match/
candidates.rs

1//! Candidate selection using set and multiset similarity.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{
6    build_set_and_mset, high_multiset_subset, tids_set_counter,
7};
8use crate::license_detection::models::Rule;
9use crate::license_detection::query::QueryRun;
10use std::collections::{HashMap, HashSet};
11
12use super::HIGH_RESEMBLANCE_THRESHOLD;
13
14/// Score vector for ranking candidates using set similarity.
15///
16/// Contains metrics computed from set/multiset intersections.
17///
18/// Corresponds to Python: `ScoresVector` namedtuple in match_set.py (line 458)
19#[derive(Debug, Clone, PartialEq)]
20pub struct ScoresVector {
21    /// True if the sets are highly similar (resemblance >= threshold)
22    pub is_highly_resemblant: bool,
23    /// Containment ratio (how much of rule is in query)
24    pub containment: f32,
25    /// Amplified resemblance (squared to boost high values)
26    pub resemblance: f32,
27    /// Number of matched tokens (normalized for ranking)
28    pub matched_length: f32,
29    /// Rule ID for tie-breaking
30    pub rid: usize,
31}
32
33impl PartialOrd for ScoresVector {
34    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
35        Some(self.cmp(other))
36    }
37}
38
39impl Eq for ScoresVector {}
40
41impl Ord for ScoresVector {
42    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
43        // Python sorts ScoresVector namedtuple with reverse=True:
44        // 1. is_highly_resemblant (True > False)
45        // 2. containment (higher is better)
46        // 3. resemblance (higher is better)
47        // 4. matched_length (higher is better)
48        // Note: Python does NOT use rid for tie-breaking in ScoresVector
49        self.is_highly_resemblant
50            .cmp(&other.is_highly_resemblant)
51            .then_with(|| {
52                self.containment
53                    .partial_cmp(&other.containment)
54                    .unwrap_or(std::cmp::Ordering::Equal)
55            })
56            .then_with(|| {
57                self.resemblance
58                    .partial_cmp(&other.resemblance)
59                    .unwrap_or(std::cmp::Ordering::Equal)
60            })
61            .then_with(|| {
62                self.matched_length
63                    .partial_cmp(&other.matched_length)
64                    .unwrap_or(std::cmp::Ordering::Equal)
65            })
66    }
67}
68
69/// Candidate with its score vector and metadata.
70///
71/// Corresponds to the tuple structure used in Python: (scores_vectors, rid, rule, high_set_intersection)
72#[derive(Debug, Clone, PartialEq)]
73pub struct Candidate {
74    /// Rounded score vector for display/grouping
75    pub score_vec_rounded: ScoresVector,
76    /// Full score vector for sorting
77    pub score_vec_full: ScoresVector,
78    /// Rule ID
79    pub rid: usize,
80    /// Reference to the rule
81    pub rule: Rule,
82    /// Set of high-value (legalese) tokens in the intersection
83    pub high_set_intersection: HashSet<TokenId>,
84}
85
86impl PartialOrd for Candidate {
87    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
88        Some(self.cmp(other))
89    }
90}
91
92impl Eq for Candidate {}
93
94impl Ord for Candidate {
95    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
96        // Python sorts the tuple ((svr, svf), rid, rule, ...) with reverse=True
97        // So it compares (svr, svf) tuple first, which means:
98        // 1. Compare rounded (svr) first
99        // 2. Then compare full (svf) if rounded is equal
100        // 3. Then compare rid if scores are still equal.
101        compare_candidate_rank(
102            &self.score_vec_rounded,
103            &self.score_vec_full,
104            self.rid,
105            &other.score_vec_rounded,
106            &other.score_vec_full,
107            other.rid,
108        )
109    }
110}
111
112fn compare_candidate_rank(
113    rounded: &ScoresVector,
114    full: &ScoresVector,
115    rid: usize,
116    other_rounded: &ScoresVector,
117    other_full: &ScoresVector,
118    other_rid: usize,
119) -> std::cmp::Ordering {
120    rounded
121        .cmp(other_rounded)
122        .then_with(|| full.cmp(other_full))
123        .then_with(|| rid.cmp(&other_rid))
124}
125
126fn python_round_tenths(value: f64) -> f32 {
127    let rendered = format!("{value:.20}");
128    let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
129
130    let whole_part: i64 = whole.parse().unwrap_or(0);
131    let mut frac_chars = frac.chars();
132    let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
133    let rest: String = frac_chars.collect();
134
135    let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
136    let should_round_up = if rest > threshold {
137        true
138    } else if rest == threshold {
139        tenths % 2 == 1
140    } else {
141        false
142    };
143
144    let mut scaled = whole_part * 10 + tenths;
145    if should_round_up {
146        scaled += 1;
147    }
148
149    scaled as f32 / 10.0
150}
151
152fn quantize_tenths(value: f32) -> i32 {
153    format!("{value:.1}")
154        .chars()
155        .filter(|c| *c != '.')
156        .collect::<String>()
157        .parse()
158        .unwrap_or(0)
159}
160
161fn build_score_vectors(
162    resemblance: f64,
163    containment: f64,
164    matched_length: usize,
165    rid: usize,
166) -> (ScoresVector, ScoresVector) {
167    let amplified_resemblance = resemblance * resemblance;
168
169    let score_vec_rounded = ScoresVector {
170        is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
171        containment: python_round_tenths(containment),
172        resemblance: python_round_tenths(amplified_resemblance),
173        matched_length: python_round_tenths(matched_length as f64 / 20.0),
174        rid,
175    };
176
177    let score_vec_full = ScoresVector {
178        is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
179        containment: containment as f32,
180        resemblance: amplified_resemblance as f32,
181        matched_length: matched_length as f32,
182        rid,
183    };
184
185    (score_vec_rounded, score_vec_full)
186}
187
188/// Key for grouping duplicate candidates.
189///
190/// Candidates with the same DupeGroupKey are considered duplicates,
191/// and only the best one is kept.
192///
193/// Corresponds to Python: `filter_dupes.group_key()` in match_set.py (line 467-476)
194#[derive(Debug, Clone, Hash, PartialEq, Eq)]
195struct DupeGroupKey {
196    license_expression: String,
197    is_highly_resemblant: bool,
198    containment: i32,
199    resemblance: i32,
200    matched_length: i32,
201    rule_length: usize,
202}
203
204/// Filter duplicate candidates, keeping only the best from each group.
205///
206/// Candidates are grouped by (license_expression, is_highly_resemblant, containment,
207/// resemblance, matched_length, rule_length). Within each group, candidates are
208/// ranked by (score_vec_full, rule.identifier) and only the best is kept.
209///
210/// This matches Python's filter_dupes behavior where matched_length uses 1-decimal
211/// precision (e.g., 6.9 and 6.7 are different, but 7 and 7 would be same).
212///
213/// Corresponds to Python: `filter_dupes()` in match_set.py (line 461-498)
214pub(super) fn filter_dupes(candidates: Vec<Candidate>) -> Vec<Candidate> {
215    let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
216
217    for candidate in candidates {
218        let key = DupeGroupKey {
219            license_expression: candidate.rule.license_expression.clone(),
220            is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
221            containment: quantize_tenths(candidate.score_vec_rounded.containment),
222            resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
223            matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
224            rule_length: candidate.rule.tokens.len(),
225        };
226        groups.entry(key).or_default().push(candidate);
227    }
228
229    let mut result: Vec<Candidate> = Vec::new();
230    for mut group in groups.into_values() {
231        // Python: duplicates = sorted(duplicates, reverse=True, key=lambda x: (sv_full, rule.identifier))
232        // Higher sv_full wins, then HIGHER identifier alphabetically (reverse=True)
233        group.sort_by(|a, b| {
234            b.score_vec_full
235                .cmp(&a.score_vec_full)
236                .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
237        });
238        if let Some(best) = group.into_iter().next() {
239            result.push(best);
240        }
241    }
242
243    result
244}
245
246/// Compute intersection of two multisets.
247///
248/// For each token ID present in both multisets, the intersection value is the
249/// smaller of the occurrence counts.
250///
251/// Corresponds to Python: `multisets_intersector()` in match_set.py (line 119)
252pub fn multisets_intersector(
253    qmset: &HashMap<TokenId, usize>,
254    imset: &HashMap<TokenId, usize>,
255) -> HashMap<TokenId, usize> {
256    let (set1, set2) = if qmset.len() < imset.len() {
257        (qmset, imset)
258    } else {
259        (imset, qmset)
260    };
261
262    set1.iter()
263        .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
264        .collect()
265}
266
267/// Compute multiset-based candidates (Phase 2 refinement).
268///
269/// After selecting candidates using sets, this refines the ranking using multisets.
270///
271/// Corresponds to Python: `compute_candidates()` step 2 in match_set.py (line 311-350)
272pub fn compute_candidates_with_msets(
273    index: &LicenseIndex,
274    query_run: &QueryRun,
275    high_resemblance: bool,
276    top_n: usize,
277) -> Vec<Candidate> {
278    let query_tokens = query_run.matchable_tokens();
279    if query_tokens.is_empty() {
280        return Vec::new();
281    }
282
283    let query_token_ids: Vec<TokenId> = query_tokens
284        .iter()
285        .filter(|&&tid| tid >= 0)
286        .map(|&tid| TokenId::new(tid as u16))
287        .collect();
288
289    if query_token_ids.is_empty() {
290        return Vec::new();
291    }
292
293    let (query_set, query_mset) = build_set_and_mset(&query_token_ids);
294
295    // Build the set of high-value tokens in the query
296    let query_high_set: HashSet<TokenId> = query_set
297        .iter()
298        .filter(|tid| tid.as_usize() < index.len_legalese)
299        .copied()
300        .collect();
301
302    if query_high_set.is_empty() {
303        return Vec::new();
304    }
305
306    // Use inverted index to find candidate rules that share high-value tokens
307    let candidate_rids: HashSet<usize> = query_high_set
308        .iter()
309        .filter_map(|tid| index.rids_by_high_tid.get(tid))
310        .flat_map(|rids| rids.iter().copied())
311        .collect();
312
313    if candidate_rids.is_empty() {
314        return Vec::new();
315    }
316
317    let mut step1_candidates: Vec<(ScoresVector, ScoresVector, usize, Rule, HashSet<TokenId>)> =
318        Vec::new();
319
320    for rid in candidate_rids {
321        let Some(rule) = index.rules_by_rid.get(rid) else {
322            continue;
323        };
324        let Some(rule_set) = index.sets_by_rid.get(&rid) else {
325            continue;
326        };
327        let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
328            continue;
329        };
330
331        // STEP 1: Compute HIGH intersection first (smaller sets, faster)
332        // Check size without allocation for early rejection
333        let high_intersection_size = query_high_set.intersection(rule_high_set).count();
334        if high_intersection_size < rule.min_high_matched_length_unique {
335            continue;
336        }
337
338        // Allocate the high intersection (passed threshold check)
339        let high_set_intersection: HashSet<TokenId> = query_high_set
340            .intersection(rule_high_set)
341            .copied()
342            .collect();
343        if high_set_intersection.is_empty() {
344            continue;
345        }
346
347        // STEP 2: Only now compute FULL intersection (fewer candidates reach here)
348        let intersection: HashSet<TokenId> = query_set.intersection(rule_set).copied().collect();
349        if intersection.is_empty() {
350            continue;
351        }
352
353        // Check total intersection threshold
354        let matched_length = tids_set_counter(&intersection);
355        if matched_length < rule.min_matched_length_unique {
356            continue;
357        }
358
359        // Compute resemblance using TOTAL intersection, not just high
360        let qset_len = query_set.len();
361        let iset_len = rule.length_unique;
362        if qset_len == 0 || iset_len == 0 {
363            continue;
364        }
365
366        let union_len = qset_len + iset_len - matched_length;
367        let resemblance = matched_length as f64 / union_len as f64;
368        let containment = matched_length as f64 / iset_len as f64;
369
370        // Check minimum_containment (Python: match_set.py:429-433)
371        // Rules with minimum_coverage require a minimum containment ratio
372        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
373        if let Some(min_cont) = minimum_containment
374            && containment < min_cont
375        {
376            continue;
377        }
378
379        let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
380
381        if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
382            continue;
383        }
384
385        step1_candidates.push((svr, svf, rid, rule.clone(), high_set_intersection));
386    }
387
388    if step1_candidates.is_empty() {
389        return Vec::new();
390    }
391
392    step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
393
394    step1_candidates.truncate(top_n * 10);
395
396    let mut sortable_candidates: Vec<Candidate> = Vec::new();
397
398    for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
399        let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
400            continue;
401        };
402
403        // Filter using HIGH multisets (Python: high_intersection check)
404        let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
405        let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
406        let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
407        if high_intersection_mset.is_empty() {
408            continue;
409        }
410
411        let high_matched_length: usize = high_intersection_mset.values().sum();
412        if high_matched_length < rule.min_high_matched_length {
413            continue;
414        }
415
416        // Compute scores using FULL multisets (Python: matched_length = counter(intersection))
417        let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
418        let matched_length: usize = full_intersection_mset.values().sum();
419        if matched_length < rule.min_matched_length {
420            continue;
421        }
422        let qset_len: usize = query_mset.values().sum();
423        let iset_len: usize = rule_mset.values().sum();
424
425        if qset_len == 0 || iset_len == 0 {
426            continue;
427        }
428
429        let union_len = qset_len + iset_len - matched_length;
430        let resemblance = matched_length as f64 / union_len as f64;
431        let containment = matched_length as f64 / iset_len as f64;
432
433        // Check minimum_containment (Python: match_set.py:429-433)
434        // Rules with minimum_coverage require a minimum containment ratio
435        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
436        if let Some(min_cont) = minimum_containment
437            && containment < min_cont
438        {
439            continue;
440        }
441
442        let (score_vec_rounded, score_vec_full) =
443            build_score_vectors(resemblance, containment, matched_length, rid);
444
445        if high_resemblance
446            && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
447        {
448            continue;
449        }
450
451        sortable_candidates.push(Candidate {
452            score_vec_rounded,
453            score_vec_full,
454            rid,
455            rule,
456            high_set_intersection,
457        });
458    }
459
460    sortable_candidates = filter_dupes(sortable_candidates);
461
462    sortable_candidates.sort_by(|a, b| b.cmp(a));
463    sortable_candidates.truncate(top_n);
464
465    sortable_candidates
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471    use crate::license_detection::index::dictionary::tid;
472
473    #[test]
474    fn test_scores_vector_comparison() {
475        let sv1 = ScoresVector {
476            is_highly_resemblant: true,
477            containment: 0.9,
478            resemblance: 0.8,
479            matched_length: 10.0,
480            rid: 0,
481        };
482
483        let sv2 = ScoresVector {
484            is_highly_resemblant: false,
485            containment: 0.8,
486            resemblance: 0.6,
487            matched_length: 5.0,
488            rid: 1,
489        };
490
491        assert!(sv1 > sv2);
492    }
493
494    #[test]
495    fn test_python_round_tenths_matches_python_half_even_behavior() {
496        assert_eq!(python_round_tenths(0.05), 0.1);
497        assert_eq!(python_round_tenths(0.15), 0.1);
498        assert_eq!(python_round_tenths(0.25), 0.2);
499        assert_eq!(python_round_tenths(2.25), 2.2);
500        assert_eq!(python_round_tenths(4.35), 4.3);
501        assert_eq!(python_round_tenths(6.65), 6.7);
502    }
503
504    #[test]
505    fn test_candidate_ordering() {
506        let candidate1 = Candidate {
507            score_vec_rounded: ScoresVector {
508                is_highly_resemblant: true,
509                containment: 0.9,
510                resemblance: 0.8,
511                matched_length: 10.0,
512                rid: 0,
513            },
514            score_vec_full: ScoresVector {
515                is_highly_resemblant: true,
516                containment: 0.9,
517                resemblance: 0.8,
518                matched_length: 10.0,
519                rid: 0,
520            },
521            rid: 0,
522            rule: Rule {
523                identifier: "test1".to_string(),
524                license_expression: "mit".to_string(),
525                text: String::new(),
526                tokens: vec![],
527                rule_kind: crate::license_detection::models::RuleKind::Text,
528                is_false_positive: false,
529                is_required_phrase: false,
530                is_from_license: false,
531                relevance: 100,
532                minimum_coverage: None,
533                has_stored_minimum_coverage: false,
534                is_continuous: true,
535                referenced_filenames: None,
536                ignorable_urls: None,
537                ignorable_emails: None,
538                ignorable_copyrights: None,
539                ignorable_holders: None,
540                ignorable_authors: None,
541                language: None,
542                notes: None,
543                length_unique: 0,
544                high_length_unique: 0,
545                high_length: 0,
546                min_matched_length: 0,
547                min_high_matched_length: 0,
548                min_matched_length_unique: 0,
549                min_high_matched_length_unique: 0,
550                is_small: false,
551                is_tiny: false,
552                starts_with_license: false,
553                ends_with_license: false,
554                is_deprecated: false,
555                spdx_license_key: None,
556                other_spdx_license_keys: vec![],
557                required_phrase_spans: vec![],
558                stopwords_by_pos: std::collections::HashMap::new(),
559            },
560            high_set_intersection: HashSet::new(),
561        };
562
563        let candidate2 = Candidate {
564            score_vec_rounded: ScoresVector {
565                is_highly_resemblant: false,
566                containment: 0.5,
567                resemblance: 0.3,
568                matched_length: 5.0,
569                rid: 1,
570            },
571            score_vec_full: ScoresVector {
572                is_highly_resemblant: false,
573                containment: 0.5,
574                resemblance: 0.3,
575                matched_length: 5.0,
576                rid: 1,
577            },
578            rid: 1,
579            rule: Rule {
580                identifier: "test2".to_string(),
581                license_expression: "apache".to_string(),
582                text: String::new(),
583                tokens: vec![],
584                rule_kind: crate::license_detection::models::RuleKind::Text,
585                is_false_positive: false,
586                is_required_phrase: false,
587                is_from_license: false,
588                relevance: 100,
589                minimum_coverage: None,
590                has_stored_minimum_coverage: false,
591                is_continuous: true,
592                referenced_filenames: None,
593                ignorable_urls: None,
594                ignorable_emails: None,
595                ignorable_copyrights: None,
596                ignorable_holders: None,
597                ignorable_authors: None,
598                language: None,
599                notes: None,
600                length_unique: 0,
601                high_length_unique: 0,
602                high_length: 0,
603                min_matched_length: 0,
604                min_high_matched_length: 0,
605                min_matched_length_unique: 0,
606                min_high_matched_length_unique: 0,
607                is_small: false,
608                is_tiny: false,
609                starts_with_license: false,
610                ends_with_license: false,
611                is_deprecated: false,
612                spdx_license_key: None,
613                other_spdx_license_keys: vec![],
614                required_phrase_spans: vec![],
615                stopwords_by_pos: std::collections::HashMap::new(),
616            },
617            high_set_intersection: HashSet::new(),
618        };
619
620        assert!(
621            candidate1 > candidate2,
622            "Higher containment candidate should rank higher"
623        );
624    }
625
626    #[test]
627    fn test_filter_dupes_matched_length_precision() {
628        let rule1 = Rule {
629            identifier: "x11-dec1.RULE".to_string(),
630            license_expression: "x11-dec1".to_string(),
631            text: String::new(),
632            tokens: vec![tid(0); 138],
633            rule_kind: crate::license_detection::models::RuleKind::Text,
634            is_false_positive: false,
635            is_required_phrase: false,
636            is_from_license: false,
637            relevance: 100,
638            minimum_coverage: None,
639            has_stored_minimum_coverage: false,
640            is_continuous: true,
641            referenced_filenames: None,
642            ignorable_urls: None,
643            ignorable_emails: None,
644            ignorable_copyrights: None,
645            ignorable_holders: None,
646            ignorable_authors: None,
647            language: None,
648            notes: None,
649            length_unique: 0,
650            high_length_unique: 0,
651            high_length: 0,
652            min_matched_length: 0,
653            min_high_matched_length: 0,
654            min_matched_length_unique: 0,
655            min_high_matched_length_unique: 0,
656            is_small: false,
657            is_tiny: false,
658            starts_with_license: false,
659            ends_with_license: false,
660            is_deprecated: false,
661            spdx_license_key: None,
662            other_spdx_license_keys: vec![],
663            required_phrase_spans: vec![],
664            stopwords_by_pos: std::collections::HashMap::new(),
665        };
666
667        let rule2 = Rule {
668            identifier: "cmu-uc.RULE".to_string(),
669            license_expression: "cmu-uc".to_string(),
670            text: String::new(),
671            tokens: vec![tid(0); 133],
672            ..rule1.clone()
673        };
674
675        let candidate1 = Candidate {
676            score_vec_rounded: ScoresVector {
677                is_highly_resemblant: false,
678                containment: 0.5,
679                resemblance: 0.25,
680                matched_length: 7.0,
681                rid: 1,
682            },
683            score_vec_full: ScoresVector {
684                is_highly_resemblant: false,
685                containment: 0.5,
686                resemblance: 0.25,
687                matched_length: 138.0,
688                rid: 1,
689            },
690            rid: 1,
691            rule: rule1,
692            high_set_intersection: HashSet::new(),
693        };
694
695        let candidate2 = Candidate {
696            score_vec_rounded: ScoresVector {
697                is_highly_resemblant: false,
698                containment: 0.5,
699                resemblance: 0.25,
700                matched_length: 7.0,
701                rid: 2,
702            },
703            score_vec_full: ScoresVector {
704                is_highly_resemblant: false,
705                containment: 0.5,
706                resemblance: 0.25,
707                matched_length: 133.0,
708                rid: 2,
709            },
710            rid: 2,
711            rule: rule2,
712            high_set_intersection: HashSet::new(),
713        };
714
715        let candidates = vec![candidate1, candidate2];
716        let filtered = filter_dupes(candidates);
717
718        assert_eq!(
719            filtered.len(),
720            2,
721            "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
722        );
723    }
724
725    #[test]
726    fn test_filter_dupes_same_group() {
727        let rule1 = Rule {
728            identifier: "mit.RULE".to_string(),
729            license_expression: "mit".to_string(),
730            text: String::new(),
731            tokens: vec![tid(0); 100],
732            rule_kind: crate::license_detection::models::RuleKind::Text,
733            is_false_positive: false,
734            is_required_phrase: false,
735            is_from_license: false,
736            relevance: 100,
737            minimum_coverage: None,
738            has_stored_minimum_coverage: false,
739            is_continuous: true,
740            referenced_filenames: None,
741            ignorable_urls: None,
742            ignorable_emails: None,
743            ignorable_copyrights: None,
744            ignorable_holders: None,
745            ignorable_authors: None,
746            language: None,
747            notes: None,
748            length_unique: 0,
749            high_length_unique: 0,
750            high_length: 0,
751            min_matched_length: 0,
752            min_high_matched_length: 0,
753            min_matched_length_unique: 0,
754            min_high_matched_length_unique: 0,
755            is_small: false,
756            is_tiny: false,
757            starts_with_license: false,
758            ends_with_license: false,
759            is_deprecated: false,
760            spdx_license_key: None,
761            other_spdx_license_keys: vec![],
762            required_phrase_spans: vec![],
763            stopwords_by_pos: std::collections::HashMap::new(),
764        };
765
766        let rule2 = Rule {
767            identifier: "mit_2.RULE".to_string(),
768            license_expression: "mit".to_string(),
769            text: String::new(),
770            tokens: vec![tid(0); 100],
771            ..rule1.clone()
772        };
773
774        let candidate1 = Candidate {
775            score_vec_rounded: ScoresVector {
776                is_highly_resemblant: false,
777                containment: 0.5,
778                resemblance: 0.25,
779                matched_length: 5.0,
780                rid: 1,
781            },
782            score_vec_full: ScoresVector {
783                is_highly_resemblant: false,
784                containment: 0.5,
785                resemblance: 0.25,
786                matched_length: 100.0,
787                rid: 1,
788            },
789            rid: 1,
790            rule: rule1,
791            high_set_intersection: HashSet::new(),
792        };
793
794        let candidate2 = Candidate {
795            score_vec_rounded: ScoresVector {
796                is_highly_resemblant: false,
797                containment: 0.5,
798                resemblance: 0.25,
799                matched_length: 5.0,
800                rid: 2,
801            },
802            score_vec_full: ScoresVector {
803                is_highly_resemblant: false,
804                containment: 0.5,
805                resemblance: 0.25,
806                matched_length: 100.0,
807                rid: 2,
808            },
809            rid: 2,
810            rule: rule2,
811            high_set_intersection: HashSet::new(),
812        };
813
814        let candidates = vec![candidate1, candidate2];
815        let filtered = filter_dupes(candidates);
816
817        assert_eq!(
818            filtered.len(),
819            1,
820            "Should keep only one candidate when all group keys match"
821        );
822    }
823
824    #[test]
825    fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
826        let rule_sa = Rule {
827            identifier: "cc-by-sa-1.0.RULE".to_string(),
828            license_expression: "cc-by-sa-1.0".to_string(),
829            text: String::new(),
830            tokens: vec![tid(0); 1960],
831            rule_kind: crate::license_detection::models::RuleKind::Text,
832            is_false_positive: false,
833            is_required_phrase: false,
834            is_from_license: false,
835            relevance: 100,
836            minimum_coverage: None,
837            has_stored_minimum_coverage: false,
838            is_continuous: true,
839            referenced_filenames: None,
840            ignorable_urls: None,
841            ignorable_emails: None,
842            ignorable_copyrights: None,
843            ignorable_holders: None,
844            ignorable_authors: None,
845            language: None,
846            notes: None,
847            length_unique: 0,
848            high_length_unique: 0,
849            high_length: 0,
850            min_matched_length: 0,
851            min_high_matched_length: 0,
852            min_matched_length_unique: 0,
853            min_high_matched_length_unique: 0,
854            is_small: false,
855            is_tiny: false,
856            starts_with_license: false,
857            ends_with_license: false,
858            is_deprecated: false,
859            spdx_license_key: None,
860            other_spdx_license_keys: vec![],
861            required_phrase_spans: vec![],
862            stopwords_by_pos: std::collections::HashMap::new(),
863        };
864
865        let rule_nc_sa = Rule {
866            identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
867            license_expression: "cc-by-nc-sa-1.0".to_string(),
868            text: String::new(),
869            tokens: vec![tid(0); 1829],
870            rule_kind: crate::license_detection::models::RuleKind::Text,
871            is_false_positive: false,
872            is_required_phrase: false,
873            is_from_license: false,
874            relevance: 100,
875            minimum_coverage: None,
876            has_stored_minimum_coverage: false,
877            is_continuous: true,
878            referenced_filenames: None,
879            ignorable_urls: None,
880            ignorable_emails: None,
881            ignorable_copyrights: None,
882            ignorable_holders: None,
883            ignorable_authors: None,
884            language: None,
885            notes: None,
886            length_unique: 0,
887            high_length_unique: 0,
888            high_length: 0,
889            min_matched_length: 0,
890            min_high_matched_length: 0,
891            min_matched_length_unique: 0,
892            min_high_matched_length_unique: 0,
893            is_small: false,
894            is_tiny: false,
895            starts_with_license: false,
896            ends_with_license: false,
897            is_deprecated: false,
898            spdx_license_key: None,
899            other_spdx_license_keys: vec![],
900            required_phrase_spans: vec![],
901            stopwords_by_pos: std::collections::HashMap::new(),
902        };
903
904        let candidate_sa = Candidate {
905            score_vec_rounded: ScoresVector {
906                is_highly_resemblant: true,
907                containment: 0.9,
908                resemblance: 0.8,
909                matched_length: 100.0,
910                rid: 1,
911            },
912            score_vec_full: ScoresVector {
913                is_highly_resemblant: true,
914                containment: 0.9,
915                resemblance: 0.8,
916                matched_length: 100.0,
917                rid: 1,
918            },
919            rid: 1,
920            rule: rule_sa,
921            high_set_intersection: HashSet::new(),
922        };
923
924        let candidate_nc_sa = Candidate {
925            score_vec_rounded: ScoresVector {
926                is_highly_resemblant: true,
927                containment: 0.9,
928                resemblance: 0.8,
929                matched_length: 100.0,
930                rid: 2,
931            },
932            score_vec_full: ScoresVector {
933                is_highly_resemblant: true,
934                containment: 0.9,
935                resemblance: 0.8,
936                matched_length: 100.0,
937                rid: 2,
938            },
939            rid: 2,
940            rule: rule_nc_sa,
941            high_set_intersection: HashSet::new(),
942        };
943
944        let candidates = vec![candidate_nc_sa, candidate_sa];
945        let filtered = filter_dupes(candidates);
946
947        assert_eq!(
948            filtered.len(),
949            2,
950            "Different license expressions should create different groups"
951        );
952
953        let mut same_group_candidates = vec![filtered[0].clone(), filtered[1].clone()];
954        for candidate in &mut same_group_candidates {
955            candidate.rule.license_expression = "same".to_string();
956            candidate.rule.tokens = vec![tid(0); 100];
957        }
958
959        let deduped = filter_dupes(same_group_candidates);
960        assert_eq!(deduped.len(), 1);
961        assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
962    }
963
964    #[test]
965    fn test_candidate_ordering_uses_rid_after_equal_scores() {
966        let rule_a = Rule {
967            identifier: "a.RULE".to_string(),
968            license_expression: "a".to_string(),
969            text: String::new(),
970            tokens: vec![tid(0); 10],
971            rule_kind: crate::license_detection::models::RuleKind::Text,
972            is_false_positive: false,
973            is_required_phrase: false,
974            is_from_license: false,
975            relevance: 100,
976            minimum_coverage: None,
977            has_stored_minimum_coverage: false,
978            is_continuous: true,
979            referenced_filenames: None,
980            ignorable_urls: None,
981            ignorable_emails: None,
982            ignorable_copyrights: None,
983            ignorable_holders: None,
984            ignorable_authors: None,
985            language: None,
986            notes: None,
987            length_unique: 0,
988            high_length_unique: 0,
989            high_length: 0,
990            min_matched_length: 0,
991            min_high_matched_length: 0,
992            min_matched_length_unique: 0,
993            min_high_matched_length_unique: 0,
994            is_small: false,
995            is_tiny: false,
996            starts_with_license: false,
997            ends_with_license: false,
998            is_deprecated: false,
999            spdx_license_key: None,
1000            other_spdx_license_keys: vec![],
1001            required_phrase_spans: vec![],
1002            stopwords_by_pos: std::collections::HashMap::new(),
1003        };
1004
1005        let candidate_low_rid = Candidate {
1006            score_vec_rounded: ScoresVector {
1007                is_highly_resemblant: true,
1008                containment: 0.9,
1009                resemblance: 0.8,
1010                matched_length: 10.0,
1011                rid: 1,
1012            },
1013            score_vec_full: ScoresVector {
1014                is_highly_resemblant: true,
1015                containment: 0.9,
1016                resemblance: 0.8,
1017                matched_length: 10.0,
1018                rid: 1,
1019            },
1020            rid: 1,
1021            rule: Rule {
1022                identifier: "z.RULE".to_string(),
1023                ..rule_a.clone()
1024            },
1025            high_set_intersection: HashSet::new(),
1026        };
1027
1028        let candidate_high_rid = Candidate {
1029            score_vec_rounded: ScoresVector {
1030                rid: 2,
1031                ..candidate_low_rid.score_vec_rounded.clone()
1032            },
1033            score_vec_full: ScoresVector {
1034                rid: 2,
1035                ..candidate_low_rid.score_vec_full.clone()
1036            },
1037            rid: 2,
1038            rule: Rule {
1039                identifier: "a.RULE".to_string(),
1040                ..rule_a
1041            },
1042            high_set_intersection: HashSet::new(),
1043        };
1044
1045        let mut sorted = [candidate_low_rid, candidate_high_rid];
1046        sorted.sort_by(|a, b| b.cmp(a));
1047        assert_eq!(
1048            sorted[0].rid, 2,
1049            "Python final candidate tuple ordering falls back to higher rid after equal scores"
1050        );
1051    }
1052}