provenant/license_detection/seq_match/
candidates.rs

1//! Candidate selection using set and multiset similarity.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::TokenId;
5use crate::license_detection::index::token_sets::{
6    build_set_and_mset, high_multiset_subset, tids_set_counter,
7};
8use crate::license_detection::models::Rule;
9use crate::license_detection::query::QueryRun;
10use std::collections::{HashMap, HashSet};
11
12use super::HIGH_RESEMBLANCE_THRESHOLD;
13
14/// Score vector for ranking candidates using set similarity.
15///
16/// Contains metrics computed from set/multiset intersections.
17///
18/// Corresponds to Python: `ScoresVector` namedtuple in match_set.py (line 458)
19#[derive(Debug, Clone, PartialEq)]
20pub struct ScoresVector {
21    /// True if the sets are highly similar (resemblance >= threshold)
22    pub is_highly_resemblant: bool,
23    /// Containment ratio (how much of rule is in query)
24    pub containment: f32,
25    /// Amplified resemblance (squared to boost high values)
26    pub resemblance: f32,
27    /// Number of matched tokens (normalized for ranking)
28    pub matched_length: f32,
29    /// Rule ID for tie-breaking
30    pub rid: usize,
31}
32
33impl PartialOrd for ScoresVector {
34    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
35        Some(self.cmp(other))
36    }
37}
38
39impl Eq for ScoresVector {}
40
41impl Ord for ScoresVector {
42    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
43        // Python sorts ScoresVector namedtuple with reverse=True:
44        // 1. is_highly_resemblant (True > False)
45        // 2. containment (higher is better)
46        // 3. resemblance (higher is better)
47        // 4. matched_length (higher is better)
48        // Note: Python does NOT use rid for tie-breaking in ScoresVector
49        self.is_highly_resemblant
50            .cmp(&other.is_highly_resemblant)
51            .then_with(|| {
52                self.containment
53                    .partial_cmp(&other.containment)
54                    .unwrap_or(std::cmp::Ordering::Equal)
55            })
56            .then_with(|| {
57                self.resemblance
58                    .partial_cmp(&other.resemblance)
59                    .unwrap_or(std::cmp::Ordering::Equal)
60            })
61            .then_with(|| {
62                self.matched_length
63                    .partial_cmp(&other.matched_length)
64                    .unwrap_or(std::cmp::Ordering::Equal)
65            })
66    }
67}
68
69/// Candidate with its score vector and metadata.
70///
71/// Corresponds to the tuple structure used in Python: (scores_vectors, rid, rule, high_set_intersection)
72#[derive(Debug, Clone, PartialEq)]
73pub struct Candidate<'a> {
74    /// Rounded score vector for display/grouping
75    pub score_vec_rounded: ScoresVector,
76    /// Full score vector for sorting
77    pub score_vec_full: ScoresVector,
78    /// Rule ID
79    pub rid: usize,
80    /// Reference to the rule (borrowed from LicenseIndex)
81    pub rule: &'a Rule,
82    /// Set of high-value (legalese) tokens in the intersection
83    pub high_set_intersection: HashSet<TokenId>,
84}
85
86impl PartialOrd for Candidate<'_> {
87    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
88        Some(self.cmp(other))
89    }
90}
91
92impl Eq for Candidate<'_> {}
93
94impl Ord for Candidate<'_> {
95    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
96        // Python sorts the tuple ((svr, svf), rid, rule, ...) with reverse=True
97        // So it compares (svr, svf) tuple first, which means:
98        // 1. Compare rounded (svr) first
99        // 2. Then compare full (svf) if rounded is equal
100        // 3. Then compare rid if scores are still equal.
101        compare_candidate_rank(
102            &self.score_vec_rounded,
103            &self.score_vec_full,
104            self.rid,
105            &other.score_vec_rounded,
106            &other.score_vec_full,
107            other.rid,
108        )
109    }
110}
111
112fn compare_candidate_rank(
113    rounded: &ScoresVector,
114    full: &ScoresVector,
115    rid: usize,
116    other_rounded: &ScoresVector,
117    other_full: &ScoresVector,
118    other_rid: usize,
119) -> std::cmp::Ordering {
120    rounded
121        .cmp(other_rounded)
122        .then_with(|| full.cmp(other_full))
123        .then_with(|| rid.cmp(&other_rid))
124}
125
126fn python_round_tenths(value: f64) -> f32 {
127    let rendered = format!("{value:.20}");
128    let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
129
130    let whole_part: i64 = whole.parse().unwrap_or(0);
131    let mut frac_chars = frac.chars();
132    let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
133    let rest: String = frac_chars.collect();
134
135    let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
136    let should_round_up = if rest > threshold {
137        true
138    } else if rest == threshold {
139        tenths % 2 == 1
140    } else {
141        false
142    };
143
144    let mut scaled = whole_part * 10 + tenths;
145    if should_round_up {
146        scaled += 1;
147    }
148
149    scaled as f32 / 10.0
150}
151
152fn quantize_tenths(value: f32) -> i32 {
153    format!("{value:.1}")
154        .chars()
155        .filter(|c| *c != '.')
156        .collect::<String>()
157        .parse()
158        .unwrap_or(0)
159}
160
161fn build_score_vectors(
162    resemblance: f64,
163    containment: f64,
164    matched_length: usize,
165    rid: usize,
166) -> (ScoresVector, ScoresVector) {
167    let amplified_resemblance = resemblance * resemblance;
168
169    let score_vec_rounded = ScoresVector {
170        is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
171        containment: python_round_tenths(containment),
172        resemblance: python_round_tenths(amplified_resemblance),
173        matched_length: python_round_tenths(matched_length as f64 / 20.0),
174        rid,
175    };
176
177    let score_vec_full = ScoresVector {
178        is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
179        containment: containment as f32,
180        resemblance: amplified_resemblance as f32,
181        matched_length: matched_length as f32,
182        rid,
183    };
184
185    (score_vec_rounded, score_vec_full)
186}
187
188/// Key for grouping duplicate candidates.
189///
190/// Candidates with the same DupeGroupKey are considered duplicates,
191/// and only the best one is kept.
192///
193/// Corresponds to Python: `filter_dupes.group_key()` in match_set.py (line 467-476)
194#[derive(Debug, Clone, Hash, PartialEq, Eq)]
195struct DupeGroupKey {
196    license_expression: String,
197    is_highly_resemblant: bool,
198    containment: i32,
199    resemblance: i32,
200    matched_length: i32,
201    rule_length: usize,
202}
203
204/// Filter duplicate candidates, keeping only the best from each group.
205///
206/// Candidates are grouped by (license_expression, is_highly_resemblant, containment,
207/// resemblance, matched_length, rule_length). Within each group, candidates are
208/// ranked by (score_vec_full, rule.identifier) and only the best is kept.
209///
210/// This matches Python's filter_dupes behavior where matched_length uses 1-decimal
211/// precision (e.g., 6.9 and 6.7 are different, but 7 and 7 would be same).
212///
213/// Corresponds to Python: `filter_dupes()` in match_set.py (line 461-498)
214pub(super) fn filter_dupes(candidates: Vec<Candidate<'_>>) -> Vec<Candidate<'_>> {
215    let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
216
217    for candidate in candidates {
218        let key = DupeGroupKey {
219            license_expression: candidate.rule.license_expression.clone(),
220            is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
221            containment: quantize_tenths(candidate.score_vec_rounded.containment),
222            resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
223            matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
224            rule_length: candidate.rule.tokens.len(),
225        };
226        groups.entry(key).or_default().push(candidate);
227    }
228
229    let mut result: Vec<Candidate> = Vec::new();
230    for mut group in groups.into_values() {
231        // Python: duplicates = sorted(duplicates, reverse=True, key=lambda x: (sv_full, rule.identifier))
232        // Higher sv_full wins, then HIGHER identifier alphabetically (reverse=True)
233        group.sort_by(|a, b| {
234            b.score_vec_full
235                .cmp(&a.score_vec_full)
236                .then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
237        });
238        if let Some(best) = group.into_iter().next() {
239            result.push(best);
240        }
241    }
242
243    result
244}
245
246/// Compute intersection of two multisets.
247///
248/// For each token ID present in both multisets, the intersection value is the
249/// smaller of the occurrence counts.
250///
251/// Corresponds to Python: `multisets_intersector()` in match_set.py (line 119)
252pub fn multisets_intersector(
253    qmset: &HashMap<TokenId, usize>,
254    imset: &HashMap<TokenId, usize>,
255) -> HashMap<TokenId, usize> {
256    let (set1, set2) = if qmset.len() < imset.len() {
257        (qmset, imset)
258    } else {
259        (imset, qmset)
260    };
261
262    set1.iter()
263        .filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
264        .collect()
265}
266
267/// Compute multiset-based candidates (Phase 2 refinement).
268///
269/// After selecting candidates using sets, this refines the ranking using multisets.
270///
271/// Corresponds to Python: `compute_candidates()` step 2 in match_set.py (line 311-350)
272pub fn compute_candidates_with_msets<'a>(
273    index: &'a LicenseIndex,
274    query_run: &QueryRun,
275    high_resemblance: bool,
276    top_n: usize,
277) -> Vec<Candidate<'a>> {
278    let query_tokens = query_run.matchable_tokens();
279    if query_tokens.is_empty() {
280        return Vec::new();
281    }
282
283    let query_token_ids: Vec<TokenId> = query_tokens
284        .iter()
285        .filter(|&&tid| tid >= 0)
286        .map(|&tid| TokenId::new(tid as u16))
287        .collect();
288
289    if query_token_ids.is_empty() {
290        return Vec::new();
291    }
292
293    let (query_set, query_mset) = build_set_and_mset(&query_token_ids);
294
295    // Build the set of high-value tokens in the query
296    let query_high_set: HashSet<TokenId> = query_set
297        .iter()
298        .filter(|tid| tid.as_usize() < index.len_legalese)
299        .copied()
300        .collect();
301
302    if query_high_set.is_empty() {
303        return Vec::new();
304    }
305
306    // Use inverted index to find candidate rules that share high-value tokens
307    let candidate_rids: HashSet<usize> = query_high_set
308        .iter()
309        .filter_map(|tid| index.rids_by_high_tid.get(tid))
310        .flat_map(|rids| rids.iter().copied())
311        .collect();
312
313    if candidate_rids.is_empty() {
314        return Vec::new();
315    }
316
317    let mut step1_candidates: Vec<(
318        ScoresVector,
319        ScoresVector,
320        usize,
321        &'a Rule,
322        HashSet<TokenId>,
323    )> = Vec::new();
324
325    for rid in candidate_rids {
326        let Some(rule) = index.rules_by_rid.get(rid) else {
327            continue;
328        };
329        let Some(rule_set) = index.sets_by_rid.get(&rid) else {
330            continue;
331        };
332        let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
333            continue;
334        };
335
336        // STEP 1: Compute HIGH intersection first (smaller sets, faster)
337        // Check size without allocation for early rejection
338        let high_intersection_size = query_high_set.intersection(rule_high_set).count();
339        if high_intersection_size < rule.min_high_matched_length_unique {
340            continue;
341        }
342
343        // Allocate the high intersection (passed threshold check)
344        let high_set_intersection: HashSet<TokenId> = query_high_set
345            .intersection(rule_high_set)
346            .copied()
347            .collect();
348        if high_set_intersection.is_empty() {
349            continue;
350        }
351
352        // STEP 2: Only now compute FULL intersection (fewer candidates reach here)
353        let intersection: HashSet<TokenId> = query_set.intersection(rule_set).copied().collect();
354        if intersection.is_empty() {
355            continue;
356        }
357
358        // Check total intersection threshold
359        let matched_length = tids_set_counter(&intersection);
360        if matched_length < rule.min_matched_length_unique {
361            continue;
362        }
363
364        // Compute resemblance using TOTAL intersection, not just high
365        let qset_len = query_set.len();
366        let iset_len = rule.length_unique;
367        if qset_len == 0 || iset_len == 0 {
368            continue;
369        }
370
371        let union_len = qset_len + iset_len - matched_length;
372        let resemblance = matched_length as f64 / union_len as f64;
373        let containment = matched_length as f64 / iset_len as f64;
374
375        // Check minimum_containment (Python: match_set.py:429-433)
376        // Rules with minimum_coverage require a minimum containment ratio
377        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
378        if let Some(min_cont) = minimum_containment
379            && containment < min_cont
380        {
381            continue;
382        }
383
384        let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
385
386        if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
387            continue;
388        }
389
390        step1_candidates.push((svr, svf, rid, rule, high_set_intersection));
391    }
392
393    if step1_candidates.is_empty() {
394        return Vec::new();
395    }
396
397    step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
398
399    step1_candidates.truncate(top_n * 10);
400
401    let mut sortable_candidates: Vec<Candidate<'a>> = Vec::new();
402
403    for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
404        let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
405            continue;
406        };
407
408        // Filter using HIGH multisets (Python: high_intersection check)
409        let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
410        let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
411        let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
412        if high_intersection_mset.is_empty() {
413            continue;
414        }
415
416        let high_matched_length: usize = high_intersection_mset.values().sum();
417        if high_matched_length < rule.min_high_matched_length {
418            continue;
419        }
420
421        // Compute scores using FULL multisets (Python: matched_length = counter(intersection))
422        let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
423        let matched_length: usize = full_intersection_mset.values().sum();
424        if matched_length < rule.min_matched_length {
425            continue;
426        }
427        let qset_len: usize = query_mset.values().sum();
428        let iset_len: usize = rule_mset.values().sum();
429
430        if qset_len == 0 || iset_len == 0 {
431            continue;
432        }
433
434        let union_len = qset_len + iset_len - matched_length;
435        let resemblance = matched_length as f64 / union_len as f64;
436        let containment = matched_length as f64 / iset_len as f64;
437
438        // Check minimum_containment (Python: match_set.py:429-433)
439        // Rules with minimum_coverage require a minimum containment ratio
440        let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
441        if let Some(min_cont) = minimum_containment
442            && containment < min_cont
443        {
444            continue;
445        }
446
447        let (score_vec_rounded, score_vec_full) =
448            build_score_vectors(resemblance, containment, matched_length, rid);
449
450        if high_resemblance
451            && (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
452        {
453            continue;
454        }
455
456        sortable_candidates.push(Candidate {
457            score_vec_rounded,
458            score_vec_full,
459            rid,
460            rule,
461            high_set_intersection,
462        });
463    }
464
465    sortable_candidates = filter_dupes(sortable_candidates);
466
467    sortable_candidates.sort_by(|a, b| b.cmp(a));
468    sortable_candidates.truncate(top_n);
469
470    sortable_candidates
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    use crate::license_detection::index::dictionary::tid;
477
478    #[test]
479    fn test_scores_vector_comparison() {
480        let sv1 = ScoresVector {
481            is_highly_resemblant: true,
482            containment: 0.9,
483            resemblance: 0.8,
484            matched_length: 10.0,
485            rid: 0,
486        };
487
488        let sv2 = ScoresVector {
489            is_highly_resemblant: false,
490            containment: 0.8,
491            resemblance: 0.6,
492            matched_length: 5.0,
493            rid: 1,
494        };
495
496        assert!(sv1 > sv2);
497    }
498
499    #[test]
500    fn test_python_round_tenths_matches_python_half_even_behavior() {
501        assert_eq!(python_round_tenths(0.05), 0.1);
502        assert_eq!(python_round_tenths(0.15), 0.1);
503        assert_eq!(python_round_tenths(0.25), 0.2);
504        assert_eq!(python_round_tenths(2.25), 2.2);
505        assert_eq!(python_round_tenths(4.35), 4.3);
506        assert_eq!(python_round_tenths(6.65), 6.7);
507    }
508
509    #[test]
510    fn test_candidate_ordering() {
511        let rule1 = Rule {
512            identifier: "test1".to_string(),
513            license_expression: "mit".to_string(),
514            text: String::new(),
515            tokens: vec![],
516            rule_kind: crate::license_detection::models::RuleKind::Text,
517            is_false_positive: false,
518            is_required_phrase: false,
519            is_from_license: false,
520            relevance: 100,
521            minimum_coverage: None,
522            has_stored_minimum_coverage: false,
523            is_continuous: true,
524            referenced_filenames: None,
525            ignorable_urls: None,
526            ignorable_emails: None,
527            ignorable_copyrights: None,
528            ignorable_holders: None,
529            ignorable_authors: None,
530            language: None,
531            notes: None,
532            length_unique: 0,
533            high_length_unique: 0,
534            high_length: 0,
535            min_matched_length: 0,
536            min_high_matched_length: 0,
537            min_matched_length_unique: 0,
538            min_high_matched_length_unique: 0,
539            is_small: false,
540            is_tiny: false,
541            starts_with_license: false,
542            ends_with_license: false,
543            is_deprecated: false,
544            spdx_license_key: None,
545            other_spdx_license_keys: vec![],
546            required_phrase_spans: vec![],
547            stopwords_by_pos: std::collections::HashMap::new(),
548        };
549
550        let rule2 = Rule {
551            identifier: "test2".to_string(),
552            license_expression: "apache".to_string(),
553            text: String::new(),
554            tokens: vec![],
555            rule_kind: crate::license_detection::models::RuleKind::Text,
556            is_false_positive: false,
557            is_required_phrase: false,
558            is_from_license: false,
559            relevance: 100,
560            minimum_coverage: None,
561            has_stored_minimum_coverage: false,
562            is_continuous: true,
563            referenced_filenames: None,
564            ignorable_urls: None,
565            ignorable_emails: None,
566            ignorable_copyrights: None,
567            ignorable_holders: None,
568            ignorable_authors: None,
569            language: None,
570            notes: None,
571            length_unique: 0,
572            high_length_unique: 0,
573            high_length: 0,
574            min_matched_length: 0,
575            min_high_matched_length: 0,
576            min_matched_length_unique: 0,
577            min_high_matched_length_unique: 0,
578            is_small: false,
579            is_tiny: false,
580            starts_with_license: false,
581            ends_with_license: false,
582            is_deprecated: false,
583            spdx_license_key: None,
584            other_spdx_license_keys: vec![],
585            required_phrase_spans: vec![],
586            stopwords_by_pos: std::collections::HashMap::new(),
587        };
588
589        let candidate1 = Candidate {
590            score_vec_rounded: ScoresVector {
591                is_highly_resemblant: true,
592                containment: 0.9,
593                resemblance: 0.8,
594                matched_length: 10.0,
595                rid: 0,
596            },
597            score_vec_full: ScoresVector {
598                is_highly_resemblant: true,
599                containment: 0.9,
600                resemblance: 0.8,
601                matched_length: 10.0,
602                rid: 0,
603            },
604            rid: 0,
605            rule: &rule1,
606            high_set_intersection: HashSet::new(),
607        };
608
609        let candidate2 = Candidate {
610            score_vec_rounded: ScoresVector {
611                is_highly_resemblant: false,
612                containment: 0.5,
613                resemblance: 0.3,
614                matched_length: 5.0,
615                rid: 1,
616            },
617            score_vec_full: ScoresVector {
618                is_highly_resemblant: false,
619                containment: 0.5,
620                resemblance: 0.3,
621                matched_length: 5.0,
622                rid: 1,
623            },
624            rid: 1,
625            rule: &rule2,
626            high_set_intersection: HashSet::new(),
627        };
628
629        assert!(
630            candidate1 > candidate2,
631            "Higher containment candidate should rank higher"
632        );
633    }
634
635    #[test]
636    fn test_filter_dupes_matched_length_precision() {
637        let rule1 = Rule {
638            identifier: "x11-dec1.RULE".to_string(),
639            license_expression: "x11-dec1".to_string(),
640            text: String::new(),
641            tokens: vec![tid(0); 138],
642            rule_kind: crate::license_detection::models::RuleKind::Text,
643            is_false_positive: false,
644            is_required_phrase: false,
645            is_from_license: false,
646            relevance: 100,
647            minimum_coverage: None,
648            has_stored_minimum_coverage: false,
649            is_continuous: true,
650            referenced_filenames: None,
651            ignorable_urls: None,
652            ignorable_emails: None,
653            ignorable_copyrights: None,
654            ignorable_holders: None,
655            ignorable_authors: None,
656            language: None,
657            notes: None,
658            length_unique: 0,
659            high_length_unique: 0,
660            high_length: 0,
661            min_matched_length: 0,
662            min_high_matched_length: 0,
663            min_matched_length_unique: 0,
664            min_high_matched_length_unique: 0,
665            is_small: false,
666            is_tiny: false,
667            starts_with_license: false,
668            ends_with_license: false,
669            is_deprecated: false,
670            spdx_license_key: None,
671            other_spdx_license_keys: vec![],
672            required_phrase_spans: vec![],
673            stopwords_by_pos: std::collections::HashMap::new(),
674        };
675
676        let rule2 = Rule {
677            identifier: "cmu-uc.RULE".to_string(),
678            license_expression: "cmu-uc".to_string(),
679            text: String::new(),
680            tokens: vec![tid(0); 133],
681            ..rule1.clone()
682        };
683
684        let candidate1 = Candidate {
685            score_vec_rounded: ScoresVector {
686                is_highly_resemblant: false,
687                containment: 0.5,
688                resemblance: 0.25,
689                matched_length: 7.0,
690                rid: 1,
691            },
692            score_vec_full: ScoresVector {
693                is_highly_resemblant: false,
694                containment: 0.5,
695                resemblance: 0.25,
696                matched_length: 138.0,
697                rid: 1,
698            },
699            rid: 1,
700            rule: &rule1,
701            high_set_intersection: HashSet::new(),
702        };
703
704        let candidate2 = Candidate {
705            score_vec_rounded: ScoresVector {
706                is_highly_resemblant: false,
707                containment: 0.5,
708                resemblance: 0.25,
709                matched_length: 7.0,
710                rid: 2,
711            },
712            score_vec_full: ScoresVector {
713                is_highly_resemblant: false,
714                containment: 0.5,
715                resemblance: 0.25,
716                matched_length: 133.0,
717                rid: 2,
718            },
719            rid: 2,
720            rule: &rule2,
721            high_set_intersection: HashSet::new(),
722        };
723
724        let candidates = vec![candidate1, candidate2];
725        let filtered = filter_dupes(candidates);
726
727        assert_eq!(
728            filtered.len(),
729            2,
730            "Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
731        );
732    }
733
734    #[test]
735    fn test_filter_dupes_same_group() {
736        let rule1 = Rule {
737            identifier: "mit.RULE".to_string(),
738            license_expression: "mit".to_string(),
739            text: String::new(),
740            tokens: vec![tid(0); 100],
741            rule_kind: crate::license_detection::models::RuleKind::Text,
742            is_false_positive: false,
743            is_required_phrase: false,
744            is_from_license: false,
745            relevance: 100,
746            minimum_coverage: None,
747            has_stored_minimum_coverage: false,
748            is_continuous: true,
749            referenced_filenames: None,
750            ignorable_urls: None,
751            ignorable_emails: None,
752            ignorable_copyrights: None,
753            ignorable_holders: None,
754            ignorable_authors: None,
755            language: None,
756            notes: None,
757            length_unique: 0,
758            high_length_unique: 0,
759            high_length: 0,
760            min_matched_length: 0,
761            min_high_matched_length: 0,
762            min_matched_length_unique: 0,
763            min_high_matched_length_unique: 0,
764            is_small: false,
765            is_tiny: false,
766            starts_with_license: false,
767            ends_with_license: false,
768            is_deprecated: false,
769            spdx_license_key: None,
770            other_spdx_license_keys: vec![],
771            required_phrase_spans: vec![],
772            stopwords_by_pos: std::collections::HashMap::new(),
773        };
774
775        let rule2 = Rule {
776            identifier: "mit_2.RULE".to_string(),
777            license_expression: "mit".to_string(),
778            text: String::new(),
779            tokens: vec![tid(0); 100],
780            ..rule1.clone()
781        };
782
783        let candidate1 = Candidate {
784            score_vec_rounded: ScoresVector {
785                is_highly_resemblant: false,
786                containment: 0.5,
787                resemblance: 0.25,
788                matched_length: 5.0,
789                rid: 1,
790            },
791            score_vec_full: ScoresVector {
792                is_highly_resemblant: false,
793                containment: 0.5,
794                resemblance: 0.25,
795                matched_length: 100.0,
796                rid: 1,
797            },
798            rid: 1,
799            rule: &rule1,
800            high_set_intersection: HashSet::new(),
801        };
802
803        let candidate2 = Candidate {
804            score_vec_rounded: ScoresVector {
805                is_highly_resemblant: false,
806                containment: 0.5,
807                resemblance: 0.25,
808                matched_length: 5.0,
809                rid: 2,
810            },
811            score_vec_full: ScoresVector {
812                is_highly_resemblant: false,
813                containment: 0.5,
814                resemblance: 0.25,
815                matched_length: 100.0,
816                rid: 2,
817            },
818            rid: 2,
819            rule: &rule2,
820            high_set_intersection: HashSet::new(),
821        };
822
823        let candidates = vec![candidate1, candidate2];
824        let filtered = filter_dupes(candidates);
825
826        assert_eq!(
827            filtered.len(),
828            1,
829            "Should keep only one candidate when all group keys match"
830        );
831    }
832
833    #[test]
834    fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
835        let rule_sa = Rule {
836            identifier: "cc-by-sa-1.0.RULE".to_string(),
837            license_expression: "cc-by-sa-1.0".to_string(),
838            text: String::new(),
839            tokens: vec![tid(0); 1960],
840            rule_kind: crate::license_detection::models::RuleKind::Text,
841            is_false_positive: false,
842            is_required_phrase: false,
843            is_from_license: false,
844            relevance: 100,
845            minimum_coverage: None,
846            has_stored_minimum_coverage: false,
847            is_continuous: true,
848            referenced_filenames: None,
849            ignorable_urls: None,
850            ignorable_emails: None,
851            ignorable_copyrights: None,
852            ignorable_holders: None,
853            ignorable_authors: None,
854            language: None,
855            notes: None,
856            length_unique: 0,
857            high_length_unique: 0,
858            high_length: 0,
859            min_matched_length: 0,
860            min_high_matched_length: 0,
861            min_matched_length_unique: 0,
862            min_high_matched_length_unique: 0,
863            is_small: false,
864            is_tiny: false,
865            starts_with_license: false,
866            ends_with_license: false,
867            is_deprecated: false,
868            spdx_license_key: None,
869            other_spdx_license_keys: vec![],
870            required_phrase_spans: vec![],
871            stopwords_by_pos: std::collections::HashMap::new(),
872        };
873
874        let rule_nc_sa = Rule {
875            identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
876            license_expression: "cc-by-nc-sa-1.0".to_string(),
877            text: String::new(),
878            tokens: vec![tid(0); 1829],
879            rule_kind: crate::license_detection::models::RuleKind::Text,
880            is_false_positive: false,
881            is_required_phrase: false,
882            is_from_license: false,
883            relevance: 100,
884            minimum_coverage: None,
885            has_stored_minimum_coverage: false,
886            is_continuous: true,
887            referenced_filenames: None,
888            ignorable_urls: None,
889            ignorable_emails: None,
890            ignorable_copyrights: None,
891            ignorable_holders: None,
892            ignorable_authors: None,
893            language: None,
894            notes: None,
895            length_unique: 0,
896            high_length_unique: 0,
897            high_length: 0,
898            min_matched_length: 0,
899            min_high_matched_length: 0,
900            min_matched_length_unique: 0,
901            min_high_matched_length_unique: 0,
902            is_small: false,
903            is_tiny: false,
904            starts_with_license: false,
905            ends_with_license: false,
906            is_deprecated: false,
907            spdx_license_key: None,
908            other_spdx_license_keys: vec![],
909            required_phrase_spans: vec![],
910            stopwords_by_pos: std::collections::HashMap::new(),
911        };
912
913        let candidate_sa = Candidate {
914            score_vec_rounded: ScoresVector {
915                is_highly_resemblant: true,
916                containment: 0.9,
917                resemblance: 0.8,
918                matched_length: 100.0,
919                rid: 1,
920            },
921            score_vec_full: ScoresVector {
922                is_highly_resemblant: true,
923                containment: 0.9,
924                resemblance: 0.8,
925                matched_length: 100.0,
926                rid: 1,
927            },
928            rid: 1,
929            rule: &rule_sa,
930            high_set_intersection: HashSet::new(),
931        };
932
933        let candidate_nc_sa = Candidate {
934            score_vec_rounded: ScoresVector {
935                is_highly_resemblant: true,
936                containment: 0.9,
937                resemblance: 0.8,
938                matched_length: 100.0,
939                rid: 2,
940            },
941            score_vec_full: ScoresVector {
942                is_highly_resemblant: true,
943                containment: 0.9,
944                resemblance: 0.8,
945                matched_length: 100.0,
946                rid: 2,
947            },
948            rid: 2,
949            rule: &rule_nc_sa,
950            high_set_intersection: HashSet::new(),
951        };
952
953        let candidates = vec![candidate_nc_sa, candidate_sa];
954        let filtered = filter_dupes(candidates);
955
956        assert_eq!(
957            filtered.len(),
958            2,
959            "Different license expressions should create different groups"
960        );
961
962        let mut rule_same1 = Rule {
963            license_expression: "same".to_string(),
964            tokens: vec![tid(0); 100],
965            ..rule_sa.clone()
966        };
967        let mut rule_same2 = Rule {
968            license_expression: "same".to_string(),
969            tokens: vec![tid(0); 100],
970            ..rule_nc_sa.clone()
971        };
972
973        let same_group_candidates = vec![
974            Candidate {
975                score_vec_rounded: filtered[0].score_vec_rounded.clone(),
976                score_vec_full: filtered[0].score_vec_full.clone(),
977                rid: filtered[0].rid,
978                rule: &mut rule_same1,
979                high_set_intersection: HashSet::new(),
980            },
981            Candidate {
982                score_vec_rounded: filtered[1].score_vec_rounded.clone(),
983                score_vec_full: filtered[1].score_vec_full.clone(),
984                rid: filtered[1].rid,
985                rule: &mut rule_same2,
986                high_set_intersection: HashSet::new(),
987            },
988        ];
989
990        let deduped = filter_dupes(same_group_candidates);
991        assert_eq!(deduped.len(), 1);
992        assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
993    }
994
995    #[test]
996    fn test_candidate_ordering_uses_rid_after_equal_scores() {
997        let rule_a = Rule {
998            identifier: "a.RULE".to_string(),
999            license_expression: "a".to_string(),
1000            text: String::new(),
1001            tokens: vec![tid(0); 10],
1002            rule_kind: crate::license_detection::models::RuleKind::Text,
1003            is_false_positive: false,
1004            is_required_phrase: false,
1005            is_from_license: false,
1006            relevance: 100,
1007            minimum_coverage: None,
1008            has_stored_minimum_coverage: false,
1009            is_continuous: true,
1010            referenced_filenames: None,
1011            ignorable_urls: None,
1012            ignorable_emails: None,
1013            ignorable_copyrights: None,
1014            ignorable_holders: None,
1015            ignorable_authors: None,
1016            language: None,
1017            notes: None,
1018            length_unique: 0,
1019            high_length_unique: 0,
1020            high_length: 0,
1021            min_matched_length: 0,
1022            min_high_matched_length: 0,
1023            min_matched_length_unique: 0,
1024            min_high_matched_length_unique: 0,
1025            is_small: false,
1026            is_tiny: false,
1027            starts_with_license: false,
1028            ends_with_license: false,
1029            is_deprecated: false,
1030            spdx_license_key: None,
1031            other_spdx_license_keys: vec![],
1032            required_phrase_spans: vec![],
1033            stopwords_by_pos: std::collections::HashMap::new(),
1034        };
1035
1036        let rule_z = Rule {
1037            identifier: "z.RULE".to_string(),
1038            ..rule_a.clone()
1039        };
1040
1041        let candidate_low_rid = Candidate {
1042            score_vec_rounded: ScoresVector {
1043                is_highly_resemblant: true,
1044                containment: 0.9,
1045                resemblance: 0.8,
1046                matched_length: 10.0,
1047                rid: 1,
1048            },
1049            score_vec_full: ScoresVector {
1050                is_highly_resemblant: true,
1051                containment: 0.9,
1052                resemblance: 0.8,
1053                matched_length: 10.0,
1054                rid: 1,
1055            },
1056            rid: 1,
1057            rule: &rule_z,
1058            high_set_intersection: HashSet::new(),
1059        };
1060
1061        let candidate_high_rid = Candidate {
1062            score_vec_rounded: ScoresVector {
1063                rid: 2,
1064                ..candidate_low_rid.score_vec_rounded.clone()
1065            },
1066            score_vec_full: ScoresVector {
1067                rid: 2,
1068                ..candidate_low_rid.score_vec_full.clone()
1069            },
1070            rid: 2,
1071            rule: &rule_a,
1072            high_set_intersection: HashSet::new(),
1073        };
1074
1075        let mut sorted = [candidate_low_rid, candidate_high_rid];
1076        sorted.sort_by(|a, b| b.cmp(a));
1077        assert_eq!(
1078            sorted[0].rid, 2,
1079            "Python final candidate tuple ordering falls back to higher rid after equal scores"
1080        );
1081    }
1082}
provenant/license_detection/seq_match/candidates.rs

provenant/license_detection/seq_match/
candidates.rs