use crate::license_detection::index::LicenseIndex;
use crate::license_detection::index::dictionary::TokenId;
use crate::license_detection::index::token_sets::{
build_set_and_mset, high_multiset_subset, tids_set_counter,
};
use crate::license_detection::models::Rule;
use crate::license_detection::query::QueryRun;
use std::collections::{HashMap, HashSet};
use super::HIGH_RESEMBLANCE_THRESHOLD;
#[derive(Debug, Clone, PartialEq)]
pub struct ScoresVector {
pub is_highly_resemblant: bool,
pub containment: f32,
pub resemblance: f32,
pub matched_length: f32,
pub rid: usize,
}
impl PartialOrd for ScoresVector {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Eq for ScoresVector {}
impl Ord for ScoresVector {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.is_highly_resemblant
.cmp(&other.is_highly_resemblant)
.then_with(|| {
self.containment
.partial_cmp(&other.containment)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| {
self.resemblance
.partial_cmp(&other.resemblance)
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| {
self.matched_length
.partial_cmp(&other.matched_length)
.unwrap_or(std::cmp::Ordering::Equal)
})
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Candidate<'a> {
pub score_vec_rounded: ScoresVector,
pub score_vec_full: ScoresVector,
pub rid: usize,
pub rule: &'a Rule,
pub high_set_intersection: HashSet<TokenId>,
}
impl PartialOrd for Candidate<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Eq for Candidate<'_> {}
impl Ord for Candidate<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
compare_candidate_rank(
&self.score_vec_rounded,
&self.score_vec_full,
self.rid,
&other.score_vec_rounded,
&other.score_vec_full,
other.rid,
)
}
}
fn compare_candidate_rank(
rounded: &ScoresVector,
full: &ScoresVector,
rid: usize,
other_rounded: &ScoresVector,
other_full: &ScoresVector,
other_rid: usize,
) -> std::cmp::Ordering {
rounded
.cmp(other_rounded)
.then_with(|| full.cmp(other_full))
.then_with(|| rid.cmp(&other_rid))
}
fn python_round_tenths(value: f64) -> f32 {
let rendered = format!("{value:.20}");
let (whole, frac) = rendered.split_once('.').unwrap_or((rendered.as_str(), "0"));
let whole_part: i64 = whole.parse().unwrap_or(0);
let mut frac_chars = frac.chars();
let tenths = frac_chars.next().and_then(|c| c.to_digit(10)).unwrap_or(0) as i64;
let rest: String = frac_chars.collect();
let threshold = format!("5{}", "0".repeat(rest.len().saturating_sub(1)));
let should_round_up = if rest > threshold {
true
} else if rest == threshold {
tenths % 2 == 1
} else {
false
};
let mut scaled = whole_part * 10 + tenths;
if should_round_up {
scaled += 1;
}
scaled as f32 / 10.0
}
fn quantize_tenths(value: f32) -> i32 {
format!("{value:.1}")
.chars()
.filter(|c| *c != '.')
.collect::<String>()
.parse()
.unwrap_or(0)
}
fn build_score_vectors(
resemblance: f64,
containment: f64,
matched_length: usize,
rid: usize,
) -> (ScoresVector, ScoresVector) {
let amplified_resemblance = resemblance * resemblance;
let score_vec_rounded = ScoresVector {
is_highly_resemblant: python_round_tenths(resemblance) >= HIGH_RESEMBLANCE_THRESHOLD,
containment: python_round_tenths(containment),
resemblance: python_round_tenths(amplified_resemblance),
matched_length: python_round_tenths(matched_length as f64 / 20.0),
rid,
};
let score_vec_full = ScoresVector {
is_highly_resemblant: resemblance >= f64::from(HIGH_RESEMBLANCE_THRESHOLD),
containment: containment as f32,
resemblance: amplified_resemblance as f32,
matched_length: matched_length as f32,
rid,
};
(score_vec_rounded, score_vec_full)
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
struct DupeGroupKey {
license_expression: String,
is_highly_resemblant: bool,
containment: i32,
resemblance: i32,
matched_length: i32,
rule_length: usize,
}
pub(super) fn filter_dupes(candidates: Vec<Candidate<'_>>) -> Vec<Candidate<'_>> {
let mut groups: HashMap<DupeGroupKey, Vec<Candidate>> = HashMap::new();
for candidate in candidates {
let key = DupeGroupKey {
license_expression: candidate.rule.license_expression.clone(),
is_highly_resemblant: candidate.score_vec_rounded.is_highly_resemblant,
containment: quantize_tenths(candidate.score_vec_rounded.containment),
resemblance: quantize_tenths(candidate.score_vec_rounded.resemblance),
matched_length: quantize_tenths(candidate.score_vec_rounded.matched_length),
rule_length: candidate.rule.tokens.len(),
};
groups.entry(key).or_default().push(candidate);
}
let mut result: Vec<Candidate> = Vec::new();
for mut group in groups.into_values() {
group.sort_by(|a, b| {
b.score_vec_full
.cmp(&a.score_vec_full)
.then_with(|| b.rule.identifier.cmp(&a.rule.identifier))
});
if let Some(best) = group.into_iter().next() {
result.push(best);
}
}
result
}
pub fn multisets_intersector(
qmset: &HashMap<TokenId, usize>,
imset: &HashMap<TokenId, usize>,
) -> HashMap<TokenId, usize> {
let (set1, set2) = if qmset.len() < imset.len() {
(qmset, imset)
} else {
(imset, qmset)
};
set1.iter()
.filter_map(|(&tid, &count1)| set2.get(&tid).map(|&count2| (tid, count1.min(count2))))
.collect()
}
pub fn compute_candidates_with_msets<'a>(
index: &'a LicenseIndex,
query_run: &QueryRun,
high_resemblance: bool,
top_n: usize,
) -> Vec<Candidate<'a>> {
let query_tokens = query_run.matchable_tokens();
if query_tokens.is_empty() {
return Vec::new();
}
let query_token_ids: Vec<TokenId> = query_tokens
.iter()
.filter(|&&tid| tid >= 0)
.map(|&tid| TokenId::new(tid as u16))
.collect();
if query_token_ids.is_empty() {
return Vec::new();
}
let (query_set, query_mset) = build_set_and_mset(&query_token_ids);
let query_high_set: HashSet<TokenId> = query_set
.iter()
.filter(|tid| tid.as_usize() < index.len_legalese)
.copied()
.collect();
if query_high_set.is_empty() {
return Vec::new();
}
let candidate_rids: HashSet<usize> = query_high_set
.iter()
.filter_map(|tid| index.rids_by_high_tid.get(tid))
.flat_map(|rids| rids.iter().copied())
.collect();
if candidate_rids.is_empty() {
return Vec::new();
}
let mut step1_candidates: Vec<(
ScoresVector,
ScoresVector,
usize,
&'a Rule,
HashSet<TokenId>,
)> = Vec::new();
for rid in candidate_rids {
let Some(rule) = index.rules_by_rid.get(rid) else {
continue;
};
let Some(rule_set) = index.sets_by_rid.get(&rid) else {
continue;
};
let Some(rule_high_set) = index.high_sets_by_rid.get(&rid) else {
continue;
};
let high_intersection_size = query_high_set.intersection(rule_high_set).count();
if high_intersection_size < rule.min_high_matched_length_unique {
continue;
}
let high_set_intersection: HashSet<TokenId> = query_high_set
.intersection(rule_high_set)
.copied()
.collect();
if high_set_intersection.is_empty() {
continue;
}
let intersection: HashSet<TokenId> = query_set.intersection(rule_set).copied().collect();
if intersection.is_empty() {
continue;
}
let matched_length = tids_set_counter(&intersection);
if matched_length < rule.min_matched_length_unique {
continue;
}
let qset_len = query_set.len();
let iset_len = rule.length_unique;
if qset_len == 0 || iset_len == 0 {
continue;
}
let union_len = qset_len + iset_len - matched_length;
let resemblance = matched_length as f64 / union_len as f64;
let containment = matched_length as f64 / iset_len as f64;
let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
if let Some(min_cont) = minimum_containment
&& containment < min_cont
{
continue;
}
let (svr, svf) = build_score_vectors(resemblance, containment, matched_length, rid);
if high_resemblance && (!svr.is_highly_resemblant || !svf.is_highly_resemblant) {
continue;
}
step1_candidates.push((svr, svf, rid, rule, high_set_intersection));
}
if step1_candidates.is_empty() {
return Vec::new();
}
step1_candidates.sort_by(|a, b| compare_candidate_rank(&b.0, &b.1, b.2, &a.0, &a.1, a.2));
step1_candidates.truncate(top_n * 10);
let mut sortable_candidates: Vec<Candidate<'a>> = Vec::new();
for (_svr, _svf, rid, rule, high_set_intersection) in step1_candidates {
let Some(rule_mset) = index.msets_by_rid.get(&rid) else {
continue;
};
let query_high_mset = high_multiset_subset(&query_mset, &index.dictionary);
let rule_high_mset = high_multiset_subset(rule_mset, &index.dictionary);
let high_intersection_mset = multisets_intersector(&query_high_mset, &rule_high_mset);
if high_intersection_mset.is_empty() {
continue;
}
let high_matched_length: usize = high_intersection_mset.values().sum();
if high_matched_length < rule.min_high_matched_length {
continue;
}
let full_intersection_mset = multisets_intersector(&query_mset, rule_mset);
let matched_length: usize = full_intersection_mset.values().sum();
if matched_length < rule.min_matched_length {
continue;
}
let qset_len: usize = query_mset.values().sum();
let iset_len: usize = rule_mset.values().sum();
if qset_len == 0 || iset_len == 0 {
continue;
}
let union_len = qset_len + iset_len - matched_length;
let resemblance = matched_length as f64 / union_len as f64;
let containment = matched_length as f64 / iset_len as f64;
let minimum_containment = rule.minimum_coverage.map(|mc| mc as f64 / 100.0);
if let Some(min_cont) = minimum_containment
&& containment < min_cont
{
continue;
}
let (score_vec_rounded, score_vec_full) =
build_score_vectors(resemblance, containment, matched_length, rid);
if high_resemblance
&& (!score_vec_rounded.is_highly_resemblant || !score_vec_full.is_highly_resemblant)
{
continue;
}
sortable_candidates.push(Candidate {
score_vec_rounded,
score_vec_full,
rid,
rule,
high_set_intersection,
});
}
sortable_candidates = filter_dupes(sortable_candidates);
sortable_candidates.sort_by(|a, b| b.cmp(a));
sortable_candidates.truncate(top_n);
sortable_candidates
}
#[cfg(test)]
mod tests {
use super::*;
use crate::license_detection::index::dictionary::tid;
#[test]
fn test_scores_vector_comparison() {
let sv1 = ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 10.0,
rid: 0,
};
let sv2 = ScoresVector {
is_highly_resemblant: false,
containment: 0.8,
resemblance: 0.6,
matched_length: 5.0,
rid: 1,
};
assert!(sv1 > sv2);
}
#[test]
fn test_python_round_tenths_matches_python_half_even_behavior() {
assert_eq!(python_round_tenths(0.05), 0.1);
assert_eq!(python_round_tenths(0.15), 0.1);
assert_eq!(python_round_tenths(0.25), 0.2);
assert_eq!(python_round_tenths(2.25), 2.2);
assert_eq!(python_round_tenths(4.35), 4.3);
assert_eq!(python_round_tenths(6.65), 6.7);
}
#[test]
fn test_candidate_ordering() {
let rule1 = Rule {
identifier: "test1".to_string(),
license_expression: "mit".to_string(),
text: String::new(),
tokens: vec![],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let rule2 = Rule {
identifier: "test2".to_string(),
license_expression: "apache".to_string(),
text: String::new(),
tokens: vec![],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let candidate1 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 10.0,
rid: 0,
},
score_vec_full: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 10.0,
rid: 0,
},
rid: 0,
rule: &rule1,
high_set_intersection: HashSet::new(),
};
let candidate2 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.3,
matched_length: 5.0,
rid: 1,
},
score_vec_full: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.3,
matched_length: 5.0,
rid: 1,
},
rid: 1,
rule: &rule2,
high_set_intersection: HashSet::new(),
};
assert!(
candidate1 > candidate2,
"Higher containment candidate should rank higher"
);
}
#[test]
fn test_filter_dupes_matched_length_precision() {
let rule1 = Rule {
identifier: "x11-dec1.RULE".to_string(),
license_expression: "x11-dec1".to_string(),
text: String::new(),
tokens: vec![tid(0); 138],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let rule2 = Rule {
identifier: "cmu-uc.RULE".to_string(),
license_expression: "cmu-uc".to_string(),
text: String::new(),
tokens: vec![tid(0); 133],
..rule1.clone()
};
let candidate1 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 7.0,
rid: 1,
},
score_vec_full: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 138.0,
rid: 1,
},
rid: 1,
rule: &rule1,
high_set_intersection: HashSet::new(),
};
let candidate2 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 7.0,
rid: 2,
},
score_vec_full: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 133.0,
rid: 2,
},
rid: 2,
rule: &rule2,
high_set_intersection: HashSet::new(),
};
let candidates = vec![candidate1, candidate2];
let filtered = filter_dupes(candidates);
assert_eq!(
filtered.len(),
2,
"Should keep both candidates when matched_length differs at 1-decimal precision: 138/20=6.9 vs 133/20=6.7"
);
}
#[test]
fn test_filter_dupes_same_group() {
let rule1 = Rule {
identifier: "mit.RULE".to_string(),
license_expression: "mit".to_string(),
text: String::new(),
tokens: vec![tid(0); 100],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let rule2 = Rule {
identifier: "mit_2.RULE".to_string(),
license_expression: "mit".to_string(),
text: String::new(),
tokens: vec![tid(0); 100],
..rule1.clone()
};
let candidate1 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 5.0,
rid: 1,
},
score_vec_full: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 100.0,
rid: 1,
},
rid: 1,
rule: &rule1,
high_set_intersection: HashSet::new(),
};
let candidate2 = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 5.0,
rid: 2,
},
score_vec_full: ScoresVector {
is_highly_resemblant: false,
containment: 0.5,
resemblance: 0.25,
matched_length: 100.0,
rid: 2,
},
rid: 2,
rule: &rule2,
high_set_intersection: HashSet::new(),
};
let candidates = vec![candidate1, candidate2];
let filtered = filter_dupes(candidates);
assert_eq!(
filtered.len(),
1,
"Should keep only one candidate when all group keys match"
);
}
#[test]
fn test_filter_dupes_prefers_higher_identifier_when_full_scores_tie() {
let rule_sa = Rule {
identifier: "cc-by-sa-1.0.RULE".to_string(),
license_expression: "cc-by-sa-1.0".to_string(),
text: String::new(),
tokens: vec![tid(0); 1960],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let rule_nc_sa = Rule {
identifier: "cc-by-nc-sa-1.0.RULE".to_string(),
license_expression: "cc-by-nc-sa-1.0".to_string(),
text: String::new(),
tokens: vec![tid(0); 1829],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let candidate_sa = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 100.0,
rid: 1,
},
score_vec_full: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 100.0,
rid: 1,
},
rid: 1,
rule: &rule_sa,
high_set_intersection: HashSet::new(),
};
let candidate_nc_sa = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 100.0,
rid: 2,
},
score_vec_full: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 100.0,
rid: 2,
},
rid: 2,
rule: &rule_nc_sa,
high_set_intersection: HashSet::new(),
};
let candidates = vec![candidate_nc_sa, candidate_sa];
let filtered = filter_dupes(candidates);
assert_eq!(
filtered.len(),
2,
"Different license expressions should create different groups"
);
let mut rule_same1 = Rule {
license_expression: "same".to_string(),
tokens: vec![tid(0); 100],
..rule_sa.clone()
};
let mut rule_same2 = Rule {
license_expression: "same".to_string(),
tokens: vec![tid(0); 100],
..rule_nc_sa.clone()
};
let same_group_candidates = vec![
Candidate {
score_vec_rounded: filtered[0].score_vec_rounded.clone(),
score_vec_full: filtered[0].score_vec_full.clone(),
rid: filtered[0].rid,
rule: &mut rule_same1,
high_set_intersection: HashSet::new(),
},
Candidate {
score_vec_rounded: filtered[1].score_vec_rounded.clone(),
score_vec_full: filtered[1].score_vec_full.clone(),
rid: filtered[1].rid,
rule: &mut rule_same2,
high_set_intersection: HashSet::new(),
},
];
let deduped = filter_dupes(same_group_candidates);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].rule.identifier, "cc-by-sa-1.0.RULE");
}
#[test]
fn test_candidate_ordering_uses_rid_after_equal_scores() {
let rule_a = Rule {
identifier: "a.RULE".to_string(),
license_expression: "a".to_string(),
text: String::new(),
tokens: vec![tid(0); 10],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: false,
relevance: 100,
minimum_coverage: None,
has_stored_minimum_coverage: false,
is_continuous: true,
referenced_filenames: None,
ignorable_urls: None,
ignorable_emails: None,
ignorable_copyrights: None,
ignorable_holders: None,
ignorable_authors: None,
language: None,
notes: None,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: false,
spdx_license_key: None,
other_spdx_license_keys: vec![],
required_phrase_spans: vec![],
stopwords_by_pos: std::collections::HashMap::new(),
};
let rule_z = Rule {
identifier: "z.RULE".to_string(),
..rule_a.clone()
};
let candidate_low_rid = Candidate {
score_vec_rounded: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 10.0,
rid: 1,
},
score_vec_full: ScoresVector {
is_highly_resemblant: true,
containment: 0.9,
resemblance: 0.8,
matched_length: 10.0,
rid: 1,
},
rid: 1,
rule: &rule_z,
high_set_intersection: HashSet::new(),
};
let candidate_high_rid = Candidate {
score_vec_rounded: ScoresVector {
rid: 2,
..candidate_low_rid.score_vec_rounded.clone()
},
score_vec_full: ScoresVector {
rid: 2,
..candidate_low_rid.score_vec_full.clone()
},
rid: 2,
rule: &rule_a,
high_set_intersection: HashSet::new(),
};
let mut sorted = [candidate_low_rid, candidate_high_rid];
sorted.sort_by(|a, b| b.cmp(a));
assert_eq!(
sorted[0].rid, 2,
"Python final candidate tuple ordering falls back to higher rid after equal scores"
);
}
}