use std::collections::{HashMap, HashSet};
use crate::license_detection::automaton::AutomatonBuilder;
use crate::license_detection::hash_match::compute_hash;
use crate::license_detection::index::LicenseIndex;
use crate::license_detection::index::dictionary::{
KnownToken, TokenDictionary, TokenId, TokenKind,
};
use crate::license_detection::models::{License, LoadedLicense, LoadedRule, Rule, RuleId};
use crate::license_detection::rules::legalese;
use crate::license_detection::rules::thresholds::{
SMALL_RULE, TINY_RULE, compute_thresholds_occurrences, compute_thresholds_unique,
};
use crate::license_detection::spdx_mapping::build_spdx_mapping;
use crate::license_detection::tokenize::{
parse_required_phrase_spans, tokenize, tokenize_with_stopwords,
};
use crate::license_detection::{TokenMultiset, TokenSet};
const UNKNOWN_NGRAM_LENGTH: usize = 6;
const LICENSE_TOKEN_STRINGS: &[&str] = &["license", "licence", "licensed"];
const DEPRECATED_SPDX_SUBS: &[(&str, &str)] = &[
("ecos-2.0", "gpl-2.0-or-later with ecos-exception-2.0"),
(
"gpl-2.0-with-autoconf-exception",
"gpl-2.0-only with autoconf-exception-2.0",
),
(
"gpl-2.0-with-bison-exception",
"gpl-2.0-only with bison-exception-2.2",
),
(
"gpl-2.0-with-classpath-exception",
"gpl-2.0-only with classpath-exception-2.0",
),
(
"gpl-2.0-with-font-exception",
"gpl-2.0-only with font-exception-2.0",
),
(
"gpl-2.0-with-gcc-exception",
"gpl-2.0-only with gcc-exception-2.0",
),
(
"gpl-3.0-with-autoconf-exception",
"gpl-3.0-only with autoconf-exception-3.0",
),
(
"gpl-3.0-with-gcc-exception",
"gpl-3.0-only with gcc-exception-3.1",
),
(
"wxwindows",
"lgpl-2.0-or-later with wxwindows-exception-3.1",
),
];
fn add_deprecated_spdx_aliases(rid_by_spdx_key: &mut HashMap<String, RuleId>) {
for (deprecated, replacement) in DEPRECATED_SPDX_SUBS {
if let Some(&rid) = rid_by_spdx_key.get(*replacement) {
rid_by_spdx_key.insert(deprecated.to_string(), rid);
}
}
}
fn populate_precomputed_rule_spdx(index: &mut LicenseIndex) {
let licenses: Vec<_> = index.licenses_by_key.values().cloned().collect();
let mapping = build_spdx_mapping(&licenses);
for rule in &index.rules_by_rid {
let Ok(license_expression_spdx) =
mapping.expression_scancode_to_spdx(&rule.license_expression)
else {
continue;
};
index
.rule_metadata_by_identifier
.entry(rule.identifier.clone())
.or_default()
.license_expression_spdx = Some(license_expression_spdx);
}
}
fn apply_loaded_rule_metadata(
index: &mut LicenseIndex,
loaded_rules: &[LoadedRule],
with_deprecated: bool,
) {
for loaded_rule in loaded_rules {
if !with_deprecated && loaded_rule.is_deprecated {
continue;
}
let metadata = index
.rule_metadata_by_identifier
.entry(loaded_rule.identifier.clone())
.or_default();
metadata.skip_for_required_phrase_generation =
loaded_rule.skip_for_required_phrase_generation;
metadata.replaced_by = loaded_rule.replaced_by.clone();
}
}
fn prepare_rule_text(text: &str) -> String {
text.lines()
.map(|line| line.trim())
.collect::<Vec<_>>()
.join("\n")
}
#[cfg(test)]
pub(crate) fn generate_url_variants(
text: &str,
ignorable_urls: &Option<Vec<String>>,
) -> Vec<String> {
let Some(urls) = ignorable_urls else {
return vec![];
};
if urls.is_empty() {
return vec![];
}
let mut variants = Vec::new();
let current = text.to_string();
for url in urls {
let url_lower = url.to_lowercase();
if url_lower.starts_with("https://") {
let http_url = format!("http://{}", &url[8..]);
if current.contains(url) {
let variant = current.replace(url, &http_url);
variants.push(variant);
}
} else if url_lower.starts_with("http://") {
let https_url = format!("https://{}", &url[7..]);
if current.contains(url) {
let variant = current.replace(url, &https_url);
variants.push(variant);
}
}
}
variants
}
fn build_rule_from_license(license: &License) -> Option<Rule> {
let has_stored_minimum_coverage = license.minimum_coverage.is_some();
let text = if license.text.is_empty() {
"unknown-spdx license identifier".to_string()
} else {
prepare_rule_text(&license.text)
};
Some(Rule {
identifier: format!("{}.LICENSE", license.key),
license_expression: license.key.clone(),
text,
tokens: vec![],
rule_kind: crate::license_detection::models::RuleKind::Text,
is_false_positive: false,
is_required_phrase: false,
is_from_license: true,
relevance: 100,
minimum_coverage: license.minimum_coverage,
has_stored_minimum_coverage,
is_continuous: false,
required_phrase_spans: vec![],
stopwords_by_pos: HashMap::new(),
referenced_filenames: None,
ignorable_urls: license.ignorable_urls.clone(),
ignorable_emails: license.ignorable_emails.clone(),
ignorable_copyrights: license.ignorable_copyrights.clone(),
ignorable_holders: license.ignorable_holders.clone(),
ignorable_authors: license.ignorable_authors.clone(),
language: None,
notes: license.notes.clone(),
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: license.is_deprecated,
spdx_license_key: license.spdx_license_key.clone(),
other_spdx_license_keys: license.other_spdx_license_keys.clone(),
})
}
fn build_rules_from_licenses(licenses: &[License]) -> Vec<Rule> {
licenses
.iter()
.filter_map(build_rule_from_license)
.collect()
}
fn get_essential_spdx_tokens() -> &'static [&'static str] {
&["spdx", "license", "licence", "identifier", "licenseref"]
}
fn collect_spdx_tokens(licenses: &[License]) -> HashSet<String> {
let mut tokens: HashSet<String> = HashSet::new();
for &tok in get_essential_spdx_tokens() {
tokens.insert(tok.to_string());
}
for license in licenses {
if let Some(ref spdx_key) = license.spdx_license_key {
for token in tokenize(spdx_key) {
tokens.insert(token);
}
}
for spdx_key in &license.other_spdx_license_keys {
for token in tokenize(spdx_key) {
tokens.insert(token);
}
}
}
tokens
}
const MARKERS: &[&str] = &[
"copyright",
"c",
"copyrights",
"rights",
"reserved",
"trademark",
"foundation",
"government",
"institute",
"university",
"inc",
"corp",
"co",
"author",
"com",
"org",
"net",
"uk",
"fr",
"be",
"de",
"http",
"https",
"www",
];
pub fn is_good_tokens_ngram(tokens_ngram: &[String], known_tokens_ngram: &[KnownToken]) -> bool {
const MIN_GOOD: usize = 3;
let digit_count = tokens_ngram
.iter()
.filter(|t| t.chars().all(|c| c.is_ascii_digit()))
.count();
if digit_count >= MIN_GOOD {
return false;
}
let year_count = tokens_ngram
.iter()
.filter(|t| t.len() == 4 && t.chars().all(|c| c.is_ascii_digit()))
.count();
if year_count > 0 {
return false;
}
let single_char_count = tokens_ngram.iter().filter(|t| t.len() == 1).count();
if single_char_count >= MIN_GOOD {
return false;
}
let unique_tids = TokenSet::from_token_ids(known_tokens_ngram.iter().map(|token| token.id));
if unique_tids.len() <= 2 {
return false;
}
let has_high_token = known_tokens_ngram
.iter()
.any(|token| token.kind == TokenKind::Legalese);
if !has_high_token {
return false;
}
let has_marker = tokens_ngram.iter().any(|t| MARKERS.contains(&t.as_str()));
if has_marker {
return false;
}
true
}
pub fn compute_is_approx_matchable(rule: &Rule) -> bool {
!(rule.is_false_positive
|| rule.is_required_phrase
|| rule.is_tiny
|| rule.is_continuous
|| (rule.is_small && (rule.is_license_reference() || rule.is_license_tag())))
}
pub fn tokens_to_bytes(tokens: &[TokenId]) -> Vec<u8> {
tokens.iter().flat_map(|t| t.to_le_bytes()).collect()
}
pub fn ngrams<T: Clone>(items: &[T], ngram_length: usize) -> Vec<Vec<T>> {
if items.len() < ngram_length {
return Vec::new();
}
items
.windows(ngram_length)
.map(|window| window.to_vec())
.collect()
}
pub fn build_index(rules: Vec<Rule>, licenses: Vec<License>) -> LicenseIndex {
let legalese = legalese::archived_legalese();
let mut dictionary = TokenDictionary::new_with_legalese(legalese);
let len_legalese = dictionary.legalese_count();
{
let spdx_tokens = collect_spdx_tokens(&licenses);
let mut sorted_tokens: Vec<&String> = spdx_tokens.iter().collect();
sorted_tokens.sort();
for token in sorted_tokens {
if dictionary.lookup(token).is_none() {
let _ = dictionary.intern(token);
}
}
}
let license_token_ids = TokenSet::from_token_ids(
LICENSE_TOKEN_STRINGS
.iter()
.filter_map(|&token| dictionary.lookup(token).map(|token| token.id)),
);
let mut rid_by_hash: HashMap<[u8; 20], RuleId> = HashMap::new();
let mut rules_by_rid: Vec<Rule> = Vec::with_capacity(rules.len());
let mut tids_by_rid: Vec<Vec<TokenId>> = Vec::with_capacity(rules.len());
let mut sets_by_rid: HashMap<RuleId, TokenSet> = HashMap::new();
let mut msets_by_rid: HashMap<RuleId, TokenMultiset> = HashMap::new();
let mut high_sets_by_rid: HashMap<RuleId, TokenSet> = HashMap::new();
let mut high_postings_by_rid: HashMap<RuleId, HashMap<TokenId, Vec<usize>>> = HashMap::new();
let mut rids_by_high_tid: HashMap<TokenId, HashSet<RuleId>> = HashMap::new();
let mut rules_builder = AutomatonBuilder::new();
let mut unknown_automaton_patterns: Vec<Vec<u8>> = Vec::new();
let mut licenses_by_key: HashMap<String, License> = HashMap::new();
let mut sorted_licenses: Vec<License> = licenses;
sorted_licenses.sort_by(|a, b| a.key.cmp(&b.key));
for license in &sorted_licenses {
licenses_by_key.insert(license.key.clone(), license.clone());
}
let mut license_rules_vec: Vec<License> = licenses_by_key.values().cloned().collect();
license_rules_vec.sort_by(|a, b| a.key.cmp(&b.key));
let license_rules = build_rules_from_licenses(&license_rules_vec);
let mut all_rules: Vec<Rule> = license_rules.into_iter().chain(rules).collect();
all_rules.sort();
let mut rid_by_spdx_key: HashMap<String, RuleId> = HashMap::new();
let mut unknown_spdx_rid: Option<RuleId> = None;
for (rid, mut rule) in all_rules.into_iter().enumerate() {
let rule_id = RuleId::new(rid);
rule.required_phrase_spans = parse_required_phrase_spans(&rule.text);
let (rule_tokens, stopwords_by_pos) = tokenize_with_stopwords(&rule.text);
rule.stopwords_by_pos = stopwords_by_pos;
let mut known_rule_tokens: Vec<KnownToken> = Vec::with_capacity(rule_tokens.len());
let mut rule_token_ids: Vec<TokenId> = Vec::with_capacity(rule_tokens.len());
let mut is_weak = true;
for rts in &rule_tokens {
let known_token = dictionary.intern(rts);
if is_weak && known_token.kind == TokenKind::Legalese {
is_weak = false;
}
known_rule_tokens.push(known_token);
rule_token_ids.push(known_token.id);
}
let rule_length = rule_token_ids.len();
rule.tokens = rule_token_ids.clone();
rule.starts_with_license = rule_token_ids
.first()
.map(|&tid| license_token_ids.contains_token_id(tid))
.unwrap_or(false);
rule.ends_with_license = rule_token_ids
.last()
.map(|&tid| license_token_ids.contains_token_id(tid))
.unwrap_or(false);
let rule_hash = compute_hash(&rule_token_ids);
if !rule_token_ids.is_empty() {
let pattern = tokens_to_bytes(&rule_token_ids);
rules_builder.add_pattern_with_value(&pattern, rule_id.raw() as u32);
}
if rule.is_false_positive {
rules_by_rid.push(rule);
tids_by_rid.push(rule_token_ids);
continue;
}
rid_by_hash.insert(rule_hash, rule_id);
let is_approx_matchable = compute_is_approx_matchable(&rule);
if rule_length >= UNKNOWN_NGRAM_LENGTH {
let known_ngrams = ngrams(&known_rule_tokens, UNKNOWN_NGRAM_LENGTH);
let toks_ngrams = ngrams(&rule_tokens, UNKNOWN_NGRAM_LENGTH);
for (known_ngram, toks_ngram) in known_ngrams.iter().zip(toks_ngrams.iter()) {
if is_good_tokens_ngram(toks_ngram, known_ngram) {
let token_ids: Vec<TokenId> =
known_ngram.iter().map(|token| token.id).collect();
unknown_automaton_patterns.push(tokens_to_bytes(&token_ids));
}
}
}
if is_approx_matchable && !is_weak {
let mut postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
for (pos, token) in known_rule_tokens.iter().enumerate() {
if token.kind == TokenKind::Legalese {
postings.entry(token.id).or_default().push(pos);
}
}
if !postings.is_empty() {
high_postings_by_rid.insert(rule_id, postings);
}
}
let tids_set = TokenSet::from_token_ids(rule_token_ids.iter().copied());
let mset = TokenMultiset::from_token_ids(&rule_token_ids);
sets_by_rid.insert(rule_id, tids_set.clone());
msets_by_rid.insert(rule_id, mset.clone());
let tids_set_high = tids_set.high_subset(&dictionary);
let mset_high = mset.high_subset(&dictionary);
if !tids_set_high.is_empty() {
high_sets_by_rid.insert(rule_id, tids_set_high.clone());
}
if is_approx_matchable && !is_weak {
for tid in tids_set_high.iter() {
rids_by_high_tid
.entry(TokenId::new(tid))
.or_default()
.insert(rule_id);
}
}
rule.length_unique = tids_set.len();
rule.high_length_unique = tids_set_high.len();
rule.high_length = mset_high.total_count();
let (updated_coverage, min_matched_length, min_high_matched_length) =
compute_thresholds_occurrences(rule.minimum_coverage, rule_length, rule.high_length);
if !rule.has_stored_minimum_coverage {
rule.minimum_coverage = updated_coverage;
}
rule.min_matched_length = min_matched_length;
rule.min_high_matched_length = min_high_matched_length;
let (min_matched_length_unique, min_high_matched_length_unique) = compute_thresholds_unique(
rule.minimum_coverage,
rule_length,
rule.length_unique,
rule.high_length_unique,
);
rule.min_matched_length_unique = min_matched_length_unique;
rule.min_high_matched_length_unique = min_high_matched_length_unique;
rule.is_small = rule_length < SMALL_RULE;
rule.is_tiny = rule_length < TINY_RULE;
if let Some(ref spdx_key) = rule.spdx_license_key {
rid_by_spdx_key.insert(spdx_key.to_lowercase(), rule_id);
}
for alias in &rule.other_spdx_license_keys {
rid_by_spdx_key.insert(alias.to_lowercase(), rule_id);
}
if rule.license_expression == "unknown-spdx" {
unknown_spdx_rid = Some(rule_id);
}
rules_by_rid.push(rule);
tids_by_rid.push(rule_token_ids);
}
add_deprecated_spdx_aliases(&mut rid_by_spdx_key);
let rules_automaton = rules_builder.build();
let unknown_automaton = if unknown_automaton_patterns.is_empty() {
AutomatonBuilder::new().build()
} else {
let mut unique_patterns: Vec<Vec<u8>> = unknown_automaton_patterns.into_iter().collect();
unique_patterns.sort();
let mut builder = AutomatonBuilder::new();
for pattern in &unique_patterns {
builder.add_pattern(pattern);
}
builder.build()
};
let mut index = LicenseIndex {
dictionary,
len_legalese,
rid_by_hash,
rules_by_rid,
tids_by_rid,
rules_automaton,
unknown_automaton,
sets_by_rid,
rule_metadata_by_identifier: HashMap::new(),
msets_by_rid,
high_sets_by_rid,
high_postings_by_rid,
licenses_by_key,
rid_by_spdx_key,
unknown_spdx_rid,
rids_by_high_tid,
spdx_license_list_version: None,
};
populate_precomputed_rule_spdx(&mut index);
index
}
pub fn loaded_rule_to_rule(loaded: LoadedRule) -> Rule {
Rule {
identifier: loaded.identifier,
license_expression: loaded.license_expression,
text: loaded.text,
tokens: vec![],
rule_kind: loaded.rule_kind,
is_false_positive: loaded.is_false_positive,
is_required_phrase: loaded.is_required_phrase,
is_from_license: false,
relevance: loaded.relevance.unwrap_or(100),
minimum_coverage: loaded.minimum_coverage,
has_stored_minimum_coverage: loaded.has_stored_minimum_coverage,
is_continuous: loaded.is_continuous,
required_phrase_spans: vec![],
stopwords_by_pos: HashMap::new(),
referenced_filenames: loaded.referenced_filenames,
ignorable_urls: loaded.ignorable_urls,
ignorable_emails: loaded.ignorable_emails,
ignorable_copyrights: loaded.ignorable_copyrights,
ignorable_holders: loaded.ignorable_holders,
ignorable_authors: loaded.ignorable_authors,
language: loaded.language,
notes: loaded.notes,
length_unique: 0,
high_length_unique: 0,
high_length: 0,
min_matched_length: 0,
min_high_matched_length: 0,
min_matched_length_unique: 0,
min_high_matched_length_unique: 0,
is_small: false,
is_tiny: false,
starts_with_license: false,
ends_with_license: false,
is_deprecated: loaded.is_deprecated,
spdx_license_key: None,
other_spdx_license_keys: vec![],
}
}
pub fn loaded_license_to_license(loaded: LoadedLicense) -> License {
License {
key: loaded.key,
short_name: loaded.short_name,
name: loaded.name,
language: loaded.language,
spdx_license_key: loaded.spdx_license_key,
other_spdx_license_keys: loaded.other_spdx_license_keys,
category: loaded.category,
owner: loaded.owner,
homepage_url: loaded.homepage_url,
text: loaded.text,
reference_urls: loaded.reference_urls,
osi_license_key: loaded.osi_license_key,
text_urls: loaded.text_urls,
osi_url: loaded.osi_url,
faq_url: loaded.faq_url,
other_urls: loaded.other_urls,
notes: loaded.notes,
is_deprecated: loaded.is_deprecated,
is_exception: loaded.is_exception,
is_unknown: loaded.is_unknown,
is_generic: loaded.is_generic,
replaced_by: loaded.replaced_by,
minimum_coverage: loaded.minimum_coverage,
standard_notice: loaded.standard_notice,
ignorable_copyrights: loaded.ignorable_copyrights,
ignorable_holders: loaded.ignorable_holders,
ignorable_authors: loaded.ignorable_authors,
ignorable_urls: loaded.ignorable_urls,
ignorable_emails: loaded.ignorable_emails,
}
}
pub fn build_index_from_loaded(
loaded_rules: Vec<LoadedRule>,
loaded_licenses: Vec<LoadedLicense>,
with_deprecated: bool,
) -> LicenseIndex {
let rule_metadata = loaded_rules.clone();
let rules: Vec<Rule> = loaded_rules
.into_iter()
.filter(|r| with_deprecated || !r.is_deprecated)
.map(loaded_rule_to_rule)
.collect();
let licenses: Vec<License> = loaded_licenses
.into_iter()
.filter(|l| with_deprecated || !l.is_deprecated)
.map(loaded_license_to_license)
.collect();
let mut index = build_index(rules, licenses);
apply_loaded_rule_metadata(&mut index, &rule_metadata, with_deprecated);
index
}
#[cfg(test)]
mod tests;