pub mod aho_match;
pub mod automaton;
pub mod build_policy;
pub mod dataset;
pub(crate) mod detection;
pub mod embedded;
pub mod license_cache;
mod position_set;
mod token_multiset;
mod token_set;
#[cfg(test)]
mod embedded_test;
pub mod expression;
#[cfg(feature = "golden-tests")]
pub mod golden_utils;
pub mod hash_match;
pub mod index;
mod match_refine;
pub mod models;
pub mod query;
pub mod rules;
pub mod seq_match;
pub mod spdx_lid;
pub mod spdx_mapping;
#[cfg(test)]
mod test_utils;
pub mod tokenize;
pub mod unknown_match;
use bit_set::BitSet;
use std::collections::HashSet;
use std::fs;
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;
use anyhow::Result;
use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
use crate::license_detection::dataset::{
CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
load_license_dataset_from_root,
};
use crate::license_detection::embedded::index::{
load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
};
use crate::license_detection::index::build_index_from_loaded;
use crate::license_detection::license_cache::{
LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
};
use crate::license_detection::query::Query;
use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
use crate::models::LicenseIndexProvenance;
use crate::utils::text::strip_utf8_bom_str;
use crate::license_detection::detection::{
attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
split_groups_across_frontmatter_boundary,
};
use crate::license_detection::models::MatcherKind;
#[allow(dead_code)]
pub const SCANCODE_LICENSES_RULES_PATH: &str =
"reference/scancode-toolkit/src/licensedcode/data/rules";
#[allow(dead_code)]
pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
"reference/scancode-toolkit/src/licensedcode/data/licenses";
#[allow(dead_code)]
pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
pub(crate) use detection::{
LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
};
pub use models::LicenseMatch;
pub use aho_match::aho_match;
pub use hash_match::hash_match;
pub use match_refine::{
filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
refine_matches_without_false_positive_filter, split_weak_matches,
};
pub use position_set::PositionSet;
pub use spdx_lid::spdx_lid_match;
pub use token_multiset::TokenMultiset;
pub use token_set::TokenSet;
pub use unknown_match::unknown_match;
use self::seq_match::{
MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
seq_match_with_candidates_and_deadline,
};
#[derive(Debug, Clone)]
pub struct LicenseDetectionEngine {
index: Arc<index::LicenseIndex>,
spdx_mapping: SpdxMapping,
spdx_license_list_version: Option<String>,
license_index_provenance: Option<LicenseIndexProvenance>,
}
const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
deadline.is_some_and(|deadline| Instant::now() >= deadline)
}
pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
if deadline_exceeded(deadline) {
Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
} else {
Ok(())
}
}
fn truncate_detection_text(clean_text: &str) -> &str {
if clean_text.len() <= MAX_DETECTION_SIZE {
return clean_text;
}
log::debug!(
"Content size {} exceeds limit {}, truncating for detection",
clean_text.len(),
MAX_DETECTION_SIZE
);
let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
&clean_text[..boundary]
}
fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
(!m.query_span().is_empty()).then(|| m.query_span().clone())
}
fn has_full_match_coverage(m: &LicenseMatch) -> bool {
m.coverage() == 100.0
}
fn is_redundant_same_expression_seq_container(
container: &LicenseMatch,
candidate_contained_matches: &[LicenseMatch],
) -> bool {
let container_is_redundant_coverage =
has_full_match_coverage(container) || container.coverage() >= 99.0;
if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
return false;
}
let container_qspan_set = container.qspan_set();
let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
.iter()
.filter(|m| {
m.matcher == MatcherKind::Aho
&& has_full_match_coverage(m)
&& m.license_expression == container.license_expression
&& m.overlaps_with(&container_qspan_set)
})
.collect();
if contained.len() < 2 {
return false;
}
let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
if material_children < 2 {
return false;
}
contained.sort_by_key(|m| m.qspan_bounds());
let mut child_union = PositionSet::new();
for m in &contained {
child_union.extend_from_span(m.query_span());
}
let container_only_positions = container_qspan_set.difference(&child_union);
let child_only_positions = child_union.difference(&container_qspan_set);
let mut bridge_positions = BitSet::new();
for pair in contained.windows(2) {
let (_, previous_end) = pair[0].qspan_bounds();
let (next_start, _) = pair[1].qspan_bounds();
if next_start < previous_end {
return false;
}
for pos in previous_end..next_start {
bridge_positions.insert(pos);
}
}
let container_only_boundary_positions = container_only_positions
.iter()
.filter(|&pos| !bridge_positions.contains(pos))
.count();
if container_only_positions.len() == 1
&& container_only_boundary_positions == 0
&& child_only_positions.is_empty()
{
return false;
}
if child_only_positions.is_empty()
&& container_only_positions.len() == container_only_boundary_positions
&& container_only_boundary_positions <= 3
{
let earliest_child = contained
.iter()
.map(|m| m.qspan_bounds().0)
.min()
.unwrap_or(usize::MAX);
let latest_child = contained
.iter()
.map(|m| m.qspan_bounds().1.saturating_sub(1))
.max()
.unwrap_or(0);
let is_one_sided_boundary = container_only_positions
.iter()
.all(|pos| pos < earliest_child)
|| container_only_positions
.iter()
.all(|pos| pos > latest_child);
if is_one_sided_boundary {
return false;
}
}
let max_container_only_positions =
MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
let max_container_boundary_positions =
MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
container_only_positions.len() <= max_container_only_positions
&& container_only_boundary_positions <= max_container_boundary_positions
&& child_only_positions.len() <= max_child_only_positions
}
fn filter_redundant_same_expression_seq_containers(
seq_matches: Vec<LicenseMatch>,
candidate_contained_matches: &[LicenseMatch],
) -> Vec<LicenseMatch> {
seq_matches
.into_iter()
.filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
.collect()
}
fn is_redundant_low_coverage_composite_seq_wrapper(
container: &LicenseMatch,
candidate_contained_matches: &[LicenseMatch],
) -> bool {
if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
return false;
}
let container_qspan_set = container.qspan_set();
let children: Vec<&LicenseMatch> = candidate_contained_matches
.iter()
.filter(|m| {
m.matcher == aho_match::MATCH_AHO
&& has_full_match_coverage(m)
&& m.license_expression != container.license_expression
&& m.overlaps_with(&container_qspan_set)
})
.collect();
if children.len() < 2 {
return false;
}
let unique_expressions: HashSet<&str> = children
.iter()
.map(|m| m.license_expression.as_str())
.collect();
if unique_expressions.len() < 2 {
return false;
}
let mut child_union = PositionSet::new();
for m in &children {
child_union.extend_from_span(m.query_span());
}
let container_only_positions = container_qspan_set.difference(&child_union);
let child_only_positions = child_union.difference(&container_qspan_set);
let mut sorted_children = children;
sorted_children.sort_by_key(|m| m.qspan_bounds());
let mut bridge_positions = BitSet::new();
for pair in sorted_children.windows(2) {
let (_, previous_end) = pair[0].qspan_bounds();
let (next_start, _) = pair[1].qspan_bounds();
for pos in previous_end..next_start {
bridge_positions.insert(pos);
}
}
let container_only_boundary_positions = container_only_positions
.iter()
.filter(|&pos| !bridge_positions.contains(pos))
.count();
child_only_positions.is_empty()
&& container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
&& container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
}
fn filter_redundant_low_coverage_composite_seq_wrappers(
seq_matches: Vec<LicenseMatch>,
candidate_contained_matches: &[LicenseMatch],
) -> Vec<LicenseMatch> {
seq_matches
.into_iter()
.filter(|m| {
!is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
})
.collect()
}
fn subtract_spdx_match_qspans(
query: &mut Query<'_>,
matched_qspans: &mut Vec<models::PositionSpan>,
aho_extra_matchables: &mut PositionSet,
spdx_matches: &[LicenseMatch],
) {
for m in spdx_matches {
let Some(span) = query_span_for_match(m) else {
continue;
};
aho_extra_matchables.extend_from_span(&span);
query.subtract(&span);
if has_full_match_coverage(m) {
matched_qspans.push(span);
}
}
}
fn merge_and_prepare_aho_matches(
index: &index::LicenseIndex,
query: &mut Query<'_>,
matched_qspans: &mut Vec<models::PositionSpan>,
refined_aho: &[LicenseMatch],
) -> (Vec<LicenseMatch>, bool) {
let merged_aho = merge_overlapping_matches(refined_aho);
let mut saw_long_exact_license_text_match = false;
for m in &merged_aho {
let Some(span) = query_span_for_match(m) else {
continue;
};
if has_full_match_coverage(m) {
matched_qspans.push(span.clone());
}
if index.rule(m.rid).is_some_and(|rule| rule.is_license_text())
&& m.rule_length > 120
&& m.coverage() > 98.0
{
query.subtract(&span);
saw_long_exact_license_text_match = true;
}
}
(merged_aho, saw_long_exact_license_text_match)
}
fn collect_whole_query_exact_followup_matches(
index: &index::LicenseIndex,
query: &mut Query<'_>,
matched_qspans: &mut Vec<models::PositionSpan>,
whole_run: &query::QueryRun<'_>,
deadline: Option<Instant>,
) -> Result<Vec<LicenseMatch>> {
let mut seq_all_matches = Vec::new();
if whole_run.is_matchable(false, matched_qspans) {
let near_dupe_candidates = if deadline.is_some() {
select_seq_candidates_with_deadline(
index,
whole_run,
true,
MAX_NEAR_DUPE_CANDIDATES,
deadline,
)?
} else {
self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
};
if !near_dupe_candidates.is_empty() {
let near_dupe_matches = if deadline.is_some() {
seq_match_with_candidates_and_deadline(
index,
whole_run,
&near_dupe_candidates,
deadline,
)?
} else {
self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
};
for m in &near_dupe_matches {
if !m.query_span().is_empty() {
let span = m.query_span().clone();
query.subtract(&span);
matched_qspans.push(span);
}
}
seq_all_matches.extend(near_dupe_matches);
}
}
Ok(seq_all_matches)
}
fn collect_regular_seq_matches(
index: &index::LicenseIndex,
query: &Query<'_>,
matched_qspans: &[models::PositionSpan],
candidate_contained_matches: &[LicenseMatch],
deadline: Option<Instant>,
) -> Result<Vec<LicenseMatch>> {
let mut seq_all_matches = Vec::new();
for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
if query_run_index % 8 == 0 {
ensure_within_deadline(deadline)?;
}
if !query_run.is_matchable(false, matched_qspans) {
continue;
}
let candidates = if deadline.is_some() {
select_seq_candidates_with_deadline(
index,
&query_run,
false,
MAX_REGULAR_SEQ_CANDIDATES,
deadline,
)?
} else {
self::seq_match::select_seq_candidates(
index,
&query_run,
false,
MAX_REGULAR_SEQ_CANDIDATES,
)
};
if !candidates.is_empty() {
let matches = if deadline.is_some() {
seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
} else {
self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
};
seq_all_matches.extend(matches);
}
}
let merged_seq = merge_overlapping_matches(&seq_all_matches);
let filtered_same_expression =
filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
Ok(filter_redundant_low_coverage_composite_seq_wrappers(
filtered_same_expression,
candidate_contained_matches,
))
}
impl LicenseDetectionEngine {
fn from_index(
index: index::LicenseIndex,
spdx_license_list_version: Option<String>,
license_index_provenance: Option<LicenseIndexProvenance>,
) -> Result<Self> {
let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
license_vec.sort_by(|a, b| a.key.cmp(&b.key));
let spdx_mapping = build_spdx_mapping(&license_vec);
Ok(Self {
index: Arc::new(index),
spdx_mapping,
spdx_license_list_version,
license_index_provenance,
})
}
#[cfg(test)]
pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
Self::from_index(index, None, None).expect("test index should build license engine")
}
pub fn from_embedded() -> Result<Self> {
let cache_config =
LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
Self::from_embedded_with_cache(&cache_config)
}
pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
let fingerprint = compute_artifact_fingerprint(artifact_bytes);
let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
.map_err(|e| {
anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
})?;
debug_assert_eq!(
artifact_metadata.license_index_provenance.source,
EMBEDDED_LICENSE_INDEX_SOURCE
);
let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
let provenance = Some(artifact_metadata.license_index_provenance.clone());
if !cache_config.reindex {
if let Some(cached) =
load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
{
let start = Instant::now();
eprintln!(
"License index loaded from rkyv cache in {:.2}s",
start.elapsed().as_secs_f64()
);
return Self::from_index(cached, spdx_version, provenance);
}
} else {
delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
}
let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
.map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
let provenance = Some(snapshot.metadata.license_index_provenance.clone());
let start = Instant::now();
let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
eprintln!(
"License index built from embedded artifact in {:.2}s",
start.elapsed().as_secs_f64()
);
let mut index = index;
index.spdx_license_list_version = spdx_version.clone();
if let Err(e) = save_cached_index(
cache_config,
LicenseCacheNamespace::Embedded,
&index,
&fingerprint,
) {
eprintln!("Warning: failed to save license index cache: {}", e);
} else if let Some(size) =
cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
{
eprintln!(
"License index cache saved ({:.1} MB)",
size as f64 / 1_048_576.0
);
}
Self::from_index(index, spdx_version, provenance)
}
pub fn from_directory(rules_path: &Path) -> Result<Self> {
let cache_config =
LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
Self::from_directory_with_cache(rules_path, &cache_config)
}
pub fn from_directory_with_cache(
rules_path: &Path,
cache_config: &LicenseCacheConfig,
) -> Result<Self> {
let LoadedLicenseDataset {
manifest,
rules: loaded_rules,
licenses: loaded_licenses,
} = load_license_dataset_from_root(rules_path)?;
let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
let provenance = Some(LicenseIndexProvenance {
source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
dataset_fingerprint: compute_dataset_fingerprint_string(
&loaded_rules,
&loaded_licenses,
)?,
ignored_rules: vec![],
ignored_licenses: vec![],
ignored_rules_due_to_licenses: vec![],
added_rules: vec![],
replaced_rules: vec![],
added_licenses: vec![],
replaced_licenses: vec![],
});
if !cache_config.reindex {
if let Some(cached) = load_cached_index(
cache_config,
LicenseCacheNamespace::CustomRules,
&fingerprint,
)? {
let start = Instant::now();
eprintln!(
"License index loaded from rkyv cache in {:.2}s",
start.elapsed().as_secs_f64()
);
return Self::from_index(
cached,
Some(manifest.spdx_license_list_version),
provenance,
);
}
} else {
delete_cache(
cache_config,
LicenseCacheNamespace::CustomRules,
&fingerprint,
)?;
}
let start = Instant::now();
let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
eprintln!(
"License index built from custom dataset in {:.2}s",
start.elapsed().as_secs_f64()
);
if let Err(e) = save_cached_index(
cache_config,
LicenseCacheNamespace::CustomRules,
&index,
&fingerprint,
) {
eprintln!("Warning: failed to save license index cache: {}", e);
} else if let Some(size) = cache_file_size(
cache_config,
LicenseCacheNamespace::CustomRules,
&fingerprint,
) {
eprintln!(
"License index cache saved ({:.1} MB)",
size as f64 / 1_048_576.0
);
}
Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
}
pub fn embedded_spdx_license_list_version() -> Result<String> {
let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
.map_err(|e| {
anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
})?
.spdx_license_list_version)
}
pub fn detect_with_kind(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
) -> Result<Vec<LicenseDetection>> {
self.detect_with_kind_with_score_and_deadline(
text,
unknown_licenses,
binary_derived,
0.0,
None,
)
}
pub fn detect_with_kind_with_score(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
min_score: f32,
) -> Result<Vec<LicenseDetection>> {
self.detect_with_kind_with_score_and_deadline(
text,
unknown_licenses,
binary_derived,
min_score,
None,
)
}
pub(crate) fn detect_with_kind_with_score_and_deadline(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
min_score: f32,
deadline: Option<Instant>,
) -> Result<Vec<LicenseDetection>> {
ensure_within_deadline(deadline)?;
let clean_text = strip_utf8_bom_str(text);
let content = truncate_detection_text(clean_text);
ensure_within_deadline(deadline)?;
let mut query = if deadline.is_some() {
Query::from_extracted_text_with_deadline(
content,
&self.index,
binary_derived,
deadline,
)?
} else {
Query::from_extracted_text(content, &self.index, binary_derived)?
};
let whole_query_run = query.whole_query_run();
let mut all_matches = Vec::new();
let mut candidate_contained_matches = Vec::new();
let mut aho_extra_matchables = PositionSet::new();
let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
{
ensure_within_deadline(deadline)?;
let hash_matches = hash_match(&self.index, &whole_query_run);
if !hash_matches.is_empty() {
let mut matches = hash_matches;
sort_matches_by_line(&mut matches);
let groups = split_groups_across_frontmatter_boundary(
group_matches_by_region(&matches),
Some(content),
);
let detections: Vec<LicenseDetection> = groups
.iter()
.map(|group| {
let mut detection = empty_detection();
populate_detection_from_group_with_spdx(
&mut detection,
group,
&self.spdx_mapping,
Some(content),
);
detection
})
.collect();
return Ok(post_process_detections(detections, min_score));
}
}
{
ensure_within_deadline(deadline)?;
let spdx_matches = spdx_lid_match(&self.index, &query);
subtract_spdx_match_qspans(
&mut query,
&mut matched_qspans,
&mut aho_extra_matchables,
&spdx_matches,
);
all_matches.extend(spdx_matches);
}
{
ensure_within_deadline(deadline)?;
let aho_matches = if aho_extra_matchables.is_empty() {
if deadline.is_some() {
aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
} else {
aho_match(&self.index, &whole_query_run)
}
} else {
if deadline.is_some() {
aho_match::aho_match_with_extra_matchables(
&self.index,
&whole_query_run,
Some(&aho_extra_matchables),
deadline,
)?
} else {
aho_match::aho_match_with_extra_matchables(
&self.index,
&whole_query_run,
Some(&aho_extra_matchables),
None,
)?
}
};
let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
candidate_contained_matches.extend(refined_aho.clone());
let (merged_aho, _) = merge_and_prepare_aho_matches(
&self.index,
&mut query,
&mut matched_qspans,
&refined_aho,
);
all_matches.extend(merged_aho);
let whole_query_followup = collect_whole_query_exact_followup_matches(
&self.index,
&mut query,
&mut matched_qspans,
&whole_query_run,
deadline,
)?;
all_matches.extend(whole_query_followup);
let merged_seq = collect_regular_seq_matches(
&self.index,
&query,
&matched_qspans,
&candidate_contained_matches,
deadline,
)?;
all_matches.extend(merged_seq);
}
ensure_within_deadline(deadline)?;
let merged_matches =
refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
let refined_matches = if unknown_licenses {
let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
let unknown_matches = unknown_match(&self.index, &query, &good_matches);
let filtered_unknown =
filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
let mut all_matches = good_matches;
all_matches.extend(filtered_unknown);
all_matches.extend(weak_matches);
all_matches
} else {
merged_matches
};
ensure_within_deadline(deadline)?;
let refined = refine_matches(&self.index, refined_matches, &query);
let mut sorted = refined;
sort_matches_by_line(&mut sorted);
let groups = split_groups_across_frontmatter_boundary(
group_matches_by_region(&sorted),
Some(content),
);
let detections: Vec<LicenseDetection> = groups
.iter()
.map(|group| {
let mut detection = empty_detection();
populate_detection_from_group_with_spdx(
&mut detection,
group,
&self.spdx_mapping,
Some(content),
);
detection
})
.collect();
let detections = post_process_detections(detections, min_score);
ensure_within_deadline(deadline)?;
Ok(detections)
}
pub fn detect_with_kind_and_source(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
source_path: &str,
) -> Result<Vec<LicenseDetection>> {
self.detect_with_kind_and_source_with_deadline(
text,
unknown_licenses,
binary_derived,
source_path,
None,
)
}
pub(crate) fn detect_with_kind_and_source_with_deadline(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
source_path: &str,
deadline: Option<Instant>,
) -> Result<Vec<LicenseDetection>> {
let mut detections = self.detect_with_kind_with_score_and_deadline(
text,
unknown_licenses,
binary_derived,
0.0,
deadline,
)?;
attach_source_path_to_detections(&mut detections, source_path);
Ok(detections)
}
pub fn detect_with_kind_and_source_with_score(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
source_path: &str,
min_score: f32,
) -> Result<Vec<LicenseDetection>> {
let mut detections =
self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
attach_source_path_to_detections(&mut detections, source_path);
Ok(detections)
}
pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
source_path: &str,
min_score: f32,
deadline: Option<Instant>,
) -> Result<Vec<LicenseDetection>> {
let mut detections = self.detect_with_kind_with_score_and_deadline(
text,
unknown_licenses,
binary_derived,
min_score,
deadline,
)?;
attach_source_path_to_detections(&mut detections, source_path);
Ok(detections)
}
#[cfg(any(test, feature = "golden-tests"))]
pub fn detect_matches_with_kind(
&self,
text: &str,
unknown_licenses: bool,
binary_derived: bool,
) -> Result<Vec<LicenseMatch>> {
let clean_text = strip_utf8_bom_str(text);
let content = truncate_detection_text(clean_text);
let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
let whole_query_run = query.whole_query_run();
let mut all_matches = Vec::new();
let mut candidate_contained_matches = Vec::new();
let mut aho_extra_matchables = PositionSet::new();
let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
{
let hash_matches = hash_match(&self.index, &whole_query_run);
if !hash_matches.is_empty() {
let mut matches = hash_matches;
sort_matches_by_line(&mut matches);
return Ok(matches);
}
}
{
let spdx_matches = spdx_lid_match(&self.index, &query);
subtract_spdx_match_qspans(
&mut query,
&mut matched_qspans,
&mut aho_extra_matchables,
&spdx_matches,
);
all_matches.extend(spdx_matches);
}
{
let aho_matches = if aho_extra_matchables.is_empty() {
aho_match(&self.index, &whole_query_run)
} else {
aho_match::aho_match_with_extra_matchables(
&self.index,
&whole_query_run,
Some(&aho_extra_matchables),
None,
)?
};
let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
candidate_contained_matches.extend(refined_aho.clone());
let (merged_aho, _) = merge_and_prepare_aho_matches(
&self.index,
&mut query,
&mut matched_qspans,
&refined_aho,
);
all_matches.extend(merged_aho);
let whole_query_followup = collect_whole_query_exact_followup_matches(
&self.index,
&mut query,
&mut matched_qspans,
&whole_query_run,
None,
)?;
all_matches.extend(whole_query_followup);
let merged_seq = collect_regular_seq_matches(
&self.index,
&query,
&matched_qspans,
&candidate_contained_matches,
None,
)?;
all_matches.extend(merged_seq);
}
let merged_matches =
refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
let refined_matches = if unknown_licenses {
let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
let unknown_matches = unknown_match(&self.index, &query, &good_matches);
let filtered_unknown =
filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
let mut all_matches = good_matches;
all_matches.extend(filtered_unknown);
all_matches.extend(weak_matches);
all_matches
} else {
merged_matches
};
let refined = refine_matches(&self.index, refined_matches, &query);
let mut sorted = refined;
sort_matches_by_line(&mut sorted);
Ok(sorted)
}
pub fn index(&self) -> &index::LicenseIndex {
&self.index
}
pub fn spdx_license_list_version(&self) -> Option<&str> {
self.spdx_license_list_version.as_deref()
}
pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
self.license_index_provenance.as_ref()
}
#[cfg(test)]
pub fn spdx_mapping(&self) -> &SpdxMapping {
&self.spdx_mapping
}
}
pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
for ancestor in search_path.ancestors() {
let candidate = ancestor.join("scancode_config.py");
if candidate.is_file() {
let config = fs::read_to_string(&candidate)?;
return Ok(parse_scancode_spdx_license_list_version(&config));
}
}
Ok(None)
}
fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
config.lines().find_map(|line| {
let trimmed = line.trim();
let (_, value) = trimmed.split_once('=')?;
(trimmed.starts_with("spdx_license_list_version")).then(|| {
value
.trim()
.trim_matches('"')
.trim_matches('\'')
.to_string()
})
})
}
#[cfg(test)]
mod tests;