use crate::license_detection::LicenseDetectionEngine;
use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
use crate::utils::text::{
remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
};
use anyhow::Error;
use rayon::prelude::*;
use std::fs::{self, File};
use std::io::{Read, Write};
use std::path::Path;
use std::sync::Arc;
use std::time::{Duration, Instant};
use crate::copyright::{
self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
};
use crate::finder::{self, DetectionConfig};
use crate::license_detection::PositionSet;
use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
use crate::license_detection::query::Query;
use crate::models::{
Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
Match, OutputEmail, OutputURL,
};
use crate::progress::ScanProgress;
use crate::scanner::collect::CollectedPaths;
use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
use crate::utils::file::{
ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
};
use crate::utils::generated::generated_code_hints_from_bytes;
use tempfile::TempDir;
const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
(
"-----BEGIN TRUSTED CERTIFICATE-----",
"-----END TRUSTED CERTIFICATE-----",
),
];
pub fn process_collected(
collected: &CollectedPaths,
progress: Arc<ScanProgress>,
license_engine: Option<Arc<LicenseDetectionEngine>>,
license_options: LicenseScanOptions,
text_options: &TextDetectionOptions,
) -> ProcessResult {
let mut all_files: Vec<FileInfo> = collected
.files
.par_iter()
.map(|(path, metadata)| {
let file_entry = process_file(
path,
metadata,
progress.as_ref(),
license_engine.clone(),
license_options,
text_options,
);
progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
file_entry
})
.collect();
for (path, metadata) in &collected.directories {
all_files.push(process_directory(
path,
metadata,
text_options.collect_info,
license_engine.is_some(),
));
}
ProcessResult {
files: all_files,
excluded_count: collected.excluded_count,
}
}
pub fn process_collected_with_memory_limit(
collected: &CollectedPaths,
progress: Arc<ScanProgress>,
license_engine: Option<Arc<LicenseDetectionEngine>>,
license_options: LicenseScanOptions,
text_options: &TextDetectionOptions,
max_in_memory: i64,
) -> ProcessResult {
if max_in_memory == 0 {
return process_collected(
collected,
progress,
license_engine,
license_options,
text_options,
);
}
let memory_limit = if max_in_memory < 0 {
0
} else {
max_in_memory as usize
};
let chunk_size = if max_in_memory < 0 {
256
} else {
memory_limit.max(1)
};
let mut retained_files = Vec::new();
let mut spill_store = None;
for chunk in collected.files.chunks(chunk_size) {
let processed_chunk: Vec<FileInfo> = chunk
.par_iter()
.map(|(path, metadata)| {
let file_entry = process_file(
path,
metadata,
progress.as_ref(),
license_engine.clone(),
license_options,
text_options,
);
progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
file_entry
})
.collect();
retain_or_spill_chunk(
processed_chunk,
&mut retained_files,
&mut spill_store,
memory_limit,
);
}
for (path, metadata) in &collected.directories {
let entry = process_directory(
path,
metadata,
text_options.collect_info,
license_engine.is_some(),
);
retain_or_spill_chunk(
vec![entry],
&mut retained_files,
&mut spill_store,
memory_limit,
);
}
if let Some(spill_store) = spill_store {
retained_files.extend(spill_store.load_all());
}
ProcessResult {
files: retained_files,
excluded_count: collected.excluded_count,
}
}
fn retain_or_spill_chunk(
chunk: Vec<FileInfo>,
retained_files: &mut Vec<FileInfo>,
spill_store: &mut Option<FileInfoSpillStore>,
memory_limit: usize,
) {
if memory_limit == 0 {
spill_store
.get_or_insert_with(FileInfoSpillStore::new)
.spill(chunk);
return;
}
let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
if remaining_capacity >= chunk.len() && spill_store.is_none() {
retained_files.extend(chunk);
return;
}
let mut chunk_iter = chunk.into_iter();
retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
let overflow: Vec<FileInfo> = chunk_iter.collect();
if !overflow.is_empty() {
spill_store
.get_or_insert_with(FileInfoSpillStore::new)
.spill(overflow);
}
}
struct FileInfoSpillStore {
temp_dir: TempDir,
batch_index: usize,
}
impl FileInfoSpillStore {
fn new() -> Self {
Self {
temp_dir: TempDir::new().expect("create spill dir"),
batch_index: 0,
}
}
fn spill(&mut self, files: Vec<FileInfo>) {
let path = self
.temp_dir
.path()
.join(format!("batch-{:06}.json.zst", self.batch_index));
self.batch_index += 1;
let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
let file = File::create(path).expect("create spill batch file");
let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
encoder
.write_all(&payload)
.expect("write spilled file batch");
encoder.finish().expect("finish spill encoder");
}
fn load_all(self) -> Vec<FileInfo> {
let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
.expect("read spill dir")
.filter_map(Result::ok)
.map(|entry| entry.path())
.collect();
paths.sort();
let mut files = Vec::new();
for path in paths {
let file = File::open(path).expect("open spill batch");
let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
let mut payload = Vec::new();
decoder.read_to_end(&mut payload).expect("read spill batch");
let mut batch: Vec<FileInfo> =
serde_json::from_slice(&payload).expect("decode spilled file batch");
files.append(&mut batch);
}
files
}
}
fn process_file(
path: &Path,
metadata: &fs::Metadata,
progress: &ScanProgress,
license_engine: Option<Arc<LicenseDetectionEngine>>,
license_options: LicenseScanOptions,
text_options: &TextDetectionOptions,
) -> FileInfo {
let mut scan_errors: Vec<String> = vec![];
let mut file_info_builder = FileInfoBuilder::default();
let license_enabled = license_engine.is_some();
let started = Instant::now();
let mut generated_flag = None;
let mut is_source_file = false;
match extract_information_from_content(
&mut file_info_builder,
&mut scan_errors,
path,
progress,
license_engine,
license_options,
text_options,
) {
Ok((is_generated, sha256, is_source)) => {
generated_flag = is_generated;
is_source_file = is_source;
let _ = sha256;
}
Err(e) => scan_errors.push(e.to_string()),
};
if is_timeout_exceeded(started, text_options.timeout_seconds) {
scan_errors.push(format!(
"Processing interrupted due to timeout after {:.2} seconds",
text_options.timeout_seconds
));
}
let mut file_info = file_info_builder
.name(path.file_name().unwrap().to_string_lossy().to_string())
.base_name(
path.file_stem()
.unwrap_or_default()
.to_string_lossy()
.to_string(),
)
.extension(
path.extension()
.map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
)
.path(path.to_string_lossy().to_string())
.file_type(FileType::File)
.size(metadata.len())
.date(
text_options
.collect_info
.then(|| get_creation_date(metadata))
.flatten(),
)
.scan_errors(scan_errors)
.build()
.expect("FileInformationBuild not completely initialized");
if text_options.collect_info {
file_info.is_source = Some(is_source_file);
}
if file_info.programming_language.as_deref() == Some("Go")
&& is_go_non_production_source(path).unwrap_or(false)
{
file_info.is_source = Some(false);
}
if text_options.detect_generated {
file_info.is_generated = Some(generated_flag.unwrap_or(false));
}
if file_info.percentage_of_license_text.is_none() && license_enabled {
file_info.percentage_of_license_text = Some(0.0);
}
file_info
}
fn extract_information_from_content(
file_info_builder: &mut FileInfoBuilder,
scan_errors: &mut Vec<String>,
path: &Path,
progress: &ScanProgress,
license_engine: Option<Arc<LicenseDetectionEngine>>,
license_options: LicenseScanOptions,
text_options: &TextDetectionOptions,
) -> Result<(Option<bool>, String, bool), Error> {
let started = Instant::now();
let buffer = fs::read(path)?;
let license_enabled = license_engine.is_some();
if is_timeout_exceeded(started, text_options.timeout_seconds) {
return Err(Error::msg(format!(
"Timeout while reading file content (> {:.2}s)",
text_options.timeout_seconds
)));
}
let sha256 = calculate_sha256(&buffer);
let is_generated = text_options
.detect_generated
.then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
let classification = classify_file_info(path, &buffer);
if text_options.collect_info {
file_info_builder
.sha1(Some(calculate_sha1(&buffer)))
.md5(Some(calculate_md5(&buffer)))
.sha256(Some(sha256.clone()))
.programming_language(classification.programming_language.clone())
.mime_type(Some(classification.mime_type.clone()))
.file_type_label(Some(classification.file_type.clone()))
.sha1_git(Some(calculate_sha1_git(&buffer)))
.is_binary(Some(classification.is_binary))
.is_text(Some(classification.is_text))
.is_archive(Some(classification.is_archive))
.is_media(Some(classification.is_media))
.is_source(Some(classification.is_source))
.is_script(Some(classification.is_script))
.files_count(Some(0))
.dirs_count(Some(0))
.size_count(Some(0));
}
if should_skip_text_detection(path, &buffer) {
return Ok((is_generated, sha256, classification.is_source));
}
if text_options.detect_packages {
let started = Instant::now();
let parse_result = try_parse_file(path).or_else(|| {
text_options
.detect_packages_in_compiled
.then(|| try_parse_compiled_bytes(&buffer))
.flatten()
});
if let Some(parse_result) = parse_result {
let packages = parse_result
.packages
.into_iter()
.filter(|package| {
let is_compiled_package = package
.datasource_id
.as_ref()
.is_some_and(is_compiled_datasource);
let is_system_package = package
.datasource_id
.as_ref()
.is_some_and(is_system_datasource);
if is_compiled_package {
text_options.detect_packages_in_compiled
} else if is_system_package {
text_options.detect_system_packages
} else {
text_options.detect_application_packages
}
})
.collect();
file_info_builder.package_data(packages);
scan_errors.extend(parse_result.scan_errors);
}
progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
}
if is_timeout_exceeded(started, text_options.timeout_seconds) {
return Err(Error::msg(format!(
"Timeout while extracting package/text metadata (> {:.2}s)",
text_options.timeout_seconds
)));
}
let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
if is_timeout_exceeded(started, text_options.timeout_seconds) {
return Err(Error::msg(format!(
"Timeout while extracting text content (> {:.2}s)",
text_options.timeout_seconds
)));
}
if text_content.is_empty() {
return Ok((is_generated, sha256, classification.is_source));
}
if text_options.detect_copyrights {
extract_copyright_information(
file_info_builder,
path,
&text_content,
text_options.timeout_seconds,
from_binary_strings,
);
}
extract_email_url_information(
file_info_builder,
&text_content,
text_options,
from_binary_strings,
);
if is_timeout_exceeded(started, text_options.timeout_seconds) {
return Err(Error::msg(format!(
"Timeout before license scan (> {:.2}s)",
text_options.timeout_seconds
)));
}
let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
if let Some(sourcemap_content) =
crate::utils::sourcemap::extract_sourcemap_content(&text_content)
{
sourcemap_content
} else {
text_content
}
} else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
remove_verbatim_escape_sequences(&text_content)
} else {
text_content
};
if license_enabled {
let started = Instant::now();
extract_license_information(
file_info_builder,
scan_errors,
path,
text_content_for_license_detection,
license_engine,
license_options,
from_binary_strings,
)?;
progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
} else {
extract_license_information(
file_info_builder,
scan_errors,
path,
text_content_for_license_detection,
license_engine,
license_options,
from_binary_strings,
)?;
}
Ok((is_generated, sha256, classification.is_source))
}
fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
timeout_seconds.is_finite()
&& timeout_seconds > 0.0
&& started.elapsed().as_secs_f64() > timeout_seconds
}
fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
matches!(
datasource_id,
DatasourceId::AlpineInstalledDb
| DatasourceId::DebianDistrolessInstalledDb
| DatasourceId::DebianInstalledFilesList
| DatasourceId::DebianInstalledMd5Sums
| DatasourceId::DebianInstalledStatusDb
| DatasourceId::FreebsdCompactManifest
| DatasourceId::RpmInstalledDatabaseBdb
| DatasourceId::RpmInstalledDatabaseNdb
| DatasourceId::RpmInstalledDatabaseSqlite
| DatasourceId::RpmYumdb
)
}
fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
matches!(
datasource_id,
DatasourceId::GoBinary | DatasourceId::RustBinary
)
}
fn extract_copyright_information(
file_info_builder: &mut FileInfoBuilder,
path: &Path,
text_content: &str,
timeout_seconds: f64,
from_binary_strings: bool,
) {
if copyright::is_credits_file(path) {
let author_detections = copyright::detect_credits_authors(text_content);
if !author_detections.is_empty() {
file_info_builder.authors(
author_detections
.into_iter()
.map(|a| Author {
author: a.author,
start_line: a.start_line,
end_line: a.end_line,
})
.collect(),
);
return;
}
}
let copyright_options = CopyrightDetectionOptions {
max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
Some(Duration::from_secs_f64(timeout_seconds))
} else {
None
},
..CopyrightDetectionOptions::default()
};
let (copyrights, holders, authors) =
copyright::detect_copyrights_with_options(text_content, ©right_options);
let (copyrights, holders, authors) = if from_binary_strings {
prune_binary_string_detections(copyrights, holders, authors)
} else {
(copyrights, holders, authors)
};
file_info_builder.copyrights(
copyrights
.into_iter()
.map(|c| Copyright {
copyright: c.copyright,
start_line: c.start_line,
end_line: c.end_line,
})
.collect::<Vec<Copyright>>(),
);
file_info_builder.holders(
holders
.into_iter()
.map(|h| Holder {
holder: h.holder,
start_line: h.start_line,
end_line: h.end_line,
})
.collect::<Vec<Holder>>(),
);
file_info_builder.authors(
authors
.into_iter()
.map(|a| Author {
author: a.author,
start_line: a.start_line,
end_line: a.end_line,
})
.collect::<Vec<Author>>(),
);
}
fn prune_binary_string_detections(
copyrights: Vec<CopyrightDetection>,
holders: Vec<HolderDetection>,
_authors: Vec<AuthorDetection>,
) -> (
Vec<CopyrightDetection>,
Vec<HolderDetection>,
Vec<AuthorDetection>,
) {
let kept_copyrights: Vec<CopyrightDetection> = copyrights
.into_iter()
.filter(|c| is_binary_string_copyright_candidate(&c.copyright))
.collect();
let kept_holders: Vec<HolderDetection> = holders
.into_iter()
.filter(|holder| {
kept_copyrights.iter().any(|copyright| {
ranges_overlap(
holder.start_line,
holder.end_line,
copyright.start_line,
copyright.end_line,
)
})
})
.collect();
(kept_copyrights, kept_holders, Vec::new())
}
fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
a_start <= b_end && b_start <= a_end
}
fn is_binary_string_copyright_candidate(text: &str) -> bool {
if contains_year(text) {
return true;
}
let lower = text.to_ascii_lowercase();
let tail = if let Some(tail) = lower.strip_prefix("copyright") {
tail.trim()
} else {
lower.trim()
};
if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
return false;
}
let alpha_tokens: Vec<&str> = tail
.split_whitespace()
.filter(|token| token.chars().any(|c| c.is_alphabetic()))
.collect();
if alpha_tokens.len() <= 1 {
return has_explicit_copyright_marker(text)
&& alpha_tokens.iter().any(|token| {
is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
});
}
if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
return true;
}
alpha_tokens
.iter()
.any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
|| alpha_tokens
.iter()
.filter(|token| token.chars().filter(|c| c.is_alphabetic()).count() >= 3)
.count()
>= 2
}
fn has_sufficient_alphabetic_content(text: &str) -> bool {
let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
if alnum_count == 0 {
return false;
}
let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
alpha_count * 2 >= alnum_count
}
fn has_excessive_at_noise(text: &str) -> bool {
text.chars().filter(|c| *c == '@').count() >= 3
}
fn has_explicit_copyright_marker(text: &str) -> bool {
let lower = text.to_ascii_lowercase();
lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
}
fn contains_year(text: &str) -> bool {
let bytes = text.as_bytes();
bytes.windows(4).any(|window| {
window.iter().all(|b| b.is_ascii_digit())
&& matches!(window[0], b'1' | b'2')
&& matches!(window[1], b'9' | b'0')
})
}
fn is_company_like_suffix(token: &str) -> bool {
matches!(
token.to_ascii_lowercase().as_str(),
"inc"
| "corp"
| "corporation"
| "co"
| "company"
| "ltd"
| "llc"
| "gmbh"
| "foundation"
| "project"
| "systems"
| "software"
| "technologies"
| "technology"
)
}
fn extract_email_url_information(
file_info_builder: &mut FileInfoBuilder,
text_content: &str,
text_options: &TextDetectionOptions,
from_binary_strings: bool,
) {
if !text_options.detect_emails && !text_options.detect_urls {
return;
}
if text_options.detect_emails {
let config = DetectionConfig {
max_emails: text_options.max_emails,
max_urls: text_options.max_urls,
unique: false,
};
let emails = finder::find_emails(text_content, &config)
.into_iter()
.filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
.map(|d| OutputEmail {
email: d.email,
start_line: d.start_line,
end_line: d.end_line,
})
.collect::<Vec<_>>();
file_info_builder.emails(emails);
}
if text_options.detect_urls {
let config = DetectionConfig {
max_emails: text_options.max_emails,
max_urls: text_options.max_urls,
unique: true,
};
let urls = finder::find_urls(text_content, &config)
.into_iter()
.filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
.map(|d| OutputURL {
url: d.url,
start_line: d.start_line,
end_line: d.end_line,
})
.collect::<Vec<_>>();
file_info_builder.urls(urls);
}
}
fn is_binary_string_email_candidate(email: &str) -> bool {
let Some((local, domain)) = email.rsplit_once('@') else {
return false;
};
if !has_strong_binary_local_part(local) {
return false;
}
has_strong_binary_host_shape(domain)
}
fn is_binary_string_url_candidate(url: &str) -> bool {
let parsed = url::Url::parse(url).ok();
let Some(parsed) = parsed else {
return false;
};
let Some(host) = parsed.host_str() else {
return false;
};
has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
}
fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
if parsed.path() != "/"
&& parsed
.path()
.split('/')
.any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 3)
{
return true;
}
if parsed.query().is_some() || parsed.fragment().is_some() {
return true;
}
let Some(host) = parsed.host_str() else {
return false;
};
let labels: Vec<&str> = host.split('.').collect();
if matches!(labels.first(), Some(&"www")) {
return true;
}
labels
.iter()
.take(labels.len().saturating_sub(1))
.any(|label| {
label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
})
}
fn has_strong_binary_local_part(local: &str) -> bool {
local
.split(|c: char| !c.is_ascii_alphabetic())
.any(|segment| segment.len() >= 3)
}
fn has_strong_binary_host_shape(host: &str) -> bool {
let labels: Vec<&str> = host.split('.').collect();
if labels.len() < 2 {
return false;
}
let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
&labels[1..]
} else {
&labels[..]
};
if relevant.len() < 2 {
return false;
}
relevant[..relevant.len() - 1].iter().any(|label| {
label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
})
}
fn extract_license_information(
file_info_builder: &mut FileInfoBuilder,
scan_errors: &mut Vec<String>,
path: &Path,
text_content: String,
license_engine: Option<Arc<LicenseDetectionEngine>>,
license_options: LicenseScanOptions,
from_binary_strings: bool,
) -> Result<(), Error> {
let Some(engine) = license_engine else {
return Ok(());
};
let detection_result = if license_options.min_score == 0 {
engine.detect_with_kind_and_source(
&text_content,
license_options.unknown_licenses,
from_binary_strings,
&path.to_string_lossy(),
)
} else {
engine.detect_with_kind_and_source_with_score(
&text_content,
license_options.unknown_licenses,
from_binary_strings,
&path.to_string_lossy(),
license_options.min_score as f32,
)
};
match detection_result {
Ok(detections) => {
let query =
Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
let mut model_detections = Vec::new();
let mut model_clues = Vec::new();
for detection in &detections {
let (public_detection, clue_matches) = convert_detection_to_model(
detection,
license_options,
&text_content,
query.as_ref(),
);
if let Some(public_detection) = public_detection {
model_detections.push(public_detection);
}
model_clues.extend(clue_matches);
}
if !model_detections.is_empty() {
let expressions: Vec<String> = model_detections
.iter()
.filter(|d| !d.license_expression_spdx.is_empty())
.map(|d| d.license_expression_spdx.clone())
.collect();
if !expressions.is_empty() {
let combined = crate::utils::spdx::combine_license_expressions(expressions);
if let Some(expr) = combined {
file_info_builder.license_expression(Some(expr));
}
}
}
file_info_builder.license_detections(model_detections);
file_info_builder.license_clues(model_clues);
file_info_builder.percentage_of_license_text(
query
.as_ref()
.map(|query| compute_percentage_of_license_text(query, &detections)),
);
}
Err(e) => {
scan_errors.push(format!("License detection failed: {}", e));
}
}
Ok(())
}
fn convert_detection_to_model(
detection: &crate::license_detection::LicenseDetection,
license_options: LicenseScanOptions,
text_content: &str,
query: Option<&Query<'_>>,
) -> (Option<LicenseDetection>, Vec<Match>) {
let matches: Vec<Match> = detection
.matches
.iter()
.map(|m| convert_match_to_model(m, license_options, text_content, query))
.collect();
if let Some(license_expression) = detection.license_expression.clone() {
(
Some(LicenseDetection {
license_expression,
license_expression_spdx: detection
.license_expression_spdx
.clone()
.unwrap_or_default(),
matches,
detection_log: if license_options.include_diagnostics {
detection.detection_log.clone()
} else {
Vec::new()
},
identifier: detection.identifier.clone(),
}),
Vec::new(),
)
} else {
(None, matches)
}
}
fn convert_match_to_model(
m: &crate::license_detection::models::LicenseMatch,
license_options: LicenseScanOptions,
text_content: &str,
query: Option<&Query<'_>>,
) -> Match {
let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
let rule_url = if m.rule_url.is_empty() {
None
} else {
Some(m.rule_url.clone())
};
let matched_text = if license_options.include_text {
m.matched_text.clone().or_else(|| {
Some(crate::license_detection::query::matched_text_from_text(
text_content,
m.start_line,
m.end_line,
))
})
} else {
None
};
let matched_text_diagnostics = if license_options.include_text_diagnostics {
query.map(|query| matched_text_diagnostics_from_match(query, m))
} else {
None
};
Match {
license_expression: m.license_expression.clone(),
license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
from_file: m.from_file.clone(),
start_line: m.start_line,
end_line: m.end_line,
matcher: Some(m.matcher.to_string()),
score: output_metric(m.score),
matched_length: Some(m.matched_length),
match_coverage: Some(output_metric(m.coverage())),
rule_relevance: Some(m.rule_relevance as usize),
rule_identifier: Some(m.rule_identifier.clone()),
rule_url,
matched_text,
referenced_filenames: m.referenced_filenames.clone(),
matched_text_diagnostics,
}
}
fn compute_percentage_of_license_text(
query: &Query<'_>,
detections: &[crate::license_detection::LicenseDetection],
) -> f64 {
let matched_positions: std::collections::HashSet<usize> = detections
.iter()
.flat_map(|detection| detection.matches.iter())
.flat_map(|m| m.query_span().iter())
.collect();
let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
if query_tokens_length == 0 {
return 0.0;
}
let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
(percentage * 100.0).round() / 100.0
}
fn matched_text_diagnostics_from_match(
query: &Query<'_>,
license_match: &InternalLicenseMatch,
) -> String {
let matched_positions: PositionSet = license_match.query_span().iter().collect();
let Some(start_pos) = matched_positions.iter().min() else {
return crate::license_detection::query::matched_text_from_text(
&query.text,
license_match.start_line,
license_match.end_line,
);
};
let Some(end_pos) = matched_positions.iter().max() else {
return crate::license_detection::query::matched_text_from_text(
&query.text,
license_match.start_line,
license_match.end_line,
);
};
crate::license_detection::query::matched_text_diagnostics_from_text(
&query.text,
query,
&matched_positions,
start_pos,
end_pos,
license_match.start_line,
license_match.end_line,
)
}
fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
is_pem_certificate_file(path, buffer)
}
fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
return Ok(false);
}
if path
.file_name()
.and_then(|name| name.to_str())
.is_some_and(|name| name.ends_with("_test.go"))
{
return Ok(true);
}
let content = fs::read_to_string(path)?;
Ok(content.lines().take(10).any(|line| {
let trimmed = line.trim();
(trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
&& trimmed.split_whitespace().any(|token| token == "test")
}))
}
fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
let prefix_len = buffer.len().min(8192);
let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
let trimmed_lines: Vec<&str> = prefix
.lines()
.map(str::trim)
.filter(|line| !line.is_empty())
.take(64)
.collect();
let Some(first_line) = trimmed_lines.first().copied() else {
return false;
};
PEM_CERTIFICATE_HEADERS
.iter()
.any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
}
fn process_directory(
path: &Path,
_metadata: &fs::Metadata,
collect_info: bool,
license_enabled: bool,
) -> FileInfo {
let name = path
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string();
let base_name = name.clone();
FileInfo {
name,
base_name,
extension: "".to_string(),
path: path.to_string_lossy().to_string(),
file_type: FileType::Directory,
mime_type: None,
file_type_label: None,
size: 0,
date: None,
sha1: None,
md5: None,
sha256: None,
sha1_git: None,
programming_language: None,
package_data: Vec::new(),
license_expression: None,
license_detections: Vec::new(),
license_clues: Vec::new(),
percentage_of_license_text: license_enabled.then_some(0.0),
copyrights: Vec::new(),
holders: Vec::new(),
authors: Vec::new(),
emails: Vec::new(),
urls: Vec::new(),
for_packages: Vec::new(),
scan_errors: Vec::new(),
license_policy: None,
is_binary: collect_info.then_some(false),
is_text: collect_info.then_some(false),
is_archive: collect_info.then_some(false),
is_media: collect_info.then_some(false),
is_source: collect_info.then_some(false),
is_script: collect_info.then_some(false),
files_count: collect_info.then_some(0),
dirs_count: collect_info.then_some(0),
size_count: collect_info.then_some(0),
source_count: None,
is_legal: false,
is_manifest: false,
is_readme: false,
is_top_level: false,
is_key_file: false,
is_community: false,
is_generated: None,
facets: vec![],
tallies: None,
}
}
#[cfg(test)]
mod tests {
use super::{
compute_percentage_of_license_text, convert_detection_to_model,
extract_email_url_information, is_binary_string_copyright_candidate,
is_binary_string_email_candidate, is_binary_string_url_candidate,
is_go_non_production_source,
};
use crate::license_detection::LicenseDetection as InternalLicenseDetection;
use crate::license_detection::index::LicenseIndex;
use crate::license_detection::index::dictionary::TokenDictionary;
use crate::license_detection::models::position_span::PositionSpan;
use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
use crate::license_detection::query::Query;
use crate::models::{FileInfoBuilder, FileType};
use crate::scanner::scan_options_fingerprint;
use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
use std::fs;
use tempfile::tempdir;
fn make_internal_match(rule_url: &str) -> LicenseMatch {
LicenseMatch {
rid: 0,
license_expression: "mit".to_string(),
license_expression_spdx: Some("MIT".to_string()),
from_file: None,
start_line: 1,
end_line: 1,
start_token: 0,
end_token: 1,
matcher: MatcherKind::Hash,
score: 1.0,
matched_length: 3,
rule_length: 3,
match_coverage: 100.0,
rule_relevance: 100,
rule_identifier: "mit.LICENSE".to_string(),
rule_url: rule_url.to_string(),
matched_text: Some("MIT".to_string()),
referenced_filenames: None,
rule_kind: RuleKind::Text,
is_from_license: true,
rule_start_token: 0,
coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
candidate_resemblance: 0.0,
candidate_containment: 0.0,
}
}
fn make_detection(rule_url: &str) -> InternalLicenseDetection {
InternalLicenseDetection {
license_expression: Some("mit".to_string()),
license_expression_spdx: Some("MIT".to_string()),
matches: vec![make_internal_match(rule_url)],
detection_log: vec![],
identifier: Some("mit-test".to_string()),
file_regions: Vec::new(),
}
}
fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
let dictionary = TokenDictionary::new_with_legalese(entries);
let mut index = LicenseIndex::new(dictionary);
index.len_legalese = len_legalese;
index
}
#[test]
fn test_convert_detection_to_model_preserves_rule_url() {
let detection = make_detection(
"https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
);
let (converted, clues) =
convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
let converted = converted.expect("detection should convert");
assert_eq!(
converted.matches[0].rule_url.as_deref(),
Some(
"https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
)
);
assert!(clues.is_empty());
}
#[test]
fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
let detection = make_detection("");
let (converted, clues) =
convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
let converted = converted.expect("detection should convert");
assert_eq!(converted.matches[0].rule_url, None);
assert!(clues.is_empty());
}
#[test]
fn test_convert_detection_to_model_rounds_match_coverage() {
let mut detection = make_detection("");
detection.matches[0].score = 81.82;
detection.matches[0].match_coverage = 33.334;
let (converted, clues) =
convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
let converted = converted.expect("detection should convert");
assert_eq!(converted.matches[0].score, 81.82);
assert_eq!(converted.matches[0].match_coverage, Some(33.33));
assert!(clues.is_empty());
}
#[test]
fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
let mut detection = make_detection(
"https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
);
detection.license_expression = None;
detection.license_expression_spdx = None;
detection.identifier = None;
detection.matches[0].license_expression = "unknown-license-reference".to_string();
detection.matches[0].license_expression_spdx =
Some("LicenseRef-scancode-unknown-license-reference".to_string());
detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
detection.matches[0].rule_kind = RuleKind::Clue;
let (converted, clues) = convert_detection_to_model(
&detection,
LicenseScanOptions {
include_text: true,
min_score: 0,
..LicenseScanOptions::default()
},
"clue text",
None,
);
assert!(converted.is_none());
assert_eq!(clues.len(), 1);
assert_eq!(clues[0].license_expression, "unknown-license-reference");
assert_eq!(
clues[0].license_expression_spdx,
"LicenseRef-scancode-unknown-license-reference"
);
assert_eq!(
clues[0].rule_identifier.as_deref(),
Some("license-clue_1.RULE")
);
assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
assert_eq!(clues[0].matched_text_diagnostics, None);
}
#[test]
fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
let text = concat!(
"Reproduction and distribution of this file, with or without modification, are\n",
"permitted in any medium without royalties provided the copyright notice\n",
"and this notice are preserved. This file is offered as-is, without any warranties.\n",
);
let index = create_test_index(
&[
("reproduction", 0),
("distribution", 1),
("file", 2),
("without", 3),
("modification", 4),
("permitted", 5),
("medium", 6),
("royalties", 7),
("provided", 8),
("copyright", 9),
("notice", 10),
("preserved", 11),
("offered", 12),
("warranties", 13),
],
14,
);
let query = Query::from_extracted_text(text, &index, false).expect("query should build");
let mut detection = make_detection(
"https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
);
detection.detection_log = vec!["imperfect-match-coverage".to_string()];
detection.matches[0].license_expression = "fsf-ap".to_string();
detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
detection.matches[0].matched_text = None;
detection.matches[0].start_line = 1;
detection.matches[0].end_line = 3;
detection.matches[0].start_token = 0;
detection.matches[0].end_token = query.tokens.len();
detection.matches[0].coordinates =
MatchCoordinates::query_region(PositionSpan::from_positions(
query
.tokens
.iter()
.enumerate()
.filter_map(|(idx, _)| (idx != 9).then_some(idx))
.collect::<Vec<_>>(),
));
detection.identifier = Some("fsf_ap-test".to_string());
let (converted, clues) = convert_detection_to_model(
&detection,
LicenseScanOptions {
include_text: true,
include_text_diagnostics: true,
include_diagnostics: true,
unknown_licenses: false,
min_score: 0,
},
text,
Some(&query),
);
let converted = converted.expect("detection should convert");
assert!(clues.is_empty());
assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
assert_eq!(
converted.matches[0].matched_text.as_deref(),
Some(text.trim_end())
);
let diagnostics = converted.matches[0]
.matched_text_diagnostics
.as_deref()
.expect("diagnostics should be present");
assert!(diagnostics.contains('['));
assert!(diagnostics.contains(']'));
assert_ne!(diagnostics, text.trim_end());
}
#[test]
fn test_extract_email_url_information_skips_binary_string_text() {
let mut builder = FileInfoBuilder::default();
let options = TextDetectionOptions {
collect_info: false,
detect_packages: false,
detect_application_packages: false,
detect_system_packages: false,
detect_packages_in_compiled: false,
detect_copyrights: false,
detect_generated: false,
detect_emails: true,
detect_urls: true,
max_emails: 50,
max_urls: 50,
timeout_seconds: 120.0,
};
extract_email_url_information(
&mut builder,
"contact 6h@fo.lwft and visit http://gmail.com/",
&options,
true,
);
let file = builder
.name("binary.bin".to_string())
.base_name("binary".to_string())
.extension(".bin".to_string())
.path("binary.bin".to_string())
.file_type(FileType::File)
.size(1)
.build()
.expect("builder should produce file info");
assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
}
#[test]
fn test_extract_email_url_information_keeps_good_binary_contacts() {
let mut builder = FileInfoBuilder::default();
let options = TextDetectionOptions {
collect_info: false,
detect_packages: false,
detect_application_packages: false,
detect_system_packages: false,
detect_packages_in_compiled: false,
detect_copyrights: false,
detect_generated: false,
detect_emails: true,
detect_urls: true,
max_emails: 50,
max_urls: 50,
timeout_seconds: 120.0,
};
extract_email_url_information(
&mut builder,
"report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
&options,
true,
);
let file = builder
.name("binary.bin".to_string())
.base_name("binary".to_string())
.extension(".bin".to_string())
.path("binary.bin".to_string())
.file_type(FileType::File)
.size(1)
.build()
.expect("builder should produce file info");
assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
}
#[test]
fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
assert!(!is_binary_string_copyright_candidate(gibberish));
}
#[test]
fn test_binary_string_copyright_candidate_keeps_real_notice() {
let notice = "Copyright nexB and others (c) 2012";
assert!(is_binary_string_copyright_candidate(notice));
}
#[test]
fn test_binary_string_email_candidate_rejects_gibberish() {
assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
}
#[test]
fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
}
#[test]
fn test_binary_string_url_candidate_rejects_short_fake_host() {
assert!(!is_binary_string_url_candidate("http://ftp.so/"));
}
#[test]
fn test_binary_string_url_candidate_keeps_gnu_help_url() {
assert!(is_binary_string_url_candidate(
"https://www.gnu.org/software/coreutils/"
));
}
#[test]
fn test_binary_string_url_candidate_rejects_bare_root_domain() {
assert!(!is_binary_string_url_candidate("http://gmail.com/"));
}
#[test]
fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
let text = "alpha MIT omega";
let query = Query::from_extracted_text(text, &index, false).expect("query should build");
let mut detection = make_detection("");
detection.matches[0].coordinates =
MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
detection.matches[0].start_token = 1;
detection.matches[0].end_token = 2;
let percentage = compute_percentage_of_license_text(&query, &[detection]);
assert_eq!(percentage, 33.33);
}
#[test]
fn test_scan_options_fingerprint_changes_with_license_score() {
let text_options = crate::scanner::TextDetectionOptions::default();
let default_fingerprint = scan_options_fingerprint(
&text_options,
LicenseScanOptions {
min_score: 0,
..LicenseScanOptions::default()
},
None,
);
let filtered_fingerprint = scan_options_fingerprint(
&text_options,
LicenseScanOptions {
min_score: 70,
..LicenseScanOptions::default()
},
None,
);
assert_ne!(default_fingerprint, filtered_fingerprint);
}
#[test]
fn test_is_go_non_production_source_for_test_filename() {
let temp_dir = tempdir().unwrap();
let path = temp_dir.path().join("scanner_test.go");
fs::write(&path, "package scanner\n").unwrap();
assert!(is_go_non_production_source(&path).unwrap());
}
#[test]
fn test_is_go_non_production_source_for_build_tag() {
let temp_dir = tempdir().unwrap();
let path = temp_dir.path().join("scanner.go");
fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
assert!(is_go_non_production_source(&path).unwrap());
}
#[test]
fn test_is_go_non_production_source_for_regular_go_file() {
let temp_dir = tempdir().unwrap();
let path = temp_dir.path().join("scanner.go");
fs::write(&path, "package scanner\n").unwrap();
assert!(!is_go_non_production_source(&path).unwrap());
}
}