use super::{LicenseFile, LicenseFileKind};
use krates::{Utf8Path as Path, Utf8PathBuf as PathBuf};
use rayon::prelude::*;
use spdx::detection::scan::Scanner;
pub(crate) fn scan_files(
root_dir: &Path,
scanner: &Scanner<'_>,
threshold: f32,
max_depth: Option<usize>,
) -> anyhow::Result<Vec<LicenseFile>> {
let types = {
let mut tb = ignore::types::TypesBuilder::new();
tb.add_defaults();
tb.select("all");
tb.build()?
};
let walker = ignore::WalkBuilder::new(root_dir)
.standard_filters(true)
.follow_links(true)
.max_depth(max_depth)
.types(types)
.build();
let files: Vec<_> = walker.filter_map(|e| e.ok()).collect();
let license_files: Vec<_> = files
.into_par_iter()
.filter_map(|file| {
log::trace!("scanning file {}", file.path().display());
if let Some(ft) = file.file_type() {
if ft.is_dir() {
return None;
}
}
#[cfg(unix)]
{
use std::os::unix::fs::FileTypeExt;
if let Ok(md) = file.metadata() {
if md.file_type().is_fifo() {
log::error!("skipping FIFO {}", file.path().display());
return None;
}
}
}
let path = match PathBuf::from_path_buf(file.into_path()) {
Ok(pb) => pb,
Err(e) => {
log::warn!("skipping path {}, not a valid utf-8 path", e.display());
return None;
}
};
let contents = read_file(&path)?;
check_is_license_file(path, contents, scanner, threshold)
})
.collect();
Ok(license_files)
}
fn read_file(path: &Path) -> Option<String> {
match std::fs::read_to_string(path) {
Err(ref e) if e.kind() == std::io::ErrorKind::InvalidData => {
log::debug!("binary file '{path}' detected");
None
}
Err(e) => {
log::error!("failed to read '{path}': {e}");
None
}
Ok(c) => Some(c),
}
}
pub(crate) fn check_is_license_file(
path: PathBuf,
contents: String,
scanner: &Scanner<'_>,
threshold: f32,
) -> Option<LicenseFile> {
match scan_text(&contents, scanner, threshold) {
ScanResult::Header(ided) => {
let license_expr = match spdx::Expression::parse(ided.id.name) {
Ok(expr) => expr,
Err(err) => {
log::error!(
"failed to parse license '{}' into a valid expression: {err}",
ided.id.name
);
return None;
}
};
Some(LicenseFile {
license_expr,
confidence: ided.confidence,
path,
kind: LicenseFileKind::Header,
})
}
ScanResult::Text(ided) => {
let license_expr = match spdx::Expression::parse(ided.id.name) {
Ok(expr) => expr,
Err(err) => {
log::error!(
"failed to parse license '{}' into a valid expression: {err}",
ided.id.name
);
return None;
}
};
Some(LicenseFile {
license_expr,
confidence: ided.confidence,
path,
kind: LicenseFileKind::Text(contents),
})
}
ScanResult::UnknownId(id_str) => {
log::error!("found unknown SPDX identifier '{id_str}' scanning '{path}'");
None
}
ScanResult::LowLicenseChance(ided) => {
log::debug!(
"found '{}' scanning '{path}' but it only has a confidence score of {}",
ided.id.name,
ided.confidence,
);
None
}
ScanResult::NoLicense => None,
}
}
struct Identified {
confidence: f32,
id: spdx::LicenseId,
}
enum ScanResult {
Header(Identified),
Text(Identified),
UnknownId(String),
LowLicenseChance(Identified),
NoLicense,
}
fn scan_text(contents: &str, strat: &Scanner<'_>, threshold: f32) -> ScanResult {
let text = spdx::detection::TextData::new(contents);
let lic_match = strat.scan(&text);
let Some(identified) = lic_match.license else {
return ScanResult::NoLicense;
};
let lic_id = match spdx::license_id(identified.name) {
Some(id) => Identified {
confidence: lic_match.score,
id,
},
None => return ScanResult::UnknownId(identified.name.to_owned()),
};
use spdx::detection::LicenseType;
if lic_match.score >= threshold {
match identified.kind {
LicenseType::Header => ScanResult::Header(lic_id),
LicenseType::Original => ScanResult::Text(lic_id),
LicenseType::Alternate => {
panic!("Alternate license detected")
}
}
} else {
ScanResult::LowLicenseChance(lic_id)
}
}