cargo-deny 0.13.7

Cargo plugin to help you manage large dependency graphs
Documentation
use super::cfg::{FileSource, ValidClarification, ValidConfig};
use crate::{
    diag::{FileId, Files, Label},
    Krate,
};
use anyhow::Error;
use krates::{Utf8Path, Utf8PathBuf};
use rayon::prelude::*;
use smallvec::SmallVec;
use std::{fmt, sync::Arc};

const LICENSE_CACHE: &[u8] = include_bytes!("../../resources/spdx_cache.bin.zstd");

#[inline]
fn iter_clarifications<'a>(
    all: &'a [ValidClarification],
    krate: &'a Krate,
) -> impl Iterator<Item = &'a ValidClarification> {
    all.iter().filter(move |vc| {
        if vc.name == krate.name {
            return crate::match_req(&krate.version, vc.version.as_ref());
        }

        false
    })
}

impl fmt::Debug for FileSource {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("FileSource")
            .field("path", &self.path)
            .field("hash", &format_args!("{:#x}", self.hash))
            .finish()
    }
}

fn find_license_files(dir: &Utf8Path) -> Result<Vec<Utf8PathBuf>, std::io::Error> {
    let entries = std::fs::read_dir(dir)?;
    Ok(entries
        .filter_map(|e| {
            e.ok().and_then(|e| {
                let p = match Utf8PathBuf::from_path_buf(e.path()) {
                    Ok(pb) => pb,
                    Err(e) => {
                        log::warn!("{} contains invalid utf-8, skipping", e.display());
                        return None;
                    }
                };

                if p.is_file() && p.file_name().map_or(false, |f| f.starts_with("LICENSE")) {
                    Some(p)
                } else {
                    None
                }
            })
        })
        .collect())
}

fn get_file_source(path: Utf8PathBuf) -> PackFile {
    use std::io::BufRead;

    // Normalize on plain newlines to handle terrible Windows conventions
    let content = {
        let file = match std::fs::File::open(&path) {
            Ok(f) => f,
            Err(e) => {
                return PackFile {
                    path,
                    data: PackFileData::Bad(e),
                }
            }
        };

        let mut s =
            String::with_capacity(file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0));

        let mut br = std::io::BufReader::new(file);
        let mut min = 0;

        while let Ok(read) = br.read_line(&mut s) {
            if read == 0 {
                break;
            }

            let keep = std::cmp::max(s.trim_end_matches(|p| p == '\r' || p == '\n').len(), min);
            s.truncate(keep);
            s.push('\n');

            min = keep + 1;
        }

        s
    };

    let hash = crate::hash(content.as_bytes());
    PackFile {
        path,
        data: PackFileData::Good(LicenseFile { hash, content }),
    }
}

struct LicenseFile {
    hash: u32,
    content: String,
}

enum PackFileData {
    Good(LicenseFile),
    Bad(std::io::Error),
}

struct PackFile {
    path: Utf8PathBuf,
    data: PackFileData,
}

enum MismatchReason<'a> {
    /// The specified file was not found when gathering license files
    FileNotFound,
    /// Encountered an I/O error trying to read the file contents
    Error(&'a std::io::Error),
    /// The hash of the license file doesn't match the expected hash
    HashDiffers,
}

struct LicensePack {
    license_files: Vec<PackFile>,
    err: Option<std::io::Error>,
}

struct GatheredExpr {
    synthesized_toml: String,
    failures: Vec<Label>,
    expr: spdx::Expression,
    file_sources: Vec<String>,
}

impl LicensePack {
    fn read(krate: &Krate) -> Self {
        let root_path = krate.manifest_path.parent().unwrap();

        let mut lic_paths = match find_license_files(root_path) {
            Ok(paths) => paths,
            Err(e) => {
                return Self {
                    license_files: Vec::new(),
                    err: Some(e),
                }
            }
        };

        // Add the explicitly specified license if it wasn't
        // already found in the root directory
        if let Some(lf) = &krate.license_file {
            if !lic_paths.iter().any(|l| l.ends_with(lf)) {
                // The `krate.license_file` is relative to the crate, while files found with
                // `find_license_files()` are absolute. We prepend the directory of the current
                // crate, to make sure all license file paths will be absolute.
                let absolute_lf = krate.manifest_path.parent().unwrap().join(lf);
                lic_paths.push(absolute_lf);
            }
        }

        let mut license_files: Vec<_> = lic_paths.into_iter().map(get_file_source).collect();

        license_files.sort_by(|a, b| a.path.cmp(&b.path));

        Self {
            license_files,
            err: None,
        }
    }

    fn license_files_match(&self, expected: &FileSource) -> Result<(), MismatchReason<'_>> {
        let err = match self
            .license_files
            .iter()
            .find(|lf| lf.path.ends_with(&expected.path.value))
        {
            Some(lf) => match &lf.data {
                PackFileData::Bad(e) => MismatchReason::Error(e),
                PackFileData::Good(file_data) => {
                    if file_data.hash != expected.hash {
                        MismatchReason::HashDiffers
                    } else {
                        return Ok(());
                    }
                }
            },
            None => MismatchReason::FileNotFound,
        };

        Err(err)
    }

    fn get_expression(
        &self,
        krate: &Krate,
        file: FileId,
        strat: &askalono::ScanStrategy<'_>,
        confidence: f32,
    ) -> Result<GatheredExpr, (String, Vec<Label>)> {
        use std::fmt::Write;

        let mut expr = String::new();
        let mut sources = Vec::new();

        let mut synth_toml = String::new();
        if let Some(err) = &self.err {
            write!(synth_toml, "license-files = \"{err}\"").unwrap();
            let len = synth_toml.len();
            return Err((
                synth_toml,
                vec![Label::secondary(file, 17..len - 1)
                    .with_message("unable to gather license files")],
            ));
        }

        let mut failures = Vec::new();
        synth_toml.push_str("license-files = [\n");

        let root_path = krate.manifest_path.parent().unwrap();

        for lic_contents in &self.license_files {
            write!(
                synth_toml,
                "    {{ path = \"{}\", ",
                lic_contents.path.strip_prefix(root_path).unwrap(),
            )
            .unwrap();

            match &lic_contents.data {
                PackFileData::Good(data) => {
                    write!(synth_toml, "hash = 0x{:08x}, ", data.hash).unwrap();

                    let text = askalono::TextData::new(&data.content);
                    match strat.scan(&text) {
                        Ok(lic_match) => {
                            if let Some(identified) = lic_match.license {
                                // askalano doesn't report any matches below the confidence threshold
                                // but we want to see what it thinks the license is if the confidence
                                // is somewhat ok at least
                                if lic_match.score >= confidence {
                                    if let Some(id) = spdx::license_id(identified.name) {
                                        if !sources.is_empty() {
                                            expr.push_str(" AND ");
                                        }

                                        expr.push_str(id.name);
                                        sources.push(
                                            lic_contents.path.file_name().unwrap().to_owned(),
                                        );
                                    } else {
                                        write!(synth_toml, "score = {:.2}", lic_match.score)
                                            .unwrap();
                                        let start = synth_toml.len();
                                        write!(synth_toml, ", license = \"{}\"", identified.name)
                                            .unwrap();
                                        let end = synth_toml.len();

                                        failures.push(
                                            Label::secondary(file, start + 13..end - 1)
                                                .with_message("unknown SPDX identifier"),
                                        );
                                    }
                                } else {
                                    let start = synth_toml.len();
                                    write!(synth_toml, "score = {:.2}", lic_match.score).unwrap();
                                    let end = synth_toml.len();
                                    write!(synth_toml, ", license = \"{}\"", identified.name)
                                        .unwrap();

                                    failures.push(
                                        Label::secondary(file, start + 8..end)
                                            .with_message("low confidence in the license text"),
                                    );
                                }
                            } else {
                                // If the license can't be matched with high enough confidence
                                let start = synth_toml.len();
                                write!(synth_toml, "score = {:.2}", lic_match.score).unwrap();
                                let end = synth_toml.len();

                                failures.push(
                                    Label::secondary(file, start + 8..end)
                                        .with_message("low confidence in the license text"),
                                );
                            }
                        }
                        Err(err) => {
                            panic!("askalono's elimination strategy failed (this used to be impossible): {err}");
                        }
                    }
                }
                PackFileData::Bad(err) => {
                    let start = synth_toml.len();
                    write!(synth_toml, "err = \"{}\"", err).unwrap();
                    let end = synth_toml.len();

                    failures.push(
                        Label::secondary(file, start + 7..end - 1)
                            .with_message("unable to read license file"),
                    );
                }
            }

            writeln!(synth_toml, " }},").unwrap();
        }

        synth_toml.push(']');

        if failures.is_empty() || !sources.is_empty() {
            Ok(GatheredExpr {
                synthesized_toml: synth_toml,
                failures,
                expr: spdx::Expression::parse(&expr).unwrap(),
                file_sources: sources,
            })
        } else {
            Err((synth_toml, failures))
        }
    }
}

#[derive(Debug)]
pub struct LicenseExprInfo {
    pub file_id: FileId,
    pub offset: usize,
    pub source: LicenseExprSource,
}

#[derive(Debug, PartialEq, Eq)]
pub enum LicenseExprSource {
    /// An SPDX expression in the Cargo.toml `license` field
    Metadata,
    /// An override in the user's deny.toml
    UserOverride,
    /// An override from an overlay
    OverlayOverride,
    /// An expression synthesized from one or more LICENSE files
    LicenseFiles(Vec<String>),
}

#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
pub enum LicenseInfo {
    /// An SPDX expression parsed or generated from the
    /// license information provided by a crate
    SpdxExpression {
        expr: spdx::Expression,
        nfo: LicenseExprInfo,
    },
    /// No licenses were detected in the crate
    Unlicensed,
}

#[derive(Debug)]
pub struct KrateLicense<'a> {
    pub krate: &'a Krate,
    pub lic_info: LicenseInfo,

    // Reasons for why the license was determined (or not!) when
    // gathering the license information
    pub(crate) labels: SmallVec<[Label; 1]>,
}

pub struct Summary<'a> {
    store: Arc<LicenseStore>,
    pub nfos: Vec<KrateLicense<'a>>,
}

impl<'a> Summary<'a> {
    fn new(store: Arc<LicenseStore>) -> Self {
        Self {
            store,
            nfos: Vec::new(),
        }
    }
}

/// Store used to identify licenses from text files
pub struct LicenseStore {
    store: askalono::Store,
}

impl LicenseStore {
    pub fn from_cache() -> Result<Self, Error> {
        let store = askalono::Store::from_cache(LICENSE_CACHE)
            .map_err(|e| anyhow::anyhow!("failed to load license store: {}", e))?;

        Ok(Self { store })
    }
}

impl Default for LicenseStore {
    fn default() -> Self {
        Self {
            store: askalono::Store::new(),
        }
    }
}

pub struct Gatherer {
    store: Arc<LicenseStore>,
    threshold: f32,
}

impl Default for Gatherer {
    fn default() -> Self {
        Self {
            store: Arc::new(LicenseStore::default()),
            threshold: 0.8,
        }
    }
}

#[inline]
fn get_toml_span(key: &'static str, content: &str) -> std::ops::Range<usize> {
    let mut offset = 0;
    let val_start = loop {
        let start = content[offset..].find('\n').unwrap() + 1;
        if content[start + offset..].starts_with(key) {
            break start + offset + key.len();
        }

        offset += start;
    };

    let val_end = content[val_start..].find("\"\n").unwrap();

    let start = val_start + 4;
    start..start + val_end - 4
}

impl Gatherer {
    pub fn with_store(mut self, store: Arc<LicenseStore>) -> Self {
        self.store = store;
        self
    }

    #[inline]
    pub fn with_confidence_threshold(mut self, threshold: f32) -> Self {
        self.threshold = threshold.clamp(0.0, 1.0);
        self
    }

    pub fn gather<'k>(
        self,
        krates: &'k crate::Krates,
        files: &mut Files,
        cfg: Option<&ValidConfig>,
    ) -> Summary<'k> {
        let mut summary = Summary::new(self.store);

        let threshold = self.threshold;

        let strategy = askalono::ScanStrategy::new(&summary.store.store)
            .mode(askalono::ScanMode::Elimination)
            .confidence_threshold(0.5)
            .optimize(false)
            .max_passes(1);

        let files_lock = std::sync::Arc::new(std::sync::RwLock::new(files));

        // Retrieve the license expression we'll use to evaluate the user's overall
        // constraints with.
        //
        // NOTE: The reason that user/overlay overrides are prioritized over the
        // expression that may be present in the crate's `license` field itself is
        // because that expression is currently limited in functionality to basic
        // tokenization and thus might not be able to express the actual licensing
        // used by a crate, which often means the expression is either not provided
        // at all, or is actually inaccurate with regards to the actual licensing
        // terms of the crate.
        //
        // When falling back to full-text license files to obtain the possible
        // licenses used by a crate, we create a license expression where all
        // licenses found are joined with the `AND` operator, which means that
        // the user must comply with **ALL** licenses. This is the conservative
        // approach to ensure a (possibly) required license is not introduced which
        // could potentially be ignored if 1 or more other license requirements
        // were met that the user actually intended.
        //
        // 1. User overrides - If the user specifies the license expression and
        // the constraints for the package still match the current one being checked
        // 2. Overlay overrides - If the user has specified an overlay, and it contains
        // information for the crate and the constraints for the package still
        // match the current one being checked
        // 3. `license`
        // 4. `license-file` + all LICENSE(-*)? files - Due to the prevalance
        // of dual-licensing in the rust ecosystem, many people forgo setting
        // license-file, so we use it and/or any LICENSE files
        summary.nfos = krates
            .krates()
            .par_bridge()
            .map(|krate| {
                // Attempt an SPDX expression that we can validate the user's acceptable
                // license terms with
                let mut synth_id = None;

                let mut labels = smallvec::SmallVec::<[Label; 1]>::new();

                let mut get_span = |key: &'static str| -> (FileId, std::ops::Range<usize>) {
                    if let Some(id) = synth_id {
                        let l = files_lock.read().unwrap();
                        (synth_id.unwrap(), get_toml_span(key, l.source(id)))
                    } else {
                        // Synthesize a minimal Cargo.toml for reporting diagnostics
                        // for where we retrieved license stuff
                        let synth_manifest = format!(
                            "[package]\nname = \"{}\"\nversion = \"{}\"\nlicense = \"{}\"\n",
                            krate.name,
                            krate.version,
                            krate.license.as_deref().unwrap_or_default(),
                        );

                        {
                            let mut fl = files_lock.write().unwrap();
                            synth_id = Some(fl.add(krate.id.repr.clone(), synth_manifest));
                            (
                                synth_id.unwrap(),
                                get_toml_span(key, fl.source(synth_id.unwrap())),
                            )
                        }
                    }
                };

                let mut license_pack = None;

                // 1
                if let Some(cfg) = cfg {
                    for clarification in iter_clarifications(&cfg.clarifications, krate) {
                        let lp = if let Some(lp) = &license_pack {
                            lp
                        } else {
                            license_pack = Some(LicensePack::read(krate));
                            license_pack.as_ref().unwrap()
                        };

                        // Check to see if the clarification provided exactly matches
                        // the set of detected licenses, if they do, we use the clarification's
                        // license expression as the license requirements for this crate
                        let clarifications_match = clarification.license_files.iter().all(|clf| {
                            match lp.license_files_match(clf) {
                                Ok(_) => true,
                                Err(reason) => {
                                    if let MismatchReason::FileNotFound = reason {
                                        labels.push(
                                            super::diags::MissingClarificationFile {
                                                expected: &clf.path,
                                                cfg_file_id: cfg.file_id,
                                            }
                                            .into(),
                                        );
                                    }

                                    false
                                }
                            }
                        });

                        if clarifications_match {
                            return KrateLicense {
                                krate,
                                lic_info: LicenseInfo::SpdxExpression {
                                    expr: clarification.expression.clone(),
                                    nfo: LicenseExprInfo {
                                        file_id: cfg.file_id,
                                        offset: clarification.expr_offset,
                                        source: LicenseExprSource::UserOverride,
                                    },
                                },
                                labels,
                            };
                        }
                    }
                }

                // 2 TODO

                // 3
                match &krate.license {
                    Some(license_field) => {
                        // Reasons this can fail:
                        // * Empty! The rust crate used to validate this field has a bug
                        // https://github.com/rust-lang-nursery/license-exprs/issues/23
                        // * It also just does basic lexing, so parens, duplicate operators,
                        // unpaired exceptions etc can all fail validation

                        match spdx::Expression::parse(license_field) {
                            Ok(validated) => {
                                let (id, span) = get_span("license");

                                return KrateLicense {
                                    krate,
                                    lic_info: LicenseInfo::SpdxExpression {
                                        expr: validated,
                                        nfo: LicenseExprInfo {
                                            file_id: id,
                                            offset: span.start,
                                            source: LicenseExprSource::Metadata,
                                        },
                                    },
                                    labels,
                                };
                            }
                            Err(err) => {
                                let (id, lic_span) = get_span("license");
                                let lic_span =
                                    lic_span.start + err.span.start..lic_span.start + err.span.end;

                                labels.push(
                                    Label::secondary(id, lic_span)
                                        .with_message(format!("{}", err.reason)),
                                );
                            }
                        }
                    }
                    None => {
                        let (id, lic_span) = get_span("license");
                        labels.push(
                            Label::secondary(id, lic_span)
                                .with_message("license expression was not specified"),
                        );
                    }
                }

                // 4
                // We might have already loaded the licenses to check them against a clarification
                let license_pack = license_pack.unwrap_or_else(|| LicensePack::read(krate));

                if !license_pack.license_files.is_empty() {
                    let (id, _) = get_span("license");

                    match license_pack.get_expression(krate, id, &strategy, threshold) {
                        Ok(GatheredExpr {
                            synthesized_toml,
                            failures,
                            expr,
                            file_sources,
                        }) => {
                            // Push our synthesized license files toml content to the end of
                            // the other synthesized toml then fixup all of our spans
                            let expr_offset = {
                                let mut fl = files_lock.write().unwrap();

                                let (new_source, offset) = {
                                    let source = fl.source(id);
                                    (
                                        format!(
                                            "{source}files-expr = \"{expr}\"\n{synthesized_toml}\n"
                                        ),
                                        (source.len() + 14),
                                    )
                                };

                                fl.update(id, new_source);
                                offset
                            };

                            let fail_offset = expr_offset + expr.to_string().len() + 2;

                            for fail in failures {
                                let span =
                                    fail.range.start + fail_offset..fail.range.end + fail_offset;
                                labels.push(
                                    Label::secondary(fail.file_id, span).with_message(fail.message),
                                );
                            }

                            return KrateLicense {
                                krate,
                                lic_info: LicenseInfo::SpdxExpression {
                                    expr,
                                    nfo: LicenseExprInfo {
                                        file_id: id,
                                        offset: expr_offset,
                                        source: LicenseExprSource::LicenseFiles(file_sources),
                                    },
                                },
                                labels,
                            };
                        }
                        Err((new_toml, lic_file_lables)) => {
                            // Push our synthesized license files toml content to the end of
                            // the other synthesized toml then fixup all of our spans
                            let old_end = {
                                let mut fl = files_lock.write().unwrap();

                                let (new_source, old_end) = {
                                    let source = fl.source(id);
                                    (format!("{source}{new_toml}\n"), source.len())
                                };

                                fl.update(id, new_source);
                                old_end
                            };

                            for label in lic_file_lables {
                                let span = label.range.start + old_end..label.range.end + old_end;
                                labels.push(
                                    Label::secondary(label.file_id, span)
                                        .with_message(label.message),
                                );
                            }
                        }
                    }
                }

                // Just get a label for the crate name
                let (id, nspan) = get_span("name");
                labels.push(Label::primary(id, nspan).with_message(
                    "a valid license expression could not be retrieved for the crate",
                ));

                // Well, we tried our very best. Actually that's not true, we could scan for license
                // files not prefixed by LICENSE, and recurse into subdirectories, but honestly
                // at that point it's probably better to open a PR or something because the license
                // information is not conventional and probably warrants closer inspection
                KrateLicense {
                    krate,
                    lic_info: LicenseInfo::Unlicensed,
                    labels,
                }
            })
            .collect();

        summary.nfos.par_sort_by_key(|nfo| nfo.krate);

        summary
    }
}

#[cfg(test)]
mod test {
    #[test]
    fn normalizes_line_endings() {
        let pf = super::get_file_source(krates::Utf8PathBuf::from("./tests/LICENSE-RING"));

        let expected = {
            let text = std::fs::read_to_string("./tests/LICENSE-RING").unwrap();
            text.replace("\r\n", "\n")
        };

        let expected_hash = 0xbd0e_ed23;

        if let super::PackFileData::Good(lf) = pf.data {
            if lf.hash != expected_hash {
                eprintln!("hash: {expected_hash:#x} != {:#x}", lf.hash);

                for (i, (a, b)) in lf.content.chars().zip(expected.chars()).enumerate() {
                    assert_eq!(a, b, "character @ {i}");
                }
            }
        }
    }
}