tokmd-analysis 1.10.0

Analysis logic and enrichers for tokmd receipts.
Documentation
use std::collections::BTreeSet;
use std::path::{Path, PathBuf};

use anyhow::Result;
use tokmd_analysis_types::{LicenseFinding, LicenseReport, LicenseSourceKind};

use tokmd_analysis_types::AnalysisLimits;

const DEFAULT_MAX_LICENSE_BYTES: u64 = 256 * 1024;

pub(crate) fn build_license_report(
    root: &Path,
    files: &[PathBuf],
    limits: &AnalysisLimits,
) -> Result<LicenseReport> {
    let candidates = tokmd_scan::walk::license_candidates(files);
    let max_bytes = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_LICENSE_BYTES) as usize;

    let mut findings: Vec<LicenseFinding> = Vec::new();
    let mut extra_license_files: BTreeSet<PathBuf> = BTreeSet::new();

    for rel in &candidates.metadata_files {
        let path = root.join(rel);
        let rel_str = rel.to_string_lossy().replace('\\', "/");
        if let Some(spdx) = parse_metadata_license(&path)? {
            findings.push(LicenseFinding {
                spdx,
                confidence: 0.95,
                source_path: rel_str.clone(),
                source_kind: LicenseSourceKind::Metadata,
            });
        }
        if let Some(license_file) = parse_metadata_license_file(&path)? {
            extra_license_files.insert(PathBuf::from(license_file));
        }
    }

    let mut text_files: BTreeSet<PathBuf> = candidates.license_files.into_iter().collect();
    text_files.extend(extra_license_files);

    for rel in text_files {
        let path = root.join(&rel);
        let rel_str = rel.to_string_lossy().replace('\\', "/");
        let text = crate::content::io::read_text_capped(&path, max_bytes)?;
        if text.is_empty() {
            continue;
        }
        if let Some((spdx, confidence)) = match_license_text(&text) {
            findings.push(LicenseFinding {
                spdx,
                confidence,
                source_path: rel_str,
                source_kind: LicenseSourceKind::Text,
            });
        }
    }

    findings.sort_by(|a, b| {
        b.confidence
            .partial_cmp(&a.confidence)
            .unwrap_or(std::cmp::Ordering::Equal)
            .then_with(|| a.spdx.cmp(&b.spdx))
            .then_with(|| a.source_path.cmp(&b.source_path))
    });

    let effective = findings.first().map(|f| f.spdx.clone());

    Ok(LicenseReport {
        findings,
        effective,
    })
}

#[cfg(test)]
#[path = "tests.rs"]
mod moved_tests;

fn parse_metadata_license(path: &Path) -> Result<Option<String>> {
    let file_name = path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("")
        .to_lowercase();
    if file_name == "cargo.toml" {
        return Ok(parse_toml_license(path, "package"));
    }
    if file_name == "pyproject.toml" {
        return Ok(
            parse_toml_license(path, "project").or_else(|| parse_toml_license(path, "tool.poetry"))
        );
    }
    if file_name == "package.json" {
        return Ok(parse_package_json_license(path));
    }
    Ok(None)
}

fn parse_metadata_license_file(path: &Path) -> Result<Option<String>> {
    let file_name = path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("")
        .to_lowercase();
    if file_name != "cargo.toml" {
        return Ok(None);
    }
    let text = crate::content::io::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize)?;
    Ok(parse_toml_key(&text, "package", "license-file"))
}

fn parse_toml_license(path: &Path, section: &str) -> Option<String> {
    let text =
        crate::content::io::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
    parse_toml_key(&text, section, "license")
}

fn parse_toml_key(text: &str, section: &str, key: &str) -> Option<String> {
    let mut in_section = false;
    for raw in text.lines() {
        let line = raw.trim();
        if line.starts_with('[') && line.ends_with(']') {
            let name = line.trim_matches(&['[', ']'][..]).trim();
            in_section = name == section;
            continue;
        }
        if !in_section {
            continue;
        }
        if let Some(value) = parse_key_value(line, key) {
            return Some(value);
        }
    }
    None
}

fn parse_key_value(line: &str, key: &str) -> Option<String> {
    let mut parts = line.splitn(2, '=');
    let left = parts.next()?.trim();
    let right = parts.next()?.trim();
    if left != key {
        return None;
    }
    extract_quoted(right)
}

fn extract_quoted(text: &str) -> Option<String> {
    let mut chars = text.chars();
    let mut quote = None;
    for c in chars.by_ref() {
        if c == '"' || c == '\'' {
            quote = Some(c);
            break;
        }
    }
    let quote = quote?;
    let mut out = String::new();
    for c in chars {
        if c == quote {
            break;
        }
        out.push(c);
    }
    if out.is_empty() { None } else { Some(out) }
}

fn parse_package_json_license(path: &Path) -> Option<String> {
    let text =
        crate::content::io::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
    let value: serde_json::Value = serde_json::from_str(&text).ok()?;
    match value.get("license") {
        Some(serde_json::Value::String(s)) => Some(s.trim().to_string()),
        Some(serde_json::Value::Object(obj)) => obj
            .get("type")
            .and_then(|v| v.as_str())
            .map(|s| s.trim().to_string()),
        _ => None,
    }
}

fn match_license_text(text: &str) -> Option<(String, f32)> {
    let lower = text.to_lowercase();
    let patterns = license_patterns();
    let mut best: Option<(String, f32)> = None;

    for pattern in patterns {
        let hits = pattern
            .phrases
            .iter()
            .filter(|phrase| lower.contains(*phrase))
            .count();
        if hits < pattern.min_hits {
            continue;
        }
        let confidence = 0.6 + 0.4 * (hits as f32 / pattern.phrases.len() as f32);
        let candidate = (pattern.spdx.to_string(), confidence);
        if best.as_ref().map(|(_, c)| confidence > *c).unwrap_or(true) {
            best = Some(candidate);
        }
    }

    best
}

struct LicensePattern {
    spdx: &'static str,
    phrases: &'static [&'static str],
    min_hits: usize,
}

fn license_patterns() -> Vec<LicensePattern> {
    vec![
        LicensePattern {
            spdx: "MIT",
            phrases: &[
                "permission is hereby granted, free of charge",
                "the software is provided \"as is\"",
            ],
            min_hits: 1,
        },
        LicensePattern {
            spdx: "Apache-2.0",
            phrases: &[
                "apache license",
                "version 2.0",
                "http://www.apache.org/licenses/",
                "limitations under the license",
            ],
            min_hits: 2,
        },
        LicensePattern {
            spdx: "GPL-3.0-or-later",
            phrases: &[
                "gnu general public license",
                "version 3",
                "any later version",
            ],
            min_hits: 2,
        },
        LicensePattern {
            spdx: "AGPL-3.0-or-later",
            phrases: &[
                "gnu affero general public license",
                "version 3",
                "any later version",
            ],
            min_hits: 2,
        },
        LicensePattern {
            spdx: "BSD-3-Clause",
            phrases: &[
                "redistribution and use in source and binary forms",
                "neither the name of",
                "contributors may be used",
            ],
            min_hits: 2,
        },
        LicensePattern {
            spdx: "BSD-2-Clause",
            phrases: &[
                "redistribution and use in source and binary forms",
                "this software is provided by the copyright holders and contributors \"as is\"",
            ],
            min_hits: 1,
        },
        LicensePattern {
            spdx: "MPL-2.0",
            phrases: &[
                "mozilla public license",
                "version 2.0",
                "http://mozilla.org/MPL/2.0/",
            ],
            min_hits: 2,
        },
    ]
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use tempfile::tempdir;

    #[test]
    fn detects_metadata_license() {
        let dir = tempdir().expect("Failed to create tempdir for metadata test");
        let cargo = dir.path().join("Cargo.toml");
        fs::write(
            &cargo,
            r#"[package]
name = "demo"
license = "MIT"
"#,
        )
        .expect("Failed to write mock Cargo.toml");

        let files = vec![PathBuf::from("Cargo.toml")];
        let report = build_license_report(dir.path(), &files, &AnalysisLimits::default())
            .expect("Failed to build license report for metadata");
        assert!(
            report
                .findings
                .iter()
                .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Metadata)
        );
    }

    #[test]
    fn detects_text_license() {
        let dir = tempdir().expect("Failed to create tempdir for text test");
        let license = dir.path().join("LICENSE");
        fs::write(
            &license,
            "Permission is hereby granted, free of charge, to any person obtaining a copy of this software. The software is provided \"as is\".",
        )
        .expect("Failed to write mock LICENSE file");

        let files = vec![PathBuf::from("LICENSE")];
        let report = build_license_report(dir.path(), &files, &AnalysisLimits::default())
            .expect("Failed to build license report for text");
        assert!(
            report
                .findings
                .iter()
                .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Text)
        );
    }
}