Skip to main content

tokmd_analysis_license/
lib.rs

1use std::collections::BTreeSet;
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{LicenseFinding, LicenseReport, LicenseSourceKind};
6
7use tokmd_analysis_util::AnalysisLimits;
8
9const DEFAULT_MAX_LICENSE_BYTES: u64 = 256 * 1024;
10
11pub fn build_license_report(
12    root: &Path,
13    files: &[PathBuf],
14    limits: &AnalysisLimits,
15) -> Result<LicenseReport> {
16    let candidates = tokmd_walk::license_candidates(files);
17    let max_bytes = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_LICENSE_BYTES) as usize;
18
19    let mut findings: Vec<LicenseFinding> = Vec::new();
20    let mut extra_license_files: BTreeSet<PathBuf> = BTreeSet::new();
21
22    for rel in &candidates.metadata_files {
23        let path = root.join(rel);
24        let rel_str = rel.to_string_lossy().replace('\\', "/");
25        if let Some(spdx) = parse_metadata_license(&path)? {
26            findings.push(LicenseFinding {
27                spdx,
28                confidence: 0.95,
29                source_path: rel_str.clone(),
30                source_kind: LicenseSourceKind::Metadata,
31            });
32        }
33        if let Some(license_file) = parse_metadata_license_file(&path)? {
34            extra_license_files.insert(PathBuf::from(license_file));
35        }
36    }
37
38    let mut text_files: BTreeSet<PathBuf> = candidates.license_files.into_iter().collect();
39    text_files.extend(extra_license_files);
40
41    for rel in text_files {
42        let path = root.join(&rel);
43        let rel_str = rel.to_string_lossy().replace('\\', "/");
44        let text = tokmd_content::read_text_capped(&path, max_bytes)?;
45        if text.is_empty() {
46            continue;
47        }
48        if let Some((spdx, confidence)) = match_license_text(&text) {
49            findings.push(LicenseFinding {
50                spdx,
51                confidence,
52                source_path: rel_str,
53                source_kind: LicenseSourceKind::Text,
54            });
55        }
56    }
57
58    findings.sort_by(|a, b| {
59        b.confidence
60            .partial_cmp(&a.confidence)
61            .unwrap_or(std::cmp::Ordering::Equal)
62            .then_with(|| a.spdx.cmp(&b.spdx))
63            .then_with(|| a.source_path.cmp(&b.source_path))
64    });
65
66    let effective = findings.first().map(|f| f.spdx.clone());
67
68    Ok(LicenseReport {
69        findings,
70        effective,
71    })
72}
73
74fn parse_metadata_license(path: &Path) -> Result<Option<String>> {
75    let file_name = path
76        .file_name()
77        .and_then(|n| n.to_str())
78        .unwrap_or("")
79        .to_lowercase();
80    if file_name == "cargo.toml" {
81        return Ok(parse_toml_license(path, "package"));
82    }
83    if file_name == "pyproject.toml" {
84        return Ok(
85            parse_toml_license(path, "project").or_else(|| parse_toml_license(path, "tool.poetry"))
86        );
87    }
88    if file_name == "package.json" {
89        return Ok(parse_package_json_license(path));
90    }
91    Ok(None)
92}
93
94fn parse_metadata_license_file(path: &Path) -> Result<Option<String>> {
95    let file_name = path
96        .file_name()
97        .and_then(|n| n.to_str())
98        .unwrap_or("")
99        .to_lowercase();
100    if file_name != "cargo.toml" {
101        return Ok(None);
102    }
103    let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize)?;
104    Ok(parse_toml_key(&text, "package", "license-file"))
105}
106
107fn parse_toml_license(path: &Path, section: &str) -> Option<String> {
108    let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
109    parse_toml_key(&text, section, "license")
110}
111
112fn parse_toml_key(text: &str, section: &str, key: &str) -> Option<String> {
113    let mut in_section = false;
114    for raw in text.lines() {
115        let line = raw.trim();
116        if line.starts_with('[') && line.ends_with(']') {
117            let name = line.trim_matches(&['[', ']'][..]).trim();
118            in_section = name == section;
119            continue;
120        }
121        if !in_section {
122            continue;
123        }
124        if let Some(value) = parse_key_value(line, key) {
125            return Some(value);
126        }
127    }
128    None
129}
130
131fn parse_key_value(line: &str, key: &str) -> Option<String> {
132    let mut parts = line.splitn(2, '=');
133    let left = parts.next()?.trim();
134    let right = parts.next()?.trim();
135    if left != key {
136        return None;
137    }
138    extract_quoted(right)
139}
140
141fn extract_quoted(text: &str) -> Option<String> {
142    let mut chars = text.chars();
143    let mut quote = None;
144    for c in chars.by_ref() {
145        if c == '"' || c == '\'' {
146            quote = Some(c);
147            break;
148        }
149    }
150    let quote = quote?;
151    let mut out = String::new();
152    for c in chars {
153        if c == quote {
154            break;
155        }
156        out.push(c);
157    }
158    if out.is_empty() { None } else { Some(out) }
159}
160
161fn parse_package_json_license(path: &Path) -> Option<String> {
162    let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
163    let value: serde_json::Value = serde_json::from_str(&text).ok()?;
164    match value.get("license") {
165        Some(serde_json::Value::String(s)) => Some(s.trim().to_string()),
166        Some(serde_json::Value::Object(obj)) => obj
167            .get("type")
168            .and_then(|v| v.as_str())
169            .map(|s| s.trim().to_string()),
170        _ => None,
171    }
172}
173
174fn match_license_text(text: &str) -> Option<(String, f32)> {
175    let lower = text.to_lowercase();
176    let patterns = license_patterns();
177    let mut best: Option<(String, f32)> = None;
178
179    for pattern in patterns {
180        let hits = pattern
181            .phrases
182            .iter()
183            .filter(|phrase| lower.contains(*phrase))
184            .count();
185        if hits < pattern.min_hits {
186            continue;
187        }
188        let confidence = 0.6 + 0.4 * (hits as f32 / pattern.phrases.len() as f32);
189        let candidate = (pattern.spdx.to_string(), confidence);
190        if best.as_ref().map(|(_, c)| confidence > *c).unwrap_or(true) {
191            best = Some(candidate);
192        }
193    }
194
195    best
196}
197
198struct LicensePattern {
199    spdx: &'static str,
200    phrases: &'static [&'static str],
201    min_hits: usize,
202}
203
204fn license_patterns() -> Vec<LicensePattern> {
205    vec![
206        LicensePattern {
207            spdx: "MIT",
208            phrases: &[
209                "permission is hereby granted, free of charge",
210                "the software is provided \"as is\"",
211            ],
212            min_hits: 1,
213        },
214        LicensePattern {
215            spdx: "Apache-2.0",
216            phrases: &[
217                "apache license",
218                "version 2.0",
219                "http://www.apache.org/licenses/",
220                "limitations under the license",
221            ],
222            min_hits: 2,
223        },
224        LicensePattern {
225            spdx: "GPL-3.0-or-later",
226            phrases: &[
227                "gnu general public license",
228                "version 3",
229                "any later version",
230            ],
231            min_hits: 2,
232        },
233        LicensePattern {
234            spdx: "AGPL-3.0-or-later",
235            phrases: &[
236                "gnu affero general public license",
237                "version 3",
238                "any later version",
239            ],
240            min_hits: 2,
241        },
242        LicensePattern {
243            spdx: "BSD-3-Clause",
244            phrases: &[
245                "redistribution and use in source and binary forms",
246                "neither the name of",
247                "contributors may be used",
248            ],
249            min_hits: 2,
250        },
251        LicensePattern {
252            spdx: "BSD-2-Clause",
253            phrases: &[
254                "redistribution and use in source and binary forms",
255                "this software is provided by the copyright holders and contributors \"as is\"",
256            ],
257            min_hits: 1,
258        },
259        LicensePattern {
260            spdx: "MPL-2.0",
261            phrases: &[
262                "mozilla public license",
263                "version 2.0",
264                "http://mozilla.org/MPL/2.0/",
265            ],
266            min_hits: 2,
267        },
268    ]
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274    use std::fs;
275    use tempfile::tempdir;
276
277    #[test]
278    fn detects_metadata_license() {
279        let dir = tempdir().unwrap();
280        let cargo = dir.path().join("Cargo.toml");
281        fs::write(
282            &cargo,
283            r#"[package]
284name = "demo"
285license = "MIT"
286"#,
287        )
288        .unwrap();
289
290        let files = vec![PathBuf::from("Cargo.toml")];
291        let report = build_license_report(dir.path(), &files, &AnalysisLimits::default()).unwrap();
292        assert!(
293            report
294                .findings
295                .iter()
296                .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Metadata)
297        );
298    }
299
300    #[test]
301    fn detects_text_license() {
302        let dir = tempdir().unwrap();
303        let license = dir.path().join("LICENSE");
304        fs::write(
305            &license,
306            "Permission is hereby granted, free of charge, to any person obtaining a copy of this software. The software is provided \"as is\".",
307        )
308        .unwrap();
309
310        let files = vec![PathBuf::from("LICENSE")];
311        let report = build_license_report(dir.path(), &files, &AnalysisLimits::default()).unwrap();
312        assert!(
313            report
314                .findings
315                .iter()
316                .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Text)
317        );
318    }
319}