1use std::collections::BTreeSet;
2use std::path::{Path, PathBuf};
3
4use anyhow::Result;
5use tokmd_analysis_types::{LicenseFinding, LicenseReport, LicenseSourceKind};
6
7use tokmd_analysis_util::AnalysisLimits;
8
9const DEFAULT_MAX_LICENSE_BYTES: u64 = 256 * 1024;
10
11pub fn build_license_report(
12 root: &Path,
13 files: &[PathBuf],
14 limits: &AnalysisLimits,
15) -> Result<LicenseReport> {
16 let candidates = tokmd_walk::license_candidates(files);
17 let max_bytes = limits.max_file_bytes.unwrap_or(DEFAULT_MAX_LICENSE_BYTES) as usize;
18
19 let mut findings: Vec<LicenseFinding> = Vec::new();
20 let mut extra_license_files: BTreeSet<PathBuf> = BTreeSet::new();
21
22 for rel in &candidates.metadata_files {
23 let path = root.join(rel);
24 let rel_str = rel.to_string_lossy().replace('\\', "/");
25 if let Some(spdx) = parse_metadata_license(&path)? {
26 findings.push(LicenseFinding {
27 spdx,
28 confidence: 0.95,
29 source_path: rel_str.clone(),
30 source_kind: LicenseSourceKind::Metadata,
31 });
32 }
33 if let Some(license_file) = parse_metadata_license_file(&path)? {
34 extra_license_files.insert(PathBuf::from(license_file));
35 }
36 }
37
38 let mut text_files: BTreeSet<PathBuf> = candidates.license_files.into_iter().collect();
39 text_files.extend(extra_license_files);
40
41 for rel in text_files {
42 let path = root.join(&rel);
43 let rel_str = rel.to_string_lossy().replace('\\', "/");
44 let text = tokmd_content::read_text_capped(&path, max_bytes)?;
45 if text.is_empty() {
46 continue;
47 }
48 if let Some((spdx, confidence)) = match_license_text(&text) {
49 findings.push(LicenseFinding {
50 spdx,
51 confidence,
52 source_path: rel_str,
53 source_kind: LicenseSourceKind::Text,
54 });
55 }
56 }
57
58 findings.sort_by(|a, b| {
59 b.confidence
60 .partial_cmp(&a.confidence)
61 .unwrap_or(std::cmp::Ordering::Equal)
62 .then_with(|| a.spdx.cmp(&b.spdx))
63 .then_with(|| a.source_path.cmp(&b.source_path))
64 });
65
66 let effective = findings.first().map(|f| f.spdx.clone());
67
68 Ok(LicenseReport {
69 findings,
70 effective,
71 })
72}
73
74fn parse_metadata_license(path: &Path) -> Result<Option<String>> {
75 let file_name = path
76 .file_name()
77 .and_then(|n| n.to_str())
78 .unwrap_or("")
79 .to_lowercase();
80 if file_name == "cargo.toml" {
81 return Ok(parse_toml_license(path, "package"));
82 }
83 if file_name == "pyproject.toml" {
84 return Ok(
85 parse_toml_license(path, "project").or_else(|| parse_toml_license(path, "tool.poetry"))
86 );
87 }
88 if file_name == "package.json" {
89 return Ok(parse_package_json_license(path));
90 }
91 Ok(None)
92}
93
94fn parse_metadata_license_file(path: &Path) -> Result<Option<String>> {
95 let file_name = path
96 .file_name()
97 .and_then(|n| n.to_str())
98 .unwrap_or("")
99 .to_lowercase();
100 if file_name != "cargo.toml" {
101 return Ok(None);
102 }
103 let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize)?;
104 Ok(parse_toml_key(&text, "package", "license-file"))
105}
106
107fn parse_toml_license(path: &Path, section: &str) -> Option<String> {
108 let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
109 parse_toml_key(&text, section, "license")
110}
111
112fn parse_toml_key(text: &str, section: &str, key: &str) -> Option<String> {
113 let mut in_section = false;
114 for raw in text.lines() {
115 let line = raw.trim();
116 if line.starts_with('[') && line.ends_with(']') {
117 let name = line.trim_matches(&['[', ']'][..]).trim();
118 in_section = name == section;
119 continue;
120 }
121 if !in_section {
122 continue;
123 }
124 if let Some(value) = parse_key_value(line, key) {
125 return Some(value);
126 }
127 }
128 None
129}
130
131fn parse_key_value(line: &str, key: &str) -> Option<String> {
132 let mut parts = line.splitn(2, '=');
133 let left = parts.next()?.trim();
134 let right = parts.next()?.trim();
135 if left != key {
136 return None;
137 }
138 extract_quoted(right)
139}
140
141fn extract_quoted(text: &str) -> Option<String> {
142 let mut chars = text.chars();
143 let mut quote = None;
144 for c in chars.by_ref() {
145 if c == '"' || c == '\'' {
146 quote = Some(c);
147 break;
148 }
149 }
150 let quote = quote?;
151 let mut out = String::new();
152 for c in chars {
153 if c == quote {
154 break;
155 }
156 out.push(c);
157 }
158 if out.is_empty() { None } else { Some(out) }
159}
160
161fn parse_package_json_license(path: &Path) -> Option<String> {
162 let text = tokmd_content::read_text_capped(path, DEFAULT_MAX_LICENSE_BYTES as usize).ok()?;
163 let value: serde_json::Value = serde_json::from_str(&text).ok()?;
164 match value.get("license") {
165 Some(serde_json::Value::String(s)) => Some(s.trim().to_string()),
166 Some(serde_json::Value::Object(obj)) => obj
167 .get("type")
168 .and_then(|v| v.as_str())
169 .map(|s| s.trim().to_string()),
170 _ => None,
171 }
172}
173
174fn match_license_text(text: &str) -> Option<(String, f32)> {
175 let lower = text.to_lowercase();
176 let patterns = license_patterns();
177 let mut best: Option<(String, f32)> = None;
178
179 for pattern in patterns {
180 let hits = pattern
181 .phrases
182 .iter()
183 .filter(|phrase| lower.contains(*phrase))
184 .count();
185 if hits < pattern.min_hits {
186 continue;
187 }
188 let confidence = 0.6 + 0.4 * (hits as f32 / pattern.phrases.len() as f32);
189 let candidate = (pattern.spdx.to_string(), confidence);
190 if best.as_ref().map(|(_, c)| confidence > *c).unwrap_or(true) {
191 best = Some(candidate);
192 }
193 }
194
195 best
196}
197
198struct LicensePattern {
199 spdx: &'static str,
200 phrases: &'static [&'static str],
201 min_hits: usize,
202}
203
204fn license_patterns() -> Vec<LicensePattern> {
205 vec![
206 LicensePattern {
207 spdx: "MIT",
208 phrases: &[
209 "permission is hereby granted, free of charge",
210 "the software is provided \"as is\"",
211 ],
212 min_hits: 1,
213 },
214 LicensePattern {
215 spdx: "Apache-2.0",
216 phrases: &[
217 "apache license",
218 "version 2.0",
219 "http://www.apache.org/licenses/",
220 "limitations under the license",
221 ],
222 min_hits: 2,
223 },
224 LicensePattern {
225 spdx: "GPL-3.0-or-later",
226 phrases: &[
227 "gnu general public license",
228 "version 3",
229 "any later version",
230 ],
231 min_hits: 2,
232 },
233 LicensePattern {
234 spdx: "AGPL-3.0-or-later",
235 phrases: &[
236 "gnu affero general public license",
237 "version 3",
238 "any later version",
239 ],
240 min_hits: 2,
241 },
242 LicensePattern {
243 spdx: "BSD-3-Clause",
244 phrases: &[
245 "redistribution and use in source and binary forms",
246 "neither the name of",
247 "contributors may be used",
248 ],
249 min_hits: 2,
250 },
251 LicensePattern {
252 spdx: "BSD-2-Clause",
253 phrases: &[
254 "redistribution and use in source and binary forms",
255 "this software is provided by the copyright holders and contributors \"as is\"",
256 ],
257 min_hits: 1,
258 },
259 LicensePattern {
260 spdx: "MPL-2.0",
261 phrases: &[
262 "mozilla public license",
263 "version 2.0",
264 "http://mozilla.org/MPL/2.0/",
265 ],
266 min_hits: 2,
267 },
268 ]
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274 use std::fs;
275 use tempfile::tempdir;
276
277 #[test]
278 fn detects_metadata_license() {
279 let dir = tempdir().unwrap();
280 let cargo = dir.path().join("Cargo.toml");
281 fs::write(
282 &cargo,
283 r#"[package]
284name = "demo"
285license = "MIT"
286"#,
287 )
288 .unwrap();
289
290 let files = vec![PathBuf::from("Cargo.toml")];
291 let report = build_license_report(dir.path(), &files, &AnalysisLimits::default()).unwrap();
292 assert!(
293 report
294 .findings
295 .iter()
296 .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Metadata)
297 );
298 }
299
300 #[test]
301 fn detects_text_license() {
302 let dir = tempdir().unwrap();
303 let license = dir.path().join("LICENSE");
304 fs::write(
305 &license,
306 "Permission is hereby granted, free of charge, to any person obtaining a copy of this software. The software is provided \"as is\".",
307 )
308 .unwrap();
309
310 let files = vec![PathBuf::from("LICENSE")];
311 let report = build_license_report(dir.path(), &files, &AnalysisLimits::default()).unwrap();
312 assert!(
313 report
314 .findings
315 .iter()
316 .any(|f| f.spdx == "MIT" && f.source_kind == LicenseSourceKind::Text)
317 );
318 }
319}