sancus_lib/
license_detector.rs1use anyhow::{Context, Result};
12use log::*;
13use regex::Regex;
14use spdx::LicenseId;
15use std::path::Path;
16use std::{collections::HashMap, io::Write, sync::OnceLock};
17
18use crate::license_text::LicenseText;
19
20#[derive(Debug, Clone)]
21pub struct LicenseFile {
22 pub id: Option<String>,
23 pub file: String,
24}
25
26#[derive(Debug, Clone)]
27struct LicenseHash {
28 id: Option<String>,
29 text: String,
30 word_hash: HashMap<String, u32>,
31}
32
33#[derive(Debug, Clone)]
34pub struct LicenseDetector {
35 templates: Vec<LicenseHash>,
36}
37
38static INSTANCE: OnceLock<LicenseDetector> = OnceLock::new();
39
40impl LicenseHash {
41 pub fn new(id: Option<String>, text: &str) -> Self {
42 let word_hash = Self::generate_hash(text);
43 Self {
44 id,
45 text: text.to_owned(),
46 word_hash,
47 }
48 }
49
50 fn generate_hash(text: &str) -> HashMap<String, u32> {
51 let mut word_hash = HashMap::new();
52 for word in Regex::new(r"\w+").unwrap().find_iter(text) {
53 *word_hash.entry(word.as_str().to_lowercase().clone()).or_insert(0) += 1;
54 }
55 word_hash
56 }
57}
58
59impl LicenseDetector {
60 pub fn init() {
61 let license_detector = Self::build();
62 INSTANCE.set(license_detector).unwrap();
63 }
64
65 pub fn instance() -> &'static Self {
66 INSTANCE
67 .get()
68 .expect("LicenseDetector is not initialized, please execute LicenseDetector::init()")
69 }
70
71 fn build() -> Self {
72 let mut templates = vec![];
73 for (id, _full_name, _flags) in spdx::identifiers::LICENSES {
74 let license = spdx::license_id(id).unwrap();
75
76 templates.push(LicenseHash::new(Some(license.name.to_owned()), license.text()));
77 }
78 LicenseDetector { templates }
79 }
80
81 fn compare(text_hash: &HashMap<String, u32>, template_hash: &HashMap<String, u32>) -> u32 {
82 let mut errors = 0;
83 let mut text_hash = text_hash.clone();
84
85 for (word, &count) in template_hash {
86 let text_count = text_hash.remove(word).unwrap_or(0);
87 let diff = ((text_count as i32) - (count as i32)).unsigned_abs();
88 errors += diff;
89 }
90
91 for (_, count) in text_hash {
92 errors += count;
93 }
94
95 errors
96 }
97
98 pub fn detect_license(&self, package: &str, license_ids: &[LicenseId], text: &str) -> LicenseText {
99 let mut text_hash = LicenseHash::new(None, text);
100 let mut best_score = None;
101 let mut best_template_text = "".to_owned();
102
103 if license_ids.is_empty() {
104 for template in &self.templates {
105 let total: u32 = template.word_hash.values().sum();
106 let errors = Self::compare(&text_hash.word_hash, &template.word_hash);
107 let score = (errors as f32) / (total as f32);
108
109 trace!("Score for {}: {}", template.id.as_ref().unwrap(), score);
110 if best_score.is_none() || score < best_score.unwrap() {
111 best_score = Some(score);
112 text_hash.id = template.id.clone();
113 best_template_text = template.text.to_owned();
114 }
115 }
116 } else {
117 for license_id in license_ids {
118 let template = LicenseHash::new(Some(license_id.name.to_owned()), license_id.text());
119 let total: u32 = template.word_hash.values().sum();
120 let errors = Self::compare(&text_hash.word_hash, &template.word_hash);
121 let score = (errors as f32) / (total as f32);
122
123 trace!("Score for {}: {}", template.id.as_ref().unwrap(), score);
124 if best_score.is_none() || score < best_score.unwrap() {
125 best_score = Some(score);
126 text_hash.id = template.id.clone();
127 best_template_text = template.text.to_owned();
128 }
129 }
130 }
131 let score = best_score.unwrap();
132
133 debug!("Best score was for {}: {}", text_hash.id.as_ref().unwrap(), score);
134
135 if license_ids.len() == 1 {
136 let license_id = license_ids.first().unwrap();
137 if text_hash.id.as_ref().unwrap() != license_id.name {
138 let tmp_dir = Path::new("license_detect").join(package);
139 std::fs::create_dir_all(tmp_dir.clone()).unwrap();
140
141 let text_file = tmp_dir.join("best_template_text.txt");
142 let mut file = std::fs::File::create(text_file).unwrap();
143 file.write_all(best_template_text.as_bytes()).unwrap();
144 drop(file);
145
146 let text_file = tmp_dir.join("original_text.txt");
147 let mut file = std::fs::File::create(text_file).unwrap();
148 file.write_all(text_hash.text.as_bytes()).unwrap();
149 drop(file);
150
151 let text_file = tmp_dir.join("spdx_id_text.txt");
152 let mut file = std::fs::File::create(text_file).unwrap();
153 file.write_all(license_id.text().as_bytes()).unwrap();
154 drop(file);
155 }
156 }
157
158 LicenseText {
159 id: text_hash.id.unwrap(),
160 text: text.to_owned(),
161 }
162 }
163
164 pub fn detect_licenses(
165 &self,
166 package: &str,
167 license_ids: &[LicenseId],
168 license_files: &[LicenseFile],
169 ) -> Result<Vec<LicenseText>> {
170 debug!("Detect license ids for license texts of package {package}");
171 let mut license_texts = vec![];
172 for license_file in license_files {
173 let text = std::fs::read_to_string(license_file.file.clone())
174 .with_context(|| format!("Cannot read third party license file {}", license_file.file))?;
175
176 if let Some(id) = license_file.id.as_ref() {
177 license_texts.push(LicenseText { id: id.clone(), text });
178 } else {
179 license_texts.push(LicenseDetector::instance().detect_license(package, license_ids, text.as_str()));
180 }
181 }
182 Ok(license_texts)
183 }
184}