sancus_lib/
license_detector.rs

1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6//
7// SPDX-License-Identifier: MIT OR Apache-2.0
8//
9// SPDX-FileCopyrightText: 2024 X-Software GmbH <opensource@x-software.com>
10
11use anyhow::{Context, Result};
12use log::*;
13use regex::Regex;
14use spdx::LicenseId;
15use std::path::Path;
16use std::{collections::HashMap, io::Write, sync::OnceLock};
17
18use crate::license_text::LicenseText;
19
20#[derive(Debug, Clone)]
21pub struct LicenseFile {
22    pub id: Option<String>,
23    pub file: String,
24}
25
26#[derive(Debug, Clone)]
27struct LicenseHash {
28    id: Option<String>,
29    text: String,
30    word_hash: HashMap<String, u32>,
31}
32
33#[derive(Debug, Clone)]
34pub struct LicenseDetector {
35    templates: Vec<LicenseHash>,
36}
37
38static INSTANCE: OnceLock<LicenseDetector> = OnceLock::new();
39
40impl LicenseHash {
41    pub fn new(id: Option<String>, text: &str) -> Self {
42        let word_hash = Self::generate_hash(text);
43        Self {
44            id,
45            text: text.to_owned(),
46            word_hash,
47        }
48    }
49
50    fn generate_hash(text: &str) -> HashMap<String, u32> {
51        let mut word_hash = HashMap::new();
52        for word in Regex::new(r"\w+").unwrap().find_iter(text) {
53            *word_hash.entry(word.as_str().to_lowercase().clone()).or_insert(0) += 1;
54        }
55        word_hash
56    }
57}
58
59impl LicenseDetector {
60    pub fn init() {
61        let license_detector = Self::build();
62        INSTANCE.set(license_detector).unwrap();
63    }
64
65    pub fn instance() -> &'static Self {
66        INSTANCE
67            .get()
68            .expect("LicenseDetector is not initialized, please execute LicenseDetector::init()")
69    }
70
71    fn build() -> Self {
72        let mut templates = vec![];
73        for (id, _full_name, _flags) in spdx::identifiers::LICENSES {
74            let license = spdx::license_id(id).unwrap();
75
76            templates.push(LicenseHash::new(Some(license.name.to_owned()), license.text()));
77        }
78        LicenseDetector { templates }
79    }
80
81    fn compare(text_hash: &HashMap<String, u32>, template_hash: &HashMap<String, u32>) -> u32 {
82        let mut errors = 0;
83        let mut text_hash = text_hash.clone();
84
85        for (word, &count) in template_hash {
86            let text_count = text_hash.remove(word).unwrap_or(0);
87            let diff = ((text_count as i32) - (count as i32)).unsigned_abs();
88            errors += diff;
89        }
90
91        for (_, count) in text_hash {
92            errors += count;
93        }
94
95        errors
96    }
97
98    pub fn detect_license(&self, package: &str, license_ids: &[LicenseId], text: &str) -> LicenseText {
99        let mut text_hash = LicenseHash::new(None, text);
100        let mut best_score = None;
101        let mut best_template_text = "".to_owned();
102
103        if license_ids.is_empty() {
104            for template in &self.templates {
105                let total: u32 = template.word_hash.values().sum();
106                let errors = Self::compare(&text_hash.word_hash, &template.word_hash);
107                let score = (errors as f32) / (total as f32);
108
109                trace!("Score for {}: {}", template.id.as_ref().unwrap(), score);
110                if best_score.is_none() || score < best_score.unwrap() {
111                    best_score = Some(score);
112                    text_hash.id = template.id.clone();
113                    best_template_text = template.text.to_owned();
114                }
115            }
116        } else {
117            for license_id in license_ids {
118                let template = LicenseHash::new(Some(license_id.name.to_owned()), license_id.text());
119                let total: u32 = template.word_hash.values().sum();
120                let errors = Self::compare(&text_hash.word_hash, &template.word_hash);
121                let score = (errors as f32) / (total as f32);
122
123                trace!("Score for {}: {}", template.id.as_ref().unwrap(), score);
124                if best_score.is_none() || score < best_score.unwrap() {
125                    best_score = Some(score);
126                    text_hash.id = template.id.clone();
127                    best_template_text = template.text.to_owned();
128                }
129            }
130        }
131        let score = best_score.unwrap();
132
133        debug!("Best score was for {}: {}", text_hash.id.as_ref().unwrap(), score);
134
135        if license_ids.len() == 1 {
136            let license_id = license_ids.first().unwrap();
137            if text_hash.id.as_ref().unwrap() != license_id.name {
138                let tmp_dir = Path::new("license_detect").join(package);
139                std::fs::create_dir_all(tmp_dir.clone()).unwrap();
140
141                let text_file = tmp_dir.join("best_template_text.txt");
142                let mut file = std::fs::File::create(text_file).unwrap();
143                file.write_all(best_template_text.as_bytes()).unwrap();
144                drop(file);
145
146                let text_file = tmp_dir.join("original_text.txt");
147                let mut file = std::fs::File::create(text_file).unwrap();
148                file.write_all(text_hash.text.as_bytes()).unwrap();
149                drop(file);
150
151                let text_file = tmp_dir.join("spdx_id_text.txt");
152                let mut file = std::fs::File::create(text_file).unwrap();
153                file.write_all(license_id.text().as_bytes()).unwrap();
154                drop(file);
155            }
156        }
157
158        LicenseText {
159            id: text_hash.id.unwrap(),
160            text: text.to_owned(),
161        }
162    }
163
164    pub fn detect_licenses(
165        &self,
166        package: &str,
167        license_ids: &[LicenseId],
168        license_files: &[LicenseFile],
169    ) -> Result<Vec<LicenseText>> {
170        debug!("Detect license ids for license texts of package {package}");
171        let mut license_texts = vec![];
172        for license_file in license_files {
173            let text = std::fs::read_to_string(license_file.file.clone())
174                .with_context(|| format!("Cannot read third party license file {}", license_file.file))?;
175
176            if let Some(id) = license_file.id.as_ref() {
177                license_texts.push(LicenseText { id: id.clone(), text });
178            } else {
179                license_texts.push(LicenseDetector::instance().detect_license(package, license_ids, text.as_str()));
180            }
181        }
182        Ok(license_texts)
183    }
184}