Skip to main content

provenant/license_detection/
license_cache.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::fmt::Write as _;
5use std::fs;
6use std::path::{Path, PathBuf};
7
8use anyhow::{Context, Result};
9use rancor::{Panic, ResultExt};
10use sha2::{Digest, Sha256};
11
12use crate::cache::{CacheConfig, write_bytes_atomically};
13use crate::license_detection::index::LicenseIndex;
14use crate::license_detection::models::{LoadedLicense, LoadedRule};
15
16const CACHE_ROOT_DIR_NAME: &str = "license-index";
17const CACHE_FILE_EXTENSION: &str = "rkyv";
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum LicenseCacheNamespace {
21    Embedded,
22    CustomRules,
23}
24
25impl LicenseCacheNamespace {
26    fn directory_name(self) -> &'static str {
27        match self {
28            Self::Embedded => "embedded",
29            Self::CustomRules => "custom",
30        }
31    }
32}
33
34pub struct LicenseCacheConfig {
35    pub root_dir: PathBuf,
36    pub reindex: bool,
37    pub enabled: bool,
38}
39
40impl LicenseCacheConfig {
41    pub fn new(root_dir: PathBuf, reindex: bool, enabled: bool) -> Self {
42        Self {
43            root_dir,
44            reindex,
45            enabled,
46        }
47    }
48
49    pub fn default_root_dir() -> PathBuf {
50        CacheConfig::default_root_dir_without_scan_root()
51    }
52
53    fn namespace_dir(&self, namespace: LicenseCacheNamespace) -> PathBuf {
54        self.root_dir
55            .join(CACHE_ROOT_DIR_NAME)
56            .join(namespace.directory_name())
57    }
58
59    fn cache_file_path(&self, namespace: LicenseCacheNamespace, fingerprint: &[u8; 32]) -> PathBuf {
60        self.namespace_dir(namespace).join(format!(
61            "{}.{}",
62            fingerprint_hex(fingerprint),
63            CACHE_FILE_EXTENSION
64        ))
65    }
66}
67
68fn fingerprint_hex(fingerprint: &[u8; 32]) -> String {
69    let mut hex = String::with_capacity(fingerprint.len() * 2);
70    for byte in fingerprint {
71        let _ = write!(&mut hex, "{byte:02x}");
72    }
73    hex
74}
75
76fn prune_namespace_dir(namespace_dir: &Path, active_path: &Path) -> Result<()> {
77    if !namespace_dir.exists() {
78        return Ok(());
79    }
80
81    for entry in fs::read_dir(namespace_dir)
82        .with_context(|| format!("Failed to read license cache namespace {namespace_dir:?}"))?
83    {
84        let path = entry?.path();
85        if path == active_path
86            || path.extension().and_then(|ext| ext.to_str()) != Some(CACHE_FILE_EXTENSION)
87        {
88            continue;
89        }
90        fs::remove_file(&path)
91            .with_context(|| format!("Failed to prune stale license cache file {path:?}"))?;
92    }
93
94    Ok(())
95}
96
97pub fn compute_rules_fingerprint(
98    rules: &[LoadedRule],
99    licenses: &[LoadedLicense],
100) -> Result<[u8; 32]> {
101    let mut sorted_rules: Vec<_> = rules.iter().collect();
102    sorted_rules.sort_by_key(|r| &r.identifier);
103    let mut sorted_licenses: Vec<_> = licenses.iter().collect();
104    sorted_licenses.sort_by_key(|l| &l.key);
105
106    let serialized = postcard::to_allocvec(&(sorted_rules, sorted_licenses))
107        .context("Failed to serialize effective rules/licenses for cache fingerprinting")?;
108
109    Ok(Sha256::digest(serialized).into())
110}
111
112pub fn compute_artifact_fingerprint(artifact_bytes: &[u8]) -> [u8; 32] {
113    Sha256::digest(artifact_bytes).into()
114}
115
116pub fn load_cached_index(
117    config: &LicenseCacheConfig,
118    namespace: LicenseCacheNamespace,
119    fingerprint: &[u8; 32],
120) -> Result<Option<LicenseIndex>> {
121    if !config.enabled {
122        return Ok(None);
123    }
124
125    let cache_path = config.cache_file_path(namespace, fingerprint);
126
127    if !cache_path.exists() {
128        return Ok(None);
129    }
130
131    let bytes = match fs::read(&cache_path) {
132        Ok(bytes) => bytes,
133        Err(_) => return Ok(None),
134    };
135
136    if bytes.len() < 32 {
137        return Ok(None);
138    }
139
140    let stored_fingerprint: [u8; 32] = bytes[..32].try_into().unwrap();
141    if stored_fingerprint != *fingerprint {
142        return Ok(None);
143    }
144
145    let archived =
146        match rkyv::access::<rkyv::Archived<LicenseIndex>, rkyv::rancor::Error>(&bytes[32..]) {
147            Ok(archived) => archived,
148            Err(_) => return Ok(None),
149        };
150
151    let cached: LicenseIndex = rkyv::deserialize::<LicenseIndex, Panic>(archived).always_ok();
152
153    Ok(Some(cached))
154}
155
156pub fn save_cached_index(
157    config: &LicenseCacheConfig,
158    namespace: LicenseCacheNamespace,
159    cached: &LicenseIndex,
160    fingerprint: &[u8; 32],
161) -> Result<()> {
162    if !config.enabled {
163        return Ok(());
164    }
165
166    let rkyv_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(cached)
167        .map_err(|e| anyhow::anyhow!("Failed to serialize license index cache: {}", e))?;
168
169    let mut payload = Vec::with_capacity(fingerprint.len() + rkyv_bytes.len());
170    payload.extend_from_slice(fingerprint);
171    payload.extend_from_slice(&rkyv_bytes);
172
173    let namespace_dir = config.namespace_dir(namespace);
174    let cache_path = config.cache_file_path(namespace, fingerprint);
175
176    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || {
177        fs::create_dir_all(&namespace_dir)
178            .with_context(|| "Failed to create license index cache directory")?;
179        prune_namespace_dir(&namespace_dir, &cache_path)?;
180        write_bytes_atomically(&cache_path, &payload)
181            .with_context(|| "Failed to persist license index cache file")
182    })?;
183
184    Ok(())
185}
186
187pub fn delete_cache(
188    config: &LicenseCacheConfig,
189    namespace: LicenseCacheNamespace,
190    fingerprint: &[u8; 32],
191) -> Result<()> {
192    if !config.enabled {
193        return Ok(());
194    }
195
196    let cache_path = config.cache_file_path(namespace, fingerprint);
197    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || -> Result<()> {
198        if cache_path.exists() {
199            fs::remove_file(&cache_path).context("Failed to delete license index cache file")?;
200        }
201        Ok(())
202    })?;
203
204    Ok(())
205}
206
207pub fn cache_file_size(
208    config: &LicenseCacheConfig,
209    namespace: LicenseCacheNamespace,
210    fingerprint: &[u8; 32],
211) -> Option<u64> {
212    if !config.enabled {
213        return None;
214    }
215
216    fs::metadata(config.cache_file_path(namespace, fingerprint))
217        .ok()
218        .map(|m| m.len())
219}
220
221#[cfg(test)]
222mod tests {
223    use tempfile::TempDir;
224
225    use super::*;
226    use crate::license_detection::automaton::Automaton;
227    use crate::license_detection::index::dictionary::TokenDictionary;
228
229    fn sample_cached_index() -> LicenseIndex {
230        LicenseIndex {
231            dictionary: TokenDictionary::default(),
232            len_legalese: 0,
233            rid_by_hash: Default::default(),
234            rules_by_rid: Default::default(),
235            tids_by_rid: Default::default(),
236            rules_automaton: Automaton::empty(),
237            unknown_automaton: Automaton::empty(),
238            sets_by_rid: Default::default(),
239            rule_metadata_by_identifier: Default::default(),
240            msets_by_rid: Default::default(),
241            high_sets_by_rid: Default::default(),
242            high_postings_by_rid: Default::default(),
243            licenses_by_key: Default::default(),
244            rid_by_spdx_key: Default::default(),
245            unknown_spdx_rid: None,
246            rids_by_high_tid: Default::default(),
247            spdx_license_list_version: Some("test".to_string()),
248        }
249    }
250
251    #[test]
252    fn test_cache_file_path_uses_namespace_and_fingerprint() {
253        let config = LicenseCacheConfig::new(PathBuf::from("/tmp/cache-root"), false, true);
254        let fingerprint = [0xAB; 32];
255
256        assert_eq!(
257            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint),
258            PathBuf::from(format!(
259                "/tmp/cache-root/license-index/embedded/{}.rkyv",
260                "ab".repeat(32)
261            ))
262        );
263        assert_eq!(
264            config.cache_file_path(LicenseCacheNamespace::CustomRules, &fingerprint),
265            PathBuf::from(format!(
266                "/tmp/cache-root/license-index/custom/{}.rkyv",
267                "ab".repeat(32)
268            ))
269        );
270    }
271
272    #[test]
273    fn test_save_cached_index_prunes_stale_namespace_entries() {
274        let temp_dir = TempDir::new().expect("create temp dir");
275        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, true);
276        let fingerprint = [0x11; 32];
277        let namespace_dir = config.namespace_dir(LicenseCacheNamespace::Embedded);
278        fs::create_dir_all(&namespace_dir).expect("create namespace dir");
279        fs::write(namespace_dir.join("stale.rkyv"), b"old").expect("write stale cache file");
280
281        let cached = sample_cached_index();
282        save_cached_index(
283            &config,
284            LicenseCacheNamespace::Embedded,
285            &cached,
286            &fingerprint,
287        )
288        .expect("save cache");
289
290        let entries = fs::read_dir(&namespace_dir)
291            .expect("read namespace dir")
292            .map(|entry| entry.expect("dir entry").path())
293            .collect::<Vec<_>>();
294
295        assert_eq!(entries.len(), 1);
296        assert_eq!(
297            entries[0],
298            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
299        );
300    }
301
302    #[test]
303    fn test_disabled_cache_skips_persistence() {
304        let temp_dir = TempDir::new().expect("create temp dir");
305        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, false);
306        let fingerprint = [0x22; 32];
307
308        save_cached_index(
309            &config,
310            LicenseCacheNamespace::Embedded,
311            &sample_cached_index(),
312            &fingerprint,
313        )
314        .expect("disabled save should succeed");
315
316        assert!(
317            !config
318                .cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
319                .exists()
320        );
321        assert!(
322            load_cached_index(&config, LicenseCacheNamespace::Embedded, &fingerprint)
323                .expect("disabled load should succeed")
324                .is_none()
325        );
326    }
327
328    #[test]
329    fn test_compute_rules_fingerprint_changes_when_rule_metadata_changes() {
330        let rule_a = LoadedRule {
331            identifier: "example.RULE".to_string(),
332            license_expression: "mit".to_string(),
333            text: "example text".to_string(),
334            rule_kind: crate::license_detection::models::RuleKind::Text,
335            is_false_positive: false,
336            is_required_phrase: false,
337            skip_for_required_phrase_generation: false,
338            relevance: Some(100),
339            minimum_coverage: None,
340            has_stored_minimum_coverage: false,
341            is_continuous: false,
342            referenced_filenames: None,
343            ignorable_urls: None,
344            ignorable_emails: None,
345            ignorable_copyrights: None,
346            ignorable_holders: None,
347            ignorable_authors: None,
348            language: None,
349            notes: None,
350            is_deprecated: false,
351            replaced_by: vec![],
352        };
353        let mut rule_b = rule_a.clone();
354        rule_b.referenced_filenames = Some(vec!["LICENSE".to_string()]);
355
356        let license = LoadedLicense {
357            key: "mit".to_string(),
358            short_name: Some("MIT".to_string()),
359            name: "MIT License".to_string(),
360            language: Some("en".to_string()),
361            spdx_license_key: Some("MIT".to_string()),
362            other_spdx_license_keys: vec![],
363            category: Some("Permissive".to_string()),
364            owner: None,
365            homepage_url: None,
366            text: "MIT text".to_string(),
367            reference_urls: vec![],
368            osi_license_key: None,
369            text_urls: vec![],
370            osi_url: None,
371            faq_url: None,
372            other_urls: vec![],
373            notes: None,
374            is_deprecated: false,
375            is_exception: false,
376            is_unknown: false,
377            is_generic: false,
378            replaced_by: vec![],
379            minimum_coverage: None,
380            standard_notice: None,
381            ignorable_copyrights: None,
382            ignorable_holders: None,
383            ignorable_authors: None,
384            ignorable_urls: None,
385            ignorable_emails: None,
386        };
387
388        let fingerprint_a = compute_rules_fingerprint(&[rule_a], std::slice::from_ref(&license))
389            .expect("fingerprint A");
390        let fingerprint_b =
391            compute_rules_fingerprint(&[rule_b], &[license]).expect("fingerprint B");
392
393        assert_ne!(fingerprint_a, fingerprint_b);
394    }
395}