Skip to main content

provenant/license_detection/
license_cache.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::fmt::Write as _;
5use std::fs;
6use std::path::{Path, PathBuf};
7
8use anyhow::{Context, Result};
9use rancor::{Panic, ResultExt};
10use sha2::{Digest, Sha256};
11
12use crate::cache::{CacheConfig, write_bytes_atomically};
13use crate::license_detection::index::LicenseIndex;
14use crate::license_detection::models::{LoadedLicense, LoadedRule};
15
16const CACHE_ROOT_DIR_NAME: &str = "license-index";
17const CACHE_FILE_EXTENSION: &str = "rkyv";
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum LicenseCacheNamespace {
21    Embedded,
22    CustomRules,
23}
24
25impl LicenseCacheNamespace {
26    fn directory_name(self) -> &'static str {
27        match self {
28            Self::Embedded => "embedded",
29            Self::CustomRules => "custom",
30        }
31    }
32}
33
34pub struct LicenseCacheConfig {
35    pub root_dir: PathBuf,
36    pub reindex: bool,
37    pub enabled: bool,
38}
39
40impl LicenseCacheConfig {
41    pub fn new(root_dir: PathBuf, reindex: bool, enabled: bool) -> Self {
42        Self {
43            root_dir,
44            reindex,
45            enabled,
46        }
47    }
48
49    pub fn default_root_dir() -> PathBuf {
50        CacheConfig::default_root_dir_without_scan_root()
51    }
52
53    fn namespace_dir(&self, namespace: LicenseCacheNamespace) -> PathBuf {
54        self.root_dir
55            .join(CACHE_ROOT_DIR_NAME)
56            .join(namespace.directory_name())
57    }
58
59    fn cache_file_path(&self, namespace: LicenseCacheNamespace, fingerprint: &[u8; 32]) -> PathBuf {
60        self.namespace_dir(namespace).join(format!(
61            "{}.{}",
62            fingerprint_hex(fingerprint),
63            CACHE_FILE_EXTENSION
64        ))
65    }
66}
67
68fn fingerprint_hex(fingerprint: &[u8; 32]) -> String {
69    let mut hex = String::with_capacity(fingerprint.len() * 2);
70    for byte in fingerprint {
71        let _ = write!(&mut hex, "{byte:02x}");
72    }
73    hex
74}
75
76fn prune_namespace_dir(namespace_dir: &Path, active_path: &Path) -> Result<()> {
77    if !namespace_dir.exists() {
78        return Ok(());
79    }
80
81    for entry in fs::read_dir(namespace_dir)
82        .with_context(|| format!("Failed to read license cache namespace {namespace_dir:?}"))?
83    {
84        let path = entry?.path();
85        if path == active_path
86            || path.extension().and_then(|ext| ext.to_str()) != Some(CACHE_FILE_EXTENSION)
87        {
88            continue;
89        }
90        fs::remove_file(&path)
91            .with_context(|| format!("Failed to prune stale license cache file {path:?}"))?;
92    }
93
94    Ok(())
95}
96
97pub fn compute_rules_fingerprint(
98    rules: &[LoadedRule],
99    licenses: &[LoadedLicense],
100) -> Result<[u8; 32]> {
101    let mut sorted_rules: Vec<_> = rules.iter().collect();
102    sorted_rules.sort_by_key(|r| &r.identifier);
103    let mut sorted_licenses: Vec<_> = licenses.iter().collect();
104    sorted_licenses.sort_by_key(|l| &l.key);
105
106    let serialized = postcard::to_allocvec(&(sorted_rules, sorted_licenses))
107        .context("Failed to serialize effective rules/licenses for cache fingerprinting")?;
108
109    Ok(Sha256::digest(serialized).into())
110}
111
112pub fn compute_artifact_fingerprint(artifact_bytes: &[u8]) -> [u8; 32] {
113    Sha256::digest(artifact_bytes).into()
114}
115
116pub fn load_cached_index(
117    config: &LicenseCacheConfig,
118    namespace: LicenseCacheNamespace,
119    fingerprint: &[u8; 32],
120) -> Result<Option<LicenseIndex>> {
121    if !config.enabled {
122        return Ok(None);
123    }
124
125    let cache_path = config.cache_file_path(namespace, fingerprint);
126
127    if !cache_path.exists() {
128        return Ok(None);
129    }
130
131    let bytes = match fs::read(&cache_path) {
132        Ok(bytes) => bytes,
133        Err(_) => return Ok(None),
134    };
135
136    if bytes.len() < 32 {
137        return Ok(None);
138    }
139
140    let stored_fingerprint: [u8; 32] = bytes[..32].try_into().unwrap();
141    if stored_fingerprint != *fingerprint {
142        return Ok(None);
143    }
144
145    let archived =
146        match rkyv::access::<rkyv::Archived<LicenseIndex>, rkyv::rancor::Error>(&bytes[32..]) {
147            Ok(archived) => archived,
148            Err(_) => return Ok(None),
149        };
150
151    let cached: LicenseIndex = rkyv::deserialize::<LicenseIndex, Panic>(archived).always_ok();
152
153    Ok(Some(cached))
154}
155
156pub fn save_cached_index(
157    config: &LicenseCacheConfig,
158    namespace: LicenseCacheNamespace,
159    cached: &LicenseIndex,
160    fingerprint: &[u8; 32],
161) -> Result<()> {
162    if !config.enabled {
163        return Ok(());
164    }
165
166    let rkyv_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(cached)
167        .map_err(|e| anyhow::anyhow!("Failed to serialize license index cache: {}", e))?;
168
169    let mut payload = Vec::with_capacity(fingerprint.len() + rkyv_bytes.len());
170    payload.extend_from_slice(fingerprint);
171    payload.extend_from_slice(&rkyv_bytes);
172
173    let namespace_dir = config.namespace_dir(namespace);
174    let cache_path = config.cache_file_path(namespace, fingerprint);
175
176    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || {
177        fs::create_dir_all(&namespace_dir)
178            .with_context(|| "Failed to create license index cache directory")?;
179        prune_namespace_dir(&namespace_dir, &cache_path)?;
180        write_bytes_atomically(&cache_path, &payload)
181            .with_context(|| "Failed to persist license index cache file")
182    })?;
183
184    Ok(())
185}
186
187pub fn delete_cache(
188    config: &LicenseCacheConfig,
189    namespace: LicenseCacheNamespace,
190    fingerprint: &[u8; 32],
191) -> Result<()> {
192    if !config.enabled {
193        return Ok(());
194    }
195
196    let cache_path = config.cache_file_path(namespace, fingerprint);
197    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || -> Result<()> {
198        if cache_path.exists() {
199            fs::remove_file(&cache_path).context("Failed to delete license index cache file")?;
200        }
201        Ok(())
202    })?;
203
204    Ok(())
205}
206
207pub fn cache_file_size(
208    config: &LicenseCacheConfig,
209    namespace: LicenseCacheNamespace,
210    fingerprint: &[u8; 32],
211) -> Option<u64> {
212    if !config.enabled {
213        return None;
214    }
215
216    fs::metadata(config.cache_file_path(namespace, fingerprint))
217        .ok()
218        .map(|m| m.len())
219}
220
221#[cfg(test)]
222mod tests {
223    use tempfile::TempDir;
224
225    use super::*;
226    use crate::license_detection::automaton::Automaton;
227    use crate::license_detection::index::dictionary::TokenDictionary;
228
229    fn sample_cached_index() -> LicenseIndex {
230        LicenseIndex {
231            dictionary: TokenDictionary::default(),
232            len_legalese: 0,
233            rid_by_hash: Default::default(),
234            rules_by_rid: Default::default(),
235            tids_by_rid: Default::default(),
236            rules_automaton: Automaton::empty(),
237            unknown_automaton: Automaton::empty(),
238            sets_by_rid: Default::default(),
239            rule_metadata_by_identifier: Default::default(),
240            msets_by_rid: Default::default(),
241            high_sets_by_rid: Default::default(),
242            high_postings_by_rid: Default::default(),
243            false_positive_rids: Default::default(),
244            approx_matchable_rids: Default::default(),
245            licenses_by_key: Default::default(),
246            pattern_id_to_rid: Default::default(),
247            rid_by_spdx_key: Default::default(),
248            unknown_spdx_rid: None,
249            rids_by_high_tid: Default::default(),
250            spdx_license_list_version: Some("test".to_string()),
251        }
252    }
253
254    #[test]
255    fn test_cache_file_path_uses_namespace_and_fingerprint() {
256        let config = LicenseCacheConfig::new(PathBuf::from("/tmp/cache-root"), false, true);
257        let fingerprint = [0xAB; 32];
258
259        assert_eq!(
260            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint),
261            PathBuf::from(format!(
262                "/tmp/cache-root/license-index/embedded/{}.rkyv",
263                "ab".repeat(32)
264            ))
265        );
266        assert_eq!(
267            config.cache_file_path(LicenseCacheNamespace::CustomRules, &fingerprint),
268            PathBuf::from(format!(
269                "/tmp/cache-root/license-index/custom/{}.rkyv",
270                "ab".repeat(32)
271            ))
272        );
273    }
274
275    #[test]
276    fn test_save_cached_index_prunes_stale_namespace_entries() {
277        let temp_dir = TempDir::new().expect("create temp dir");
278        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, true);
279        let fingerprint = [0x11; 32];
280        let namespace_dir = config.namespace_dir(LicenseCacheNamespace::Embedded);
281        fs::create_dir_all(&namespace_dir).expect("create namespace dir");
282        fs::write(namespace_dir.join("stale.rkyv"), b"old").expect("write stale cache file");
283
284        let cached = sample_cached_index();
285        save_cached_index(
286            &config,
287            LicenseCacheNamespace::Embedded,
288            &cached,
289            &fingerprint,
290        )
291        .expect("save cache");
292
293        let entries = fs::read_dir(&namespace_dir)
294            .expect("read namespace dir")
295            .map(|entry| entry.expect("dir entry").path())
296            .collect::<Vec<_>>();
297
298        assert_eq!(entries.len(), 1);
299        assert_eq!(
300            entries[0],
301            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
302        );
303    }
304
305    #[test]
306    fn test_disabled_cache_skips_persistence() {
307        let temp_dir = TempDir::new().expect("create temp dir");
308        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, false);
309        let fingerprint = [0x22; 32];
310
311        save_cached_index(
312            &config,
313            LicenseCacheNamespace::Embedded,
314            &sample_cached_index(),
315            &fingerprint,
316        )
317        .expect("disabled save should succeed");
318
319        assert!(
320            !config
321                .cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
322                .exists()
323        );
324        assert!(
325            load_cached_index(&config, LicenseCacheNamespace::Embedded, &fingerprint)
326                .expect("disabled load should succeed")
327                .is_none()
328        );
329    }
330
331    #[test]
332    fn test_compute_rules_fingerprint_changes_when_rule_metadata_changes() {
333        let rule_a = LoadedRule {
334            identifier: "example.RULE".to_string(),
335            license_expression: "mit".to_string(),
336            text: "example text".to_string(),
337            rule_kind: crate::license_detection::models::RuleKind::Text,
338            is_false_positive: false,
339            is_required_phrase: false,
340            skip_for_required_phrase_generation: false,
341            relevance: Some(100),
342            minimum_coverage: None,
343            has_stored_minimum_coverage: false,
344            is_continuous: false,
345            referenced_filenames: None,
346            ignorable_urls: None,
347            ignorable_emails: None,
348            ignorable_copyrights: None,
349            ignorable_holders: None,
350            ignorable_authors: None,
351            language: None,
352            notes: None,
353            is_deprecated: false,
354            replaced_by: vec![],
355        };
356        let mut rule_b = rule_a.clone();
357        rule_b.referenced_filenames = Some(vec!["LICENSE".to_string()]);
358
359        let license = LoadedLicense {
360            key: "mit".to_string(),
361            short_name: Some("MIT".to_string()),
362            name: "MIT License".to_string(),
363            language: Some("en".to_string()),
364            spdx_license_key: Some("MIT".to_string()),
365            other_spdx_license_keys: vec![],
366            category: Some("Permissive".to_string()),
367            owner: None,
368            homepage_url: None,
369            text: "MIT text".to_string(),
370            reference_urls: vec![],
371            osi_license_key: None,
372            text_urls: vec![],
373            osi_url: None,
374            faq_url: None,
375            other_urls: vec![],
376            notes: None,
377            is_deprecated: false,
378            is_exception: false,
379            is_unknown: false,
380            is_generic: false,
381            replaced_by: vec![],
382            minimum_coverage: None,
383            standard_notice: None,
384            ignorable_copyrights: None,
385            ignorable_holders: None,
386            ignorable_authors: None,
387            ignorable_urls: None,
388            ignorable_emails: None,
389        };
390
391        let fingerprint_a = compute_rules_fingerprint(&[rule_a], std::slice::from_ref(&license))
392            .expect("fingerprint A");
393        let fingerprint_b =
394            compute_rules_fingerprint(&[rule_b], &[license]).expect("fingerprint B");
395
396        assert_ne!(fingerprint_a, fingerprint_b);
397    }
398}