Skip to main content

provenant/license_detection/
license_cache.rs

1use std::fmt::Write as _;
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use anyhow::{Context, Result};
6use rancor::{Panic, ResultExt};
7use sha2::{Digest, Sha256};
8
9use crate::cache::{CacheConfig, write_bytes_atomically};
10use crate::license_detection::index::LicenseIndex;
11use crate::license_detection::models::{LoadedLicense, LoadedRule};
12
13const CACHE_ROOT_DIR_NAME: &str = "license-index";
14const CACHE_FILE_EXTENSION: &str = "rkyv";
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LicenseCacheNamespace {
18    Embedded,
19    CustomRules,
20}
21
22impl LicenseCacheNamespace {
23    fn directory_name(self) -> &'static str {
24        match self {
25            Self::Embedded => "embedded",
26            Self::CustomRules => "custom",
27        }
28    }
29}
30
31pub struct LicenseCacheConfig {
32    pub root_dir: PathBuf,
33    pub reindex: bool,
34    pub enabled: bool,
35}
36
37impl LicenseCacheConfig {
38    pub fn new(root_dir: PathBuf, reindex: bool, enabled: bool) -> Self {
39        Self {
40            root_dir,
41            reindex,
42            enabled,
43        }
44    }
45
46    pub fn default_root_dir() -> PathBuf {
47        CacheConfig::default_root_dir_without_scan_root()
48    }
49
50    fn namespace_dir(&self, namespace: LicenseCacheNamespace) -> PathBuf {
51        self.root_dir
52            .join(CACHE_ROOT_DIR_NAME)
53            .join(namespace.directory_name())
54    }
55
56    fn cache_file_path(&self, namespace: LicenseCacheNamespace, fingerprint: &[u8; 32]) -> PathBuf {
57        self.namespace_dir(namespace).join(format!(
58            "{}.{}",
59            fingerprint_hex(fingerprint),
60            CACHE_FILE_EXTENSION
61        ))
62    }
63}
64
65fn fingerprint_hex(fingerprint: &[u8; 32]) -> String {
66    let mut hex = String::with_capacity(fingerprint.len() * 2);
67    for byte in fingerprint {
68        let _ = write!(&mut hex, "{byte:02x}");
69    }
70    hex
71}
72
73fn prune_namespace_dir(namespace_dir: &Path, active_path: &Path) -> Result<()> {
74    if !namespace_dir.exists() {
75        return Ok(());
76    }
77
78    for entry in fs::read_dir(namespace_dir)
79        .with_context(|| format!("Failed to read license cache namespace {namespace_dir:?}"))?
80    {
81        let path = entry?.path();
82        if path == active_path
83            || path.extension().and_then(|ext| ext.to_str()) != Some(CACHE_FILE_EXTENSION)
84        {
85            continue;
86        }
87        fs::remove_file(&path)
88            .with_context(|| format!("Failed to prune stale license cache file {path:?}"))?;
89    }
90
91    Ok(())
92}
93
94pub fn compute_rules_fingerprint(
95    rules: &[LoadedRule],
96    licenses: &[LoadedLicense],
97) -> Result<[u8; 32]> {
98    let mut sorted_rules: Vec<_> = rules.iter().collect();
99    sorted_rules.sort_by_key(|r| &r.identifier);
100    let mut sorted_licenses: Vec<_> = licenses.iter().collect();
101    sorted_licenses.sort_by_key(|l| &l.key);
102
103    let serialized = postcard::to_allocvec(&(sorted_rules, sorted_licenses))
104        .context("Failed to serialize effective rules/licenses for cache fingerprinting")?;
105
106    Ok(Sha256::digest(serialized).into())
107}
108
109pub fn compute_artifact_fingerprint(artifact_bytes: &[u8]) -> [u8; 32] {
110    Sha256::digest(artifact_bytes).into()
111}
112
113pub fn load_cached_index(
114    config: &LicenseCacheConfig,
115    namespace: LicenseCacheNamespace,
116    fingerprint: &[u8; 32],
117) -> Result<Option<LicenseIndex>> {
118    if !config.enabled {
119        return Ok(None);
120    }
121
122    let cache_path = config.cache_file_path(namespace, fingerprint);
123
124    if !cache_path.exists() {
125        return Ok(None);
126    }
127
128    let bytes = match fs::read(&cache_path) {
129        Ok(bytes) => bytes,
130        Err(_) => return Ok(None),
131    };
132
133    if bytes.len() < 32 {
134        return Ok(None);
135    }
136
137    let stored_fingerprint: [u8; 32] = bytes[..32].try_into().unwrap();
138    if stored_fingerprint != *fingerprint {
139        return Ok(None);
140    }
141
142    let archived =
143        match rkyv::access::<rkyv::Archived<LicenseIndex>, rkyv::rancor::Error>(&bytes[32..]) {
144            Ok(archived) => archived,
145            Err(_) => return Ok(None),
146        };
147
148    let cached: LicenseIndex = rkyv::deserialize::<LicenseIndex, Panic>(archived).always_ok();
149
150    Ok(Some(cached))
151}
152
153pub fn save_cached_index(
154    config: &LicenseCacheConfig,
155    namespace: LicenseCacheNamespace,
156    cached: &LicenseIndex,
157    fingerprint: &[u8; 32],
158) -> Result<()> {
159    if !config.enabled {
160        return Ok(());
161    }
162
163    let rkyv_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(cached)
164        .map_err(|e| anyhow::anyhow!("Failed to serialize license index cache: {}", e))?;
165
166    let mut payload = Vec::with_capacity(fingerprint.len() + rkyv_bytes.len());
167    payload.extend_from_slice(fingerprint);
168    payload.extend_from_slice(&rkyv_bytes);
169
170    let namespace_dir = config.namespace_dir(namespace);
171    let cache_path = config.cache_file_path(namespace, fingerprint);
172
173    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || {
174        fs::create_dir_all(&namespace_dir)
175            .with_context(|| "Failed to create license index cache directory")?;
176        prune_namespace_dir(&namespace_dir, &cache_path)?;
177        write_bytes_atomically(&cache_path, &payload)
178            .with_context(|| "Failed to persist license index cache file")
179    })?;
180
181    Ok(())
182}
183
184pub fn delete_cache(
185    config: &LicenseCacheConfig,
186    namespace: LicenseCacheNamespace,
187    fingerprint: &[u8; 32],
188) -> Result<()> {
189    if !config.enabled {
190        return Ok(());
191    }
192
193    let cache_path = config.cache_file_path(namespace, fingerprint);
194    crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || -> Result<()> {
195        if cache_path.exists() {
196            fs::remove_file(&cache_path).context("Failed to delete license index cache file")?;
197        }
198        Ok(())
199    })?;
200
201    Ok(())
202}
203
204pub fn cache_file_size(
205    config: &LicenseCacheConfig,
206    namespace: LicenseCacheNamespace,
207    fingerprint: &[u8; 32],
208) -> Option<u64> {
209    if !config.enabled {
210        return None;
211    }
212
213    fs::metadata(config.cache_file_path(namespace, fingerprint))
214        .ok()
215        .map(|m| m.len())
216}
217
218#[cfg(test)]
219mod tests {
220    use tempfile::TempDir;
221
222    use super::*;
223    use crate::license_detection::automaton::Automaton;
224    use crate::license_detection::index::dictionary::TokenDictionary;
225
226    fn sample_cached_index() -> LicenseIndex {
227        LicenseIndex {
228            dictionary: TokenDictionary::default(),
229            len_legalese: 0,
230            rid_by_hash: Default::default(),
231            rules_by_rid: Default::default(),
232            tids_by_rid: Default::default(),
233            rules_automaton: Automaton::empty(),
234            unknown_automaton: Automaton::empty(),
235            sets_by_rid: Default::default(),
236            rule_metadata_by_identifier: Default::default(),
237            msets_by_rid: Default::default(),
238            high_sets_by_rid: Default::default(),
239            high_postings_by_rid: Default::default(),
240            false_positive_rids: Default::default(),
241            approx_matchable_rids: Default::default(),
242            licenses_by_key: Default::default(),
243            pattern_id_to_rid: Default::default(),
244            rid_by_spdx_key: Default::default(),
245            unknown_spdx_rid: None,
246            rids_by_high_tid: Default::default(),
247            spdx_license_list_version: Some("test".to_string()),
248        }
249    }
250
251    #[test]
252    fn test_cache_file_path_uses_namespace_and_fingerprint() {
253        let config = LicenseCacheConfig::new(PathBuf::from("/tmp/cache-root"), false, true);
254        let fingerprint = [0xAB; 32];
255
256        assert_eq!(
257            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint),
258            PathBuf::from(format!(
259                "/tmp/cache-root/license-index/embedded/{}.rkyv",
260                "ab".repeat(32)
261            ))
262        );
263        assert_eq!(
264            config.cache_file_path(LicenseCacheNamespace::CustomRules, &fingerprint),
265            PathBuf::from(format!(
266                "/tmp/cache-root/license-index/custom/{}.rkyv",
267                "ab".repeat(32)
268            ))
269        );
270    }
271
272    #[test]
273    fn test_save_cached_index_prunes_stale_namespace_entries() {
274        let temp_dir = TempDir::new().expect("create temp dir");
275        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, true);
276        let fingerprint = [0x11; 32];
277        let namespace_dir = config.namespace_dir(LicenseCacheNamespace::Embedded);
278        fs::create_dir_all(&namespace_dir).expect("create namespace dir");
279        fs::write(namespace_dir.join("stale.rkyv"), b"old").expect("write stale cache file");
280
281        let cached = sample_cached_index();
282        save_cached_index(
283            &config,
284            LicenseCacheNamespace::Embedded,
285            &cached,
286            &fingerprint,
287        )
288        .expect("save cache");
289
290        let entries = fs::read_dir(&namespace_dir)
291            .expect("read namespace dir")
292            .map(|entry| entry.expect("dir entry").path())
293            .collect::<Vec<_>>();
294
295        assert_eq!(entries.len(), 1);
296        assert_eq!(
297            entries[0],
298            config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
299        );
300    }
301
302    #[test]
303    fn test_disabled_cache_skips_persistence() {
304        let temp_dir = TempDir::new().expect("create temp dir");
305        let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, false);
306        let fingerprint = [0x22; 32];
307
308        save_cached_index(
309            &config,
310            LicenseCacheNamespace::Embedded,
311            &sample_cached_index(),
312            &fingerprint,
313        )
314        .expect("disabled save should succeed");
315
316        assert!(
317            !config
318                .cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
319                .exists()
320        );
321        assert!(
322            load_cached_index(&config, LicenseCacheNamespace::Embedded, &fingerprint)
323                .expect("disabled load should succeed")
324                .is_none()
325        );
326    }
327
328    #[test]
329    fn test_compute_rules_fingerprint_changes_when_rule_metadata_changes() {
330        let rule_a = LoadedRule {
331            identifier: "example.RULE".to_string(),
332            license_expression: "mit".to_string(),
333            text: "example text".to_string(),
334            rule_kind: crate::license_detection::models::RuleKind::Text,
335            is_false_positive: false,
336            is_required_phrase: false,
337            skip_for_required_phrase_generation: false,
338            relevance: Some(100),
339            minimum_coverage: None,
340            has_stored_minimum_coverage: false,
341            is_continuous: false,
342            referenced_filenames: None,
343            ignorable_urls: None,
344            ignorable_emails: None,
345            ignorable_copyrights: None,
346            ignorable_holders: None,
347            ignorable_authors: None,
348            language: None,
349            notes: None,
350            is_deprecated: false,
351            replaced_by: vec![],
352        };
353        let mut rule_b = rule_a.clone();
354        rule_b.referenced_filenames = Some(vec!["LICENSE".to_string()]);
355
356        let license = LoadedLicense {
357            key: "mit".to_string(),
358            short_name: Some("MIT".to_string()),
359            name: "MIT License".to_string(),
360            language: Some("en".to_string()),
361            spdx_license_key: Some("MIT".to_string()),
362            other_spdx_license_keys: vec![],
363            category: Some("Permissive".to_string()),
364            owner: None,
365            homepage_url: None,
366            text: "MIT text".to_string(),
367            reference_urls: vec![],
368            osi_license_key: None,
369            text_urls: vec![],
370            osi_url: None,
371            faq_url: None,
372            other_urls: vec![],
373            notes: None,
374            is_deprecated: false,
375            is_exception: false,
376            is_unknown: false,
377            is_generic: false,
378            replaced_by: vec![],
379            minimum_coverage: None,
380            standard_notice: None,
381            ignorable_copyrights: None,
382            ignorable_holders: None,
383            ignorable_authors: None,
384            ignorable_urls: None,
385            ignorable_emails: None,
386        };
387
388        let fingerprint_a = compute_rules_fingerprint(&[rule_a], std::slice::from_ref(&license))
389            .expect("fingerprint A");
390        let fingerprint_b =
391            compute_rules_fingerprint(&[rule_b], &[license]).expect("fingerprint B");
392
393        assert_ne!(fingerprint_a, fingerprint_b);
394    }
395}