provenant/license_detection/
license_cache.rs1use std::fmt::Write as _;
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use anyhow::{Context, Result};
6use rancor::{Panic, ResultExt};
7use sha2::{Digest, Sha256};
8
9use crate::cache::{CacheConfig, write_bytes_atomically};
10use crate::license_detection::index::LicenseIndex;
11use crate::license_detection::models::{LoadedLicense, LoadedRule};
12
13const CACHE_ROOT_DIR_NAME: &str = "license-index";
14const CACHE_FILE_EXTENSION: &str = "rkyv";
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LicenseCacheNamespace {
18 Embedded,
19 CustomRules,
20}
21
22impl LicenseCacheNamespace {
23 fn directory_name(self) -> &'static str {
24 match self {
25 Self::Embedded => "embedded",
26 Self::CustomRules => "custom",
27 }
28 }
29}
30
31pub struct LicenseCacheConfig {
32 pub root_dir: PathBuf,
33 pub reindex: bool,
34 pub enabled: bool,
35}
36
37impl LicenseCacheConfig {
38 pub fn new(root_dir: PathBuf, reindex: bool, enabled: bool) -> Self {
39 Self {
40 root_dir,
41 reindex,
42 enabled,
43 }
44 }
45
46 pub fn default_root_dir() -> PathBuf {
47 CacheConfig::default_root_dir_without_scan_root()
48 }
49
50 fn namespace_dir(&self, namespace: LicenseCacheNamespace) -> PathBuf {
51 self.root_dir
52 .join(CACHE_ROOT_DIR_NAME)
53 .join(namespace.directory_name())
54 }
55
56 fn cache_file_path(&self, namespace: LicenseCacheNamespace, fingerprint: &[u8; 32]) -> PathBuf {
57 self.namespace_dir(namespace).join(format!(
58 "{}.{}",
59 fingerprint_hex(fingerprint),
60 CACHE_FILE_EXTENSION
61 ))
62 }
63}
64
65fn fingerprint_hex(fingerprint: &[u8; 32]) -> String {
66 let mut hex = String::with_capacity(fingerprint.len() * 2);
67 for byte in fingerprint {
68 let _ = write!(&mut hex, "{byte:02x}");
69 }
70 hex
71}
72
73fn prune_namespace_dir(namespace_dir: &Path, active_path: &Path) -> Result<()> {
74 if !namespace_dir.exists() {
75 return Ok(());
76 }
77
78 for entry in fs::read_dir(namespace_dir)
79 .with_context(|| format!("Failed to read license cache namespace {namespace_dir:?}"))?
80 {
81 let path = entry?.path();
82 if path == active_path
83 || path.extension().and_then(|ext| ext.to_str()) != Some(CACHE_FILE_EXTENSION)
84 {
85 continue;
86 }
87 fs::remove_file(&path)
88 .with_context(|| format!("Failed to prune stale license cache file {path:?}"))?;
89 }
90
91 Ok(())
92}
93
94pub fn compute_rules_fingerprint(
95 rules: &[LoadedRule],
96 licenses: &[LoadedLicense],
97) -> Result<[u8; 32]> {
98 let mut sorted_rules: Vec<_> = rules.iter().collect();
99 sorted_rules.sort_by_key(|r| &r.identifier);
100 let mut sorted_licenses: Vec<_> = licenses.iter().collect();
101 sorted_licenses.sort_by_key(|l| &l.key);
102
103 let serialized = postcard::to_allocvec(&(sorted_rules, sorted_licenses))
104 .context("Failed to serialize effective rules/licenses for cache fingerprinting")?;
105
106 Ok(Sha256::digest(serialized).into())
107}
108
109pub fn compute_artifact_fingerprint(artifact_bytes: &[u8]) -> [u8; 32] {
110 Sha256::digest(artifact_bytes).into()
111}
112
113pub fn load_cached_index(
114 config: &LicenseCacheConfig,
115 namespace: LicenseCacheNamespace,
116 fingerprint: &[u8; 32],
117) -> Result<Option<LicenseIndex>> {
118 if !config.enabled {
119 return Ok(None);
120 }
121
122 let cache_path = config.cache_file_path(namespace, fingerprint);
123
124 if !cache_path.exists() {
125 return Ok(None);
126 }
127
128 let bytes = match fs::read(&cache_path) {
129 Ok(bytes) => bytes,
130 Err(_) => return Ok(None),
131 };
132
133 if bytes.len() < 32 {
134 return Ok(None);
135 }
136
137 let stored_fingerprint: [u8; 32] = bytes[..32].try_into().unwrap();
138 if stored_fingerprint != *fingerprint {
139 return Ok(None);
140 }
141
142 let archived =
143 match rkyv::access::<rkyv::Archived<LicenseIndex>, rkyv::rancor::Error>(&bytes[32..]) {
144 Ok(archived) => archived,
145 Err(_) => return Ok(None),
146 };
147
148 let cached: LicenseIndex = rkyv::deserialize::<LicenseIndex, Panic>(archived).always_ok();
149
150 Ok(Some(cached))
151}
152
153pub fn save_cached_index(
154 config: &LicenseCacheConfig,
155 namespace: LicenseCacheNamespace,
156 cached: &LicenseIndex,
157 fingerprint: &[u8; 32],
158) -> Result<()> {
159 if !config.enabled {
160 return Ok(());
161 }
162
163 let rkyv_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(cached)
164 .map_err(|e| anyhow::anyhow!("Failed to serialize license index cache: {}", e))?;
165
166 let mut payload = Vec::with_capacity(fingerprint.len() + rkyv_bytes.len());
167 payload.extend_from_slice(fingerprint);
168 payload.extend_from_slice(&rkyv_bytes);
169
170 let namespace_dir = config.namespace_dir(namespace);
171 let cache_path = config.cache_file_path(namespace, fingerprint);
172
173 crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || {
174 fs::create_dir_all(&namespace_dir)
175 .with_context(|| "Failed to create license index cache directory")?;
176 prune_namespace_dir(&namespace_dir, &cache_path)?;
177 write_bytes_atomically(&cache_path, &payload)
178 .with_context(|| "Failed to persist license index cache file")
179 })?;
180
181 Ok(())
182}
183
184pub fn delete_cache(
185 config: &LicenseCacheConfig,
186 namespace: LicenseCacheNamespace,
187 fingerprint: &[u8; 32],
188) -> Result<()> {
189 if !config.enabled {
190 return Ok(());
191 }
192
193 let cache_path = config.cache_file_path(namespace, fingerprint);
194 crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || -> Result<()> {
195 if cache_path.exists() {
196 fs::remove_file(&cache_path).context("Failed to delete license index cache file")?;
197 }
198 Ok(())
199 })?;
200
201 Ok(())
202}
203
204pub fn cache_file_size(
205 config: &LicenseCacheConfig,
206 namespace: LicenseCacheNamespace,
207 fingerprint: &[u8; 32],
208) -> Option<u64> {
209 if !config.enabled {
210 return None;
211 }
212
213 fs::metadata(config.cache_file_path(namespace, fingerprint))
214 .ok()
215 .map(|m| m.len())
216}
217
218#[cfg(test)]
219mod tests {
220 use tempfile::TempDir;
221
222 use super::*;
223 use crate::license_detection::automaton::Automaton;
224 use crate::license_detection::index::dictionary::TokenDictionary;
225
226 fn sample_cached_index() -> LicenseIndex {
227 LicenseIndex {
228 dictionary: TokenDictionary::default(),
229 len_legalese: 0,
230 rid_by_hash: Default::default(),
231 rules_by_rid: Default::default(),
232 tids_by_rid: Default::default(),
233 rules_automaton: Automaton::empty(),
234 unknown_automaton: Automaton::empty(),
235 sets_by_rid: Default::default(),
236 rule_metadata_by_identifier: Default::default(),
237 msets_by_rid: Default::default(),
238 high_sets_by_rid: Default::default(),
239 high_postings_by_rid: Default::default(),
240 false_positive_rids: Default::default(),
241 approx_matchable_rids: Default::default(),
242 licenses_by_key: Default::default(),
243 pattern_id_to_rid: Default::default(),
244 rid_by_spdx_key: Default::default(),
245 unknown_spdx_rid: None,
246 rids_by_high_tid: Default::default(),
247 spdx_license_list_version: Some("test".to_string()),
248 }
249 }
250
251 #[test]
252 fn test_cache_file_path_uses_namespace_and_fingerprint() {
253 let config = LicenseCacheConfig::new(PathBuf::from("/tmp/cache-root"), false, true);
254 let fingerprint = [0xAB; 32];
255
256 assert_eq!(
257 config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint),
258 PathBuf::from(format!(
259 "/tmp/cache-root/license-index/embedded/{}.rkyv",
260 "ab".repeat(32)
261 ))
262 );
263 assert_eq!(
264 config.cache_file_path(LicenseCacheNamespace::CustomRules, &fingerprint),
265 PathBuf::from(format!(
266 "/tmp/cache-root/license-index/custom/{}.rkyv",
267 "ab".repeat(32)
268 ))
269 );
270 }
271
272 #[test]
273 fn test_save_cached_index_prunes_stale_namespace_entries() {
274 let temp_dir = TempDir::new().expect("create temp dir");
275 let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, true);
276 let fingerprint = [0x11; 32];
277 let namespace_dir = config.namespace_dir(LicenseCacheNamespace::Embedded);
278 fs::create_dir_all(&namespace_dir).expect("create namespace dir");
279 fs::write(namespace_dir.join("stale.rkyv"), b"old").expect("write stale cache file");
280
281 let cached = sample_cached_index();
282 save_cached_index(
283 &config,
284 LicenseCacheNamespace::Embedded,
285 &cached,
286 &fingerprint,
287 )
288 .expect("save cache");
289
290 let entries = fs::read_dir(&namespace_dir)
291 .expect("read namespace dir")
292 .map(|entry| entry.expect("dir entry").path())
293 .collect::<Vec<_>>();
294
295 assert_eq!(entries.len(), 1);
296 assert_eq!(
297 entries[0],
298 config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
299 );
300 }
301
302 #[test]
303 fn test_disabled_cache_skips_persistence() {
304 let temp_dir = TempDir::new().expect("create temp dir");
305 let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, false);
306 let fingerprint = [0x22; 32];
307
308 save_cached_index(
309 &config,
310 LicenseCacheNamespace::Embedded,
311 &sample_cached_index(),
312 &fingerprint,
313 )
314 .expect("disabled save should succeed");
315
316 assert!(
317 !config
318 .cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
319 .exists()
320 );
321 assert!(
322 load_cached_index(&config, LicenseCacheNamespace::Embedded, &fingerprint)
323 .expect("disabled load should succeed")
324 .is_none()
325 );
326 }
327
328 #[test]
329 fn test_compute_rules_fingerprint_changes_when_rule_metadata_changes() {
330 let rule_a = LoadedRule {
331 identifier: "example.RULE".to_string(),
332 license_expression: "mit".to_string(),
333 text: "example text".to_string(),
334 rule_kind: crate::license_detection::models::RuleKind::Text,
335 is_false_positive: false,
336 is_required_phrase: false,
337 skip_for_required_phrase_generation: false,
338 relevance: Some(100),
339 minimum_coverage: None,
340 has_stored_minimum_coverage: false,
341 is_continuous: false,
342 referenced_filenames: None,
343 ignorable_urls: None,
344 ignorable_emails: None,
345 ignorable_copyrights: None,
346 ignorable_holders: None,
347 ignorable_authors: None,
348 language: None,
349 notes: None,
350 is_deprecated: false,
351 replaced_by: vec![],
352 };
353 let mut rule_b = rule_a.clone();
354 rule_b.referenced_filenames = Some(vec!["LICENSE".to_string()]);
355
356 let license = LoadedLicense {
357 key: "mit".to_string(),
358 short_name: Some("MIT".to_string()),
359 name: "MIT License".to_string(),
360 language: Some("en".to_string()),
361 spdx_license_key: Some("MIT".to_string()),
362 other_spdx_license_keys: vec![],
363 category: Some("Permissive".to_string()),
364 owner: None,
365 homepage_url: None,
366 text: "MIT text".to_string(),
367 reference_urls: vec![],
368 osi_license_key: None,
369 text_urls: vec![],
370 osi_url: None,
371 faq_url: None,
372 other_urls: vec![],
373 notes: None,
374 is_deprecated: false,
375 is_exception: false,
376 is_unknown: false,
377 is_generic: false,
378 replaced_by: vec![],
379 minimum_coverage: None,
380 standard_notice: None,
381 ignorable_copyrights: None,
382 ignorable_holders: None,
383 ignorable_authors: None,
384 ignorable_urls: None,
385 ignorable_emails: None,
386 };
387
388 let fingerprint_a = compute_rules_fingerprint(&[rule_a], std::slice::from_ref(&license))
389 .expect("fingerprint A");
390 let fingerprint_b =
391 compute_rules_fingerprint(&[rule_b], &[license]).expect("fingerprint B");
392
393 assert_ne!(fingerprint_a, fingerprint_b);
394 }
395}