provenant/license_detection/
license_cache.rs1use std::fmt::Write as _;
5use std::fs;
6use std::path::{Path, PathBuf};
7
8use anyhow::{Context, Result};
9use rancor::{Panic, ResultExt};
10use sha2::{Digest, Sha256};
11
12use crate::cache::{CacheConfig, write_bytes_atomically};
13use crate::license_detection::index::LicenseIndex;
14use crate::license_detection::models::{LoadedLicense, LoadedRule};
15
16const CACHE_ROOT_DIR_NAME: &str = "license-index";
17const CACHE_FILE_EXTENSION: &str = "rkyv";
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum LicenseCacheNamespace {
21 Embedded,
22 CustomRules,
23}
24
25impl LicenseCacheNamespace {
26 fn directory_name(self) -> &'static str {
27 match self {
28 Self::Embedded => "embedded",
29 Self::CustomRules => "custom",
30 }
31 }
32}
33
34pub struct LicenseCacheConfig {
35 pub root_dir: PathBuf,
36 pub reindex: bool,
37 pub enabled: bool,
38}
39
40impl LicenseCacheConfig {
41 pub fn new(root_dir: PathBuf, reindex: bool, enabled: bool) -> Self {
42 Self {
43 root_dir,
44 reindex,
45 enabled,
46 }
47 }
48
49 pub fn default_root_dir() -> PathBuf {
50 CacheConfig::default_root_dir_without_scan_root()
51 }
52
53 fn namespace_dir(&self, namespace: LicenseCacheNamespace) -> PathBuf {
54 self.root_dir
55 .join(CACHE_ROOT_DIR_NAME)
56 .join(namespace.directory_name())
57 }
58
59 fn cache_file_path(&self, namespace: LicenseCacheNamespace, fingerprint: &[u8; 32]) -> PathBuf {
60 self.namespace_dir(namespace).join(format!(
61 "{}.{}",
62 fingerprint_hex(fingerprint),
63 CACHE_FILE_EXTENSION
64 ))
65 }
66}
67
68fn fingerprint_hex(fingerprint: &[u8; 32]) -> String {
69 let mut hex = String::with_capacity(fingerprint.len() * 2);
70 for byte in fingerprint {
71 let _ = write!(&mut hex, "{byte:02x}");
72 }
73 hex
74}
75
76fn prune_namespace_dir(namespace_dir: &Path, active_path: &Path) -> Result<()> {
77 if !namespace_dir.exists() {
78 return Ok(());
79 }
80
81 for entry in fs::read_dir(namespace_dir)
82 .with_context(|| format!("Failed to read license cache namespace {namespace_dir:?}"))?
83 {
84 let path = entry?.path();
85 if path == active_path
86 || path.extension().and_then(|ext| ext.to_str()) != Some(CACHE_FILE_EXTENSION)
87 {
88 continue;
89 }
90 fs::remove_file(&path)
91 .with_context(|| format!("Failed to prune stale license cache file {path:?}"))?;
92 }
93
94 Ok(())
95}
96
97pub fn compute_rules_fingerprint(
98 rules: &[LoadedRule],
99 licenses: &[LoadedLicense],
100) -> Result<[u8; 32]> {
101 let mut sorted_rules: Vec<_> = rules.iter().collect();
102 sorted_rules.sort_by_key(|r| &r.identifier);
103 let mut sorted_licenses: Vec<_> = licenses.iter().collect();
104 sorted_licenses.sort_by_key(|l| &l.key);
105
106 let serialized = postcard::to_allocvec(&(sorted_rules, sorted_licenses))
107 .context("Failed to serialize effective rules/licenses for cache fingerprinting")?;
108
109 Ok(Sha256::digest(serialized).into())
110}
111
112pub fn compute_artifact_fingerprint(artifact_bytes: &[u8]) -> [u8; 32] {
113 Sha256::digest(artifact_bytes).into()
114}
115
116pub fn load_cached_index(
117 config: &LicenseCacheConfig,
118 namespace: LicenseCacheNamespace,
119 fingerprint: &[u8; 32],
120) -> Result<Option<LicenseIndex>> {
121 if !config.enabled {
122 return Ok(None);
123 }
124
125 let cache_path = config.cache_file_path(namespace, fingerprint);
126
127 if !cache_path.exists() {
128 return Ok(None);
129 }
130
131 let bytes = match fs::read(&cache_path) {
132 Ok(bytes) => bytes,
133 Err(_) => return Ok(None),
134 };
135
136 if bytes.len() < 32 {
137 return Ok(None);
138 }
139
140 let stored_fingerprint: [u8; 32] = bytes[..32].try_into().unwrap();
141 if stored_fingerprint != *fingerprint {
142 return Ok(None);
143 }
144
145 let archived =
146 match rkyv::access::<rkyv::Archived<LicenseIndex>, rkyv::rancor::Error>(&bytes[32..]) {
147 Ok(archived) => archived,
148 Err(_) => return Ok(None),
149 };
150
151 let cached: LicenseIndex = rkyv::deserialize::<LicenseIndex, Panic>(archived).always_ok();
152
153 Ok(Some(cached))
154}
155
156pub fn save_cached_index(
157 config: &LicenseCacheConfig,
158 namespace: LicenseCacheNamespace,
159 cached: &LicenseIndex,
160 fingerprint: &[u8; 32],
161) -> Result<()> {
162 if !config.enabled {
163 return Ok(());
164 }
165
166 let rkyv_bytes = rkyv::to_bytes::<rkyv::rancor::Error>(cached)
167 .map_err(|e| anyhow::anyhow!("Failed to serialize license index cache: {}", e))?;
168
169 let mut payload = Vec::with_capacity(fingerprint.len() + rkyv_bytes.len());
170 payload.extend_from_slice(fingerprint);
171 payload.extend_from_slice(&rkyv_bytes);
172
173 let namespace_dir = config.namespace_dir(namespace);
174 let cache_path = config.cache_file_path(namespace, fingerprint);
175
176 crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || {
177 fs::create_dir_all(&namespace_dir)
178 .with_context(|| "Failed to create license index cache directory")?;
179 prune_namespace_dir(&namespace_dir, &cache_path)?;
180 write_bytes_atomically(&cache_path, &payload)
181 .with_context(|| "Failed to persist license index cache file")
182 })?;
183
184 Ok(())
185}
186
187pub fn delete_cache(
188 config: &LicenseCacheConfig,
189 namespace: LicenseCacheNamespace,
190 fingerprint: &[u8; 32],
191) -> Result<()> {
192 if !config.enabled {
193 return Ok(());
194 }
195
196 let cache_path = config.cache_file_path(namespace, fingerprint);
197 crate::cache::locking::with_exclusive_cache_lock(&config.root_dir, || -> Result<()> {
198 if cache_path.exists() {
199 fs::remove_file(&cache_path).context("Failed to delete license index cache file")?;
200 }
201 Ok(())
202 })?;
203
204 Ok(())
205}
206
207pub fn cache_file_size(
208 config: &LicenseCacheConfig,
209 namespace: LicenseCacheNamespace,
210 fingerprint: &[u8; 32],
211) -> Option<u64> {
212 if !config.enabled {
213 return None;
214 }
215
216 fs::metadata(config.cache_file_path(namespace, fingerprint))
217 .ok()
218 .map(|m| m.len())
219}
220
221#[cfg(test)]
222mod tests {
223 use tempfile::TempDir;
224
225 use super::*;
226 use crate::license_detection::automaton::Automaton;
227 use crate::license_detection::index::dictionary::TokenDictionary;
228
229 fn sample_cached_index() -> LicenseIndex {
230 LicenseIndex {
231 dictionary: TokenDictionary::default(),
232 len_legalese: 0,
233 rid_by_hash: Default::default(),
234 rules_by_rid: Default::default(),
235 tids_by_rid: Default::default(),
236 rules_automaton: Automaton::empty(),
237 unknown_automaton: Automaton::empty(),
238 sets_by_rid: Default::default(),
239 rule_metadata_by_identifier: Default::default(),
240 msets_by_rid: Default::default(),
241 high_sets_by_rid: Default::default(),
242 high_postings_by_rid: Default::default(),
243 false_positive_rids: Default::default(),
244 approx_matchable_rids: Default::default(),
245 licenses_by_key: Default::default(),
246 pattern_id_to_rid: Default::default(),
247 rid_by_spdx_key: Default::default(),
248 unknown_spdx_rid: None,
249 rids_by_high_tid: Default::default(),
250 spdx_license_list_version: Some("test".to_string()),
251 }
252 }
253
254 #[test]
255 fn test_cache_file_path_uses_namespace_and_fingerprint() {
256 let config = LicenseCacheConfig::new(PathBuf::from("/tmp/cache-root"), false, true);
257 let fingerprint = [0xAB; 32];
258
259 assert_eq!(
260 config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint),
261 PathBuf::from(format!(
262 "/tmp/cache-root/license-index/embedded/{}.rkyv",
263 "ab".repeat(32)
264 ))
265 );
266 assert_eq!(
267 config.cache_file_path(LicenseCacheNamespace::CustomRules, &fingerprint),
268 PathBuf::from(format!(
269 "/tmp/cache-root/license-index/custom/{}.rkyv",
270 "ab".repeat(32)
271 ))
272 );
273 }
274
275 #[test]
276 fn test_save_cached_index_prunes_stale_namespace_entries() {
277 let temp_dir = TempDir::new().expect("create temp dir");
278 let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, true);
279 let fingerprint = [0x11; 32];
280 let namespace_dir = config.namespace_dir(LicenseCacheNamespace::Embedded);
281 fs::create_dir_all(&namespace_dir).expect("create namespace dir");
282 fs::write(namespace_dir.join("stale.rkyv"), b"old").expect("write stale cache file");
283
284 let cached = sample_cached_index();
285 save_cached_index(
286 &config,
287 LicenseCacheNamespace::Embedded,
288 &cached,
289 &fingerprint,
290 )
291 .expect("save cache");
292
293 let entries = fs::read_dir(&namespace_dir)
294 .expect("read namespace dir")
295 .map(|entry| entry.expect("dir entry").path())
296 .collect::<Vec<_>>();
297
298 assert_eq!(entries.len(), 1);
299 assert_eq!(
300 entries[0],
301 config.cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
302 );
303 }
304
305 #[test]
306 fn test_disabled_cache_skips_persistence() {
307 let temp_dir = TempDir::new().expect("create temp dir");
308 let config = LicenseCacheConfig::new(temp_dir.path().to_path_buf(), false, false);
309 let fingerprint = [0x22; 32];
310
311 save_cached_index(
312 &config,
313 LicenseCacheNamespace::Embedded,
314 &sample_cached_index(),
315 &fingerprint,
316 )
317 .expect("disabled save should succeed");
318
319 assert!(
320 !config
321 .cache_file_path(LicenseCacheNamespace::Embedded, &fingerprint)
322 .exists()
323 );
324 assert!(
325 load_cached_index(&config, LicenseCacheNamespace::Embedded, &fingerprint)
326 .expect("disabled load should succeed")
327 .is_none()
328 );
329 }
330
331 #[test]
332 fn test_compute_rules_fingerprint_changes_when_rule_metadata_changes() {
333 let rule_a = LoadedRule {
334 identifier: "example.RULE".to_string(),
335 license_expression: "mit".to_string(),
336 text: "example text".to_string(),
337 rule_kind: crate::license_detection::models::RuleKind::Text,
338 is_false_positive: false,
339 is_required_phrase: false,
340 skip_for_required_phrase_generation: false,
341 relevance: Some(100),
342 minimum_coverage: None,
343 has_stored_minimum_coverage: false,
344 is_continuous: false,
345 referenced_filenames: None,
346 ignorable_urls: None,
347 ignorable_emails: None,
348 ignorable_copyrights: None,
349 ignorable_holders: None,
350 ignorable_authors: None,
351 language: None,
352 notes: None,
353 is_deprecated: false,
354 replaced_by: vec![],
355 };
356 let mut rule_b = rule_a.clone();
357 rule_b.referenced_filenames = Some(vec!["LICENSE".to_string()]);
358
359 let license = LoadedLicense {
360 key: "mit".to_string(),
361 short_name: Some("MIT".to_string()),
362 name: "MIT License".to_string(),
363 language: Some("en".to_string()),
364 spdx_license_key: Some("MIT".to_string()),
365 other_spdx_license_keys: vec![],
366 category: Some("Permissive".to_string()),
367 owner: None,
368 homepage_url: None,
369 text: "MIT text".to_string(),
370 reference_urls: vec![],
371 osi_license_key: None,
372 text_urls: vec![],
373 osi_url: None,
374 faq_url: None,
375 other_urls: vec![],
376 notes: None,
377 is_deprecated: false,
378 is_exception: false,
379 is_unknown: false,
380 is_generic: false,
381 replaced_by: vec![],
382 minimum_coverage: None,
383 standard_notice: None,
384 ignorable_copyrights: None,
385 ignorable_holders: None,
386 ignorable_authors: None,
387 ignorable_urls: None,
388 ignorable_emails: None,
389 };
390
391 let fingerprint_a = compute_rules_fingerprint(&[rule_a], std::slice::from_ref(&license))
392 .expect("fingerprint A");
393 let fingerprint_b =
394 compute_rules_fingerprint(&[rule_b], &[license]).expect("fingerprint B");
395
396 assert_ne!(fingerprint_a, fingerprint_b);
397 }
398}