Skip to main content

sqry_core/cache/
key.rs

1//! Cache key for identifying parsed files.
2//!
3//! The cache key uniquely identifies a file's AST summary using:
4//! - Canonical file path (normalized, symlinks resolved)
5//! - Language identifier (plugin name)
6//! - Content hash (BLAKE3 digest)
7//!
8//! # Path Canonicalization
9//!
10//! Cache keys attempt to canonicalize paths to handle symlinks and relative paths
11//! consistently. If canonicalization fails (file deleted, permission denied, or
12//! unsupported filesystem), the key falls back to the original path.
13//!
14//! # Examples
15//!
16//! ```rust
17//! use sqry_core::cache::CacheKey;
18//! use sqry_core::hash::Blake3Hash;
19//! use std::path::PathBuf;
20//!
21//! let hash_hex = "a".repeat(64);
22//! let hash = Blake3Hash::from_hex(&hash_hex).unwrap();
23//! let key = CacheKey::new(
24//!     PathBuf::from("src/main.rs"),
25//!     "rust",
26//!     hash,
27//! );
28//!
29//! // Keys are comparable and hashable
30//! assert_eq!(key.language(), "rust");
31//! ```
32
33use crate::hash::Blake3Hash;
34use std::fmt;
35use std::path::{Path, PathBuf};
36
37/// Unique identifier for cached AST summaries.
38///
39/// A cache key combines:
40/// - **Canonical path**: Normalized file path with symlinks resolved
41/// - **Language ID**: Plugin identifier (e.g., "rust", "python")
42/// - **Content hash**: BLAKE3 digest of file contents
43///
44/// # Equality and Hashing
45///
46/// Two cache keys are equal if all three components match. This ensures
47/// cache misses when:
48/// - File content changes (different hash)
49/// - File is moved (different canonical path)
50/// - Language plugin changes (different language ID)
51///
52/// # Canonicalization Fallback
53///
54/// If path canonicalization fails, the original path is used. This handles:
55/// - Deleted files during cache cleanup
56/// - Permission-denied scenarios
57/// - Filesystems without canonicalization support
58///
59/// Fallback events are logged at DEBUG level via the `log` crate.
60#[derive(Debug, Clone, PartialEq, Eq, Hash)]
61pub struct CacheKey {
62    /// Canonical file path (or original if canonicalization failed).
63    canonical_path: PathBuf,
64
65    /// Language identifier from the plugin.
66    language: String,
67
68    /// BLAKE3 hash of file contents.
69    content_hash: Blake3Hash,
70
71    /// Whether canonicalization succeeded.
72    ///
73    /// Used for diagnostics and telemetry. When `false`, indicates
74    /// the cache key is using the original path as a fallback.
75    canonicalization_succeeded: bool,
76}
77
78impl CacheKey {
79    /// Create a new cache key with path canonicalization.
80    ///
81    /// Attempts to canonicalize the path. If canonicalization fails,
82    /// falls back to the original path and logs a DEBUG message.
83    ///
84    /// # Arguments
85    ///
86    /// - `path`: File path (can be relative or contain symlinks)
87    /// - `language`: Language identifier from the plugin
88    /// - `content_hash`: BLAKE3 hash of file contents
89    ///
90    /// # Examples
91    ///
92    /// ```rust
93    /// use sqry_core::cache::CacheKey;
94    /// use sqry_core::hash::Blake3Hash;
95    /// use std::path::PathBuf;
96    ///
97    /// let hash_hex = "a".repeat(64);
98    /// let hash = Blake3Hash::from_hex(&hash_hex).unwrap();
99    /// let key = CacheKey::new(
100    ///     PathBuf::from("./src/main.rs"),
101    ///     "rust",
102    ///     hash,
103    /// );
104    /// ```
105    pub fn new<P: AsRef<Path>>(
106        path: P,
107        language: impl Into<String>,
108        content_hash: Blake3Hash,
109    ) -> Self {
110        let path = path.as_ref();
111        let language = language.into();
112
113        // Attempt canonicalization
114        let (mut canonical_path, canonicalization_succeeded) = match path.canonicalize() {
115            Ok(canonical) => {
116                log::trace!(
117                    "Canonicalized cache key path: {} -> {}",
118                    path.display(),
119                    canonical.display()
120                );
121                (canonical, true)
122            }
123            Err(e) => {
124                log::debug!(
125                    "Cache key canonicalization failed for {}: {}. Using original path.",
126                    path.display(),
127                    e
128                );
129                (path.to_path_buf(), false)
130            }
131        };
132
133        // Normalize case for case-insensitive filesystems (Windows, macOS)
134        // This prevents duplicate cache entries for paths that differ only in case
135        canonical_path = Self::normalize_case_if_needed(canonical_path);
136
137        Self {
138            canonical_path,
139            language,
140            content_hash,
141            canonicalization_succeeded,
142        }
143    }
144
145    /// Normalize path case for case-insensitive filesystems.
146    ///
147    /// On Windows and macOS (case-insensitive by default), converts the path
148    /// to lowercase to ensure consistent cache keys for paths that differ only
149    /// in case (e.g., "FILE.rs" vs "file.rs").
150    ///
151    /// On Linux and other case-sensitive systems, returns the path unchanged.
152    fn normalize_case_if_needed(path: PathBuf) -> PathBuf {
153        #[cfg(any(target_os = "windows", target_os = "macos"))]
154        {
155            // Convert to lowercase on case-insensitive platforms
156            if let Some(path_str) = path.to_str() {
157                PathBuf::from(path_str.to_lowercase())
158            } else {
159                // Non-UTF8 path, can't normalize safely
160                log::debug!("Cannot normalize non-UTF8 path: {:?}", path);
161                path
162            }
163        }
164
165        #[cfg(not(any(target_os = "windows", target_os = "macos")))]
166        {
167            // Case-sensitive filesystem, no normalization needed
168            path
169        }
170    }
171
172    /// Create a cache key without path canonicalization.
173    ///
174    /// Uses the provided path as-is, skipping canonicalization.
175    /// Useful for testing or when paths are already canonical.
176    ///
177    /// # Examples
178    ///
179    /// ```rust
180    /// use sqry_core::cache::CacheKey;
181    /// use sqry_core::hash::Blake3Hash;
182    /// use std::path::PathBuf;
183    ///
184    /// let hash_hex = "a".repeat(64);
185    /// let hash = Blake3Hash::from_hex(&hash_hex).unwrap();
186    /// let key = CacheKey::from_raw_path(
187    ///     PathBuf::from("/absolute/path/file.rs"),
188    ///     "rust",
189    ///     hash,
190    /// );
191    /// ```
192    pub fn from_raw_path<P: Into<PathBuf>>(
193        path: P,
194        language: impl Into<String>,
195        content_hash: Blake3Hash,
196    ) -> Self {
197        Self {
198            canonical_path: path.into(),
199            language: language.into(),
200            content_hash,
201            canonicalization_succeeded: true, // Assume caller knows what they're doing
202        }
203    }
204
205    /// Get the canonical file path (or original if canonicalization failed).
206    #[must_use]
207    pub fn path(&self) -> &Path {
208        &self.canonical_path
209    }
210
211    /// Get the language identifier.
212    #[must_use]
213    pub fn language(&self) -> &str {
214        &self.language
215    }
216
217    /// Get the content hash.
218    #[must_use]
219    pub fn content_hash(&self) -> &Blake3Hash {
220        &self.content_hash
221    }
222
223    /// Check if path canonicalization succeeded.
224    ///
225    /// Returns `false` if the key is using the original path as a fallback.
226    #[must_use]
227    pub fn is_canonical(&self) -> bool {
228        self.canonicalization_succeeded
229    }
230
231    /// Compute a storage key for file persistence.
232    ///
233    /// Returns a string combining language, content hash, path hash, and filename,
234    /// suitable for use as a directory/file path in the persistent cache.
235    ///
236    /// Format: `{language}/{content_hash}/{path_hash}/{filename}`
237    ///
238    /// The path hash prevents collisions when different files have identical
239    /// content and filename (e.g., `/proj1/main.rs` and `/proj2/main.rs`).
240    ///
241    /// # Examples
242    ///
243    /// ```rust
244    /// use sqry_core::cache::CacheKey;
245    /// use sqry_core::hash::Blake3Hash;
246    /// use std::path::PathBuf;
247    ///
248    /// let hash_hex = "a".repeat(64);
249    /// let hash = Blake3Hash::from_hex(&hash_hex).unwrap();
250    /// let key = CacheKey::from_raw_path(
251    ///     PathBuf::from("/path/to/file.rs"),
252    ///     "rust",
253    ///     hash,
254    /// );
255    ///
256    /// let storage_key = key.storage_key();
257    /// assert!(storage_key.starts_with("rust/"));
258    /// assert!(storage_key.ends_with("file.rs"));
259    /// ```
260    #[must_use]
261    pub fn storage_key(&self) -> String {
262        let filename = self
263            .canonical_path
264            .file_name()
265            .and_then(|n| n.to_str())
266            .unwrap_or("unknown");
267
268        // Hash the canonical path to prevent collisions between different files
269        // with identical content and filename (e.g., /proj1/main.rs vs /proj2/main.rs)
270        let path_hash = {
271            let path_str = self.canonical_path.to_string_lossy();
272            let hash = blake3::hash(path_str.as_bytes());
273            // Use first 8 bytes (16 hex chars) - sufficient for collision resistance
274            hex::encode(&hash.as_bytes()[..8])
275        };
276
277        format!(
278            "{}/{}/{}/{}",
279            self.language,
280            self.content_hash.to_hex(),
281            path_hash,
282            filename
283        )
284    }
285}
286
287impl fmt::Display for CacheKey {
288    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
289        write!(
290            f,
291            "{}:{}:{}",
292            self.canonical_path.display(),
293            self.language,
294            self.content_hash.to_hex()
295        )
296    }
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302    use crate::cache::GraphNodeSummary;
303    use std::fs;
304    use std::io::Write;
305    use std::sync::Arc;
306
307    fn make_test_hash(byte: u8) -> Blake3Hash {
308        Blake3Hash::from_bytes([byte; 32])
309    }
310
311    #[test]
312    fn test_cache_key_new() {
313        let hash = make_test_hash(0x42);
314        let key = CacheKey::new(PathBuf::from("test.rs"), "rust", hash);
315
316        assert_eq!(key.language(), "rust");
317        assert_eq!(key.content_hash(), &hash);
318        // Path might be canonical or not depending on filesystem
319    }
320
321    #[test]
322    fn test_cache_key_from_raw_path() {
323        let hash = make_test_hash(0x42);
324        let path = PathBuf::from("/absolute/path/test.rs");
325        let key = CacheKey::from_raw_path(path.clone(), "rust", hash);
326
327        assert_eq!(key.path(), path.as_path());
328        assert_eq!(key.language(), "rust");
329        assert_eq!(key.content_hash(), &hash);
330        assert!(key.is_canonical()); // from_raw_path assumes canonical
331    }
332
333    #[test]
334    fn test_cache_key_equality() {
335        let hash1 = make_test_hash(0x42);
336        let hash2 = make_test_hash(0x43);
337
338        let key1 = CacheKey::from_raw_path("/path/file.rs", "rust", hash1);
339        let key2 = CacheKey::from_raw_path("/path/file.rs", "rust", hash1);
340        let key3 = CacheKey::from_raw_path("/path/file.rs", "python", hash1);
341        let key4 = CacheKey::from_raw_path("/path/file.rs", "rust", hash2);
342        let key5 = CacheKey::from_raw_path("/other/file.rs", "rust", hash1);
343
344        // Same components = equal
345        assert_eq!(key1, key2);
346
347        // Different language = not equal
348        assert_ne!(key1, key3);
349
350        // Different hash = not equal
351        assert_ne!(key1, key4);
352
353        // Different path = not equal
354        assert_ne!(key1, key5);
355    }
356
357    #[test]
358    fn test_cache_key_hash_consistency() {
359        use std::collections::HashMap;
360
361        let hash = make_test_hash(0x42);
362        let key1 = CacheKey::from_raw_path("/path/file.rs", "rust", hash);
363        let key2 = CacheKey::from_raw_path("/path/file.rs", "rust", hash);
364
365        let mut map = HashMap::new();
366        map.insert(key1.clone(), "value1");
367        map.insert(key2.clone(), "value2");
368
369        // Should have only one entry (keys are equal)
370        assert_eq!(map.len(), 1);
371        assert_eq!(map.get(&key1), Some(&"value2"));
372    }
373
374    #[test]
375    fn test_cache_key_storage_key() {
376        let hash = make_test_hash(0x42);
377        let key = CacheKey::from_raw_path("/path/file.rs", "rust", hash);
378
379        let storage_key = key.storage_key();
380
381        // Format: {language}/{content_hash}/{path_hash}/{filename}
382        assert!(storage_key.starts_with("rust/"));
383        assert!(storage_key.ends_with("/file.rs"));
384
385        // Should contain path hash (16 hex chars from 8 bytes)
386        let parts: Vec<&str> = storage_key.split('/').collect();
387        assert_eq!(
388            parts.len(),
389            4,
390            "Should have 4 parts: language/content_hash/path_hash/filename"
391        );
392        assert_eq!(parts[0], "rust");
393        assert_eq!(parts[1].len(), 64, "Content hash should be 64 hex chars");
394        assert_eq!(parts[2].len(), 16, "Path hash should be 16 hex chars");
395        assert_eq!(parts[3], "file.rs");
396    }
397
398    #[test]
399    fn test_cache_key_storage_no_collision() {
400        // Two different files with same filename and content should have different storage keys
401        let hash = make_test_hash(0x42); // Same content hash
402
403        let key1 = CacheKey::from_raw_path("/project1/main.rs", "rust", hash);
404        let key2 = CacheKey::from_raw_path("/project2/main.rs", "rust", hash);
405
406        let storage1 = key1.storage_key();
407        let storage2 = key2.storage_key();
408
409        // Should have different storage keys due to different paths
410        assert_ne!(
411            storage1, storage2,
412            "Different paths should produce different storage keys"
413        );
414
415        // Both should have same language and content hash
416        assert!(storage1.starts_with("rust/"));
417        assert!(storage2.starts_with("rust/"));
418
419        // But different path hashes
420        let parts1: Vec<&str> = storage1.split('/').collect();
421        let parts2: Vec<&str> = storage2.split('/').collect();
422
423        assert_eq!(parts1[1], parts2[1], "Same content hash");
424        assert_ne!(parts1[2], parts2[2], "Different path hashes");
425        assert_eq!(parts1[3], parts2[3], "Same filename");
426    }
427
428    #[test]
429    fn test_cache_key_display() {
430        let hash = make_test_hash(0x42);
431        let key = CacheKey::from_raw_path("/path/file.rs", "rust", hash);
432
433        let display = format!("{key}");
434
435        // Format: path:language:hash
436        assert!(display.contains("/path/file.rs"));
437        assert!(display.contains("rust"));
438        assert!(display.contains(&hash.to_hex()));
439    }
440
441    #[test]
442    fn test_cache_key_canonicalization_success() {
443        // Create a real temporary file
444        let tmp_cache_dir = std::env::temp_dir();
445        let temp_file = tmp_cache_dir.join("sqry_test_cache_key.rs");
446        let mut file = fs::File::create(&temp_file).unwrap();
447        file.write_all(b"fn main() {}").unwrap();
448        drop(file);
449
450        let hash = make_test_hash(0x42);
451        let key = CacheKey::new(&temp_file, "rust", hash);
452
453        // Should have canonicalized successfully
454        assert!(key.is_canonical());
455        // Canonical path should be absolute
456        assert!(key.path().is_absolute());
457
458        // Cleanup
459        let _ = fs::remove_file(&temp_file);
460    }
461
462    #[test]
463    fn test_cache_key_canonicalization_fallback() {
464        // Use a path that doesn't exist
465        let nonexistent = PathBuf::from("/nonexistent/path/file.rs");
466        let hash = make_test_hash(0x42);
467
468        let key = CacheKey::new(&nonexistent, "rust", hash);
469
470        // Canonicalization should have failed
471        assert!(!key.is_canonical());
472        // Should fall back to original path
473        assert_eq!(key.path(), nonexistent.as_path());
474    }
475
476    #[test]
477    fn test_cache_key_different_languages() {
478        let hash = make_test_hash(0x42);
479        let key_rust = CacheKey::from_raw_path("/path/file.txt", "rust", hash);
480        let key_python = CacheKey::from_raw_path("/path/file.txt", "python", hash);
481
482        // Same path and hash but different language
483        assert_ne!(key_rust, key_python);
484        assert_ne!(key_rust.storage_key(), key_python.storage_key());
485    }
486
487    #[test]
488    fn test_cache_key_relative_vs_absolute() {
489        // Create a real file to enable canonicalization
490        let tmp_cache_dir = std::env::temp_dir();
491        let temp_file = tmp_cache_dir.join("sqry_test_relative.rs");
492        let mut file = fs::File::create(&temp_file).unwrap();
493        file.write_all(b"// test").unwrap();
494        drop(file);
495
496        let hash = make_test_hash(0x42);
497
498        // Both should canonicalize to the same absolute path
499        let key1 = CacheKey::new(&temp_file, "rust", hash);
500        let key2 = CacheKey::new(temp_file.canonicalize().unwrap(), "rust", hash);
501
502        // Both should have canonical paths and be equal
503        assert!(key1.is_canonical());
504        assert!(key2.is_canonical());
505        assert_eq!(key1, key2);
506
507        // Cleanup
508        let _ = fs::remove_file(&temp_file);
509    }
510
511    #[test]
512    #[cfg(any(target_os = "windows", target_os = "macos"))]
513    fn test_cache_key_case_normalization() {
514        // On case-insensitive filesystems, paths differing only in case
515        // should produce the same cache key
516        let _hash = make_test_hash(0x42);
517
518        // Use from_raw_path to test the normalization directly
519        // (new() would canonicalize which might change case anyway)
520        let lowercase_path = PathBuf::from("/path/to/file.rs");
521        let uppercase_path = PathBuf::from("/PATH/TO/FILE.RS");
522        let mixed_path = PathBuf::from("/Path/To/File.rs");
523
524        // Apply normalization manually (simulating what new() does)
525        let normalized_lower = CacheKey::normalize_case_if_needed(lowercase_path.clone());
526        let normalized_upper = CacheKey::normalize_case_if_needed(uppercase_path.clone());
527        let normalized_mixed = CacheKey::normalize_case_if_needed(mixed_path.clone());
528
529        // All should normalize to the same lowercase path
530        assert_eq!(normalized_lower, normalized_upper);
531        assert_eq!(normalized_lower, normalized_mixed);
532        assert_eq!(normalized_lower.to_str().unwrap(), "/path/to/file.rs");
533    }
534
535    #[test]
536    #[cfg(not(any(target_os = "windows", target_os = "macos")))]
537    fn test_cache_key_case_preservation() {
538        // On case-sensitive filesystems (Linux), paths should preserve case
539        let lowercase_path = PathBuf::from("/path/to/file.rs");
540        let uppercase_path = PathBuf::from("/PATH/TO/FILE.RS");
541
542        // Apply normalization (should be no-op on Linux)
543        let normalized_lower = CacheKey::normalize_case_if_needed(lowercase_path.clone());
544        let normalized_upper = CacheKey::normalize_case_if_needed(uppercase_path.clone());
545
546        // Should preserve original case
547        assert_eq!(normalized_lower, lowercase_path);
548        assert_eq!(normalized_upper, uppercase_path);
549        assert_ne!(normalized_lower, normalized_upper);
550    }
551
552    #[test]
553    fn test_cache_key_symlink_resolution() {
554        use std::fs;
555        use tempfile::TempDir;
556
557        // Create a temporary directory with a real file and a symlink
558        let tmp_cache_dir = TempDir::new().unwrap();
559        let real_file = tmp_cache_dir.path().join("real_file.rs");
560        let symlink = tmp_cache_dir.path().join("symlink.rs");
561
562        // Create the real file
563        fs::write(&real_file, "fn test() {}").unwrap();
564
565        // Create symlink (Unix only - skip on Windows)
566        #[cfg(unix)]
567        {
568            std::os::unix::fs::symlink(&real_file, &symlink).unwrap();
569
570            let hash = make_test_hash(0x42);
571
572            // Create cache keys for both paths
573            let key_real = CacheKey::new(&real_file, "rust", hash);
574            let key_symlink = CacheKey::new(&symlink, "rust", hash);
575
576            // Both should canonicalize to the same path
577            assert_eq!(
578                key_real.path(),
579                key_symlink.path(),
580                "Symlinks should resolve to the same canonical path"
581            );
582        }
583
584        #[cfg(not(unix))]
585        {
586            // On Windows, just verify the test compiles
587            let _ = (real_file, symlink);
588        }
589    }
590
591    #[test]
592    fn test_cache_key_mixed_case_paths_same_file() {
593        use std::fs;
594        use tempfile::TempDir;
595
596        // Create a temporary file
597        let tmp_cache_dir = TempDir::new().unwrap();
598        let file_path = tmp_cache_dir.path().join("TestFile.rs");
599        fs::write(&file_path, "fn test() {}").unwrap();
600
601        let hash = make_test_hash(0x42);
602
603        // Create keys with different case variations
604        let key1 = CacheKey::new(&file_path, "rust", hash);
605
606        // On case-insensitive systems, these should normalize to the same key
607        // On case-sensitive systems, only the exact path works
608        #[cfg(any(target_os = "windows", target_os = "macos"))]
609        {
610            // Lowercase version should normalize to same path
611            let lowercase_path = tmp_cache_dir.path().join("testfile.rs");
612            let key2 = CacheKey::new(&lowercase_path, "rust", hash);
613
614            // After normalization, paths should be equal (both lowercase)
615            assert_eq!(
616                key1.path().to_str().unwrap().to_lowercase(),
617                key2.path().to_str().unwrap().to_lowercase(),
618                "Case variations should normalize on case-insensitive filesystems"
619            );
620        }
621
622        #[cfg(not(any(target_os = "windows", target_os = "macos")))]
623        {
624            // On case-sensitive systems, only exact path match works
625            // Different case = different file
626            let _ = key1; // Just verify it compiles
627        }
628    }
629
630    #[test]
631    fn test_cache_key_non_utf8_path() {
632        // Test behavior with non-UTF8 paths (should handle gracefully)
633        #[cfg(unix)]
634        {
635            use std::ffi::OsStr;
636            use std::os::unix::ffi::OsStrExt;
637
638            // Create a path with invalid UTF-8
639            let invalid_bytes = b"/tmp/\xFF\xFE.rs";
640            let invalid_path = PathBuf::from(OsStr::from_bytes(invalid_bytes));
641
642            let hash = make_test_hash(0x42);
643
644            // Should not panic, even with non-UTF8 path
645            let key = CacheKey::from_raw_path(invalid_path.clone(), "rust", hash);
646            assert_eq!(key.path(), invalid_path.as_path());
647        }
648    }
649
650    #[test]
651    fn test_serialized_size_fallback() {
652        // Test that serialized_size handles errors gracefully
653        use crate::graph::unified::node::NodeKind;
654
655        let summary = GraphNodeSummary::new(
656            Arc::from("test_function"),
657            NodeKind::Function,
658            Arc::from(Path::new("test.rs")),
659            10,
660            0,
661            20,
662            1,
663        );
664
665        // Should return actual size
666        let size = summary.serialized_size();
667        assert!(size > 0, "Serialized size should be positive");
668        assert!(size <= 512, "Serialized size should be reasonable");
669
670        // The fallback path is hard to test without breaking postcard,
671        // but we can verify the method doesn't panic
672    }
673}