Skip to main content

sqry_core/indexing/
incremental.rs

1//! Incremental indexing with hash-based change detection.
2//!
3//! This module provides fast file change detection using `XXHash64` (~10GB/s hashing speed)
4//! to enable incremental re-indexing. Only files that have changed since the last index
5//! are re-parsed, achieving 10-100x speedup for re-indexing operations.
6//!
7//! # Architecture
8//!
9//! The incremental indexing system uses a 3-level change detection strategy:
10//!
11//! 1. **Existence check**: Is the file still there?
12//! 2. **Metadata check**: Has size or mtime changed?
13//! 3. **Hash check**: Has content actually changed? (guards against mtime-only changes)
14//!
15//! # Usage
16//!
17//! ```rust,ignore
18//! use sqry_core::indexing::{HashIndex, FileHash};
19//! use std::path::Path;
20//!
21//! // Load existing hash index
22//! let cache_dir = Path::new(".sqry-cache");
23//! let mut hash_index = HashIndex::load(cache_dir)?;
24//!
25//! // Check if a file has changed
26//! let file_path = Path::new("src/main.rs");
27//! if hash_index.has_changed(file_path)? {
28//!     println!("File changed, re-indexing...");
29//!     // Re-index the file
30//!     let new_hash = FileHash::compute(file_path)?;
31//!     hash_index.update(file_path.to_path_buf(), new_hash);
32//! }
33//!
34//! // Save updated hash index
35//! hash_index.save(cache_dir)?;
36//! ```
37//!
38//! # Performance
39//!
40//! - `XXHash64` hashing: ~10 GB/s (vs BLAKE3 ~1 GB/s)
41//! - Metadata checks: ~1M files/sec
42//! - Target: 10-100x faster re-indexing for typical codebases
43
44use anyhow::{Context, Result};
45use serde::{Deserialize, Serialize};
46use std::collections::HashMap;
47use std::fs;
48use std::path::{Path, PathBuf};
49use std::time::SystemTime;
50use xxhash_rust::xxh64::xxh64;
51
52use crate::config::buffers::parse_buffer_size;
53
54const HASH_INDEX_MAGIC: [u8; 7] = *b"SQRYHSH";
55const HASH_INDEX_ENVELOPE_VERSION: u16 = 1;
56
57#[derive(Serialize, Deserialize)]
58struct HashIndexEnvelope {
59    magic: [u8; 7],
60    version: u16,
61    sqry_version: String,
62    payload: Vec<u8>,
63}
64
65/// Hash information for a single file.
66///
67/// This structure stores both the content hash (for change detection)
68/// and metadata (for quick pre-checks before expensive hashing).
69///
70/// # Phase 3: Content Caching
71///
72/// The optional `content` field caches file content in memory to enable
73/// content-based diffing without disk I/O. This is controlled by the
74/// `HashIndex` content-cache limit (configurable per builder) and is not
75/// persisted to disk (`#[serde(skip)]`).
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct FileHash {
78    /// Absolute path to the file
79    pub path: PathBuf,
80    /// `XXHash64` hash of file contents
81    pub hash: u64,
82    /// File size in bytes
83    pub size: u64,
84    /// Last modification time
85    pub mtime: SystemTime,
86    /// Number of symbols extracted from this file (for stats)
87    pub symbols_count: usize,
88    /// Cached file content for content-based diffing (Phase 3/4).
89    ///
90    /// Populated according to the content-cache size configuration managed by
91    /// `HashIndex`. Not persisted to disk (rebuilt on load if needed).
92    #[serde(skip)]
93    pub content: Option<String>,
94}
95
96impl FileHash {
97    /// Compute hash for a file.
98    ///
99    /// This reads the file in chunks and computes an `XXHash64` hash.
100    ///
101    /// # Errors
102    ///
103    /// Returns an error if the file cannot be read or metadata cannot be accessed.
104    pub fn compute(path: &Path) -> Result<Self> {
105        use std::io::Read;
106
107        // Get metadata first
108        let metadata = fs::metadata(path)
109            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
110
111        let size = metadata.len();
112        let mtime = metadata
113            .modified()
114            .with_context(|| format!("Failed to get modification time for {}", path.display()))?;
115
116        // Read and hash file contents
117        let mut file = fs::File::open(path)
118            .with_context(|| format!("Failed to open file {}", path.display()))?;
119
120        // Read in chunks for efficiency (respects SQRY_PARSE_BUFFER env var)
121        let mut buffer = vec![0u8; parse_buffer_size()];
122        let mut hasher = xxhash_rust::xxh64::Xxh64::new(0); // Use seed 0 for consistency
123
124        loop {
125            let bytes_read = file
126                .read(&mut buffer)
127                .with_context(|| format!("Failed to read file {}", path.display()))?;
128
129            if bytes_read == 0 {
130                break;
131            }
132
133            hasher.update(&buffer[..bytes_read]);
134        }
135
136        let hash = hasher.digest();
137
138        Ok(Self {
139            path: path.to_path_buf(),
140            hash,
141            size,
142            mtime,
143            symbols_count: 0, // Will be updated after indexing
144            content: None,    // Phase 3: Not cached during compute (populated later)
145        })
146    }
147
148    /// Quick compute using just the file bytes (for testing or small files).
149    ///
150    /// This is faster for small files but requires loading the entire file into memory.
151    ///
152    /// # Errors
153    ///
154    /// Returns [`anyhow::Error`] when the filesystem metadata for `path` cannot be read.
155    pub fn from_bytes(path: &Path, content: &[u8]) -> Result<Self> {
156        let metadata = fs::metadata(path)
157            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
158
159        let hash = xxh64(content, 0); // Use seed 0 for consistency
160
161        Ok(Self {
162            path: path.to_path_buf(),
163            hash,
164            size: content.len() as u64,
165            mtime: metadata.modified().with_context(|| {
166                format!("Failed to get modification time for {}", path.display())
167            })?,
168            symbols_count: 0,
169            content: None, // Phase 3: Not cached by default
170        })
171    }
172
173    /// Check if file metadata (size or mtime) has changed.
174    ///
175    /// This is a fast pre-check before expensive hashing.
176    ///
177    /// # Errors
178    ///
179    /// Returns [`anyhow::Error`] when the file's metadata cannot be read.
180    pub fn metadata_changed(&self, path: &Path) -> Result<bool> {
181        let metadata = fs::metadata(path)
182            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
183
184        let current_size = metadata.len();
185        let current_mtime = metadata
186            .modified()
187            .with_context(|| format!("Failed to get modification time for {}", path.display()))?;
188
189        Ok(current_size != self.size || current_mtime != self.mtime)
190    }
191}
192
193/// Index of file hashes for incremental indexing.
194///
195/// This structure maintains a mapping from file paths to their hash information,
196/// enabling fast change detection during re-indexing operations.
197#[derive(Debug, Clone, Serialize, Deserialize)]
198pub struct HashIndex {
199    /// Map from file path to hash information
200    hashes: HashMap<PathBuf, FileHash>,
201    /// Total number of files tracked
202    pub file_count: usize,
203    /// Total number of symbols across all files
204    pub total_symbols: usize,
205    /// Maximum number of bytes to store per cached file (None = unlimited)
206    #[serde(default)]
207    content_cache_max_bytes: Option<usize>,
208}
209
210impl HashIndex {
211    /// Create a new empty hash index.
212    #[must_use]
213    pub fn new() -> Self {
214        Self::with_content_cache_limit(None)
215    }
216
217    /// Create a new hash index with an explicit content cache limit.
218    #[must_use]
219    pub fn with_content_cache_limit(limit: Option<usize>) -> Self {
220        Self {
221            hashes: HashMap::new(),
222            file_count: 0,
223            total_symbols: 0,
224            content_cache_max_bytes: limit,
225        }
226    }
227
228    /// Override the content cache limit at runtime.
229    pub fn set_content_cache_limit(&mut self, limit: Option<usize>) {
230        self.content_cache_max_bytes = limit;
231    }
232
233    /// Check if a file has changed using 3-level detection.
234    ///
235    /// Returns `true` if the file should be re-indexed, `false` if it can be skipped.
236    ///
237    /// # Detection Levels
238    ///
239    /// 1. **Existence**: If file not in index or doesn't exist on disk → changed
240    /// 2. **Metadata**: If size or mtime differs → check hash
241    /// 3. **Hash**: If hash differs → changed
242    ///
243    /// # Errors
244    ///
245    /// Returns an error if file metadata cannot be read or file cannot be hashed.
246    pub fn has_changed(&self, path: &Path) -> Result<bool> {
247        // Level 1: Existence check
248        let Some(stored_hash) = self.hashes.get(path) else {
249            // File not in index → definitely changed (new file)
250            return Ok(true);
251        };
252
253        // Check if file still exists
254        if !path.exists() {
255            // File was deleted → mark as changed so it can be removed from index
256            return Ok(true);
257        }
258
259        // Level 2: Metadata check (fast pre-screen)
260        if !stored_hash.metadata_changed(path)? {
261            // Metadata unchanged → file definitely hasn't changed
262            return Ok(false);
263        }
264
265        // Level 3: Hash check (metadata changed, need to verify content actually changed)
266        // This guards against cases like:
267        // - Touch command (mtime changed, content same)
268        // - Reverted edits (size/mtime changed, content back to original)
269        let current_hash = FileHash::compute(path)?;
270
271        Ok(current_hash.hash != stored_hash.hash)
272    }
273
274    /// Update the hash index with new file information.
275    ///
276    /// This should be called after successfully indexing a file.
277    pub fn update(&mut self, path: PathBuf, mut file_hash: FileHash) {
278        // Remove old entry if it exists to update total_symbols
279        if let Some(old_hash) = self.hashes.remove(&path) {
280            self.total_symbols = self.total_symbols.saturating_sub(old_hash.symbols_count);
281            self.file_count = self.file_count.saturating_sub(1);
282        }
283
284        // Add new entry
285        self.total_symbols += file_hash.symbols_count;
286        self.file_count += 1;
287
288        // Ensure path is consistent
289        file_hash.path.clone_from(&path);
290
291        self.hashes.insert(path, file_hash);
292    }
293
294    /// Remove a file from the index.
295    ///
296    /// This should be called when a file is deleted from the codebase.
297    pub fn remove(&mut self, path: &Path) -> Option<FileHash> {
298        if let Some(removed) = self.hashes.remove(path) {
299            self.total_symbols = self.total_symbols.saturating_sub(removed.symbols_count);
300            self.file_count = self.file_count.saturating_sub(1);
301            Some(removed)
302        } else {
303            None
304        }
305    }
306
307    /// Get hash information for a file.
308    #[must_use]
309    pub fn get(&self, path: &Path) -> Option<&FileHash> {
310        self.hashes.get(path)
311    }
312
313    /// Iterate over all tracked files.
314    pub fn iter(&self) -> impl Iterator<Item = (&PathBuf, &FileHash)> {
315        self.hashes.iter()
316    }
317
318    /// Get the number of tracked files.
319    #[must_use]
320    pub fn len(&self) -> usize {
321        self.file_count
322    }
323
324    /// Check if the index is empty.
325    #[must_use]
326    pub fn is_empty(&self) -> bool {
327        self.file_count == 0
328    }
329
330    /// Clear all entries from the index.
331    pub fn clear(&mut self) {
332        self.hashes.clear();
333        self.file_count = 0;
334        self.total_symbols = 0;
335    }
336
337    /// Get cached content for a file (Phase 3).
338    ///
339    /// Returns the cached content if available in memory.
340    /// This method is used by content-based diffing to obtain old file content
341    /// for comparison with the current file state.
342    ///
343    /// # Errors
344    ///
345    /// Returns an error if the content is not cached (was never cached or file too large).
346    /// This is intentional - we ONLY want cached content, not current disk content.
347    pub fn get_cached_content(&self, path: &Path) -> Result<String> {
348        // Try to get from cache
349        if let Some(file_hash) = self.hashes.get(path)
350            && let Some(ref content) = file_hash.content
351        {
352            return Ok(content.clone());
353        }
354
355        // Content not cached - this is an error, not a fallback case
356        anyhow::bail!("Content not cached for {}", path.display())
357    }
358
359    /// Cache file content for a file (Phase 3).
360    ///
361    /// Stores file content in memory for fast content-based diffing. The
362    /// maximum cached size is controlled by `content_cache_max_bytes`; when set
363    /// to `None` the cache is unbounded.
364    ///
365    /// This is called after successfully parsing a file to enable fast
366    /// incremental updates on the next change.
367    ///
368    /// # Size Limit
369    ///
370    /// Files larger than 100KB are not cached. This threshold can be tuned
371    /// based on memory constraints and typical file sizes in the codebase.
372    pub fn cache_content(&mut self, path: &Path, content: String) {
373        if let Some(limit) = self.content_cache_max_bytes
374            && content.len() > limit
375        {
376            log::trace!(
377                "Skipping content cache for {} (size: {} bytes > {} limit)",
378                path.display(),
379                content.len(),
380                limit
381            );
382            return;
383        }
384
385        if let Some(file_hash) = self.hashes.get_mut(path) {
386            let size = content.len();
387            file_hash.content = Some(content);
388            log::trace!("Cached content for {} ({size} bytes)", path.display());
389        }
390    }
391
392    /// Save the hash index to disk.
393    ///
394    /// The index is saved to `{cache_dir}/file_hashes.bin` using a versioned
395    /// envelope with postcard serialization (atomic write).
396    ///
397    /// # Errors
398    ///
399    /// Returns an error if the cache directory cannot be created or the file cannot be written.
400    pub fn save(&self, cache_dir: &Path) -> Result<()> {
401        // Ensure cache directory exists
402        fs::create_dir_all(cache_dir)
403            .with_context(|| format!("Failed to create cache directory {}", cache_dir.display()))?;
404
405        let hash_file = cache_dir.join("file_hashes.bin");
406
407        // Serialize payload and envelope
408        let payload =
409            postcard::to_allocvec(self).context("Failed to serialize hash index payload")?;
410
411        let envelope = HashIndexEnvelope {
412            magic: HASH_INDEX_MAGIC,
413            version: HASH_INDEX_ENVELOPE_VERSION,
414            sqry_version: env!("CARGO_PKG_VERSION").to_string(),
415            payload,
416        };
417
418        let bytes =
419            postcard::to_allocvec(&envelope).context("Failed to serialize hash index envelope")?;
420
421        // Atomic write: write to temp and then rename
422        let tmp_hash_index_file_path = hash_file.with_extension("bin.tmp");
423        fs::write(&tmp_hash_index_file_path, bytes).with_context(|| {
424            format!(
425                "Failed to write temp hash index to {}",
426                tmp_hash_index_file_path.display()
427            )
428        })?;
429
430        // Best-effort replace existing target
431        if hash_file.exists() {
432            let _ = fs::remove_file(&hash_file);
433        }
434        fs::rename(&tmp_hash_index_file_path, &hash_file).with_context(|| {
435            format!(
436                "Failed to atomically replace hash index at {} with temp {}",
437                hash_file.display(),
438                tmp_hash_index_file_path.display()
439            )
440        })?;
441
442        log::debug!(
443            "Saved hash index: {} files, {} symbols to {}",
444            self.file_count,
445            self.total_symbols,
446            hash_file.display()
447        );
448
449        Ok(())
450    }
451
452    /// Load the hash index from disk.
453    ///
454    /// If the file doesn't exist or cannot be read, returns an empty index.
455    ///
456    /// # Errors
457    ///
458    /// Returns an error only if the file exists but cannot be deserialized
459    /// (indicating corruption).
460    pub fn load(cache_dir: &Path) -> Result<Self> {
461        let hash_file = cache_dir.join("file_hashes.bin");
462
463        // If file doesn't exist, return empty index
464        if !hash_file.exists() {
465            log::debug!(
466                "No hash index found at {}, starting fresh",
467                hash_file.display()
468            );
469            return Ok(Self::new());
470        }
471
472        // Read file with size cap to prevent memory exhaustion from crafted files
473        const MAX_HASH_INDEX_BYTES: u64 = 256 * 1024 * 1024; // 256 MiB
474        let metadata = fs::metadata(&hash_file)
475            .with_context(|| format!("Failed to stat hash index: {}", hash_file.display()))?;
476        if metadata.len() > MAX_HASH_INDEX_BYTES {
477            anyhow::bail!(
478                "Hash index file is too large ({} bytes, max {}): {}",
479                metadata.len(),
480                MAX_HASH_INDEX_BYTES,
481                hash_file.display()
482            );
483        }
484        let bytes = fs::read(&hash_file)
485            .with_context(|| format!("Failed to read hash index from {}", hash_file.display()))?;
486
487        // Deserialize versioned envelope only (no legacy fallback)
488        let env: HashIndexEnvelope =
489            postcard::from_bytes(&bytes).context("Failed to deserialize hash index envelope")?;
490
491        if env.magic != HASH_INDEX_MAGIC {
492            anyhow::bail!("Invalid hash index magic: expected {HASH_INDEX_MAGIC:?}");
493        }
494        if env.version != HASH_INDEX_ENVELOPE_VERSION {
495            anyhow::bail!(
496                "Unsupported hash index version: {} (expected {})",
497                env.version,
498                HASH_INDEX_ENVELOPE_VERSION
499            );
500        }
501
502        let index: Self = postcard::from_bytes(&env.payload)
503            .context("Failed to deserialize hash index payload")?;
504
505        log::debug!(
506            "Loaded hash index: {} files, {} symbols from {}",
507            index.file_count,
508            index.total_symbols,
509            hash_file.display()
510        );
511        Ok(index)
512    }
513}
514
515impl Default for HashIndex {
516    fn default() -> Self {
517        Self::new()
518    }
519}
520
521#[cfg(test)]
522mod tests {
523    use super::*;
524    use std::io::Write;
525    use tempfile::{NamedTempFile, TempDir};
526
527    #[test]
528    fn test_file_hash_compute() {
529        let mut temp_file = NamedTempFile::new().unwrap();
530        temp_file.write_all(b"test content").unwrap();
531        temp_file.flush().unwrap();
532
533        let hash = FileHash::compute(temp_file.path()).unwrap();
534
535        assert_eq!(hash.size, 12); // "test content" is 12 bytes
536        assert!(hash.hash != 0); // Should have computed a hash
537        assert_eq!(hash.symbols_count, 0); // Default is 0
538    }
539
540    #[test]
541    fn test_file_hash_from_bytes() {
542        let mut temp_file = NamedTempFile::new().unwrap();
543        temp_file.write_all(b"test").unwrap();
544        temp_file.flush().unwrap();
545
546        let content = b"test";
547        let hash = FileHash::from_bytes(temp_file.path(), content).unwrap();
548
549        assert_eq!(hash.size, 4);
550        assert_eq!(hash.hash, xxh64(content, 0));
551    }
552
553    #[test]
554    fn test_file_hash_deterministic() {
555        let mut temp_file = NamedTempFile::new().unwrap();
556        let content = b"deterministic test content";
557        temp_file.write_all(content).unwrap();
558        temp_file.flush().unwrap();
559
560        let hash1 = FileHash::compute(temp_file.path()).unwrap();
561        let hash2 = FileHash::compute(temp_file.path()).unwrap();
562
563        assert_eq!(hash1.hash, hash2.hash);
564        assert_eq!(hash1.size, hash2.size);
565    }
566
567    #[test]
568    fn test_file_hash_different_content() {
569        let mut temp1 = NamedTempFile::new().unwrap();
570        temp1.write_all(b"content A").unwrap();
571        temp1.flush().unwrap();
572
573        let mut temp2 = NamedTempFile::new().unwrap();
574        temp2.write_all(b"content B").unwrap();
575        temp2.flush().unwrap();
576
577        let hash1 = FileHash::compute(temp1.path()).unwrap();
578        let hash2 = FileHash::compute(temp2.path()).unwrap();
579
580        assert_ne!(hash1.hash, hash2.hash);
581    }
582
583    #[test]
584    fn test_hash_index_new_file() {
585        let index = HashIndex::new();
586        let path = Path::new("nonexistent.rs");
587
588        // New file should be marked as changed
589        assert!(index.has_changed(path).unwrap());
590    }
591
592    #[test]
593    fn test_hash_index_unchanged_file() {
594        let mut temp_file = NamedTempFile::new().unwrap();
595        temp_file.write_all(b"unchanged content").unwrap();
596        temp_file.flush().unwrap();
597
598        let mut index = HashIndex::new();
599        let hash = FileHash::compute(temp_file.path()).unwrap();
600        index.update(temp_file.path().to_path_buf(), hash);
601
602        // File should not be marked as changed
603        assert!(!index.has_changed(temp_file.path()).unwrap());
604    }
605
606    #[test]
607    fn test_hash_index_changed_content() {
608        let mut temp_file = NamedTempFile::new().unwrap();
609        temp_file.write_all(b"original content").unwrap();
610        temp_file.flush().unwrap();
611
612        let mut index = HashIndex::new();
613        let hash = FileHash::compute(temp_file.path()).unwrap();
614        index.update(temp_file.path().to_path_buf(), hash);
615
616        // Modify file
617        temp_file.write_all(b" modified").unwrap();
618        temp_file.flush().unwrap();
619
620        // File should be marked as changed
621        assert!(index.has_changed(temp_file.path()).unwrap());
622    }
623
624    #[test]
625    fn test_hash_index_update_and_remove() {
626        let mut index = HashIndex::new();
627        let path = PathBuf::from("test.rs");
628
629        let mut hash = FileHash {
630            path: path.clone(),
631            hash: 12345,
632            size: 100,
633            mtime: SystemTime::now(),
634            symbols_count: 5,
635            content: None,
636        };
637
638        // Update with new file
639        index.update(path.clone(), hash.clone());
640        assert_eq!(index.len(), 1);
641        assert_eq!(index.total_symbols, 5);
642
643        // Update existing file with more symbols
644        hash.symbols_count = 10;
645        index.update(path.clone(), hash.clone());
646        assert_eq!(index.len(), 1); // Still 1 file
647        assert_eq!(index.total_symbols, 10); // Updated symbols
648
649        // Remove file
650        let removed = index.remove(&path);
651        assert!(removed.is_some());
652        assert_eq!(index.len(), 0);
653        assert_eq!(index.total_symbols, 0);
654    }
655
656    #[test]
657    fn test_hash_index_save_and_load() {
658        let tmp_index_dir = TempDir::new().unwrap();
659        let cache_dir = tmp_index_dir.path();
660
661        // Create index with some data
662        let mut index = HashIndex::new();
663        let path = PathBuf::from("test.rs");
664        let hash = FileHash {
665            path: path.clone(),
666            hash: 67890,
667            size: 200,
668            mtime: SystemTime::now(),
669            symbols_count: 15,
670            content: None,
671        };
672        index.update(path, hash);
673
674        // Save
675        index.save(cache_dir).unwrap();
676
677        // Load
678        let loaded = HashIndex::load(cache_dir).unwrap();
679
680        assert_eq!(loaded.len(), 1);
681        assert_eq!(loaded.total_symbols, 15);
682        assert_eq!(loaded.get(Path::new("test.rs")).unwrap().hash, 67890);
683    }
684
685    #[test]
686    fn test_hash_index_mtime_change_no_content_change() {
687        use filetime::{FileTime, set_file_mtime};
688        use std::time::Duration;
689
690        let mut temp_file = NamedTempFile::new().unwrap();
691        temp_file.write_all(b"same content").unwrap();
692        temp_file.flush().unwrap();
693
694        let mut index = HashIndex::new();
695        let hash = FileHash::compute(temp_file.path()).unwrap();
696        index.update(temp_file.path().to_path_buf(), hash);
697
698        // Change only the mtime (simulate touch) without modifying content
699        let meta = fs::metadata(temp_file.path()).unwrap();
700        let orig_mtime = meta.modified().unwrap();
701        let new_mtime = FileTime::from_system_time(orig_mtime + Duration::from_secs(60));
702        set_file_mtime(temp_file.path(), new_mtime).unwrap();
703
704        // Should detect metadata change, compute hash, and conclude unchanged
705        assert!(!index.has_changed(temp_file.path()).unwrap());
706    }
707
708    #[test]
709    fn test_hash_index_load_nonexistent() {
710        let tmp_index_dir = TempDir::new().unwrap();
711        let cache_dir = tmp_index_dir.path().join("nonexistent");
712
713        // Loading from nonexistent directory should return empty index
714        let index = HashIndex::load(&cache_dir).unwrap();
715
716        assert_eq!(index.len(), 0);
717        assert!(index.is_empty());
718    }
719
720    #[test]
721    fn test_hash_index_clear() {
722        let mut index = HashIndex::new();
723
724        // Add some entries
725        for i in 0_u64..5 {
726            let path = PathBuf::from(format!("file{i}.rs"));
727            let hash = FileHash {
728                path: path.clone(),
729                hash: i,
730                size: 100,
731                mtime: SystemTime::now(),
732                symbols_count: 3,
733                content: None,
734            };
735            index.update(path, hash);
736        }
737
738        assert_eq!(index.len(), 5);
739        assert_eq!(index.total_symbols, 15);
740
741        // Clear
742        index.clear();
743
744        assert_eq!(index.len(), 0);
745        assert_eq!(index.total_symbols, 0);
746        assert!(index.is_empty());
747    }
748
749    #[test]
750    fn test_xxhash64_performance_characteristic() {
751        // Test that XXHash64 is indeed very fast
752        // Generate 1MB of test data
753        let data = vec![0u8; 1_000_000];
754
755        let start = std::time::Instant::now();
756        let _hash = xxh64(&data, 0);
757        let elapsed = start.elapsed();
758
759        // XXHash64 should hash 1MB in well under 100ms on any modern CPU
760        // (typically <1ms locally, but CI shared runners can be much slower)
761        assert!(
762            elapsed.as_millis() < 100,
763            "XXHash64 took {elapsed:?} to hash 1MB (expected <100ms)"
764        );
765    }
766
767    #[test]
768    fn test_cache_small_file() {
769        // Ensure small files are cached when under the configured limit (default: unlimited)
770        let mut temp_file = NamedTempFile::new().unwrap();
771        let content = "Small file content for caching test";
772        temp_file.write_all(content.as_bytes()).unwrap();
773        temp_file.flush().unwrap();
774
775        let mut index = HashIndex::new();
776        let hash = FileHash::compute(temp_file.path()).unwrap();
777        index.update(temp_file.path().to_path_buf(), hash);
778
779        // Cache the content
780        index.cache_content(temp_file.path(), content.to_string());
781
782        // Verify it was cached
783        let cached = index.get_cached_content(temp_file.path()).unwrap();
784        assert_eq!(cached, content);
785
786        // Verify it's stored in the FileHash struct
787        let file_hash = index.get(temp_file.path()).unwrap();
788        assert!(file_hash.content.is_some());
789        assert_eq!(file_hash.content.as_ref().unwrap(), content);
790    }
791
792    #[test]
793    fn test_skip_large_file_when_limit_configured() {
794        // Verify that the optional limit is honoured when configured
795        let mut temp_file = NamedTempFile::new().unwrap();
796        // Create content larger than 100KB
797        let large_content = "x".repeat(101_000); // 101KB
798        temp_file.write_all(large_content.as_bytes()).unwrap();
799        temp_file.flush().unwrap();
800
801        let mut index = HashIndex::with_content_cache_limit(Some(100_000));
802        let hash = FileHash::compute(temp_file.path()).unwrap();
803        index.update(temp_file.path().to_path_buf(), hash);
804
805        // Attempt to cache large content
806        index.cache_content(temp_file.path(), large_content.clone());
807
808        // Verify it was NOT cached
809        let file_hash = index.get(temp_file.path()).unwrap();
810        assert!(file_hash.content.is_none());
811
812        // get_cached_content should return error (content not cached)
813        assert!(index.get_cached_content(temp_file.path()).is_err());
814    }
815
816    #[test]
817    fn test_large_file_cached_without_limit() {
818        // By default the cache is unbounded; large files should therefore be cached
819        let mut temp_file = NamedTempFile::new().unwrap();
820        let large_content = "x".repeat(101_000); // 101KB
821        temp_file.write_all(large_content.as_bytes()).unwrap();
822        temp_file.flush().unwrap();
823
824        let mut index = HashIndex::new();
825        let hash = FileHash::compute(temp_file.path()).unwrap();
826        index.update(temp_file.path().to_path_buf(), hash);
827
828        index.cache_content(temp_file.path(), large_content.clone());
829
830        let cached = index.get_cached_content(temp_file.path()).unwrap();
831        assert_eq!(cached.len(), large_content.len());
832    }
833
834    #[test]
835    fn test_get_cached_content_error_when_not_cached() {
836        // Phase 3: Test that get_cached_content returns error if not cached
837        let mut temp_file = NamedTempFile::new().unwrap();
838        let content = "Test content";
839        temp_file.write_all(content.as_bytes()).unwrap();
840        temp_file.flush().unwrap();
841
842        let mut index = HashIndex::new();
843        let hash = FileHash::compute(temp_file.path()).unwrap();
844        index.update(temp_file.path().to_path_buf(), hash);
845
846        // Don't cache the content
847
848        // get_cached_content should return error (content not cached)
849        assert!(index.get_cached_content(temp_file.path()).is_err());
850    }
851}