Skip to main content

sqry_core/indexing/
incremental.rs

1//! Incremental indexing with hash-based change detection.
2//!
3//! This module provides fast file change detection using `XXHash64` (~10GB/s hashing speed)
4//! to enable incremental re-indexing. Only files that have changed since the last index
5//! are re-parsed, achieving 10-100x speedup for re-indexing operations.
6//!
7//! # Architecture
8//!
9//! The incremental indexing system uses a 3-level change detection strategy:
10//!
11//! 1. **Existence check**: Is the file still there?
12//! 2. **Metadata check**: Has size or mtime changed?
13//! 3. **Hash check**: Has content actually changed? (guards against mtime-only changes)
14//!
15//! # Usage
16//!
17//! ```rust,ignore
18//! use sqry_core::indexing::{HashIndex, FileHash};
19//! use std::path::Path;
20//!
21//! // Load existing hash index
22//! let cache_dir = Path::new(".sqry-cache");
23//! let mut hash_index = HashIndex::load(cache_dir)?;
24//!
25//! // Check if a file has changed
26//! let file_path = Path::new("src/main.rs");
27//! if hash_index.has_changed(file_path)? {
28//!     println!("File changed, re-indexing...");
29//!     // Re-index the file
30//!     let new_hash = FileHash::compute(file_path)?;
31//!     hash_index.update(file_path.to_path_buf(), new_hash);
32//! }
33//!
34//! // Save updated hash index
35//! hash_index.save(cache_dir)?;
36//! ```
37//!
38//! # Performance
39//!
40//! - `XXHash64` hashing: ~10 GB/s (vs BLAKE3 ~1 GB/s)
41//! - Metadata checks: ~1M files/sec
42//! - Target: 10-100x faster re-indexing for typical codebases
43
44use anyhow::{Context, Result};
45use serde::{Deserialize, Serialize};
46use std::collections::HashMap;
47use std::fs;
48use std::path::{Path, PathBuf};
49use std::time::SystemTime;
50use xxhash_rust::xxh64::xxh64;
51
52use crate::config::buffers::parse_buffer_size;
53
54const HASH_INDEX_MAGIC: [u8; 7] = *b"SQRYHSH";
55const HASH_INDEX_ENVELOPE_VERSION: u16 = 1;
56
57#[derive(Serialize, Deserialize)]
58struct HashIndexEnvelope {
59    magic: [u8; 7],
60    version: u16,
61    sqry_version: String,
62    payload: Vec<u8>,
63}
64
65/// Hash information for a single file.
66///
67/// This structure stores both the content hash (for change detection)
68/// and metadata (for quick pre-checks before expensive hashing).
69///
70/// # Phase 3: Content Caching
71///
72/// The optional `content` field caches file content in memory to enable
73/// content-based diffing without disk I/O. This is controlled by the
74/// `HashIndex` content-cache limit (configurable per builder) and is not
75/// persisted to disk (`#[serde(skip)]`).
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct FileHash {
78    /// Absolute path to the file
79    pub path: PathBuf,
80    /// `XXHash64` hash of file contents
81    pub hash: u64,
82    /// File size in bytes
83    pub size: u64,
84    /// Last modification time
85    pub mtime: SystemTime,
86    /// Number of symbols extracted from this file (for stats)
87    pub symbols_count: usize,
88    /// Cached file content for content-based diffing (Phase 3/4).
89    ///
90    /// Populated according to the content-cache size configuration managed by
91    /// `HashIndex`. Not persisted to disk (rebuilt on load if needed).
92    #[serde(skip)]
93    pub content: Option<String>,
94}
95
96impl FileHash {
97    /// Compute hash for a file.
98    ///
99    /// This reads the file in chunks and computes an `XXHash64` hash.
100    ///
101    /// # Errors
102    ///
103    /// Returns an error if the file cannot be read or metadata cannot be accessed.
104    pub fn compute(path: &Path) -> Result<Self> {
105        use std::io::Read;
106
107        // Get metadata first
108        let metadata = fs::metadata(path)
109            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
110
111        let size = metadata.len();
112        let mtime = metadata
113            .modified()
114            .with_context(|| format!("Failed to get modification time for {}", path.display()))?;
115
116        // Read and hash file contents
117        let mut file = fs::File::open(path)
118            .with_context(|| format!("Failed to open file {}", path.display()))?;
119
120        // Read in chunks for efficiency (respects SQRY_PARSE_BUFFER env var)
121        let mut buffer = vec![0u8; parse_buffer_size()];
122        let mut hasher = xxhash_rust::xxh64::Xxh64::new(0); // Use seed 0 for consistency
123
124        loop {
125            let bytes_read = file
126                .read(&mut buffer)
127                .with_context(|| format!("Failed to read file {}", path.display()))?;
128
129            if bytes_read == 0 {
130                break;
131            }
132
133            hasher.update(&buffer[..bytes_read]);
134        }
135
136        let hash = hasher.digest();
137
138        Ok(Self {
139            path: path.to_path_buf(),
140            hash,
141            size,
142            mtime,
143            symbols_count: 0, // Will be updated after indexing
144            content: None,    // Phase 3: Not cached during compute (populated later)
145        })
146    }
147
148    /// Quick compute using just the file bytes (for testing or small files).
149    ///
150    /// This is faster for small files but requires loading the entire file into memory.
151    ///
152    /// # Errors
153    ///
154    /// Returns [`anyhow::Error`] when the filesystem metadata for `path` cannot be read.
155    pub fn from_bytes(path: &Path, content: &[u8]) -> Result<Self> {
156        let metadata = fs::metadata(path)
157            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
158
159        let hash = xxh64(content, 0); // Use seed 0 for consistency
160
161        Ok(Self {
162            path: path.to_path_buf(),
163            hash,
164            size: content.len() as u64,
165            mtime: metadata.modified().with_context(|| {
166                format!("Failed to get modification time for {}", path.display())
167            })?,
168            symbols_count: 0,
169            content: None, // Phase 3: Not cached by default
170        })
171    }
172
173    /// Check if file metadata (size or mtime) has changed.
174    ///
175    /// This is a fast pre-check before expensive hashing.
176    ///
177    /// # Errors
178    ///
179    /// Returns [`anyhow::Error`] when the file's metadata cannot be read.
180    pub fn metadata_changed(&self, path: &Path) -> Result<bool> {
181        let metadata = fs::metadata(path)
182            .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
183
184        let current_size = metadata.len();
185        let current_mtime = metadata
186            .modified()
187            .with_context(|| format!("Failed to get modification time for {}", path.display()))?;
188
189        Ok(current_size != self.size || current_mtime != self.mtime)
190    }
191}
192
193/// Index of file hashes for incremental indexing.
194///
195/// This structure maintains a mapping from file paths to their hash information,
196/// enabling fast change detection during re-indexing operations.
197#[derive(Debug, Clone, Serialize, Deserialize)]
198pub struct HashIndex {
199    /// Map from file path to hash information
200    hashes: HashMap<PathBuf, FileHash>,
201    /// Total number of files tracked
202    pub file_count: usize,
203    /// Total number of symbols across all files
204    pub total_symbols: usize,
205    /// Maximum number of bytes to store per cached file (None = unlimited)
206    #[serde(default)]
207    content_cache_max_bytes: Option<usize>,
208}
209
210impl HashIndex {
211    /// Create a new empty hash index.
212    #[must_use]
213    pub fn new() -> Self {
214        Self::with_content_cache_limit(None)
215    }
216
217    /// Create a new hash index with an explicit content cache limit.
218    #[must_use]
219    pub fn with_content_cache_limit(limit: Option<usize>) -> Self {
220        Self {
221            hashes: HashMap::new(),
222            file_count: 0,
223            total_symbols: 0,
224            content_cache_max_bytes: limit,
225        }
226    }
227
228    /// Override the content cache limit at runtime.
229    pub fn set_content_cache_limit(&mut self, limit: Option<usize>) {
230        self.content_cache_max_bytes = limit;
231    }
232
233    /// Check if a file has changed using 3-level detection.
234    ///
235    /// Returns `true` if the file should be re-indexed, `false` if it can be skipped.
236    ///
237    /// # Detection Levels
238    ///
239    /// 1. **Existence**: If file not in index or doesn't exist on disk → changed
240    /// 2. **Metadata**: If size or mtime differs → check hash
241    /// 3. **Hash**: If hash differs → changed
242    ///
243    /// # Errors
244    ///
245    /// Returns an error if file metadata cannot be read or file cannot be hashed.
246    pub fn has_changed(&self, path: &Path) -> Result<bool> {
247        // Level 1: Existence check
248        let Some(stored_hash) = self.hashes.get(path) else {
249            // File not in index → definitely changed (new file)
250            return Ok(true);
251        };
252
253        // Check if file still exists
254        if !path.exists() {
255            // File was deleted → mark as changed so it can be removed from index
256            return Ok(true);
257        }
258
259        // Level 2: Metadata check (fast pre-screen)
260        if !stored_hash.metadata_changed(path)? {
261            // Metadata unchanged → file definitely hasn't changed
262            return Ok(false);
263        }
264
265        // Level 3: Hash check (metadata changed, need to verify content actually changed)
266        // This guards against cases like:
267        // - Touch command (mtime changed, content same)
268        // - Reverted edits (size/mtime changed, content back to original)
269        let current_hash = FileHash::compute(path)?;
270
271        Ok(current_hash.hash != stored_hash.hash)
272    }
273
274    /// Update the hash index with new file information.
275    ///
276    /// This should be called after successfully indexing a file.
277    pub fn update(&mut self, path: PathBuf, mut file_hash: FileHash) {
278        // Remove old entry if it exists to update total_symbols
279        if let Some(old_hash) = self.hashes.remove(&path) {
280            self.total_symbols = self.total_symbols.saturating_sub(old_hash.symbols_count);
281            self.file_count = self.file_count.saturating_sub(1);
282        }
283
284        // Add new entry
285        self.total_symbols += file_hash.symbols_count;
286        self.file_count += 1;
287
288        // Ensure path is consistent
289        file_hash.path.clone_from(&path);
290
291        self.hashes.insert(path, file_hash);
292    }
293
294    /// Remove a file from the index.
295    ///
296    /// This should be called when a file is deleted from the codebase.
297    pub fn remove(&mut self, path: &Path) -> Option<FileHash> {
298        if let Some(removed) = self.hashes.remove(path) {
299            self.total_symbols = self.total_symbols.saturating_sub(removed.symbols_count);
300            self.file_count = self.file_count.saturating_sub(1);
301            Some(removed)
302        } else {
303            None
304        }
305    }
306
307    /// Get hash information for a file.
308    #[must_use]
309    pub fn get(&self, path: &Path) -> Option<&FileHash> {
310        self.hashes.get(path)
311    }
312
313    /// Iterate over all tracked files.
314    pub fn iter(&self) -> impl Iterator<Item = (&PathBuf, &FileHash)> {
315        self.hashes.iter()
316    }
317
318    /// Get the number of tracked files.
319    #[must_use]
320    pub fn len(&self) -> usize {
321        self.file_count
322    }
323
324    /// Check if the index is empty.
325    #[must_use]
326    pub fn is_empty(&self) -> bool {
327        self.file_count == 0
328    }
329
330    /// Clear all entries from the index.
331    pub fn clear(&mut self) {
332        self.hashes.clear();
333        self.file_count = 0;
334        self.total_symbols = 0;
335    }
336
337    /// Get cached content for a file (Phase 3).
338    ///
339    /// Returns the cached content if available in memory.
340    /// This method is used by content-based diffing to obtain old file content
341    /// for comparison with the current file state.
342    ///
343    /// # Errors
344    ///
345    /// Returns an error if the content is not cached (was never cached or file too large).
346    /// This is intentional - we ONLY want cached content, not current disk content.
347    pub fn get_cached_content(&self, path: &Path) -> Result<String> {
348        // Try to get from cache
349        if let Some(file_hash) = self.hashes.get(path)
350            && let Some(ref content) = file_hash.content
351        {
352            return Ok(content.clone());
353        }
354
355        // Content not cached - this is an error, not a fallback case
356        anyhow::bail!("Content not cached for {}", path.display())
357    }
358
359    /// Cache file content for a file (Phase 3).
360    ///
361    /// Stores file content in memory for fast content-based diffing. The
362    /// maximum cached size is controlled by `content_cache_max_bytes`; when set
363    /// to `None` the cache is unbounded.
364    ///
365    /// This is called after successfully parsing a file to enable fast
366    /// incremental updates on the next change.
367    ///
368    /// # Size Limit
369    ///
370    /// Files larger than 100KB are not cached. This threshold can be tuned
371    /// based on memory constraints and typical file sizes in the codebase.
372    pub fn cache_content(&mut self, path: &Path, content: String) {
373        if let Some(limit) = self.content_cache_max_bytes
374            && content.len() > limit
375        {
376            log::trace!(
377                "Skipping content cache for {} (size: {} bytes > {} limit)",
378                path.display(),
379                content.len(),
380                limit
381            );
382            return;
383        }
384
385        if let Some(file_hash) = self.hashes.get_mut(path) {
386            let size = content.len();
387            file_hash.content = Some(content);
388            log::trace!("Cached content for {} ({size} bytes)", path.display());
389        }
390    }
391
392    /// Save the hash index to disk.
393    ///
394    /// The index is saved to `{cache_dir}/file_hashes.bin` using a versioned
395    /// envelope with postcard serialization (atomic write).
396    ///
397    /// # Errors
398    ///
399    /// Returns an error if the cache directory cannot be created or the file cannot be written.
400    pub fn save(&self, cache_dir: &Path) -> Result<()> {
401        // Ensure cache directory exists
402        fs::create_dir_all(cache_dir)
403            .with_context(|| format!("Failed to create cache directory {}", cache_dir.display()))?;
404
405        let hash_file = cache_dir.join("file_hashes.bin");
406
407        // Serialize payload and envelope
408        let payload =
409            postcard::to_allocvec(self).context("Failed to serialize hash index payload")?;
410
411        let envelope = HashIndexEnvelope {
412            magic: HASH_INDEX_MAGIC,
413            version: HASH_INDEX_ENVELOPE_VERSION,
414            sqry_version: env!("CARGO_PKG_VERSION").to_string(),
415            payload,
416        };
417
418        let bytes =
419            postcard::to_allocvec(&envelope).context("Failed to serialize hash index envelope")?;
420
421        // Atomic write: write to temp and then rename
422        let tmp_hash_index_file_path = hash_file.with_extension("bin.tmp");
423        fs::write(&tmp_hash_index_file_path, bytes).with_context(|| {
424            format!(
425                "Failed to write temp hash index to {}",
426                tmp_hash_index_file_path.display()
427            )
428        })?;
429
430        // Best-effort replace existing target
431        if hash_file.exists() {
432            let _ = fs::remove_file(&hash_file);
433        }
434        fs::rename(&tmp_hash_index_file_path, &hash_file).with_context(|| {
435            format!(
436                "Failed to atomically replace hash index at {} with temp {}",
437                hash_file.display(),
438                tmp_hash_index_file_path.display()
439            )
440        })?;
441
442        log::debug!(
443            "Saved hash index: {} files, {} symbols to {}",
444            self.file_count,
445            self.total_symbols,
446            hash_file.display()
447        );
448
449        Ok(())
450    }
451
452    /// Load the hash index from disk.
453    ///
454    /// If the file doesn't exist or cannot be read, returns an empty index.
455    ///
456    /// # Errors
457    ///
458    /// Returns an error only if the file exists but cannot be deserialized
459    /// (indicating corruption).
460    pub fn load(cache_dir: &Path) -> Result<Self> {
461        let hash_file = cache_dir.join("file_hashes.bin");
462
463        // If file doesn't exist, return empty index
464        if !hash_file.exists() {
465            log::debug!(
466                "No hash index found at {}, starting fresh",
467                hash_file.display()
468            );
469            return Ok(Self::new());
470        }
471
472        // Read file
473        let bytes = fs::read(&hash_file)
474            .with_context(|| format!("Failed to read hash index from {}", hash_file.display()))?;
475
476        // Deserialize versioned envelope only (no legacy fallback)
477        let env: HashIndexEnvelope =
478            postcard::from_bytes(&bytes).context("Failed to deserialize hash index envelope")?;
479
480        if env.magic != HASH_INDEX_MAGIC {
481            anyhow::bail!("Invalid hash index magic: expected {HASH_INDEX_MAGIC:?}");
482        }
483        if env.version != HASH_INDEX_ENVELOPE_VERSION {
484            anyhow::bail!(
485                "Unsupported hash index version: {} (expected {})",
486                env.version,
487                HASH_INDEX_ENVELOPE_VERSION
488            );
489        }
490
491        let index: Self = postcard::from_bytes(&env.payload)
492            .context("Failed to deserialize hash index payload")?;
493
494        log::debug!(
495            "Loaded hash index: {} files, {} symbols from {}",
496            index.file_count,
497            index.total_symbols,
498            hash_file.display()
499        );
500        Ok(index)
501    }
502}
503
504impl Default for HashIndex {
505    fn default() -> Self {
506        Self::new()
507    }
508}
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    use std::io::Write;
514    use tempfile::{NamedTempFile, TempDir};
515
516    #[test]
517    fn test_file_hash_compute() {
518        let mut temp_file = NamedTempFile::new().unwrap();
519        temp_file.write_all(b"test content").unwrap();
520        temp_file.flush().unwrap();
521
522        let hash = FileHash::compute(temp_file.path()).unwrap();
523
524        assert_eq!(hash.size, 12); // "test content" is 12 bytes
525        assert!(hash.hash != 0); // Should have computed a hash
526        assert_eq!(hash.symbols_count, 0); // Default is 0
527    }
528
529    #[test]
530    fn test_file_hash_from_bytes() {
531        let mut temp_file = NamedTempFile::new().unwrap();
532        temp_file.write_all(b"test").unwrap();
533        temp_file.flush().unwrap();
534
535        let content = b"test";
536        let hash = FileHash::from_bytes(temp_file.path(), content).unwrap();
537
538        assert_eq!(hash.size, 4);
539        assert_eq!(hash.hash, xxh64(content, 0));
540    }
541
542    #[test]
543    fn test_file_hash_deterministic() {
544        let mut temp_file = NamedTempFile::new().unwrap();
545        let content = b"deterministic test content";
546        temp_file.write_all(content).unwrap();
547        temp_file.flush().unwrap();
548
549        let hash1 = FileHash::compute(temp_file.path()).unwrap();
550        let hash2 = FileHash::compute(temp_file.path()).unwrap();
551
552        assert_eq!(hash1.hash, hash2.hash);
553        assert_eq!(hash1.size, hash2.size);
554    }
555
556    #[test]
557    fn test_file_hash_different_content() {
558        let mut temp1 = NamedTempFile::new().unwrap();
559        temp1.write_all(b"content A").unwrap();
560        temp1.flush().unwrap();
561
562        let mut temp2 = NamedTempFile::new().unwrap();
563        temp2.write_all(b"content B").unwrap();
564        temp2.flush().unwrap();
565
566        let hash1 = FileHash::compute(temp1.path()).unwrap();
567        let hash2 = FileHash::compute(temp2.path()).unwrap();
568
569        assert_ne!(hash1.hash, hash2.hash);
570    }
571
572    #[test]
573    fn test_hash_index_new_file() {
574        let index = HashIndex::new();
575        let path = Path::new("nonexistent.rs");
576
577        // New file should be marked as changed
578        assert!(index.has_changed(path).unwrap());
579    }
580
581    #[test]
582    fn test_hash_index_unchanged_file() {
583        let mut temp_file = NamedTempFile::new().unwrap();
584        temp_file.write_all(b"unchanged content").unwrap();
585        temp_file.flush().unwrap();
586
587        let mut index = HashIndex::new();
588        let hash = FileHash::compute(temp_file.path()).unwrap();
589        index.update(temp_file.path().to_path_buf(), hash);
590
591        // File should not be marked as changed
592        assert!(!index.has_changed(temp_file.path()).unwrap());
593    }
594
595    #[test]
596    fn test_hash_index_changed_content() {
597        let mut temp_file = NamedTempFile::new().unwrap();
598        temp_file.write_all(b"original content").unwrap();
599        temp_file.flush().unwrap();
600
601        let mut index = HashIndex::new();
602        let hash = FileHash::compute(temp_file.path()).unwrap();
603        index.update(temp_file.path().to_path_buf(), hash);
604
605        // Modify file
606        temp_file.write_all(b" modified").unwrap();
607        temp_file.flush().unwrap();
608
609        // File should be marked as changed
610        assert!(index.has_changed(temp_file.path()).unwrap());
611    }
612
613    #[test]
614    fn test_hash_index_update_and_remove() {
615        let mut index = HashIndex::new();
616        let path = PathBuf::from("test.rs");
617
618        let mut hash = FileHash {
619            path: path.clone(),
620            hash: 12345,
621            size: 100,
622            mtime: SystemTime::now(),
623            symbols_count: 5,
624            content: None,
625        };
626
627        // Update with new file
628        index.update(path.clone(), hash.clone());
629        assert_eq!(index.len(), 1);
630        assert_eq!(index.total_symbols, 5);
631
632        // Update existing file with more symbols
633        hash.symbols_count = 10;
634        index.update(path.clone(), hash.clone());
635        assert_eq!(index.len(), 1); // Still 1 file
636        assert_eq!(index.total_symbols, 10); // Updated symbols
637
638        // Remove file
639        let removed = index.remove(&path);
640        assert!(removed.is_some());
641        assert_eq!(index.len(), 0);
642        assert_eq!(index.total_symbols, 0);
643    }
644
645    #[test]
646    fn test_hash_index_save_and_load() {
647        let tmp_index_dir = TempDir::new().unwrap();
648        let cache_dir = tmp_index_dir.path();
649
650        // Create index with some data
651        let mut index = HashIndex::new();
652        let path = PathBuf::from("test.rs");
653        let hash = FileHash {
654            path: path.clone(),
655            hash: 67890,
656            size: 200,
657            mtime: SystemTime::now(),
658            symbols_count: 15,
659            content: None,
660        };
661        index.update(path, hash);
662
663        // Save
664        index.save(cache_dir).unwrap();
665
666        // Load
667        let loaded = HashIndex::load(cache_dir).unwrap();
668
669        assert_eq!(loaded.len(), 1);
670        assert_eq!(loaded.total_symbols, 15);
671        assert_eq!(loaded.get(Path::new("test.rs")).unwrap().hash, 67890);
672    }
673
674    #[test]
675    fn test_hash_index_mtime_change_no_content_change() {
676        use filetime::{FileTime, set_file_mtime};
677        use std::time::Duration;
678
679        let mut temp_file = NamedTempFile::new().unwrap();
680        temp_file.write_all(b"same content").unwrap();
681        temp_file.flush().unwrap();
682
683        let mut index = HashIndex::new();
684        let hash = FileHash::compute(temp_file.path()).unwrap();
685        index.update(temp_file.path().to_path_buf(), hash);
686
687        // Change only the mtime (simulate touch) without modifying content
688        let meta = fs::metadata(temp_file.path()).unwrap();
689        let orig_mtime = meta.modified().unwrap();
690        let new_mtime = FileTime::from_system_time(orig_mtime + Duration::from_secs(60));
691        set_file_mtime(temp_file.path(), new_mtime).unwrap();
692
693        // Should detect metadata change, compute hash, and conclude unchanged
694        assert!(!index.has_changed(temp_file.path()).unwrap());
695    }
696
697    #[test]
698    fn test_hash_index_load_nonexistent() {
699        let tmp_index_dir = TempDir::new().unwrap();
700        let cache_dir = tmp_index_dir.path().join("nonexistent");
701
702        // Loading from nonexistent directory should return empty index
703        let index = HashIndex::load(&cache_dir).unwrap();
704
705        assert_eq!(index.len(), 0);
706        assert!(index.is_empty());
707    }
708
709    #[test]
710    fn test_hash_index_clear() {
711        let mut index = HashIndex::new();
712
713        // Add some entries
714        for i in 0_u64..5 {
715            let path = PathBuf::from(format!("file{i}.rs"));
716            let hash = FileHash {
717                path: path.clone(),
718                hash: i,
719                size: 100,
720                mtime: SystemTime::now(),
721                symbols_count: 3,
722                content: None,
723            };
724            index.update(path, hash);
725        }
726
727        assert_eq!(index.len(), 5);
728        assert_eq!(index.total_symbols, 15);
729
730        // Clear
731        index.clear();
732
733        assert_eq!(index.len(), 0);
734        assert_eq!(index.total_symbols, 0);
735        assert!(index.is_empty());
736    }
737
738    #[test]
739    fn test_xxhash64_performance_characteristic() {
740        // Test that XXHash64 is indeed very fast
741        // Generate 1MB of test data
742        let data = vec![0u8; 1_000_000];
743
744        let start = std::time::Instant::now();
745        let _hash = xxh64(&data, 0);
746        let elapsed = start.elapsed();
747
748        // XXHash64 should hash 1MB in well under 10ms on any modern CPU
749        // (typically <1ms, allowing generous margin for CI systems)
750        assert!(
751            elapsed.as_millis() < 20,
752            "XXHash64 took {elapsed:?} to hash 1MB (expected <20ms)"
753        );
754    }
755
756    #[test]
757    fn test_cache_small_file() {
758        // Ensure small files are cached when under the configured limit (default: unlimited)
759        let mut temp_file = NamedTempFile::new().unwrap();
760        let content = "Small file content for caching test";
761        temp_file.write_all(content.as_bytes()).unwrap();
762        temp_file.flush().unwrap();
763
764        let mut index = HashIndex::new();
765        let hash = FileHash::compute(temp_file.path()).unwrap();
766        index.update(temp_file.path().to_path_buf(), hash);
767
768        // Cache the content
769        index.cache_content(temp_file.path(), content.to_string());
770
771        // Verify it was cached
772        let cached = index.get_cached_content(temp_file.path()).unwrap();
773        assert_eq!(cached, content);
774
775        // Verify it's stored in the FileHash struct
776        let file_hash = index.get(temp_file.path()).unwrap();
777        assert!(file_hash.content.is_some());
778        assert_eq!(file_hash.content.as_ref().unwrap(), content);
779    }
780
781    #[test]
782    fn test_skip_large_file_when_limit_configured() {
783        // Verify that the optional limit is honoured when configured
784        let mut temp_file = NamedTempFile::new().unwrap();
785        // Create content larger than 100KB
786        let large_content = "x".repeat(101_000); // 101KB
787        temp_file.write_all(large_content.as_bytes()).unwrap();
788        temp_file.flush().unwrap();
789
790        let mut index = HashIndex::with_content_cache_limit(Some(100_000));
791        let hash = FileHash::compute(temp_file.path()).unwrap();
792        index.update(temp_file.path().to_path_buf(), hash);
793
794        // Attempt to cache large content
795        index.cache_content(temp_file.path(), large_content.clone());
796
797        // Verify it was NOT cached
798        let file_hash = index.get(temp_file.path()).unwrap();
799        assert!(file_hash.content.is_none());
800
801        // get_cached_content should return error (content not cached)
802        assert!(index.get_cached_content(temp_file.path()).is_err());
803    }
804
805    #[test]
806    fn test_large_file_cached_without_limit() {
807        // By default the cache is unbounded; large files should therefore be cached
808        let mut temp_file = NamedTempFile::new().unwrap();
809        let large_content = "x".repeat(101_000); // 101KB
810        temp_file.write_all(large_content.as_bytes()).unwrap();
811        temp_file.flush().unwrap();
812
813        let mut index = HashIndex::new();
814        let hash = FileHash::compute(temp_file.path()).unwrap();
815        index.update(temp_file.path().to_path_buf(), hash);
816
817        index.cache_content(temp_file.path(), large_content.clone());
818
819        let cached = index.get_cached_content(temp_file.path()).unwrap();
820        assert_eq!(cached.len(), large_content.len());
821    }
822
823    #[test]
824    fn test_get_cached_content_error_when_not_cached() {
825        // Phase 3: Test that get_cached_content returns error if not cached
826        let mut temp_file = NamedTempFile::new().unwrap();
827        let content = "Test content";
828        temp_file.write_all(content.as_bytes()).unwrap();
829        temp_file.flush().unwrap();
830
831        let mut index = HashIndex::new();
832        let hash = FileHash::compute(temp_file.path()).unwrap();
833        index.update(temp_file.path().to_path_buf(), hash);
834
835        // Don't cache the content
836
837        // get_cached_content should return error (content not cached)
838        assert!(index.get_cached_content(temp_file.path()).is_err());
839    }
840}