Skip to main content

codemem_engine/index/
incremental.rs

1//! SHA-256 based change detection for incremental indexing.
2//!
3//! Tracks content hashes of previously indexed files so unchanged files
4//! can be skipped on subsequent indexing runs.
5
6use sha2::{Digest, Sha256};
7use std::collections::HashMap;
8
9/// Tracks file content hashes for incremental change detection.
10pub struct ChangeDetector {
11    /// Map of file_path -> SHA-256 hex hash from last index run.
12    known_hashes: HashMap<String, String>,
13}
14
15impl ChangeDetector {
16    /// Create a new empty ChangeDetector.
17    pub fn new() -> Self {
18        Self {
19            known_hashes: HashMap::new(),
20        }
21    }
22
23    /// Load previously known hashes from storage.
24    ///
25    /// This reads from the `file_hashes` table if it exists. If the table
26    /// doesn't exist or the query fails, starts fresh with no known hashes.
27    pub fn load_from_storage(&mut self, storage: &dyn codemem_core::StorageBackend) {
28        match storage.load_file_hashes() {
29            Ok(hashes) => {
30                tracing::debug!("Loaded {} known file hashes", hashes.len());
31                self.known_hashes = hashes;
32            }
33            Err(e) => {
34                tracing::warn!("Failed to load file hashes, starting fresh: {e}");
35            }
36        }
37    }
38
39    /// Save current hashes to storage.
40    pub fn save_to_storage(
41        &self,
42        storage: &dyn codemem_core::StorageBackend,
43    ) -> Result<(), codemem_core::CodememError> {
44        storage.save_file_hashes(&self.known_hashes)
45    }
46
47    /// Check if a file has changed since the last index.
48    /// Returns `true` if the file is new or its content hash differs.
49    pub fn is_changed(&self, path: &str, content: &[u8]) -> bool {
50        let hash = Self::hash_content(content);
51        self.known_hashes.get(path) != Some(&hash)
52    }
53
54    /// Check if a file has changed and return (changed, hash) to avoid double-hashing.
55    /// Callers can pass the returned hash to `record_hash` to skip recomputation.
56    pub fn check_changed(&self, path: &str, content: &[u8]) -> (bool, String) {
57        let hash = Self::hash_content(content);
58        let changed = self.known_hashes.get(path) != Some(&hash);
59        (changed, hash)
60    }
61
62    /// Update the stored hash for a file after successful indexing.
63    pub fn update_hash(&mut self, path: &str, content: &[u8]) {
64        let hash = Self::hash_content(content);
65        self.known_hashes.insert(path.to_string(), hash);
66    }
67
68    /// Record a pre-computed hash for a file (avoids re-hashing when used with `check_changed`).
69    pub fn record_hash(&mut self, path: &str, hash: String) {
70        self.known_hashes.insert(path.to_string(), hash);
71    }
72
73    /// Remove the hash for a file (e.g., when it's deleted).
74    pub fn remove_hash(&mut self, path: &str) {
75        self.known_hashes.remove(path);
76    }
77
78    /// Get the number of tracked files.
79    pub fn tracked_count(&self) -> usize {
80        self.known_hashes.len()
81    }
82
83    /// Compute SHA-256 hash of content bytes.
84    fn hash_content(content: &[u8]) -> String {
85        let mut hasher = Sha256::new();
86        hasher.update(content);
87        format!("{:x}", hasher.finalize())
88    }
89}
90
91impl Default for ChangeDetector {
92    fn default() -> Self {
93        Self::new()
94    }
95}
96
97#[cfg(test)]
98#[path = "tests/incremental_tests.rs"]
99mod tests;