Skip to main content

codemem_engine/index/
incremental.rs

1//! SHA-256 based change detection for incremental indexing.
2//!
3//! Tracks content hashes of previously indexed files so unchanged files
4//! can be skipped on subsequent indexing runs.
5
6use sha2::{Digest, Sha256};
7use std::collections::HashMap;
8
9/// Tracks file content hashes for incremental change detection.
10pub struct ChangeDetector {
11    /// Map of file_path -> SHA-256 hex hash from last index run.
12    known_hashes: HashMap<String, String>,
13}
14
15impl ChangeDetector {
16    /// Create a new empty ChangeDetector.
17    pub fn new() -> Self {
18        Self {
19            known_hashes: HashMap::new(),
20        }
21    }
22
23    /// Load previously known hashes from storage.
24    ///
25    /// This reads from the `file_hashes` table if it exists. If the table
26    /// doesn't exist or the query fails, starts fresh with no known hashes.
27    pub fn load_from_storage(
28        &mut self,
29        storage: &dyn codemem_core::StorageBackend,
30        namespace: &str,
31    ) {
32        match storage.load_file_hashes(namespace) {
33            Ok(hashes) => {
34                tracing::debug!("Loaded {} known file hashes", hashes.len());
35                self.known_hashes = hashes;
36            }
37            Err(e) => {
38                tracing::warn!("Failed to load file hashes, starting fresh: {e}");
39            }
40        }
41    }
42
43    /// Save current hashes to storage.
44    pub fn save_to_storage(
45        &self,
46        storage: &dyn codemem_core::StorageBackend,
47        namespace: &str,
48    ) -> Result<(), codemem_core::CodememError> {
49        storage.save_file_hashes(namespace, &self.known_hashes)
50    }
51
52    /// Check if a file has changed and return (changed, hash) to avoid double-hashing.
53    /// Callers can pass the returned hash to `record_hash` to skip recomputation.
54    pub fn check_changed(&self, path: &str, content: &[u8]) -> (bool, String) {
55        let hash = Self::hash_content(content);
56        let changed = self.known_hashes.get(path) != Some(&hash);
57        (changed, hash)
58    }
59
60    /// Record a pre-computed hash for a file (avoids re-hashing when used with `check_changed`).
61    pub fn record_hash(&mut self, path: &str, hash: String) {
62        self.known_hashes.insert(path.to_string(), hash);
63    }
64
65    /// Check if a file has changed since the last index.
66    /// Returns `true` if the file is new or its content hash differs.
67    #[cfg(test)]
68    pub fn is_changed(&self, path: &str, content: &[u8]) -> bool {
69        let hash = Self::hash_content(content);
70        self.known_hashes.get(path) != Some(&hash)
71    }
72
73    /// Update the stored hash for a file after successful indexing.
74    #[cfg(test)]
75    pub fn update_hash(&mut self, path: &str, content: &[u8]) {
76        let hash = Self::hash_content(content);
77        self.known_hashes.insert(path.to_string(), hash);
78    }
79
80    /// Remove the hash for a file (e.g., when it's deleted).
81    #[cfg(test)]
82    pub fn remove_hash(&mut self, path: &str) {
83        self.known_hashes.remove(path);
84    }
85
86    /// Get the number of tracked files.
87    #[cfg(test)]
88    pub fn tracked_count(&self) -> usize {
89        self.known_hashes.len()
90    }
91
92    /// Compute SHA-256 hash of content bytes.
93    fn hash_content(content: &[u8]) -> String {
94        let mut hasher = Sha256::new();
95        hasher.update(content);
96        format!("{:x}", hasher.finalize())
97    }
98}
99
100impl Default for ChangeDetector {
101    fn default() -> Self {
102        Self::new()
103    }
104}
105
106#[cfg(test)]
107#[path = "tests/incremental_tests.rs"]
108mod tests;