Skip to main content

codemem_engine/index/
incremental.rs

1//! SHA-256 based change detection for incremental indexing.
2//!
3//! Tracks content hashes of previously indexed files so unchanged files
4//! can be skipped on subsequent indexing runs.
5
6use sha2::{Digest, Sha256};
7use std::collections::HashMap;
8
9/// Tracks file content hashes for incremental change detection.
10pub struct ChangeDetector {
11    /// Map of file_path -> SHA-256 hex hash from last index run.
12    known_hashes: HashMap<String, String>,
13}
14
15impl ChangeDetector {
16    /// Create a new empty ChangeDetector.
17    pub fn new() -> Self {
18        Self {
19            known_hashes: HashMap::new(),
20        }
21    }
22
23    /// Load previously known hashes from storage.
24    ///
25    /// This reads from the `file_hashes` table if it exists. If the table
26    /// doesn't exist or the query fails, starts fresh with no known hashes.
27    pub fn load_from_storage(&mut self, storage: &dyn codemem_core::StorageBackend) {
28        match storage.load_file_hashes() {
29            Ok(hashes) => {
30                tracing::debug!("Loaded {} known file hashes", hashes.len());
31                self.known_hashes = hashes;
32            }
33            Err(e) => {
34                tracing::warn!("Failed to load file hashes, starting fresh: {e}");
35            }
36        }
37    }
38
39    /// Save current hashes to storage.
40    pub fn save_to_storage(
41        &self,
42        storage: &dyn codemem_core::StorageBackend,
43    ) -> Result<(), codemem_core::CodememError> {
44        storage.save_file_hashes(&self.known_hashes)
45    }
46
47    /// Check if a file has changed and return (changed, hash) to avoid double-hashing.
48    /// Callers can pass the returned hash to `record_hash` to skip recomputation.
49    pub fn check_changed(&self, path: &str, content: &[u8]) -> (bool, String) {
50        let hash = Self::hash_content(content);
51        let changed = self.known_hashes.get(path) != Some(&hash);
52        (changed, hash)
53    }
54
55    /// Record a pre-computed hash for a file (avoids re-hashing when used with `check_changed`).
56    pub fn record_hash(&mut self, path: &str, hash: String) {
57        self.known_hashes.insert(path.to_string(), hash);
58    }
59
60    /// Check if a file has changed since the last index.
61    /// Returns `true` if the file is new or its content hash differs.
62    #[cfg(test)]
63    pub fn is_changed(&self, path: &str, content: &[u8]) -> bool {
64        let hash = Self::hash_content(content);
65        self.known_hashes.get(path) != Some(&hash)
66    }
67
68    /// Update the stored hash for a file after successful indexing.
69    #[cfg(test)]
70    pub fn update_hash(&mut self, path: &str, content: &[u8]) {
71        let hash = Self::hash_content(content);
72        self.known_hashes.insert(path.to_string(), hash);
73    }
74
75    /// Remove the hash for a file (e.g., when it's deleted).
76    #[cfg(test)]
77    pub fn remove_hash(&mut self, path: &str) {
78        self.known_hashes.remove(path);
79    }
80
81    /// Get the number of tracked files.
82    #[cfg(test)]
83    pub fn tracked_count(&self) -> usize {
84        self.known_hashes.len()
85    }
86
87    /// Compute SHA-256 hash of content bytes.
88    fn hash_content(content: &[u8]) -> String {
89        let mut hasher = Sha256::new();
90        hasher.update(content);
91        format!("{:x}", hasher.finalize())
92    }
93}
94
95impl Default for ChangeDetector {
96    fn default() -> Self {
97        Self::new()
98    }
99}
100
101#[cfg(test)]
102#[path = "tests/incremental_tests.rs"]
103mod tests;