Skip to main content

ralph_workflow/json_parser/deduplication/
rolling_hash.rs

1// Rolling hash window implementation for fast substring detection.
2//
3// Uses the Rabin-Karp algorithm for O(1) hash computation on sliding windows.
4
5use std::collections::HashMap;
6
7/// Rolling hash window for fast substring detection.
8///
9/// Maintains rolling hash values over accumulated content using the Rabin-Karp
10/// algorithm. Provides O(1) hash computation for sliding windows.
11///
12/// # Algorithm
13///
14/// Uses polynomial rolling hash with base 256 (byte values) and modulus
15/// from a large prime to minimize collisions.
16///
17/// Hash computation:
18/// ```text
19/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
20/// ```
21///
22/// Rolling update:
23/// ```text
24/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
25/// ```
26///
27/// # Example
28///
29/// ```ignore
30/// let mut window = RollingHashWindow::new();
31/// window.add_content("Hello World");
32///
33/// // Check if "World" exists in the accumulated content
34/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
35/// let world_hash = RollingHashWindow::compute_hash("World");
36///
37/// if hashes.contains(&world_hash) {
38///     // Potential match - verify with KMP
39/// }
40/// ```
41#[derive(Debug, Default, Clone)]
42pub struct RollingHashWindow {
43    /// Accumulated content for hash computation
44    content: String,
45    /// Cached hash values for different window sizes
46    /// Maps `window_size` -> Vec<(position, hash)>
47    cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
48}
49
50impl RollingHashWindow {
51    /// Base for polynomial rolling hash (256 for byte values)
52    const BASE: u64 = 256;
53    /// Modulus for hash computation (large prime to minimize collisions)
54    const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
55
56    /// Create a new rolling hash window.
57    #[cfg(test)]
58    pub fn new() -> Self {
59        Self::default()
60    }
61
62    /// Compute rolling hash of a string slice.
63    ///
64    /// Uses polynomial rolling hash with base 256 and a large prime modulus.
65    ///
66    /// # Arguments
67    /// * `text` - The text to hash
68    ///
69    /// # Returns
70    /// The hash value as a u64
71    ///
72    /// # Example
73    /// ```ignore
74    /// let hash = RollingHashWindow::compute_hash("Hello");
75    /// ```
76    pub fn compute_hash(text: &str) -> u64 {
77        let mut hash: u64 = 0;
78        for byte in text.bytes() {
79            hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
80        }
81        hash
82    }
83
84    /// Compute base^(n-1) mod MODULUS for rolling hash updates.
85    ///
86    /// This is used to efficiently remove the leftmost character when
87    /// sliding the window.
88    #[cfg(test)]
89    fn compute_power(power: usize) -> u64 {
90        let mut result = 1u64;
91        for _ in 0..power {
92            result = (result * Self::BASE) % Self::MODULUS;
93        }
94        result
95    }
96
97    /// Add content to the window and update cached hashes.
98    ///
99    /// This appends new content and recomputes hash values for all
100    /// cached window sizes.
101    ///
102    /// # Arguments
103    /// * `text` - The new content to add
104    #[cfg(test)]
105    pub fn add_content(&mut self, text: &str) {
106        if text.is_empty() {
107            return;
108        }
109
110        let _start_pos = self.content.len();
111        self.content.push_str(text);
112
113        // Invalidate and recompute all cached hashes
114        self.cached_hashes.clear();
115    }
116
117    /// Get all window hashes of a specific size.
118    ///
119    /// Computes rolling hash values for all windows of the given size
120    /// in the accumulated content.
121    ///
122    /// # Arguments
123    /// * `window_size` - The size of each window
124    ///
125    /// # Returns
126    /// A vector of (position, hash) tuples for each window
127    ///
128    /// # Example
129    /// ```ignore
130    /// // Get hashes for all 5-character windows
131    /// let hashes = window.get_window_hashes(5);
132    /// ```
133    #[cfg(test)]
134    pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
135        // Return cached copy if available
136        if let Some(hashes) = self.cached_hashes.get(&window_size) {
137            return hashes.clone();
138        }
139
140        // Compute hashes for this window size
141        let content_bytes = self.content.as_bytes();
142        let content_len = content_bytes.len();
143
144        if content_len < window_size {
145            // Not enough content for even one window
146            let empty: Vec<(usize, u64)> = Vec::new();
147            self.cached_hashes.insert(window_size, empty.clone());
148            return empty;
149        }
150
151        let mut hashes = Vec::new();
152        let mut hash: u64 = 0;
153
154        // Compute initial window hash
155        for byte in content_bytes.iter().take(window_size) {
156            hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
157        }
158        hashes.push((0, hash));
159
160        // Precompute base^(window_size-1) mod MODULUS for rolling updates
161        let power = Self::compute_power(window_size - 1);
162
163        // Slide window and compute rolling hashes
164        for i in 1..=(content_len - window_size) {
165            // Remove leftmost character contribution
166            let leftmost = u64::from(content_bytes[i - 1]);
167            let removed = (leftmost * power) % Self::MODULUS;
168            hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
169
170            // Shift and add new character
171            hash = (hash * Self::BASE) % Self::MODULUS;
172            let new_char = u64::from(content_bytes[i + window_size - 1]);
173            hash = (hash + new_char) % Self::MODULUS;
174
175            hashes.push((i, hash));
176        }
177
178        // Cache for future use
179        self.cached_hashes.insert(window_size, hashes.clone());
180        hashes
181    }
182
183    /// Check if a hash exists in any window of the given size.
184    ///
185    /// This is a fast O(1) check after hashes have been computed.
186    ///
187    /// # Arguments
188    /// * `hash` - The hash value to search for
189    /// * `window_size` - The window size to check
190    ///
191    /// # Returns
192    /// * `Some(position)` - The position where the hash was found
193    /// * `None` - Hash not found
194    #[cfg(test)]
195    pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
196        let hashes = self.get_window_hashes(window_size);
197        hashes
198            .into_iter()
199            .find(|(_, h)| *h == hash)
200            .map(|(pos, _)| pos)
201    }
202
203    /// Clear all content and cached hashes.
204    pub fn clear(&mut self) {
205        self.content.clear();
206        self.cached_hashes.clear();
207    }
208
209    /// Get the current content length.
210    #[cfg(test)]
211    pub const fn len(&self) -> usize {
212        self.content.len()
213    }
214
215    /// Check if the window is empty.
216    #[cfg(test)]
217    pub const fn is_empty(&self) -> bool {
218        self.content.is_empty()
219    }
220}