Skip to main content

ralph_workflow/json_parser/deduplication/
rolling_hash.rs

1// Rolling hash window implementation for fast substring detection.
2//
3// Uses the Rabin-Karp algorithm for O(1) hash computation on sliding windows.
4
5use std::collections::HashMap;
6
7/// Rolling hash window for fast substring detection.
8///
9/// Maintains rolling hash values over accumulated content using the Rabin-Karp
10/// algorithm. Provides O(1) hash computation for sliding windows.
11///
12/// # Algorithm
13///
14/// Uses polynomial rolling hash with base 256 (byte values) and modulus
15/// from a large prime to minimize collisions.
16///
17/// Hash computation:
18/// ```text
19/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
20/// ```
21///
22/// Rolling update:
23/// ```text
24/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
25/// ```
26///
27/// # Example
28///
29/// ```ignore
30/// let mut window = RollingHashWindow::new();
31/// window.add_content("Hello World");
32///
33/// // Check if "World" exists in the accumulated content
34/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
35/// let world_hash = RollingHashWindow::compute_hash("World");
36///
37/// if hashes.contains(&world_hash) {
38///     // Potential match - verify with KMP
39/// }
40/// ```
41#[derive(Debug, Default, Clone)]
42pub struct RollingHashWindow {
43    /// Accumulated content for hash computation
44    content: String,
45    /// Cached hash values for different window sizes
46    /// Maps `window_size` -> Vec<(position, hash)>
47    cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
48}
49
50impl RollingHashWindow {
51    /// Base for polynomial rolling hash (256 for byte values)
52    const BASE: u64 = 256;
53    /// Modulus for hash computation (large prime to minimize collisions)
54    const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
55
56    /// Create a new rolling hash window.
57    #[cfg(test)]
58    #[must_use] 
59    pub fn new() -> Self {
60        Self::default()
61    }
62
63    /// Compute rolling hash of a string slice.
64    ///
65    /// Uses polynomial rolling hash with base 256 and a large prime modulus.
66    ///
67    /// # Arguments
68    /// * `text` - The text to hash
69    ///
70    /// # Returns
71    /// The hash value as a u64
72    ///
73    /// # Example
74    /// ```ignore
75    /// let hash = RollingHashWindow::compute_hash("Hello");
76    /// ```
77    #[must_use] 
78    pub fn compute_hash(text: &str) -> u64 {
79        let mut hash: u64 = 0;
80        for byte in text.bytes() {
81            hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
82        }
83        hash
84    }
85
86    /// Compute base^(n-1) mod MODULUS for rolling hash updates.
87    ///
88    /// This is used to efficiently remove the leftmost character when
89    /// sliding the window.
90    #[cfg(test)]
91    fn compute_power(power: usize) -> u64 {
92        let mut result = 1u64;
93        for _ in 0..power {
94            result = (result * Self::BASE) % Self::MODULUS;
95        }
96        result
97    }
98
99    /// Add content to the window and update cached hashes.
100    ///
101    /// This appends new content and recomputes hash values for all
102    /// cached window sizes.
103    ///
104    /// # Arguments
105    /// * `text` - The new content to add
106    #[cfg(test)]
107    pub fn add_content(&mut self, text: &str) {
108        if text.is_empty() {
109            return;
110        }
111
112        let _start_pos = self.content.len();
113        self.content.push_str(text);
114
115        // Invalidate and recompute all cached hashes
116        self.cached_hashes.clear();
117    }
118
119    /// Get all window hashes of a specific size.
120    ///
121    /// Computes rolling hash values for all windows of the given size
122    /// in the accumulated content.
123    ///
124    /// # Arguments
125    /// * `window_size` - The size of each window
126    ///
127    /// # Returns
128    /// A vector of (position, hash) tuples for each window
129    ///
130    /// # Example
131    /// ```ignore
132    /// // Get hashes for all 5-character windows
133    /// let hashes = window.get_window_hashes(5);
134    /// ```
135    #[cfg(test)]
136    pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
137        // Return cached copy if available
138        if let Some(hashes) = self.cached_hashes.get(&window_size) {
139            return hashes.clone();
140        }
141
142        // Compute hashes for this window size
143        let content_bytes = self.content.as_bytes();
144        let content_len = content_bytes.len();
145
146        if content_len < window_size {
147            // Not enough content for even one window
148            let empty: Vec<(usize, u64)> = Vec::new();
149            self.cached_hashes.insert(window_size, empty.clone());
150            return empty;
151        }
152
153        let mut hashes = Vec::new();
154        let mut hash: u64 = 0;
155
156        // Compute initial window hash
157        for byte in content_bytes.iter().take(window_size) {
158            hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
159        }
160        hashes.push((0, hash));
161
162        // Precompute base^(window_size-1) mod MODULUS for rolling updates
163        let power = Self::compute_power(window_size - 1);
164
165        // Slide window and compute rolling hashes
166        for i in 1..=(content_len - window_size) {
167            // Remove leftmost character contribution
168            let leftmost = u64::from(content_bytes[i - 1]);
169            let removed = (leftmost * power) % Self::MODULUS;
170            hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
171
172            // Shift and add new character
173            hash = (hash * Self::BASE) % Self::MODULUS;
174            let new_char = u64::from(content_bytes[i + window_size - 1]);
175            hash = (hash + new_char) % Self::MODULUS;
176
177            hashes.push((i, hash));
178        }
179
180        // Cache for future use
181        self.cached_hashes.insert(window_size, hashes.clone());
182        hashes
183    }
184
185    /// Check if a hash exists in any window of the given size.
186    ///
187    /// This is a fast O(1) check after hashes have been computed.
188    ///
189    /// # Arguments
190    /// * `hash` - The hash value to search for
191    /// * `window_size` - The window size to check
192    ///
193    /// # Returns
194    /// * `Some(position)` - The position where the hash was found
195    /// * `None` - Hash not found
196    #[cfg(test)]
197    pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
198        let hashes = self.get_window_hashes(window_size);
199        hashes
200            .into_iter()
201            .find(|(_, h)| *h == hash)
202            .map(|(pos, _)| pos)
203    }
204
205    /// Clear all content and cached hashes.
206    pub fn clear(&mut self) {
207        self.content.clear();
208        self.cached_hashes.clear();
209    }
210
211    /// Get the current content length.
212    #[cfg(test)]
213    #[must_use] 
214    pub const fn len(&self) -> usize {
215        self.content.len()
216    }
217
218    /// Check if the window is empty.
219    #[cfg(test)]
220    #[must_use] 
221    pub const fn is_empty(&self) -> bool {
222        self.content.is_empty()
223    }
224}