ralph_workflow/json_parser/deduplication/rolling_hash.rs
1// Rolling hash window implementation for fast substring detection.
2//
3// Uses the Rabin-Karp algorithm for O(1) hash computation on sliding windows.
4
5use std::collections::HashMap;
6
7/// Rolling hash window for fast substring detection.
8///
9/// Maintains rolling hash values over accumulated content using the Rabin-Karp
10/// algorithm. Provides O(1) hash computation for sliding windows.
11///
12/// # Algorithm
13///
14/// Uses polynomial rolling hash with base 256 (byte values) and modulus
15/// from a large prime to minimize collisions.
16///
17/// Hash computation:
18/// ```text
19/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
20/// ```
21///
22/// Rolling update:
23/// ```text
24/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
25/// ```
26///
27/// # Example
28///
29/// ```ignore
30/// let mut window = RollingHashWindow::new();
31/// window.add_content("Hello World");
32///
33/// // Check if "World" exists in the accumulated content
34/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
35/// let world_hash = RollingHashWindow::compute_hash("World");
36///
37/// if hashes.contains(&world_hash) {
38/// // Potential match - verify with KMP
39/// }
40/// ```
41#[derive(Debug, Default, Clone)]
42pub struct RollingHashWindow {
43 /// Accumulated content for hash computation
44 content: String,
45 /// Cached hash values for different window sizes
46 /// Maps `window_size` -> Vec<(position, hash)>
47 cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
48}
49
50impl RollingHashWindow {
51 /// Base for polynomial rolling hash (256 for byte values)
52 const BASE: u64 = 256;
53 /// Modulus for hash computation (large prime to minimize collisions)
54 const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
55
56 /// Create a new rolling hash window.
57 #[cfg(test)]
58 #[must_use]
59 pub fn new() -> Self {
60 Self::default()
61 }
62
63 /// Compute rolling hash of a string slice.
64 ///
65 /// Uses polynomial rolling hash with base 256 and a large prime modulus.
66 ///
67 /// # Arguments
68 /// * `text` - The text to hash
69 ///
70 /// # Returns
71 /// The hash value as a u64
72 ///
73 /// # Example
74 /// ```ignore
75 /// let hash = RollingHashWindow::compute_hash("Hello");
76 /// ```
77 #[must_use]
78 pub fn compute_hash(text: &str) -> u64 {
79 let mut hash: u64 = 0;
80 for byte in text.bytes() {
81 hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
82 }
83 hash
84 }
85
86 /// Compute base^(n-1) mod MODULUS for rolling hash updates.
87 ///
88 /// This is used to efficiently remove the leftmost character when
89 /// sliding the window.
90 #[cfg(test)]
91 fn compute_power(power: usize) -> u64 {
92 let mut result = 1u64;
93 for _ in 0..power {
94 result = (result * Self::BASE) % Self::MODULUS;
95 }
96 result
97 }
98
99 /// Add content to the window and update cached hashes.
100 ///
101 /// This appends new content and recomputes hash values for all
102 /// cached window sizes.
103 ///
104 /// # Arguments
105 /// * `text` - The new content to add
106 #[cfg(test)]
107 pub fn add_content(&mut self, text: &str) {
108 if text.is_empty() {
109 return;
110 }
111
112 let _start_pos = self.content.len();
113 self.content.push_str(text);
114
115 // Invalidate and recompute all cached hashes
116 self.cached_hashes.clear();
117 }
118
119 /// Get all window hashes of a specific size.
120 ///
121 /// Computes rolling hash values for all windows of the given size
122 /// in the accumulated content.
123 ///
124 /// # Arguments
125 /// * `window_size` - The size of each window
126 ///
127 /// # Returns
128 /// A vector of (position, hash) tuples for each window
129 ///
130 /// # Example
131 /// ```ignore
132 /// // Get hashes for all 5-character windows
133 /// let hashes = window.get_window_hashes(5);
134 /// ```
135 #[cfg(test)]
136 pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
137 // Return cached copy if available
138 if let Some(hashes) = self.cached_hashes.get(&window_size) {
139 return hashes.clone();
140 }
141
142 // Compute hashes for this window size
143 let content_bytes = self.content.as_bytes();
144 let content_len = content_bytes.len();
145
146 if content_len < window_size {
147 // Not enough content for even one window
148 let empty: Vec<(usize, u64)> = Vec::new();
149 self.cached_hashes.insert(window_size, empty.clone());
150 return empty;
151 }
152
153 let mut hashes = Vec::new();
154 let mut hash: u64 = 0;
155
156 // Compute initial window hash
157 for byte in content_bytes.iter().take(window_size) {
158 hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
159 }
160 hashes.push((0, hash));
161
162 // Precompute base^(window_size-1) mod MODULUS for rolling updates
163 let power = Self::compute_power(window_size - 1);
164
165 // Slide window and compute rolling hashes
166 for i in 1..=(content_len - window_size) {
167 // Remove leftmost character contribution
168 let leftmost = u64::from(content_bytes[i - 1]);
169 let removed = (leftmost * power) % Self::MODULUS;
170 hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
171
172 // Shift and add new character
173 hash = (hash * Self::BASE) % Self::MODULUS;
174 let new_char = u64::from(content_bytes[i + window_size - 1]);
175 hash = (hash + new_char) % Self::MODULUS;
176
177 hashes.push((i, hash));
178 }
179
180 // Cache for future use
181 self.cached_hashes.insert(window_size, hashes.clone());
182 hashes
183 }
184
185 /// Check if a hash exists in any window of the given size.
186 ///
187 /// This is a fast O(1) check after hashes have been computed.
188 ///
189 /// # Arguments
190 /// * `hash` - The hash value to search for
191 /// * `window_size` - The window size to check
192 ///
193 /// # Returns
194 /// * `Some(position)` - The position where the hash was found
195 /// * `None` - Hash not found
196 #[cfg(test)]
197 pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
198 let hashes = self.get_window_hashes(window_size);
199 hashes
200 .into_iter()
201 .find(|(_, h)| *h == hash)
202 .map(|(pos, _)| pos)
203 }
204
205 /// Clear all content and cached hashes.
206 pub fn clear(&mut self) {
207 self.content.clear();
208 self.cached_hashes.clear();
209 }
210
211 /// Get the current content length.
212 #[cfg(test)]
213 #[must_use]
214 pub const fn len(&self) -> usize {
215 self.content.len()
216 }
217
218 /// Check if the window is empty.
219 #[cfg(test)]
220 #[must_use]
221 pub const fn is_empty(&self) -> bool {
222 self.content.is_empty()
223 }
224}