ralph_workflow/json_parser/deduplication/rolling_hash.rs
1// Rolling hash window implementation for fast substring detection.
2//
3// Uses the Rabin-Karp algorithm for O(1) hash computation on sliding windows.
4
5use std::collections::HashMap;
6
7/// Rolling hash window for fast substring detection.
8///
9/// Maintains rolling hash values over accumulated content using the Rabin-Karp
10/// algorithm. Provides O(1) hash computation for sliding windows.
11///
12/// # Algorithm
13///
14/// Uses polynomial rolling hash with base 256 (byte values) and modulus
15/// from a large prime to minimize collisions.
16///
17/// Hash computation:
18/// ```text
19/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
20/// ```
21///
22/// Rolling update:
23/// ```text
24/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
25/// ```
26///
27/// # Example
28///
29/// ```ignore
30/// let mut window = RollingHashWindow::new();
31/// window.add_content("Hello World");
32///
33/// // Check if "World" exists in the accumulated content
34/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
35/// let world_hash = RollingHashWindow::compute_hash("World");
36///
37/// if hashes.contains(&world_hash) {
38/// // Potential match - verify with KMP
39/// }
40/// ```
41#[derive(Debug, Default, Clone)]
42pub struct RollingHashWindow {
43 /// Accumulated content for hash computation
44 content: String,
45 /// Cached hash values for different window sizes
46 /// Maps `window_size` -> Vec<(position, hash)>
47 cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
48}
49
50impl RollingHashWindow {
51 /// Base for polynomial rolling hash (256 for byte values)
52 const BASE: u64 = 256;
53 /// Modulus for hash computation (large prime to minimize collisions)
54 const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
55
56 /// Create a new rolling hash window.
57 #[cfg(test)]
58 pub fn new() -> Self {
59 Self::default()
60 }
61
62 /// Compute rolling hash of a string slice.
63 ///
64 /// Uses polynomial rolling hash with base 256 and a large prime modulus.
65 ///
66 /// # Arguments
67 /// * `text` - The text to hash
68 ///
69 /// # Returns
70 /// The hash value as a u64
71 ///
72 /// # Example
73 /// ```ignore
74 /// let hash = RollingHashWindow::compute_hash("Hello");
75 /// ```
76 pub fn compute_hash(text: &str) -> u64 {
77 let mut hash: u64 = 0;
78 for byte in text.bytes() {
79 hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
80 }
81 hash
82 }
83
84 /// Compute base^(n-1) mod MODULUS for rolling hash updates.
85 ///
86 /// This is used to efficiently remove the leftmost character when
87 /// sliding the window.
88 #[cfg(test)]
89 fn compute_power(power: usize) -> u64 {
90 let mut result = 1u64;
91 for _ in 0..power {
92 result = (result * Self::BASE) % Self::MODULUS;
93 }
94 result
95 }
96
97 /// Add content to the window and update cached hashes.
98 ///
99 /// This appends new content and recomputes hash values for all
100 /// cached window sizes.
101 ///
102 /// # Arguments
103 /// * `text` - The new content to add
104 #[cfg(test)]
105 pub fn add_content(&mut self, text: &str) {
106 if text.is_empty() {
107 return;
108 }
109
110 let _start_pos = self.content.len();
111 self.content.push_str(text);
112
113 // Invalidate and recompute all cached hashes
114 self.cached_hashes.clear();
115 }
116
117 /// Get all window hashes of a specific size.
118 ///
119 /// Computes rolling hash values for all windows of the given size
120 /// in the accumulated content.
121 ///
122 /// # Arguments
123 /// * `window_size` - The size of each window
124 ///
125 /// # Returns
126 /// A vector of (position, hash) tuples for each window
127 ///
128 /// # Example
129 /// ```ignore
130 /// // Get hashes for all 5-character windows
131 /// let hashes = window.get_window_hashes(5);
132 /// ```
133 #[cfg(test)]
134 pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
135 // Return cached copy if available
136 if let Some(hashes) = self.cached_hashes.get(&window_size) {
137 return hashes.clone();
138 }
139
140 // Compute hashes for this window size
141 let content_bytes = self.content.as_bytes();
142 let content_len = content_bytes.len();
143
144 if content_len < window_size {
145 // Not enough content for even one window
146 let empty: Vec<(usize, u64)> = Vec::new();
147 self.cached_hashes.insert(window_size, empty.clone());
148 return empty;
149 }
150
151 let mut hashes = Vec::new();
152 let mut hash: u64 = 0;
153
154 // Compute initial window hash
155 for byte in content_bytes.iter().take(window_size) {
156 hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
157 }
158 hashes.push((0, hash));
159
160 // Precompute base^(window_size-1) mod MODULUS for rolling updates
161 let power = Self::compute_power(window_size - 1);
162
163 // Slide window and compute rolling hashes
164 for i in 1..=(content_len - window_size) {
165 // Remove leftmost character contribution
166 let leftmost = u64::from(content_bytes[i - 1]);
167 let removed = (leftmost * power) % Self::MODULUS;
168 hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
169
170 // Shift and add new character
171 hash = (hash * Self::BASE) % Self::MODULUS;
172 let new_char = u64::from(content_bytes[i + window_size - 1]);
173 hash = (hash + new_char) % Self::MODULUS;
174
175 hashes.push((i, hash));
176 }
177
178 // Cache for future use
179 self.cached_hashes.insert(window_size, hashes.clone());
180 hashes
181 }
182
183 /// Check if a hash exists in any window of the given size.
184 ///
185 /// This is a fast O(1) check after hashes have been computed.
186 ///
187 /// # Arguments
188 /// * `hash` - The hash value to search for
189 /// * `window_size` - The window size to check
190 ///
191 /// # Returns
192 /// * `Some(position)` - The position where the hash was found
193 /// * `None` - Hash not found
194 #[cfg(test)]
195 pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
196 let hashes = self.get_window_hashes(window_size);
197 hashes
198 .into_iter()
199 .find(|(_, h)| *h == hash)
200 .map(|(pos, _)| pos)
201 }
202
203 /// Clear all content and cached hashes.
204 pub fn clear(&mut self) {
205 self.content.clear();
206 self.cached_hashes.clear();
207 }
208
209 /// Get the current content length.
210 #[cfg(test)]
211 pub const fn len(&self) -> usize {
212 self.content.len()
213 }
214
215 /// Check if the window is empty.
216 #[cfg(test)]
217 pub const fn is_empty(&self) -> bool {
218 self.content.is_empty()
219 }
220}