ralph_workflow/json_parser/deduplication/
boundary.rs1use std::collections::HashMap;
9
10fn kmp_fail_dispatch(lps: &mut [usize], len: &mut usize, i: &mut usize, byte: u8, prev: u8) {
11 if byte == prev {
12 *len = len.saturating_add(1);
13 lps[*i] = *len;
14 *i = i.saturating_add(1);
15 } else if *len != 0 {
16 *len = lps[*len - 1];
17 } else {
18 lps[*i] = 0;
19 *i = i.saturating_add(1);
20 }
21}
22
23fn kmp_build_failure_table(pattern_bytes: &[u8], m: usize) -> Vec<usize> {
24 let mut lps = vec![0; m];
25 let mut len = 0usize;
26 let mut i = 1usize;
27 while i < m {
28 let (bi, bl) = (pattern_bytes[i], pattern_bytes[len]);
29 kmp_fail_dispatch(&mut lps, &mut len, &mut i, bi, bl);
30 }
31 lps
32}
33
34fn kmp_search_on_match(i: &mut usize, j: &mut usize, m: usize) -> Option<usize> {
35 *i = i.saturating_add(1);
36 *j = j.saturating_add(1);
37 if *j == m {
38 Some(*i - *j)
39 } else {
40 None
41 }
42}
43
44fn kmp_search_miss(failure: &[usize], i: &mut usize, j: &mut usize) {
45 if *j != 0 {
46 *j = failure[*j - 1];
47 } else {
48 *i = i.saturating_add(1);
49 }
50}
51
52fn kmp_search_step(
53 text_bytes: &[u8],
54 pattern_bytes: &[u8],
55 failure: &[usize],
56 i: &mut usize,
57 j: &mut usize,
58 m: usize,
59) -> Option<usize> {
60 if pattern_bytes[*j] == text_bytes[*i] {
61 kmp_search_on_match(i, j, m)
62 } else {
63 kmp_search_miss(failure, i, j);
64 None
65 }
66}
67
68pub struct KMPMatcher {
69 pattern: String,
70 failure: Vec<usize>,
71}
72
73impl KMPMatcher {
74 #[must_use]
75 pub fn new(pattern: &str) -> Self {
76 let pattern = pattern.to_string();
77 let failure = Self::compute_failure(&pattern);
78 Self { pattern, failure }
79 }
80
81 fn compute_failure(pattern: &str) -> Vec<usize> {
82 let m = pattern.len();
83 if m == 0 {
84 return Vec::new();
85 }
86 let pattern_bytes = pattern.as_bytes();
87 kmp_build_failure_table(pattern_bytes, m)
88 }
89
90 fn kmp_srch(&self, text: &[u8], n: usize, m: usize) -> Option<usize> {
91 let (mut i, mut j, mut found) = (0, 0, None);
92 while i < n && found.is_none() {
93 found = kmp_search_step(
94 text,
95 self.pattern.as_bytes(),
96 &self.failure,
97 &mut i,
98 &mut j,
99 m,
100 );
101 }
102 found
103 }
104
105 #[must_use]
106 pub fn find(&self, text: &str) -> Option<usize> {
107 let n = text.len();
108 let m = self.pattern.len();
109 if m == 0 || n < m {
110 return None;
111 }
112 self.kmp_srch(text.as_bytes(), n, m)
113 }
114
115 #[cfg(test)]
116 #[must_use]
117 pub fn find_all(&self, text: &str) -> Vec<usize> {
118 let mut positions = Vec::new();
119 let n = text.len();
120 let m = self.pattern.len();
121
122 if m == 0 || n < m {
123 return positions;
124 }
125
126 let text_bytes = text.as_bytes();
127 let pattern_bytes = self.pattern.as_bytes();
128
129 let mut i = 0;
130 let mut j = 0;
131
132 while i < n {
133 if pattern_bytes[j] == text_bytes[i] {
134 i = i.saturating_add(1);
135 j = j.saturating_add(1);
136
137 if j == m {
138 positions.push(i - j);
139 j = self.failure[j - 1];
140 }
141 } else if j != 0 {
142 j = self.failure[j - 1];
143 } else {
144 i = i.saturating_add(1);
145 }
146 }
147
148 positions
149 }
150
151 #[cfg(test)]
152 #[must_use]
153 pub const fn pattern_len(&self) -> usize {
154 self.pattern.len()
155 }
156
157 #[cfg(test)]
158 #[must_use]
159 pub const fn is_empty(&self) -> bool {
160 self.pattern.is_empty()
161 }
162
163 #[cfg(test)]
164 #[must_use]
165 pub(crate) fn failure_table(&self) -> &[usize] {
166 &self.failure
167 }
168}
169
170#[derive(Debug, Default, Clone)]
171pub struct RollingHashWindow {
172 content: String,
173 cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
174}
175
176impl RollingHashWindow {
177 const BASE: u64 = 256;
178 const MODULUS: u64 = 2_147_483_647;
179
180 #[cfg(test)]
181 #[must_use]
182 pub fn new() -> Self {
183 Self::default()
184 }
185
186 #[must_use]
187 pub fn compute_hash(text: &str) -> u64 {
188 text.bytes().fold(0u64, |hash, byte| {
189 (hash * Self::BASE + u64::from(byte)) % Self::MODULUS
190 })
191 }
192
193 #[cfg(test)]
194 fn compute_power(power: usize) -> u64 {
195 (0..power).fold(1u64, |result, _| (result * Self::BASE) % Self::MODULUS)
196 }
197
198 #[cfg(test)]
199 pub fn add_content(&mut self, text: &str) {
200 if text.is_empty() {
201 return;
202 }
203
204 self.content.push_str(text);
205 self.cached_hashes.clear();
206 }
207
208 #[cfg(test)]
209 pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
210 if let Some(hashes) = self.cached_hashes.get(&window_size) {
211 return hashes.clone();
212 }
213
214 let content_bytes = self.content.as_bytes();
215 let content_len = content_bytes.len();
216
217 if content_len < window_size {
218 let empty: Vec<(usize, u64)> = Vec::new();
219 self.cached_hashes.insert(window_size, empty.clone());
220 return empty;
221 }
222
223 let mut hashes = Vec::new();
224 let mut hash: u64 = 0;
225
226 for byte in content_bytes.iter().take(window_size) {
227 hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
228 }
229 hashes.push((0, hash));
230
231 let power = Self::compute_power(window_size - 1);
232
233 for i in 1..=(content_len - window_size) {
234 let leftmost = u64::from(content_bytes[i - 1]);
235 let removed = (leftmost * power) % Self::MODULUS;
236 hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
237
238 hash = (hash * Self::BASE) % Self::MODULUS;
239 let new_char = u64::from(content_bytes[i + window_size - 1]);
240 hash = (hash + new_char) % Self::MODULUS;
241
242 hashes.push((i, hash));
243 }
244
245 self.cached_hashes.insert(window_size, hashes.clone());
246 hashes
247 }
248
249 #[cfg(test)]
250 pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
251 let hashes = self.get_window_hashes(window_size);
252 hashes
253 .into_iter()
254 .find(|(_, h)| *h == hash)
255 .map(|(pos, _)| pos)
256 }
257
258 pub fn clear(&mut self) {
259 self.content.clear();
260 self.cached_hashes.clear();
261 }
262
263 #[cfg(test)]
264 #[must_use]
265 pub const fn len(&self) -> usize {
266 self.content.len()
267 }
268
269 #[cfg(test)]
270 #[must_use]
271 pub const fn is_empty(&self) -> bool {
272 self.content.is_empty()
273 }
274}