Skip to main content

ralph_workflow/json_parser/deduplication/
boundary.rs

1//! Boundary module for inherently iterative deduplication algorithms.
2//!
3//! This module contains KMP and rolling hash implementations that are
4//! fundamentally iterative and cannot be cleanly expressed in functional style.
5//!
6//! These functions are exempt from functional programming lints.
7
8use std::collections::HashMap;
9
10fn kmp_fail_dispatch(lps: &mut [usize], len: &mut usize, i: &mut usize, byte: u8, prev: u8) {
11    if byte == prev {
12        *len = len.saturating_add(1);
13        lps[*i] = *len;
14        *i = i.saturating_add(1);
15    } else if *len != 0 {
16        *len = lps[*len - 1];
17    } else {
18        lps[*i] = 0;
19        *i = i.saturating_add(1);
20    }
21}
22
23fn kmp_build_failure_table(pattern_bytes: &[u8], m: usize) -> Vec<usize> {
24    let mut lps = vec![0; m];
25    let mut len = 0usize;
26    let mut i = 1usize;
27    while i < m {
28        let (bi, bl) = (pattern_bytes[i], pattern_bytes[len]);
29        kmp_fail_dispatch(&mut lps, &mut len, &mut i, bi, bl);
30    }
31    lps
32}
33
34fn kmp_search_on_match(i: &mut usize, j: &mut usize, m: usize) -> Option<usize> {
35    *i = i.saturating_add(1);
36    *j = j.saturating_add(1);
37    if *j == m {
38        Some(*i - *j)
39    } else {
40        None
41    }
42}
43
44fn kmp_search_miss(failure: &[usize], i: &mut usize, j: &mut usize) {
45    if *j != 0 {
46        *j = failure[*j - 1];
47    } else {
48        *i = i.saturating_add(1);
49    }
50}
51
52fn kmp_search_step(
53    text_bytes: &[u8],
54    pattern_bytes: &[u8],
55    failure: &[usize],
56    i: &mut usize,
57    j: &mut usize,
58    m: usize,
59) -> Option<usize> {
60    if pattern_bytes[*j] == text_bytes[*i] {
61        kmp_search_on_match(i, j, m)
62    } else {
63        kmp_search_miss(failure, i, j);
64        None
65    }
66}
67
68pub struct KMPMatcher {
69    pattern: String,
70    failure: Vec<usize>,
71}
72
73impl KMPMatcher {
74    #[must_use]
75    pub fn new(pattern: &str) -> Self {
76        let pattern = pattern.to_string();
77        let failure = Self::compute_failure(&pattern);
78        Self { pattern, failure }
79    }
80
81    fn compute_failure(pattern: &str) -> Vec<usize> {
82        let m = pattern.len();
83        if m == 0 {
84            return Vec::new();
85        }
86        let pattern_bytes = pattern.as_bytes();
87        kmp_build_failure_table(pattern_bytes, m)
88    }
89
90    fn kmp_srch(&self, text: &[u8], n: usize, m: usize) -> Option<usize> {
91        let (mut i, mut j, mut found) = (0, 0, None);
92        while i < n && found.is_none() {
93            found = kmp_search_step(
94                text,
95                self.pattern.as_bytes(),
96                &self.failure,
97                &mut i,
98                &mut j,
99                m,
100            );
101        }
102        found
103    }
104
105    #[must_use]
106    pub fn find(&self, text: &str) -> Option<usize> {
107        let n = text.len();
108        let m = self.pattern.len();
109        if m == 0 || n < m {
110            return None;
111        }
112        self.kmp_srch(text.as_bytes(), n, m)
113    }
114
115    #[cfg(test)]
116    #[must_use]
117    pub fn find_all(&self, text: &str) -> Vec<usize> {
118        let mut positions = Vec::new();
119        let n = text.len();
120        let m = self.pattern.len();
121
122        if m == 0 || n < m {
123            return positions;
124        }
125
126        let text_bytes = text.as_bytes();
127        let pattern_bytes = self.pattern.as_bytes();
128
129        let mut i = 0;
130        let mut j = 0;
131
132        while i < n {
133            if pattern_bytes[j] == text_bytes[i] {
134                i = i.saturating_add(1);
135                j = j.saturating_add(1);
136
137                if j == m {
138                    positions.push(i - j);
139                    j = self.failure[j - 1];
140                }
141            } else if j != 0 {
142                j = self.failure[j - 1];
143            } else {
144                i = i.saturating_add(1);
145            }
146        }
147
148        positions
149    }
150
151    #[cfg(test)]
152    #[must_use]
153    pub const fn pattern_len(&self) -> usize {
154        self.pattern.len()
155    }
156
157    #[cfg(test)]
158    #[must_use]
159    pub const fn is_empty(&self) -> bool {
160        self.pattern.is_empty()
161    }
162
163    #[cfg(test)]
164    #[must_use]
165    pub(crate) fn failure_table(&self) -> &[usize] {
166        &self.failure
167    }
168}
169
170#[derive(Debug, Default, Clone)]
171pub struct RollingHashWindow {
172    content: String,
173    cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
174}
175
176impl RollingHashWindow {
177    const BASE: u64 = 256;
178    const MODULUS: u64 = 2_147_483_647;
179
180    #[cfg(test)]
181    #[must_use]
182    pub fn new() -> Self {
183        Self::default()
184    }
185
186    #[must_use]
187    pub fn compute_hash(text: &str) -> u64 {
188        text.bytes().fold(0u64, |hash, byte| {
189            (hash * Self::BASE + u64::from(byte)) % Self::MODULUS
190        })
191    }
192
193    #[cfg(test)]
194    fn compute_power(power: usize) -> u64 {
195        (0..power).fold(1u64, |result, _| (result * Self::BASE) % Self::MODULUS)
196    }
197
198    #[cfg(test)]
199    pub fn add_content(&mut self, text: &str) {
200        if text.is_empty() {
201            return;
202        }
203
204        self.content.push_str(text);
205        self.cached_hashes.clear();
206    }
207
208    #[cfg(test)]
209    pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
210        if let Some(hashes) = self.cached_hashes.get(&window_size) {
211            return hashes.clone();
212        }
213
214        let content_bytes = self.content.as_bytes();
215        let content_len = content_bytes.len();
216
217        if content_len < window_size {
218            let empty: Vec<(usize, u64)> = Vec::new();
219            self.cached_hashes.insert(window_size, empty.clone());
220            return empty;
221        }
222
223        let mut hashes = Vec::new();
224        let mut hash: u64 = 0;
225
226        for byte in content_bytes.iter().take(window_size) {
227            hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
228        }
229        hashes.push((0, hash));
230
231        let power = Self::compute_power(window_size - 1);
232
233        for i in 1..=(content_len - window_size) {
234            let leftmost = u64::from(content_bytes[i - 1]);
235            let removed = (leftmost * power) % Self::MODULUS;
236            hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
237
238            hash = (hash * Self::BASE) % Self::MODULUS;
239            let new_char = u64::from(content_bytes[i + window_size - 1]);
240            hash = (hash + new_char) % Self::MODULUS;
241
242            hashes.push((i, hash));
243        }
244
245        self.cached_hashes.insert(window_size, hashes.clone());
246        hashes
247    }
248
249    #[cfg(test)]
250    pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
251        let hashes = self.get_window_hashes(window_size);
252        hashes
253            .into_iter()
254            .find(|(_, h)| *h == hash)
255            .map(|(pos, _)| pos)
256    }
257
258    pub fn clear(&mut self) {
259        self.content.clear();
260        self.cached_hashes.clear();
261    }
262
263    #[cfg(test)]
264    #[must_use]
265    pub const fn len(&self) -> usize {
266        self.content.len()
267    }
268
269    #[cfg(test)]
270    #[must_use]
271    pub const fn is_empty(&self) -> bool {
272        self.content.is_empty()
273    }
274}