ralph_workflow/json_parser/deduplication/deduplicator.rs
1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5/// Delta deduplicator using rolling hash and KMP.
6///
7/// Orchestrates the two-phase deduplication approach:
8/// 1. Rolling hash for fast filtering
9/// 2. KMP for exact verification
10///
11/// # Example
12///
13/// ```ignore
14/// let mut dedup = DeltaDeduplicator::new();
15///
16/// // Add accumulated content
17/// dedup.add_accumulated("Hello World");
18///
19/// // Check if a delta is a duplicate
20/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
21/// // "!" is the new portion
22/// assert_eq!(new_portion, "!");
23/// }
24/// ```
25#[derive(Debug, Default, Clone)]
26pub struct DeltaDeduplicator {
27 /// Rolling hash window for accumulated content
28 hash_window: RollingHashWindow,
29}
30
31impl DeltaDeduplicator {
32 /// Create a new delta deduplicator.
33 #[cfg(test)]
34 #[must_use]
35 pub fn new() -> Self {
36 Self::default()
37 }
38
39 /// Add accumulated content for deduplication tracking.
40 ///
41 /// # Arguments
42 /// * `content` - The accumulated content to track
43 #[cfg(test)]
44 pub fn add_accumulated(&mut self, content: &str) {
45 self.hash_window.add_content(content);
46 }
47
48 /// Extract new content from a potential snapshot.
49 ///
50 /// Uses rolling hash and KMP to detect if the incoming delta contains
51 /// previously accumulated content (a snapshot) and extracts only the
52 /// new portion.
53 ///
54 /// # Two-Phase Algorithm
55 ///
56 /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
57 /// in any window of the accumulated content. O(n) time.
58 ///
59 /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
60 /// substring match and find the exact position. O(n+m) time.
61 ///
62 /// # Arguments
63 /// * `delta` - The incoming delta to check
64 /// * `accumulated` - The previously accumulated content
65 ///
66 /// # Returns
67 /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
68 /// * `None` - The delta is genuinely new, doesn't start with accumulated
69 ///
70 /// # Example
71 /// ```ignore
72 /// let mut dedup = DeltaDeduplicator::new();
73 /// dedup.add_accumulated("Hello");
74 ///
75 /// // Snapshot: "Hello World"
76 /// assert_eq!(
77 /// DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
78 /// Some(" World")
79 /// );
80 ///
81 /// // Genuine delta: " World"
82 /// assert_eq!(
83 /// DeltaDeduplicator::extract_new_content(" World", "Hello"),
84 /// None
85 /// );
86 /// ```
87 #[cfg(test)]
88 #[must_use]
89 pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
90 // Handle identical content (delta == accumulated) - return empty string
91 if delta == accumulated {
92 return Some("");
93 }
94
95 // Fast rejection: delta must be longer than accumulated
96 if delta.len() <= accumulated.len() {
97 return None;
98 }
99
100 // Phase 1: Rolling hash check
101 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
102 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
103
104 // If hashes don't match, definitely not a snapshot
105 if accumulated_hash != delta_prefix_hash {
106 return None;
107 }
108
109 // Phase 2: KMP verification for exact match
110 let kmp = KMPMatcher::new(accumulated);
111 if let Some(pos) = kmp.find(delta) {
112 // Verify it's at position 0 (snapshot starts with accumulated)
113 if pos == 0 {
114 return Some(&delta[accumulated.len()..]);
115 }
116 }
117
118 // Hash collision or match not at start - not a snapshot
119 None
120 }
121
122 /// Check if delta is likely a snapshot using rolling hash only.
123 ///
124 /// This is a faster O(n) check that may have false positives due to
125 /// hash collisions. Use `extract_new_content` for verified results.
126 ///
127 /// # Arguments
128 /// * `delta` - The incoming delta to check
129 /// * `accumulated` - The previously accumulated content
130 ///
131 /// # Returns
132 /// * `true` - Delta may be a snapshot (hash matches)
133 /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
134 #[must_use]
135 pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
136 // Handle identical content (duplicate delta)
137 if delta == accumulated {
138 return true;
139 }
140
141 // Delta must be longer than accumulated to be a snapshot
142 if delta.len() <= accumulated.len() {
143 return false;
144 }
145
146 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
147 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
148
149 accumulated_hash == delta_prefix_hash
150 }
151
152 /// Check if delta is likely a snapshot with strong overlap detection.
153 ///
154 /// This is an enhanced version of `is_likely_snapshot` that applies
155 /// strong overlap thresholds to prevent false positives on intentional
156 /// repetitions.
157 ///
158 /// # Strong Overlap Detection
159 ///
160 /// This method only returns `true` when:
161 /// - The overlap meets minimum character count threshold (default: 30 chars)
162 /// - The overlap meets minimum ratio threshold (default: 50% of delta)
163 /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
164 /// - Short chunks (< 20 chars) are only deduped if exact match
165 ///
166 /// # Arguments
167 /// * `delta` - The incoming delta to check
168 /// * `accumulated` - The previously accumulated content
169 ///
170 /// # Returns
171 /// * `true` - Delta is a snapshot meeting strong overlap criteria
172 /// * `false` - Delta is either genuine or overlap is too weak
173 #[must_use]
174 pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
175 let thresholds = get_overlap_thresholds();
176
177 // Handle short chunks: only dedupe if exact match
178 if delta.len() < thresholds.short_chunk_threshold {
179 return delta == accumulated;
180 }
181
182 // Handle identical content (delta == accumulated)
183 // This is a snapshot where no new content is added
184 if delta == accumulated {
185 return true;
186 }
187
188 // Fast rejection: delta must be longer than accumulated
189 if delta.len() <= accumulated.len() {
190 return false;
191 }
192
193 // First check with basic rolling hash for quick rejection
194 if !Self::is_likely_snapshot(delta, accumulated) {
195 return false;
196 }
197
198 // Score the overlap to check if it meets strong overlap criteria
199 let score = score_overlap(delta, accumulated);
200
201 // Apply threshold checks
202 score.meets_thresholds(&thresholds)
203 }
204
205 /// Extract new content from a snapshot with strong overlap detection.
206 ///
207 /// This is an enhanced version of `extract_new_content` that only extracts
208 /// new content when the overlap meets strong overlap thresholds.
209 ///
210 /// # Arguments
211 /// * `delta` - The incoming delta to check
212 /// * `accumulated` - The previously accumulated content
213 ///
214 /// # Returns
215 /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
216 /// * `None` - The overlap is too weak or not a snapshot
217 #[must_use]
218 pub fn extract_new_content_with_thresholds<'a>(
219 delta: &'a str,
220 accumulated: &str,
221 ) -> Option<&'a str> {
222 let thresholds = get_overlap_thresholds();
223
224 // Handle short chunks: only dedupe if exact match
225 if delta.len() < thresholds.short_chunk_threshold {
226 if delta == accumulated {
227 return Some("");
228 }
229 return None;
230 }
231
232 // Handle identical content
233 if delta == accumulated {
234 return Some("");
235 }
236
237 // Fast rejection: delta must be longer than accumulated
238 if delta.len() <= accumulated.len() {
239 return None;
240 }
241
242 // Score the overlap
243 let score = score_overlap(delta, accumulated);
244
245 // Check if overlap meets thresholds
246 if !score.meets_thresholds(&thresholds) {
247 return None;
248 }
249
250 // Extract new content using the overlap length from the score
251 if score.char_count > 0 && delta.len() > score.char_count {
252 Some(&delta[score.char_count..])
253 } else {
254 None
255 }
256 }
257
258 /// Clear all tracked content.
259 pub fn clear(&mut self) {
260 self.hash_window.clear();
261 }
262}