ralph_workflow/json_parser/deduplication/deduplicator.rs
1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5/// Delta deduplicator using rolling hash and KMP.
6///
7/// Orchestrates the two-phase deduplication approach:
8/// 1. Rolling hash for fast filtering
9/// 2. KMP for exact verification
10///
11/// # Example
12///
13/// ```ignore
14/// let mut dedup = DeltaDeduplicator::new();
15///
16/// // Add accumulated content
17/// dedup.add_accumulated("Hello World");
18///
19/// // Check if a delta is a duplicate
20/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
21/// // "!" is the new portion
22/// assert_eq!(new_portion, "!");
23/// }
24/// ```
25#[derive(Debug, Default, Clone)]
26pub struct DeltaDeduplicator {
27 /// Rolling hash window for accumulated content
28 hash_window: RollingHashWindow,
29}
30
31impl DeltaDeduplicator {
32 /// Create a new delta deduplicator.
33 #[cfg(test)]
34 pub fn new() -> Self {
35 Self::default()
36 }
37
38 /// Add accumulated content for deduplication tracking.
39 ///
40 /// # Arguments
41 /// * `content` - The accumulated content to track
42 #[cfg(test)]
43 pub fn add_accumulated(&mut self, content: &str) {
44 self.hash_window.add_content(content);
45 }
46
47 /// Extract new content from a potential snapshot.
48 ///
49 /// Uses rolling hash and KMP to detect if the incoming delta contains
50 /// previously accumulated content (a snapshot) and extracts only the
51 /// new portion.
52 ///
53 /// # Two-Phase Algorithm
54 ///
55 /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
56 /// in any window of the accumulated content. O(n) time.
57 ///
58 /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
59 /// substring match and find the exact position. O(n+m) time.
60 ///
61 /// # Arguments
62 /// * `delta` - The incoming delta to check
63 /// * `accumulated` - The previously accumulated content
64 ///
65 /// # Returns
66 /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
67 /// * `None` - The delta is genuinely new, doesn't start with accumulated
68 ///
69 /// # Example
70 /// ```ignore
71 /// let mut dedup = DeltaDeduplicator::new();
72 /// dedup.add_accumulated("Hello");
73 ///
74 /// // Snapshot: "Hello World"
75 /// assert_eq!(
76 /// DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
77 /// Some(" World")
78 /// );
79 ///
80 /// // Genuine delta: " World"
81 /// assert_eq!(
82 /// DeltaDeduplicator::extract_new_content(" World", "Hello"),
83 /// None
84 /// );
85 /// ```
86 #[cfg(test)]
87 pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
88 // Handle identical content (delta == accumulated) - return empty string
89 if delta == accumulated {
90 return Some("");
91 }
92
93 // Fast rejection: delta must be longer than accumulated
94 if delta.len() <= accumulated.len() {
95 return None;
96 }
97
98 // Phase 1: Rolling hash check
99 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
100 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
101
102 // If hashes don't match, definitely not a snapshot
103 if accumulated_hash != delta_prefix_hash {
104 return None;
105 }
106
107 // Phase 2: KMP verification for exact match
108 let kmp = KMPMatcher::new(accumulated);
109 if let Some(pos) = kmp.find(delta) {
110 // Verify it's at position 0 (snapshot starts with accumulated)
111 if pos == 0 {
112 return Some(&delta[accumulated.len()..]);
113 }
114 }
115
116 // Hash collision or match not at start - not a snapshot
117 None
118 }
119
120 /// Check if delta is likely a snapshot using rolling hash only.
121 ///
122 /// This is a faster O(n) check that may have false positives due to
123 /// hash collisions. Use `extract_new_content` for verified results.
124 ///
125 /// # Arguments
126 /// * `delta` - The incoming delta to check
127 /// * `accumulated` - The previously accumulated content
128 ///
129 /// # Returns
130 /// * `true` - Delta may be a snapshot (hash matches)
131 /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
132 pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
133 // Handle identical content (duplicate delta)
134 if delta == accumulated {
135 return true;
136 }
137
138 // Delta must be longer than accumulated to be a snapshot
139 if delta.len() <= accumulated.len() {
140 return false;
141 }
142
143 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
144 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
145
146 accumulated_hash == delta_prefix_hash
147 }
148
149 /// Check if delta is likely a snapshot with strong overlap detection.
150 ///
151 /// This is an enhanced version of `is_likely_snapshot` that applies
152 /// strong overlap thresholds to prevent false positives on intentional
153 /// repetitions.
154 ///
155 /// # Strong Overlap Detection
156 ///
157 /// This method only returns `true` when:
158 /// - The overlap meets minimum character count threshold (default: 30 chars)
159 /// - The overlap meets minimum ratio threshold (default: 50% of delta)
160 /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
161 /// - Short chunks (< 20 chars) are only deduped if exact match
162 ///
163 /// # Arguments
164 /// * `delta` - The incoming delta to check
165 /// * `accumulated` - The previously accumulated content
166 ///
167 /// # Returns
168 /// * `true` - Delta is a snapshot meeting strong overlap criteria
169 /// * `false` - Delta is either genuine or overlap is too weak
170 pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
171 let thresholds = get_overlap_thresholds();
172
173 // Handle short chunks: only dedupe if exact match
174 if delta.len() < thresholds.short_chunk_threshold {
175 return delta == accumulated;
176 }
177
178 // Handle identical content (delta == accumulated)
179 // This is a snapshot where no new content is added
180 if delta == accumulated {
181 return true;
182 }
183
184 // Fast rejection: delta must be longer than accumulated
185 if delta.len() <= accumulated.len() {
186 return false;
187 }
188
189 // First check with basic rolling hash for quick rejection
190 if !Self::is_likely_snapshot(delta, accumulated) {
191 return false;
192 }
193
194 // Score the overlap to check if it meets strong overlap criteria
195 let score = score_overlap(delta, accumulated);
196
197 // Apply threshold checks
198 score.meets_thresholds(&thresholds)
199 }
200
201 /// Extract new content from a snapshot with strong overlap detection.
202 ///
203 /// This is an enhanced version of `extract_new_content` that only extracts
204 /// new content when the overlap meets strong overlap thresholds.
205 ///
206 /// # Arguments
207 /// * `delta` - The incoming delta to check
208 /// * `accumulated` - The previously accumulated content
209 ///
210 /// # Returns
211 /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
212 /// * `None` - The overlap is too weak or not a snapshot
213 pub fn extract_new_content_with_thresholds<'a>(
214 delta: &'a str,
215 accumulated: &str,
216 ) -> Option<&'a str> {
217 let thresholds = get_overlap_thresholds();
218
219 // Handle short chunks: only dedupe if exact match
220 if delta.len() < thresholds.short_chunk_threshold {
221 if delta == accumulated {
222 return Some("");
223 }
224 return None;
225 }
226
227 // Handle identical content
228 if delta == accumulated {
229 return Some("");
230 }
231
232 // Fast rejection: delta must be longer than accumulated
233 if delta.len() <= accumulated.len() {
234 return None;
235 }
236
237 // Score the overlap
238 let score = score_overlap(delta, accumulated);
239
240 // Check if overlap meets thresholds
241 if !score.meets_thresholds(&thresholds) {
242 return None;
243 }
244
245 // Extract new content using the overlap length from the score
246 if score.char_count > 0 && delta.len() > score.char_count {
247 Some(&delta[score.char_count..])
248 } else {
249 None
250 }
251 }
252
253 /// Clear all tracked content.
254 pub fn clear(&mut self) {
255 self.hash_window.clear();
256 }
257}