Skip to main content

ralph_workflow/json_parser/deduplication/
deduplicator.rs

1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5/// Delta deduplicator using rolling hash and KMP.
6///
7/// Orchestrates the two-phase deduplication approach:
8/// 1. Rolling hash for fast filtering
9/// 2. KMP for exact verification
10///
11/// # Example
12///
13/// ```ignore
14/// let mut dedup = DeltaDeduplicator::new();
15///
16/// // Add accumulated content
17/// dedup.add_accumulated("Hello World");
18///
19/// // Check if a delta is a duplicate
20/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
21///     // "!" is the new portion
22///     assert_eq!(new_portion, "!");
23/// }
24/// ```
25#[derive(Debug, Default, Clone)]
26pub struct DeltaDeduplicator {
27    /// Rolling hash window for accumulated content
28    hash_window: RollingHashWindow,
29}
30
31impl DeltaDeduplicator {
32    /// Create a new delta deduplicator.
33    #[cfg(test)]
34    #[must_use] 
35    pub fn new() -> Self {
36        Self::default()
37    }
38
39    /// Add accumulated content for deduplication tracking.
40    ///
41    /// # Arguments
42    /// * `content` - The accumulated content to track
43    #[cfg(test)]
44    pub fn add_accumulated(&mut self, content: &str) {
45        self.hash_window.add_content(content);
46    }
47
48    /// Extract new content from a potential snapshot.
49    ///
50    /// Uses rolling hash and KMP to detect if the incoming delta contains
51    /// previously accumulated content (a snapshot) and extracts only the
52    /// new portion.
53    ///
54    /// # Two-Phase Algorithm
55    ///
56    /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
57    ///    in any window of the accumulated content. O(n) time.
58    ///
59    /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
60    ///    substring match and find the exact position. O(n+m) time.
61    ///
62    /// # Arguments
63    /// * `delta` - The incoming delta to check
64    /// * `accumulated` - The previously accumulated content
65    ///
66    /// # Returns
67    /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
68    /// * `None` - The delta is genuinely new, doesn't start with accumulated
69    ///
70    /// # Example
71    /// ```ignore
72    /// let mut dedup = DeltaDeduplicator::new();
73    /// dedup.add_accumulated("Hello");
74    ///
75    /// // Snapshot: "Hello World"
76    /// assert_eq!(
77    ///     DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
78    ///     Some(" World")
79    /// );
80    ///
81    /// // Genuine delta: " World"
82    /// assert_eq!(
83    ///     DeltaDeduplicator::extract_new_content(" World", "Hello"),
84    ///     None
85    /// );
86    /// ```
87    #[cfg(test)]
88    #[must_use] 
89    pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
90        // Handle identical content (delta == accumulated) - return empty string
91        if delta == accumulated {
92            return Some("");
93        }
94
95        // Fast rejection: delta must be longer than accumulated
96        if delta.len() <= accumulated.len() {
97            return None;
98        }
99
100        // Phase 1: Rolling hash check
101        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
102        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
103
104        // If hashes don't match, definitely not a snapshot
105        if accumulated_hash != delta_prefix_hash {
106            return None;
107        }
108
109        // Phase 2: KMP verification for exact match
110        let kmp = KMPMatcher::new(accumulated);
111        if let Some(pos) = kmp.find(delta) {
112            // Verify it's at position 0 (snapshot starts with accumulated)
113            if pos == 0 {
114                return Some(&delta[accumulated.len()..]);
115            }
116        }
117
118        // Hash collision or match not at start - not a snapshot
119        None
120    }
121
122    /// Check if delta is likely a snapshot using rolling hash only.
123    ///
124    /// This is a faster O(n) check that may have false positives due to
125    /// hash collisions. Use `extract_new_content` for verified results.
126    ///
127    /// # Arguments
128    /// * `delta` - The incoming delta to check
129    /// * `accumulated` - The previously accumulated content
130    ///
131    /// # Returns
132    /// * `true` - Delta may be a snapshot (hash matches)
133    /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
134    #[must_use] 
135    pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
136        // Handle identical content (duplicate delta)
137        if delta == accumulated {
138            return true;
139        }
140
141        // Delta must be longer than accumulated to be a snapshot
142        if delta.len() <= accumulated.len() {
143            return false;
144        }
145
146        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
147        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
148
149        accumulated_hash == delta_prefix_hash
150    }
151
152    /// Check if delta is likely a snapshot with strong overlap detection.
153    ///
154    /// This is an enhanced version of `is_likely_snapshot` that applies
155    /// strong overlap thresholds to prevent false positives on intentional
156    /// repetitions.
157    ///
158    /// # Strong Overlap Detection
159    ///
160    /// This method only returns `true` when:
161    /// - The overlap meets minimum character count threshold (default: 30 chars)
162    /// - The overlap meets minimum ratio threshold (default: 50% of delta)
163    /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
164    /// - Short chunks (< 20 chars) are only deduped if exact match
165    ///
166    /// # Arguments
167    /// * `delta` - The incoming delta to check
168    /// * `accumulated` - The previously accumulated content
169    ///
170    /// # Returns
171    /// * `true` - Delta is a snapshot meeting strong overlap criteria
172    /// * `false` - Delta is either genuine or overlap is too weak
173    #[must_use] 
174    pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
175        let thresholds = get_overlap_thresholds();
176
177        // Handle short chunks: only dedupe if exact match
178        if delta.len() < thresholds.short_chunk_threshold {
179            return delta == accumulated;
180        }
181
182        // Handle identical content (delta == accumulated)
183        // This is a snapshot where no new content is added
184        if delta == accumulated {
185            return true;
186        }
187
188        // Fast rejection: delta must be longer than accumulated
189        if delta.len() <= accumulated.len() {
190            return false;
191        }
192
193        // First check with basic rolling hash for quick rejection
194        if !Self::is_likely_snapshot(delta, accumulated) {
195            return false;
196        }
197
198        // Score the overlap to check if it meets strong overlap criteria
199        let score = score_overlap(delta, accumulated);
200
201        // Apply threshold checks
202        score.meets_thresholds(&thresholds)
203    }
204
205    /// Extract new content from a snapshot with strong overlap detection.
206    ///
207    /// This is an enhanced version of `extract_new_content` that only extracts
208    /// new content when the overlap meets strong overlap thresholds.
209    ///
210    /// # Arguments
211    /// * `delta` - The incoming delta to check
212    /// * `accumulated` - The previously accumulated content
213    ///
214    /// # Returns
215    /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
216    /// * `None` - The overlap is too weak or not a snapshot
217    #[must_use] 
218    pub fn extract_new_content_with_thresholds<'a>(
219        delta: &'a str,
220        accumulated: &str,
221    ) -> Option<&'a str> {
222        let thresholds = get_overlap_thresholds();
223
224        // Handle short chunks: only dedupe if exact match
225        if delta.len() < thresholds.short_chunk_threshold {
226            if delta == accumulated {
227                return Some("");
228            }
229            return None;
230        }
231
232        // Handle identical content
233        if delta == accumulated {
234            return Some("");
235        }
236
237        // Fast rejection: delta must be longer than accumulated
238        if delta.len() <= accumulated.len() {
239            return None;
240        }
241
242        // Score the overlap
243        let score = score_overlap(delta, accumulated);
244
245        // Check if overlap meets thresholds
246        if !score.meets_thresholds(&thresholds) {
247            return None;
248        }
249
250        // Extract new content using the overlap length from the score
251        if score.char_count > 0 && delta.len() > score.char_count {
252            Some(&delta[score.char_count..])
253        } else {
254            None
255        }
256    }
257
258    /// Clear all tracked content.
259    pub fn clear(&mut self) {
260        self.hash_window.clear();
261    }
262}