Skip to main content

ralph_workflow/json_parser/deduplication/
deduplicator.rs

1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5/// Delta deduplicator using rolling hash and KMP.
6///
7/// Orchestrates the two-phase deduplication approach:
8/// 1. Rolling hash for fast filtering
9/// 2. KMP for exact verification
10///
11/// # Example
12///
13/// ```ignore
14/// let mut dedup = DeltaDeduplicator::new();
15///
16/// // Add accumulated content
17/// dedup.add_accumulated("Hello World");
18///
19/// // Check if a delta is a duplicate
20/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
21///     // "!" is the new portion
22///     assert_eq!(new_portion, "!");
23/// }
24/// ```
25#[derive(Debug, Default, Clone)]
26pub struct DeltaDeduplicator {
27    /// Rolling hash window for accumulated content
28    hash_window: RollingHashWindow,
29}
30
31impl DeltaDeduplicator {
32    /// Create a new delta deduplicator.
33    #[cfg(test)]
34    pub fn new() -> Self {
35        Self::default()
36    }
37
38    /// Add accumulated content for deduplication tracking.
39    ///
40    /// # Arguments
41    /// * `content` - The accumulated content to track
42    #[cfg(test)]
43    pub fn add_accumulated(&mut self, content: &str) {
44        self.hash_window.add_content(content);
45    }
46
47    /// Extract new content from a potential snapshot.
48    ///
49    /// Uses rolling hash and KMP to detect if the incoming delta contains
50    /// previously accumulated content (a snapshot) and extracts only the
51    /// new portion.
52    ///
53    /// # Two-Phase Algorithm
54    ///
55    /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
56    ///    in any window of the accumulated content. O(n) time.
57    ///
58    /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
59    ///    substring match and find the exact position. O(n+m) time.
60    ///
61    /// # Arguments
62    /// * `delta` - The incoming delta to check
63    /// * `accumulated` - The previously accumulated content
64    ///
65    /// # Returns
66    /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
67    /// * `None` - The delta is genuinely new, doesn't start with accumulated
68    ///
69    /// # Example
70    /// ```ignore
71    /// let mut dedup = DeltaDeduplicator::new();
72    /// dedup.add_accumulated("Hello");
73    ///
74    /// // Snapshot: "Hello World"
75    /// assert_eq!(
76    ///     DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
77    ///     Some(" World")
78    /// );
79    ///
80    /// // Genuine delta: " World"
81    /// assert_eq!(
82    ///     DeltaDeduplicator::extract_new_content(" World", "Hello"),
83    ///     None
84    /// );
85    /// ```
86    #[cfg(test)]
87    pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
88        // Handle identical content (delta == accumulated) - return empty string
89        if delta == accumulated {
90            return Some("");
91        }
92
93        // Fast rejection: delta must be longer than accumulated
94        if delta.len() <= accumulated.len() {
95            return None;
96        }
97
98        // Phase 1: Rolling hash check
99        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
100        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
101
102        // If hashes don't match, definitely not a snapshot
103        if accumulated_hash != delta_prefix_hash {
104            return None;
105        }
106
107        // Phase 2: KMP verification for exact match
108        let kmp = KMPMatcher::new(accumulated);
109        if let Some(pos) = kmp.find(delta) {
110            // Verify it's at position 0 (snapshot starts with accumulated)
111            if pos == 0 {
112                return Some(&delta[accumulated.len()..]);
113            }
114        }
115
116        // Hash collision or match not at start - not a snapshot
117        None
118    }
119
120    /// Check if delta is likely a snapshot using rolling hash only.
121    ///
122    /// This is a faster O(n) check that may have false positives due to
123    /// hash collisions. Use `extract_new_content` for verified results.
124    ///
125    /// # Arguments
126    /// * `delta` - The incoming delta to check
127    /// * `accumulated` - The previously accumulated content
128    ///
129    /// # Returns
130    /// * `true` - Delta may be a snapshot (hash matches)
131    /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
132    pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
133        // Handle identical content (duplicate delta)
134        if delta == accumulated {
135            return true;
136        }
137
138        // Delta must be longer than accumulated to be a snapshot
139        if delta.len() <= accumulated.len() {
140            return false;
141        }
142
143        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
144        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
145
146        accumulated_hash == delta_prefix_hash
147    }
148
149    /// Check if delta is likely a snapshot with strong overlap detection.
150    ///
151    /// This is an enhanced version of `is_likely_snapshot` that applies
152    /// strong overlap thresholds to prevent false positives on intentional
153    /// repetitions.
154    ///
155    /// # Strong Overlap Detection
156    ///
157    /// This method only returns `true` when:
158    /// - The overlap meets minimum character count threshold (default: 30 chars)
159    /// - The overlap meets minimum ratio threshold (default: 50% of delta)
160    /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
161    /// - Short chunks (< 20 chars) are only deduped if exact match
162    ///
163    /// # Arguments
164    /// * `delta` - The incoming delta to check
165    /// * `accumulated` - The previously accumulated content
166    ///
167    /// # Returns
168    /// * `true` - Delta is a snapshot meeting strong overlap criteria
169    /// * `false` - Delta is either genuine or overlap is too weak
170    pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
171        let thresholds = get_overlap_thresholds();
172
173        // Handle short chunks: only dedupe if exact match
174        if delta.len() < thresholds.short_chunk_threshold {
175            return delta == accumulated;
176        }
177
178        // Handle identical content (delta == accumulated)
179        // This is a snapshot where no new content is added
180        if delta == accumulated {
181            return true;
182        }
183
184        // Fast rejection: delta must be longer than accumulated
185        if delta.len() <= accumulated.len() {
186            return false;
187        }
188
189        // First check with basic rolling hash for quick rejection
190        if !Self::is_likely_snapshot(delta, accumulated) {
191            return false;
192        }
193
194        // Score the overlap to check if it meets strong overlap criteria
195        let score = score_overlap(delta, accumulated);
196
197        // Apply threshold checks
198        score.meets_thresholds(&thresholds)
199    }
200
201    /// Extract new content from a snapshot with strong overlap detection.
202    ///
203    /// This is an enhanced version of `extract_new_content` that only extracts
204    /// new content when the overlap meets strong overlap thresholds.
205    ///
206    /// # Arguments
207    /// * `delta` - The incoming delta to check
208    /// * `accumulated` - The previously accumulated content
209    ///
210    /// # Returns
211    /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
212    /// * `None` - The overlap is too weak or not a snapshot
213    pub fn extract_new_content_with_thresholds<'a>(
214        delta: &'a str,
215        accumulated: &str,
216    ) -> Option<&'a str> {
217        let thresholds = get_overlap_thresholds();
218
219        // Handle short chunks: only dedupe if exact match
220        if delta.len() < thresholds.short_chunk_threshold {
221            if delta == accumulated {
222                return Some("");
223            }
224            return None;
225        }
226
227        // Handle identical content
228        if delta == accumulated {
229            return Some("");
230        }
231
232        // Fast rejection: delta must be longer than accumulated
233        if delta.len() <= accumulated.len() {
234            return None;
235        }
236
237        // Score the overlap
238        let score = score_overlap(delta, accumulated);
239
240        // Check if overlap meets thresholds
241        if !score.meets_thresholds(&thresholds) {
242            return None;
243        }
244
245        // Extract new content using the overlap length from the score
246        if score.char_count > 0 && delta.len() > score.char_count {
247            Some(&delta[score.char_count..])
248        } else {
249            None
250        }
251    }
252
253    /// Clear all tracked content.
254    pub fn clear(&mut self) {
255        self.hash_window.clear();
256    }
257}