ralph_workflow/json_parser/deduplication/deduplicator.rs
1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5#[cfg(test)]
6use crate::json_parser::deduplication::kmp_matcher::KMPMatcher;
7use crate::json_parser::deduplication::rolling_hash::RollingHashWindow;
8
9
10enum OverlapPrecondition {
11 /// Delta is exact match with accumulated — treat as duplicate with no new content.
12 ExactMatch,
13 /// Delta is long enough, not identical, and longer than accumulated — proceed to scoring.
14 NotShortNotIdentical,
15 /// Delta fails basic preconditions — not a snapshot.
16 Reject,
17}
18
19fn check_overlap_preconditions(delta: &str, accumulated: &str, thresholds: &OverlapThresholds) -> OverlapPrecondition {
20 if delta == accumulated { return OverlapPrecondition::ExactMatch; }
21 if delta.len() < thresholds.short_chunk_threshold || delta.len() <= accumulated.len() { return OverlapPrecondition::Reject; }
22 OverlapPrecondition::NotShortNotIdentical
23}
24
25fn extract_suffix_by_overlap(delta: &str, char_count: usize) -> Option<&str> {
26 if char_count > 0 && delta.len() > char_count {
27 Some(&delta[char_count..])
28 } else {
29 None
30 }
31}
32
33/// Delta deduplicator using rolling hash and KMP.
34///
35/// Orchestrates the two-phase deduplication approach:
36/// 1. Rolling hash for fast filtering
37/// 2. KMP for exact verification
38///
39/// # Example
40///
41/// ```ignore
42/// let mut dedup = DeltaDeduplicator::new();
43///
44/// // Add accumulated content
45/// dedup.add_accumulated("Hello World");
46///
47/// // Check if a delta is a duplicate
48/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
49/// // "!" is the new portion
50/// assert_eq!(new_portion, "!");
51/// }
52/// ```
53#[derive(Debug, Default, Clone)]
54pub struct DeltaDeduplicator {
55 /// Rolling hash window for accumulated content
56 hash_window: RollingHashWindow,
57}
58
59impl DeltaDeduplicator {
60 /// Create a new delta deduplicator.
61 #[cfg(test)]
62 #[must_use]
63 pub fn new() -> Self {
64 Self::default()
65 }
66
67 /// Add accumulated content for deduplication tracking.
68 ///
69 /// # Arguments
70 /// * `content` - The accumulated content to track
71 #[cfg(test)]
72 pub fn add_accumulated(&mut self, content: &str) {
73 self.hash_window.add_content(content);
74 }
75
76 /// Extract new content from a potential snapshot.
77 ///
78 /// Uses rolling hash and KMP to detect if the incoming delta contains
79 /// previously accumulated content (a snapshot) and extracts only the
80 /// new portion.
81 ///
82 /// # Two-Phase Algorithm
83 ///
84 /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
85 /// in any window of the accumulated content. O(n) time.
86 ///
87 /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
88 /// substring match and find the exact position. O(n+m) time.
89 ///
90 /// # Arguments
91 /// * `delta` - The incoming delta to check
92 /// * `accumulated` - The previously accumulated content
93 ///
94 /// # Returns
95 /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
96 /// * `None` - The delta is genuinely new, doesn't start with accumulated
97 ///
98 /// # Example
99 /// ```ignore
100 /// let mut dedup = DeltaDeduplicator::new();
101 /// dedup.add_accumulated("Hello");
102 ///
103 /// // Snapshot: "Hello World"
104 /// assert_eq!(
105 /// DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
106 /// Some(" World")
107 /// );
108 ///
109 /// // Genuine delta: " World"
110 /// assert_eq!(
111 /// DeltaDeduplicator::extract_new_content(" World", "Hello"),
112 /// None
113 /// );
114 /// ```
115 #[cfg(test)]
116 #[must_use]
117 pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
118 // Handle identical content (delta == accumulated) - return empty string
119 if delta == accumulated {
120 return Some("");
121 }
122
123 // Fast rejection: delta must be longer than accumulated
124 if delta.len() <= accumulated.len() {
125 return None;
126 }
127
128 // Phase 1: Rolling hash check
129 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
130 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
131
132 // If hashes don't match, definitely not a snapshot
133 if accumulated_hash != delta_prefix_hash {
134 return None;
135 }
136
137 // Phase 2: KMP verification for exact match
138 let kmp = KMPMatcher::new(accumulated);
139 if let Some(pos) = kmp.find(delta) {
140 // Verify it's at position 0 (snapshot starts with accumulated)
141 if pos == 0 {
142 return Some(&delta[accumulated.len()..]);
143 }
144 }
145
146 // Hash collision or match not at start - not a snapshot
147 None
148 }
149
150 /// Check if delta is likely a snapshot using rolling hash only.
151 ///
152 /// This is a faster O(n) check that may have false positives due to
153 /// hash collisions. Use `extract_new_content` for verified results.
154 ///
155 /// # Arguments
156 /// * `delta` - The incoming delta to check
157 /// * `accumulated` - The previously accumulated content
158 ///
159 /// # Returns
160 /// * `true` - Delta may be a snapshot (hash matches)
161 /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
162 #[must_use]
163 pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
164 // Handle identical content (duplicate delta)
165 if delta == accumulated {
166 return true;
167 }
168
169 // Delta must be longer than accumulated to be a snapshot
170 if delta.len() <= accumulated.len() {
171 return false;
172 }
173
174 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
175 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
176
177 accumulated_hash == delta_prefix_hash
178 }
179
180 /// Check if delta is likely a snapshot with strong overlap detection.
181 ///
182 /// This is an enhanced version of `is_likely_snapshot` that applies
183 /// strong overlap thresholds to prevent false positives on intentional
184 /// repetitions.
185 ///
186 /// # Strong Overlap Detection
187 ///
188 /// This method only returns `true` when:
189 /// - The overlap meets minimum character count threshold (default: 30 chars)
190 /// - The overlap meets minimum ratio threshold (default: 50% of delta)
191 /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
192 /// - Short chunks (< 20 chars) are only deduped if exact match
193 ///
194 /// # Arguments
195 /// * `delta` - The incoming delta to check
196 /// * `accumulated` - The previously accumulated content
197 ///
198 /// # Returns
199 /// * `true` - Delta is a snapshot meeting strong overlap criteria
200 /// * `false` - Delta is either genuine or overlap is too weak
201 #[must_use]
202 pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
203 let thresholds = get_overlap_thresholds();
204 match check_overlap_preconditions(delta, accumulated, &thresholds) {
205 OverlapPrecondition::ExactMatch => true,
206 OverlapPrecondition::NotShortNotIdentical => {
207 Self::is_likely_snapshot(delta, accumulated)
208 && score_overlap(delta, accumulated).meets_thresholds(&thresholds)
209 }
210 OverlapPrecondition::Reject => false,
211 }
212 }
213
214 /// Extract new content from a snapshot with strong overlap detection.
215 ///
216 /// This is an enhanced version of `extract_new_content` that only extracts
217 /// new content when the overlap meets strong overlap thresholds.
218 ///
219 /// # Arguments
220 /// * `delta` - The incoming delta to check
221 /// * `accumulated` - The previously accumulated content
222 ///
223 /// # Returns
224 /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
225 /// * `None` - The overlap is too weak or not a snapshot
226 #[must_use]
227 pub fn extract_new_content_with_thresholds<'a>(
228 delta: &'a str,
229 accumulated: &str,
230 ) -> Option<&'a str> {
231 let thresholds = get_overlap_thresholds();
232 match check_overlap_preconditions(delta, accumulated, &thresholds) {
233 OverlapPrecondition::ExactMatch => Some(""),
234 OverlapPrecondition::NotShortNotIdentical => {
235 let score = score_overlap(delta, accumulated);
236 score
237 .meets_thresholds(&thresholds)
238 .then(|| extract_suffix_by_overlap(delta, score.char_count))
239 .flatten()
240 }
241 OverlapPrecondition::Reject => None,
242 }
243 }
244
245 /// Clear all tracked content.
246 pub fn clear(&mut self) {
247 self.hash_window.clear();
248 }
249}