Skip to main content

ralph_workflow/json_parser/deduplication/
deduplicator.rs

1// Delta deduplicator implementation.
2//
3// Orchestrates rolling hash and KMP for two-phase deduplication.
4
5#[cfg(test)]
6use crate::json_parser::deduplication::kmp_matcher::KMPMatcher;
7use crate::json_parser::deduplication::rolling_hash::RollingHashWindow;
8
9
10enum OverlapPrecondition {
11    /// Delta is exact match with accumulated — treat as duplicate with no new content.
12    ExactMatch,
13    /// Delta is long enough, not identical, and longer than accumulated — proceed to scoring.
14    NotShortNotIdentical,
15    /// Delta fails basic preconditions — not a snapshot.
16    Reject,
17}
18
19fn check_overlap_preconditions(delta: &str, accumulated: &str, thresholds: &OverlapThresholds) -> OverlapPrecondition {
20    if delta == accumulated { return OverlapPrecondition::ExactMatch; }
21    if delta.len() < thresholds.short_chunk_threshold || delta.len() <= accumulated.len() { return OverlapPrecondition::Reject; }
22    OverlapPrecondition::NotShortNotIdentical
23}
24
25fn extract_suffix_by_overlap(delta: &str, char_count: usize) -> Option<&str> {
26    if char_count > 0 && delta.len() > char_count {
27        Some(&delta[char_count..])
28    } else {
29        None
30    }
31}
32
33/// Delta deduplicator using rolling hash and KMP.
34///
35/// Orchestrates the two-phase deduplication approach:
36/// 1. Rolling hash for fast filtering
37/// 2. KMP for exact verification
38///
39/// # Example
40///
41/// ```ignore
42/// let mut dedup = DeltaDeduplicator::new();
43///
44/// // Add accumulated content
45/// dedup.add_accumulated("Hello World");
46///
47/// // Check if a delta is a duplicate
48/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
49///     // "!" is the new portion
50///     assert_eq!(new_portion, "!");
51/// }
52/// ```
53#[derive(Debug, Default, Clone)]
54pub struct DeltaDeduplicator {
55    /// Rolling hash window for accumulated content
56    hash_window: RollingHashWindow,
57}
58
59impl DeltaDeduplicator {
60    /// Create a new delta deduplicator.
61    #[cfg(test)]
62    #[must_use]
63    pub fn new() -> Self {
64        Self::default()
65    }
66
67    /// Add accumulated content for deduplication tracking.
68    ///
69    /// # Arguments
70    /// * `content` - The accumulated content to track
71    #[cfg(test)]
72    pub fn add_accumulated(&mut self, content: &str) {
73        self.hash_window.add_content(content);
74    }
75
76    /// Extract new content from a potential snapshot.
77    ///
78    /// Uses rolling hash and KMP to detect if the incoming delta contains
79    /// previously accumulated content (a snapshot) and extracts only the
80    /// new portion.
81    ///
82    /// # Two-Phase Algorithm
83    ///
84    /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
85    ///    in any window of the accumulated content. O(n) time.
86    ///
87    /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
88    ///    substring match and find the exact position. O(n+m) time.
89    ///
90    /// # Arguments
91    /// * `delta` - The incoming delta to check
92    /// * `accumulated` - The previously accumulated content
93    ///
94    /// # Returns
95    /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
96    /// * `None` - The delta is genuinely new, doesn't start with accumulated
97    ///
98    /// # Example
99    /// ```ignore
100    /// let mut dedup = DeltaDeduplicator::new();
101    /// dedup.add_accumulated("Hello");
102    ///
103    /// // Snapshot: "Hello World"
104    /// assert_eq!(
105    ///     DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
106    ///     Some(" World")
107    /// );
108    ///
109    /// // Genuine delta: " World"
110    /// assert_eq!(
111    ///     DeltaDeduplicator::extract_new_content(" World", "Hello"),
112    ///     None
113    /// );
114    /// ```
115    #[cfg(test)]
116    #[must_use]
117    pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
118        // Handle identical content (delta == accumulated) - return empty string
119        if delta == accumulated {
120            return Some("");
121        }
122
123        // Fast rejection: delta must be longer than accumulated
124        if delta.len() <= accumulated.len() {
125            return None;
126        }
127
128        // Phase 1: Rolling hash check
129        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
130        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
131
132        // If hashes don't match, definitely not a snapshot
133        if accumulated_hash != delta_prefix_hash {
134            return None;
135        }
136
137        // Phase 2: KMP verification for exact match
138        let kmp = KMPMatcher::new(accumulated);
139        if let Some(pos) = kmp.find(delta) {
140            // Verify it's at position 0 (snapshot starts with accumulated)
141            if pos == 0 {
142                return Some(&delta[accumulated.len()..]);
143            }
144        }
145
146        // Hash collision or match not at start - not a snapshot
147        None
148    }
149
150    /// Check if delta is likely a snapshot using rolling hash only.
151    ///
152    /// This is a faster O(n) check that may have false positives due to
153    /// hash collisions. Use `extract_new_content` for verified results.
154    ///
155    /// # Arguments
156    /// * `delta` - The incoming delta to check
157    /// * `accumulated` - The previously accumulated content
158    ///
159    /// # Returns
160    /// * `true` - Delta may be a snapshot (hash matches)
161    /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
162    #[must_use]
163    pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
164        // Handle identical content (duplicate delta)
165        if delta == accumulated {
166            return true;
167        }
168
169        // Delta must be longer than accumulated to be a snapshot
170        if delta.len() <= accumulated.len() {
171            return false;
172        }
173
174        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
175        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
176
177        accumulated_hash == delta_prefix_hash
178    }
179
180    /// Check if delta is likely a snapshot with strong overlap detection.
181    ///
182    /// This is an enhanced version of `is_likely_snapshot` that applies
183    /// strong overlap thresholds to prevent false positives on intentional
184    /// repetitions.
185    ///
186    /// # Strong Overlap Detection
187    ///
188    /// This method only returns `true` when:
189    /// - The overlap meets minimum character count threshold (default: 30 chars)
190    /// - The overlap meets minimum ratio threshold (default: 50% of delta)
191    /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
192    /// - Short chunks (< 20 chars) are only deduped if exact match
193    ///
194    /// # Arguments
195    /// * `delta` - The incoming delta to check
196    /// * `accumulated` - The previously accumulated content
197    ///
198    /// # Returns
199    /// * `true` - Delta is a snapshot meeting strong overlap criteria
200    /// * `false` - Delta is either genuine or overlap is too weak
201    #[must_use]
202    pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
203        let thresholds = get_overlap_thresholds();
204        match check_overlap_preconditions(delta, accumulated, &thresholds) {
205            OverlapPrecondition::ExactMatch => true,
206            OverlapPrecondition::NotShortNotIdentical => {
207                Self::is_likely_snapshot(delta, accumulated)
208                    && score_overlap(delta, accumulated).meets_thresholds(&thresholds)
209            }
210            OverlapPrecondition::Reject => false,
211        }
212    }
213
214    /// Extract new content from a snapshot with strong overlap detection.
215    ///
216    /// This is an enhanced version of `extract_new_content` that only extracts
217    /// new content when the overlap meets strong overlap thresholds.
218    ///
219    /// # Arguments
220    /// * `delta` - The incoming delta to check
221    /// * `accumulated` - The previously accumulated content
222    ///
223    /// # Returns
224    /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
225    /// * `None` - The overlap is too weak or not a snapshot
226    #[must_use]
227    pub fn extract_new_content_with_thresholds<'a>(
228        delta: &'a str,
229        accumulated: &str,
230    ) -> Option<&'a str> {
231        let thresholds = get_overlap_thresholds();
232        match check_overlap_preconditions(delta, accumulated, &thresholds) {
233            OverlapPrecondition::ExactMatch => Some(""),
234            OverlapPrecondition::NotShortNotIdentical => {
235                let score = score_overlap(delta, accumulated);
236                score
237                    .meets_thresholds(&thresholds)
238                    .then(|| extract_suffix_by_overlap(delta, score.char_count))
239                    .flatten()
240            }
241            OverlapPrecondition::Reject => None,
242        }
243    }
244
245    /// Clear all tracked content.
246    pub fn clear(&mut self) {
247        self.hash_window.clear();
248    }
249}