ralph_workflow/json_parser/
deduplication.rs

1//! Delta deduplication using KMP and Rolling Hash algorithms.
2//!
3//! This module provides efficient deduplication for streaming deltas using:
4//! - **Rolling Hash (Rabin-Karp)**: Fast O(n) filtering to eliminate impossible matches
5//! - **KMP (Knuth-Morris-Pratt)**: O(n+m) verification for exact substring matching
6//! - **Strong Overlap Detection**: Thresholds and boundary checks to prevent false positives
7//!
8//! # Enhanced Deduplication
9//!
10//! The enhanced algorithm uses multiple layers of validation to prevent false positives:
11//!
12//! 1. **Rolling Hash Filter**: Fast O(n) check to eliminate impossible matches
13//! 2. **KMP Verification**: O(n+m) confirmation of actual substring match
14//! 3. **Overlap Threshold**: Only dedupe when overlap >= 30 chars AND >= 50% of delta
15//! 4. **Boundary Sanity**: Ensure overlap ends at whitespace/punctuation/newline
16//! 5. **Short Chunk Protection**: Chunks < 20 chars never deduped unless exact match
17//!
18//! # Architecture
19//!
20//! ```text
21//! Incoming Delta
22//!       │
23//!       ▼
24//! ┌─────────────────────┐
25//! │  Rolling Hash Check │  ◄── Compute hash of delta, compare against
26//! │  (Rabin-Karp)       │      sliding window hashes of accumulated content
27//! └──────────┬──────────┘
28//!            │
29//!     ┌──────┴──────┐
30//!     │ Hash Match? │
31//!     └──────┬──────┘
32//!       No   │   Yes
33//!       │    │
34//!       ▼    ▼
35//!    Accept  ┌─────────────────┐
36//!    Delta   │  KMP Verification│  ◄── Confirm actual substring match
37//!            └────────┬────────┘
38//!                     │
39//!              ┌──────┴──────┐
40//!              │True Match?  │
41//!              └──────┬──────┘
42//!                No   │   Yes
43//!                │    │
44//!                ▼    ▼
45//!             Accept  ┌─────────────────────┐
46//!             Delta   │ Strong Overlap Check│ ◄── >= 30 chars, >= 50%, safe boundary
47//!                     └──────────┬──────────┘
48//!                                │
49//!                         ┌──────┴──────┐
50//!                         │Measures?    │
51//!                         └──────┬──────┘
52//!                           No   │   Yes
53//!                           │    │
54//!                           ▼    ▼
55//!                        Accept  Extract New
56//!                        Delta   Portion Only
57//! ```
58
59use std::collections::HashMap;
60use std::sync::OnceLock;
61
62// ============================================================================
63// Configuration Constants for Strong Overlap Detection
64// ============================================================================
65
66/// Default minimum overlap character count for deduplication.
67///
68/// Overlaps must be at least this many characters to be considered for deduplication.
69/// This prevents false positives from short accidental matches (e.g., "the", "and").
70const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
71
72/// Minimum overlap ratio expressed as integer (50 = 50%).
73/// Used for integer-based ratio comparison to avoid floating point issues.
74const MIN_OVERLAP_RATIO_INT: usize = 50;
75
76/// Default threshold for considering a chunk "short".
77///
78/// Short chunks (< this many chars) are never deduped unless they're exact matches
79/// with the accumulated content. This prevents aggressive deduplication of tokens
80/// like ".", "\n", "Ok" that are legitimately repeated.
81const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
82
83/// Default threshold for consecutive duplicate detection.
84///
85/// If the exact same chunk arrives this many times in a row, it's treated as a
86/// resend glitch and dropped entirely.
87const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
88
89/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
90const MIN_MIN_OVERLAP_CHARS: usize = 10;
91
92/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
93const MAX_MIN_OVERLAP_CHARS: usize = 100;
94
95/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
96const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
97
98/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
99const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
100
101/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
102const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
103
104/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
105const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
106
107/// Configuration for strong overlap detection.
108///
109/// This struct holds the tunable thresholds that determine when an overlap
110/// is "strong enough" to warrant deduplication.
111#[derive(Debug, Clone, Copy)]
112pub struct OverlapThresholds {
113    /// Minimum character count for overlap
114    pub min_overlap_chars: usize,
115    /// Threshold below which chunks are considered "short"
116    pub short_chunk_threshold: usize,
117    /// Number of consecutive duplicates before aggressive dedupe
118    pub consecutive_duplicate_threshold: usize,
119}
120
121impl Default for OverlapThresholds {
122    fn default() -> Self {
123        Self {
124            min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
125            short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
126            consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
127        }
128    }
129}
130
131/// Get the overlap thresholds from environment variables or use defaults.
132///
133/// Reads the following environment variables:
134/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
135/// - `RALPH_STREAMING_MIN_OVERLAP_RATIO`: Minimum overlap ratio (default: 0.5, range: 0.1-0.9)
136/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
137/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
138///
139/// # Returns
140/// The configured overlap thresholds.
141pub fn get_overlap_thresholds() -> OverlapThresholds {
142    static THRESHOLDS: OnceLock<OverlapThresholds> = OnceLock::new();
143    *THRESHOLDS.get_or_init(|| {
144        let min_overlap_chars = std::env::var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
145            .ok()
146            .and_then(|s| s.parse::<usize>().ok())
147            .and_then(|v| {
148                if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
149                    Some(v)
150                } else {
151                    None
152                }
153            })
154            .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
155
156        let short_chunk_threshold = std::env::var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
157            .ok()
158            .and_then(|s| s.parse::<usize>().ok())
159            .and_then(|v| {
160                if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
161                    Some(v)
162                } else {
163                    None
164                }
165            })
166            .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
167
168        let consecutive_duplicate_threshold =
169            std::env::var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
170                .ok()
171                .and_then(|s| s.parse::<usize>().ok())
172                .and_then(|v| {
173                    if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
174                        .contains(&v)
175                    {
176                        Some(v)
177                    } else {
178                        None
179                    }
180                })
181                .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
182
183        OverlapThresholds {
184            min_overlap_chars,
185            short_chunk_threshold,
186            consecutive_duplicate_threshold,
187        }
188    })
189}
190
191// ============================================================================
192// Boundary Detection
193// ============================================================================
194
195/// Check if a character position is at a safe boundary for deduplication.
196///
197/// A "safe boundary" is where the overlap ends at a natural break point in text:
198/// - Whitespace (space, tab, newline, etc.)
199/// - ASCII punctuation (.,!?;:, etc.)
200/// - End of string
201///
202/// This prevents deduplication from splitting words or tokens mid-way through,
203/// which could cause incorrect rendering of intentional repetitions.
204///
205/// # Arguments
206/// * `text` - The text to check
207/// * `pos` - The position in the text (byte offset)
208///
209/// # Returns
210/// * `true` - The position is at a safe boundary for deduplication
211/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
212///
213/// # Examples
214///
215/// ```ignore
216/// // Safe: overlap ends at space
217/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
218///
219/// // Safe: overlap ends at punctuation
220/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
221///
222/// // Unsafe: overlap ends mid-word
223/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
224/// ```
225fn is_safe_boundary(text: &str, pos: usize) -> bool {
226    // End of string is always safe
227    if pos >= text.len() {
228        return true;
229    }
230
231    // Get the character at the boundary position
232    // We need to use character iteration for Unicode safety
233    let char_at_pos = text[pos..].chars().next();
234
235    char_at_pos
236        .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
237}
238
239// ============================================================================
240// Overlap Quality Scoring
241// ============================================================================
242
243/// Score representing the "strength" of an overlap.
244///
245/// This struct captures multiple metrics about an overlap to determine
246/// if it's strong enough to warrant deduplication.
247#[derive(Debug, Clone, Copy, PartialEq, Eq)]
248pub struct OverlapScore {
249    /// Character count of the overlap
250    pub char_count: usize,
251    /// Whether the overlap meets the minimum ratio threshold
252    pub ratio_met: bool,
253    /// Whether the overlap ends at a safe boundary
254    pub is_safe_boundary: bool,
255}
256
257impl OverlapScore {
258    /// Check if this overlap meets all thresholds for deduplication.
259    ///
260    /// # Arguments
261    /// * `thresholds` - The overlap thresholds to check against
262    ///
263    /// # Returns
264    /// * `true` - The overlap is strong enough for deduplication
265    /// * `false` - The overlap is too weak
266    #[must_use]
267    pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
268        self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
269    }
270
271    /// Check if the delta is short (below short chunk threshold).
272    ///
273    /// # Arguments
274    /// * `delta_len` - The length of the delta
275    /// * `thresholds` - The overlap thresholds
276    ///
277    /// # Returns
278    /// * `true` - The delta is considered short
279    /// * `false` - The delta is normal length
280    #[must_use]
281    #[cfg(test)]
282    pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
283        delta_len < thresholds.short_chunk_threshold
284    }
285}
286
287/// Score the quality of an overlap between delta and accumulated content.
288///
289/// This function computes multiple metrics about an overlap to determine
290/// if it's strong enough to warrant deduplication.
291///
292/// # Arguments
293/// * `delta` - The incoming delta
294/// * `accumulated` - The previously accumulated content
295///
296/// # Returns
297/// An `OverlapScore` containing:
298/// - `char_count`: The length of the overlap in characters
299/// - `ratio`: The overlap as a fraction of delta length
300/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
301///
302/// # Examples
303///
304/// ```ignore
305/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
306/// let score = score_overlap("Hello World! More text here", "Hello World!");
307/// assert!(score.char_count >= 30);
308/// assert!(score.ratio_met);
309/// assert!(score.is_safe_boundary);
310/// ```
311fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
312    // Check if delta starts with accumulated (snapshot detection)
313    let overlap_len = if delta.starts_with(accumulated) {
314        accumulated.len()
315    } else {
316        0
317    };
318
319    // Calculate ratio as integer to avoid floating point precision issues
320    // We'll compare overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
321    // This avoids f64 casting entirely
322    let ratio_met = if delta.is_empty() {
323        false
324    } else {
325        // Check if overlap/delta >= MIN_OVERLAP_RATIO without floating point
326        // By cross-multiplying: overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
327        let overlap_scaled = overlap_len.saturating_mul(100);
328        let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
329        overlap_scaled >= threshold
330    };
331
332    // Check if the accumulated string ends at a safe boundary
333    // This is important because we don't want to dedupe if the accumulated
334    // string ends mid-word (e.g., accumulated="Hello" and delta="HelloWorld")
335    let is_safe_boundary = if overlap_len > 0 {
336        // Check if the last character of accumulated is a safe boundary
337        is_safe_boundary(accumulated, accumulated.len())
338    } else {
339        false
340    };
341
342    OverlapScore {
343        char_count: overlap_len,
344        ratio_met,
345        is_safe_boundary,
346    }
347}
348
349/// Rolling hash window for fast substring detection.
350///
351/// Maintains rolling hash values over accumulated content using the Rabin-Karp
352/// algorithm. Provides O(1) hash computation for sliding windows.
353///
354/// # Algorithm
355///
356/// Uses polynomial rolling hash with base 256 (byte values) and modulus
357/// from a large prime to minimize collisions.
358///
359/// Hash computation:
360/// ```text
361/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
362/// ```
363///
364/// Rolling update:
365/// ```text
366/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
367/// ```
368///
369/// # Example
370///
371/// ```ignore
372/// let mut window = RollingHashWindow::new();
373/// window.add_content("Hello World");
374///
375/// // Check if "World" exists in the accumulated content
376/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
377/// let world_hash = RollingHashWindow::compute_hash("World");
378///
379/// if hashes.contains(&world_hash) {
380///     // Potential match - verify with KMP
381/// }
382/// ```
383#[derive(Debug, Default, Clone)]
384pub struct RollingHashWindow {
385    /// Accumulated content for hash computation
386    content: String,
387    /// Cached hash values for different window sizes
388    /// Maps `window_size` -> Vec<(position, hash)>
389    cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
390}
391
392impl RollingHashWindow {
393    /// Base for polynomial rolling hash (256 for byte values)
394    const BASE: u64 = 256;
395    /// Modulus for hash computation (large prime to minimize collisions)
396    const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
397
398    /// Create a new rolling hash window.
399    #[cfg(test)]
400    pub fn new() -> Self {
401        Self::default()
402    }
403
404    /// Compute rolling hash of a string slice.
405    ///
406    /// Uses polynomial rolling hash with base 256 and a large prime modulus.
407    ///
408    /// # Arguments
409    /// * `text` - The text to hash
410    ///
411    /// # Returns
412    /// The hash value as a u64
413    ///
414    /// # Example
415    /// ```ignore
416    /// let hash = RollingHashWindow::compute_hash("Hello");
417    /// ```
418    pub fn compute_hash(text: &str) -> u64 {
419        let mut hash: u64 = 0;
420        for byte in text.bytes() {
421            hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
422        }
423        hash
424    }
425
426    /// Compute base^(n-1) mod MODULUS for rolling hash updates.
427    ///
428    /// This is used to efficiently remove the leftmost character when
429    /// sliding the window.
430    #[cfg(test)]
431    fn compute_power(power: usize) -> u64 {
432        let mut result = 1u64;
433        for _ in 0..power {
434            result = (result * Self::BASE) % Self::MODULUS;
435        }
436        result
437    }
438
439    /// Add content to the window and update cached hashes.
440    ///
441    /// This appends new content and recomputes hash values for all
442    /// cached window sizes.
443    ///
444    /// # Arguments
445    /// * `text` - The new content to add
446    #[cfg(test)]
447    pub fn add_content(&mut self, text: &str) {
448        if text.is_empty() {
449            return;
450        }
451
452        let _start_pos = self.content.len();
453        self.content.push_str(text);
454
455        // Invalidate and recompute all cached hashes
456        self.cached_hashes.clear();
457    }
458
459    /// Get all window hashes of a specific size.
460    ///
461    /// Computes rolling hash values for all windows of the given size
462    /// in the accumulated content.
463    ///
464    /// # Arguments
465    /// * `window_size` - The size of each window
466    ///
467    /// # Returns
468    /// A vector of (position, hash) tuples for each window
469    ///
470    /// # Example
471    /// ```ignore
472    /// // Get hashes for all 5-character windows
473    /// let hashes = window.get_window_hashes(5);
474    /// ```
475    #[cfg(test)]
476    pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
477        // Return cached copy if available
478        if let Some(hashes) = self.cached_hashes.get(&window_size) {
479            return hashes.clone();
480        }
481
482        // Compute hashes for this window size
483        let content_bytes = self.content.as_bytes();
484        let content_len = content_bytes.len();
485
486        if content_len < window_size {
487            // Not enough content for even one window
488            let empty: Vec<(usize, u64)> = Vec::new();
489            self.cached_hashes.insert(window_size, empty.clone());
490            return empty;
491        }
492
493        let mut hashes = Vec::new();
494        let mut hash: u64 = 0;
495
496        // Compute initial window hash
497        for byte in content_bytes.iter().take(window_size) {
498            hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
499        }
500        hashes.push((0, hash));
501
502        // Precompute base^(window_size-1) mod MODULUS for rolling updates
503        let power = Self::compute_power(window_size - 1);
504
505        // Slide window and compute rolling hashes
506        for i in 1..=(content_len - window_size) {
507            // Remove leftmost character contribution
508            let leftmost = u64::from(content_bytes[i - 1]);
509            let removed = (leftmost * power) % Self::MODULUS;
510            hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
511
512            // Shift and add new character
513            hash = (hash * Self::BASE) % Self::MODULUS;
514            let new_char = u64::from(content_bytes[i + window_size - 1]);
515            hash = (hash + new_char) % Self::MODULUS;
516
517            hashes.push((i, hash));
518        }
519
520        // Cache for future use
521        self.cached_hashes.insert(window_size, hashes.clone());
522        hashes
523    }
524
525    /// Check if a hash exists in any window of the given size.
526    ///
527    /// This is a fast O(1) check after hashes have been computed.
528    ///
529    /// # Arguments
530    /// * `hash` - The hash value to search for
531    /// * `window_size` - The window size to check
532    ///
533    /// # Returns
534    /// * `Some(position)` - The position where the hash was found
535    /// * `None` - Hash not found
536    #[cfg(test)]
537    pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
538        let hashes = self.get_window_hashes(window_size);
539        hashes
540            .into_iter()
541            .find(|(_, h)| *h == hash)
542            .map(|(pos, _)| pos)
543    }
544
545    /// Clear all content and cached hashes.
546    pub fn clear(&mut self) {
547        self.content.clear();
548        self.cached_hashes.clear();
549    }
550
551    /// Get the current content length.
552    #[cfg(test)]
553    pub const fn len(&self) -> usize {
554        self.content.len()
555    }
556
557    /// Check if the window is empty.
558    #[cfg(test)]
559    pub const fn is_empty(&self) -> bool {
560        self.content.is_empty()
561    }
562}
563
564/// KMP (Knuth-Morris-Pratt) matcher for exact substring matching.
565///
566/// Provides linear-time substring search by precomputing a failure function
567/// that allows skipping already-matched characters.
568///
569/// # Algorithm
570///
571/// The failure function (also called "prefix function" or "lps array") stores
572/// the length of the longest proper prefix of the pattern that is also a suffix
573/// for each position in the pattern.
574///
575/// # Example
576///
577/// ```ignore
578/// let mut kmp = KMPMatcher::new("abc");
579/// let text = "xyzabcuvw";
580///
581/// if let Some(pos) = kmp.find(text) {
582///     // Found "abc" at position 3
583/// }
584/// ```
585#[derive(Debug, Clone)]
586#[cfg(test)]
587pub struct KMPMatcher {
588    /// The pattern to search for
589    pattern: String,
590    /// Failure function (longest proper prefix which is also suffix)
591    failure: Vec<usize>,
592}
593
594#[cfg(test)]
595impl KMPMatcher {
596    /// Create a new KMP matcher for the given pattern.
597    ///
598    /// Precomputes the failure function for efficient searching.
599    ///
600    /// # Arguments
601    /// * `pattern` - The pattern to search for
602    ///
603    /// # Example
604    /// ```ignore
605    /// let matcher = KMPMatcher::new("hello");
606    /// ```
607    pub fn new(pattern: &str) -> Self {
608        let pattern = pattern.to_string();
609        let failure = Self::compute_failure(&pattern);
610        Self { pattern, failure }
611    }
612
613    /// Compute the failure function for KMP.
614    ///
615    /// The failure function `lps[i]` stores the length of the longest proper
616    /// prefix of `pattern[0..=i]` that is also a suffix.
617    ///
618    /// # Example
619    ///
620    /// For pattern "abab":
621    /// - lps[0] = 0 (no proper prefix for single char)
622    /// - lps[1] = 0 ("ab" has no matching prefix/suffix)
623    /// - lps[2] = 1 ("aba" has "a" as prefix and suffix)
624    /// - lps[3] = 2 ("abab" has "ab" as prefix and suffix)
625    fn compute_failure(pattern: &str) -> Vec<usize> {
626        let m = pattern.len();
627        if m == 0 {
628            return Vec::new();
629        }
630
631        let mut lps = vec![0; m];
632        let mut len = 0; // Length of the previous longest prefix suffix
633        let mut i = 1;
634
635        let pattern_bytes = pattern.as_bytes();
636
637        while i < m {
638            if pattern_bytes[i] == pattern_bytes[len] {
639                len += 1;
640                lps[i] = len;
641                i += 1;
642            } else if len != 0 {
643                // Fall back to previous longest prefix suffix
644                len = lps[len - 1];
645                // Note: we don't increment i here
646            } else {
647                // No proper prefix suffix found
648                lps[i] = 0;
649                i += 1;
650            }
651        }
652
653        lps
654    }
655
656    /// Find the pattern in text, returning the first position.
657    ///
658    /// Uses the precomputed failure function for O(n) search time.
659    ///
660    /// # Arguments
661    /// * `text` - The text to search in
662    ///
663    /// # Returns
664    /// * `Some(position)` - The starting position of the pattern
665    /// * `None` - Pattern not found
666    ///
667    /// # Example
668    /// ```ignore
669    /// let matcher = KMPMatcher::new("world");
670    /// assert_eq!(matcher.find("hello world"), Some(6));
671    /// assert_eq!(matcher.find("hello"), None);
672    /// ```
673    pub fn find(&self, text: &str) -> Option<usize> {
674        let n = text.len();
675        let m = self.pattern.len();
676
677        if m == 0 || n < m {
678            return None;
679        }
680
681        let text_bytes = text.as_bytes();
682        let pattern_bytes = self.pattern.as_bytes();
683
684        let mut i = 0; // Index for text
685        let mut j = 0; // Index for pattern
686
687        while i < n {
688            if pattern_bytes[j] == text_bytes[i] {
689                i += 1;
690                j += 1;
691
692                if j == m {
693                    // Found complete pattern match
694                    return Some(i - j);
695                }
696            } else if j != 0 {
697                // Use failure function to skip ahead
698                j = self.failure[j - 1];
699            } else {
700                // No match at all, move to next character
701                i += 1;
702            }
703        }
704
705        None
706    }
707
708    /// Find all occurrences of the pattern in text.
709    ///
710    /// Returns all positions where the pattern appears in the text.
711    ///
712    /// # Arguments
713    /// * `text` - The text to search in
714    ///
715    /// # Returns
716    /// A vector of positions where the pattern was found
717    ///
718    /// # Example
719    /// ```ignore
720    /// let matcher = KMPMatcher::new("ab");
721    /// let positions = matcher.find_all("ababab");
722    /// assert_eq!(positions, vec![0, 2, 4]);
723    /// ```
724    #[cfg(test)]
725    pub fn find_all(&self, text: &str) -> Vec<usize> {
726        let mut positions = Vec::new();
727        let n = text.len();
728        let m = self.pattern.len();
729
730        if m == 0 || n < m {
731            return positions;
732        }
733
734        let text_bytes = text.as_bytes();
735        let pattern_bytes = self.pattern.as_bytes();
736
737        let mut i = 0; // Index for text
738        let mut j = 0; // Index for pattern
739
740        while i < n {
741            if pattern_bytes[j] == text_bytes[i] {
742                i += 1;
743                j += 1;
744
745                if j == m {
746                    // Found complete pattern match
747                    positions.push(i - j);
748                    j = self.failure[j - 1];
749                }
750            } else if j != 0 {
751                j = self.failure[j - 1];
752            } else {
753                i += 1;
754            }
755        }
756
757        positions
758    }
759
760    /// Get the pattern length.
761    #[cfg(test)]
762    pub const fn pattern_len(&self) -> usize {
763        self.pattern.len()
764    }
765
766    /// Check if the pattern is empty.
767    #[cfg(test)]
768    pub const fn is_empty(&self) -> bool {
769        self.pattern.is_empty()
770    }
771}
772
773/// Delta deduplicator using rolling hash and KMP.
774///
775/// Orchestrates the two-phase deduplication approach:
776/// 1. Rolling hash for fast filtering
777/// 2. KMP for exact verification
778///
779/// # Example
780///
781/// ```ignore
782/// let mut dedup = DeltaDeduplicator::new();
783///
784/// // Add accumulated content
785/// dedup.add_accumulated("Hello World");
786///
787/// // Check if a delta is a duplicate
788/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
789///     // "!" is the new portion
790///     assert_eq!(new_portion, "!");
791/// }
792/// ```
793#[derive(Debug, Default, Clone)]
794pub struct DeltaDeduplicator {
795    /// Rolling hash window for accumulated content
796    hash_window: RollingHashWindow,
797}
798
799impl DeltaDeduplicator {
800    /// Create a new delta deduplicator.
801    #[cfg(test)]
802    pub fn new() -> Self {
803        Self::default()
804    }
805
806    /// Add accumulated content for deduplication tracking.
807    ///
808    /// # Arguments
809    /// * `content` - The accumulated content to track
810    #[cfg(test)]
811    pub fn add_accumulated(&mut self, content: &str) {
812        self.hash_window.add_content(content);
813    }
814
815    /// Extract new content from a potential snapshot.
816    ///
817    /// Uses rolling hash and KMP to detect if the incoming delta contains
818    /// previously accumulated content (a snapshot) and extracts only the
819    /// new portion.
820    ///
821    /// # Two-Phase Algorithm
822    ///
823    /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
824    ///    in any window of the accumulated content. O(n) time.
825    ///
826    /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
827    ///    substring match and find the exact position. O(n+m) time.
828    ///
829    /// # Arguments
830    /// * `delta` - The incoming delta to check
831    /// * `accumulated` - The previously accumulated content
832    ///
833    /// # Returns
834    /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
835    /// * `None` - The delta is genuinely new, doesn't start with accumulated
836    ///
837    /// # Example
838    /// ```ignore
839    /// let mut dedup = DeltaDeduplicator::new();
840    /// dedup.add_accumulated("Hello");
841    ///
842    /// // Snapshot: "Hello World"
843    /// assert_eq!(
844    ///     DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
845    ///     Some(" World")
846    /// );
847    ///
848    /// // Genuine delta: " World"
849    /// assert_eq!(
850    ///     DeltaDeduplicator::extract_new_content(" World", "Hello"),
851    ///     None
852    /// );
853    /// ```
854    #[cfg(test)]
855    pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
856        // Handle identical content (delta == accumulated) - return empty string
857        if delta == accumulated {
858            return Some("");
859        }
860
861        // Fast rejection: delta must be longer than accumulated
862        if delta.len() <= accumulated.len() {
863            return None;
864        }
865
866        // Phase 1: Rolling hash check
867        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
868        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
869
870        // If hashes don't match, definitely not a snapshot
871        if accumulated_hash != delta_prefix_hash {
872            return None;
873        }
874
875        // Phase 2: KMP verification for exact match
876        let kmp = KMPMatcher::new(accumulated);
877        if let Some(pos) = kmp.find(delta) {
878            // Verify it's at position 0 (snapshot starts with accumulated)
879            if pos == 0 {
880                return Some(&delta[accumulated.len()..]);
881            }
882        }
883
884        // Hash collision or match not at start - not a snapshot
885        None
886    }
887
888    /// Check if delta is likely a snapshot using rolling hash only.
889    ///
890    /// This is a faster O(n) check that may have false positives due to
891    /// hash collisions. Use `extract_new_content` for verified results.
892    ///
893    /// # Arguments
894    /// * `delta` - The incoming delta to check
895    /// * `accumulated` - The previously accumulated content
896    ///
897    /// # Returns
898    /// * `true` - Delta may be a snapshot (hash matches)
899    /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
900    pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
901        // Handle identical content (duplicate delta)
902        if delta == accumulated {
903            return true;
904        }
905
906        // Delta must be longer than accumulated to be a snapshot
907        if delta.len() <= accumulated.len() {
908            return false;
909        }
910
911        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
912        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
913
914        accumulated_hash == delta_prefix_hash
915    }
916
917    /// Check if delta is likely a snapshot with strong overlap detection.
918    ///
919    /// This is an enhanced version of `is_likely_snapshot` that applies
920    /// strong overlap thresholds to prevent false positives on intentional
921    /// repetitions.
922    ///
923    /// # Strong Overlap Detection
924    ///
925    /// This method only returns `true` when:
926    /// - The overlap meets minimum character count threshold (default: 30 chars)
927    /// - The overlap meets minimum ratio threshold (default: 50% of delta)
928    /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
929    /// - Short chunks (< 20 chars) are only deduped if exact match
930    ///
931    /// # Arguments
932    /// * `delta` - The incoming delta to check
933    /// * `accumulated` - The previously accumulated content
934    ///
935    /// # Returns
936    /// * `true` - Delta is a snapshot meeting strong overlap criteria
937    /// * `false` - Delta is either genuine or overlap is too weak
938    pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
939        let thresholds = get_overlap_thresholds();
940
941        // Handle short chunks: only dedupe if exact match
942        if delta.len() < thresholds.short_chunk_threshold {
943            return delta == accumulated;
944        }
945
946        // Handle identical content (delta == accumulated)
947        // This is a snapshot where no new content is added
948        if delta == accumulated {
949            return true;
950        }
951
952        // Fast rejection: delta must be longer than accumulated
953        if delta.len() <= accumulated.len() {
954            return false;
955        }
956
957        // First check with basic rolling hash for quick rejection
958        if !Self::is_likely_snapshot(delta, accumulated) {
959            return false;
960        }
961
962        // Score the overlap to check if it meets strong overlap criteria
963        let score = score_overlap(delta, accumulated);
964
965        // Apply threshold checks
966        score.meets_thresholds(&thresholds)
967    }
968
969    /// Extract new content from a snapshot with strong overlap detection.
970    ///
971    /// This is an enhanced version of `extract_new_content` that only extracts
972    /// new content when the overlap meets strong overlap thresholds.
973    ///
974    /// # Arguments
975    /// * `delta` - The incoming delta to check
976    /// * `accumulated` - The previously accumulated content
977    ///
978    /// # Returns
979    /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
980    /// * `None` - The overlap is too weak or not a snapshot
981    pub fn extract_new_content_with_thresholds<'a>(
982        delta: &'a str,
983        accumulated: &str,
984    ) -> Option<&'a str> {
985        let thresholds = get_overlap_thresholds();
986
987        // Handle short chunks: only dedupe if exact match
988        if delta.len() < thresholds.short_chunk_threshold {
989            if delta == accumulated {
990                return Some("");
991            }
992            return None;
993        }
994
995        // Handle identical content
996        if delta == accumulated {
997            return Some("");
998        }
999
1000        // Fast rejection: delta must be longer than accumulated
1001        if delta.len() <= accumulated.len() {
1002            return None;
1003        }
1004
1005        // Score the overlap
1006        let score = score_overlap(delta, accumulated);
1007
1008        // Check if overlap meets thresholds
1009        if !score.meets_thresholds(&thresholds) {
1010            return None;
1011        }
1012
1013        // Extract new content using the overlap length from the score
1014        if score.char_count > 0 && delta.len() > score.char_count {
1015            Some(&delta[score.char_count..])
1016        } else {
1017            None
1018        }
1019    }
1020
1021    /// Clear all tracked content.
1022    pub fn clear(&mut self) {
1023        self.hash_window.clear();
1024    }
1025}
1026
1027#[cfg(test)]
1028mod tests {
1029    use super::*;
1030
1031    // Tests for RollingHashWindow
1032
1033    #[test]
1034    fn test_rolling_hash_compute_hash() {
1035        let hash1 = RollingHashWindow::compute_hash("Hello");
1036        let hash2 = RollingHashWindow::compute_hash("Hello");
1037        let hash3 = RollingHashWindow::compute_hash("World");
1038
1039        // Same input produces same hash
1040        assert_eq!(hash1, hash2);
1041        // Different input likely produces different hash
1042        assert_ne!(hash1, hash3);
1043    }
1044
1045    #[test]
1046    fn test_rolling_hash_add_content() {
1047        let mut window = RollingHashWindow::new();
1048        assert!(window.is_empty());
1049
1050        window.add_content("Hello");
1051        assert_eq!(window.len(), 5);
1052        assert!(!window.is_empty());
1053    }
1054
1055    #[test]
1056    fn test_rolling_hash_get_window_hashes() {
1057        let mut window = RollingHashWindow::new();
1058        window.add_content("HelloWorld");
1059
1060        // Get hashes for 5-character windows
1061        let hashes = window.get_window_hashes(5);
1062        assert_eq!(hashes.len(), 6); // "Hello", "elloW", "lloWo", "loWor", "oWorl", "World"
1063
1064        // Verify positions
1065        assert_eq!(hashes[0].0, 0); // First window starts at 0
1066        assert_eq!(hashes[5].0, 5); // Last window starts at 5
1067    }
1068
1069    #[test]
1070    fn test_rolling_hash_contains_hash() {
1071        let mut window = RollingHashWindow::new();
1072        window.add_content("HelloWorld");
1073
1074        let world_hash = RollingHashWindow::compute_hash("World");
1075        let xyz_hash = RollingHashWindow::compute_hash("XYZ");
1076
1077        // "World" exists in the content
1078        assert!(window.contains_hash(world_hash, 5).is_some());
1079        // "XYZ" doesn't exist
1080        assert!(window.contains_hash(xyz_hash, 3).is_none());
1081    }
1082
1083    #[test]
1084    fn test_rolling_hash_clear() {
1085        let mut window = RollingHashWindow::new();
1086        window.add_content("Hello");
1087        assert_eq!(window.len(), 5);
1088
1089        window.clear();
1090        assert!(window.is_empty());
1091    }
1092
1093    // Tests for KMPMatcher
1094
1095    #[test]
1096    fn test_kmp_find_pattern_exists() {
1097        let kmp = KMPMatcher::new("World");
1098        assert_eq!(kmp.find("Hello World"), Some(6));
1099        assert_eq!(kmp.find("WorldHello"), Some(0));
1100    }
1101
1102    #[test]
1103    fn test_kmp_find_pattern_not_exists() {
1104        let kmp = KMPMatcher::new("XYZ");
1105        assert_eq!(kmp.find("Hello World"), None);
1106    }
1107
1108    #[test]
1109    fn test_kmp_find_pattern_empty() {
1110        let kmp = KMPMatcher::new("");
1111        assert_eq!(kmp.find("Hello"), None);
1112    }
1113
1114    #[test]
1115    fn test_kmp_find_text_shorter_than_pattern() {
1116        let kmp = KMPMatcher::new("Hello World");
1117        assert_eq!(kmp.find("Hello"), None);
1118    }
1119
1120    #[test]
1121    fn test_kmp_find_all() {
1122        let kmp = KMPMatcher::new("ab");
1123        let positions = kmp.find_all("ababab");
1124        assert_eq!(positions, vec![0, 2, 4]);
1125    }
1126
1127    #[test]
1128    fn test_kmp_find_all_no_matches() {
1129        let kmp = KMPMatcher::new("xyz");
1130        let positions = kmp.find_all("abcabc");
1131        assert!(positions.is_empty());
1132    }
1133
1134    #[test]
1135    fn test_kmp_find_overlapping_patterns() {
1136        let kmp = KMPMatcher::new("aa");
1137        let positions = kmp.find_all("aaa");
1138        assert_eq!(positions, vec![0, 1]);
1139    }
1140
1141    #[test]
1142    fn test_kmp_failure_function() {
1143        let kmp = KMPMatcher::new("abab");
1144        // lps = [0, 0, 1, 2]
1145        assert_eq!(kmp.failure, vec![0, 0, 1, 2]);
1146    }
1147
1148    #[test]
1149    fn test_kmp_pattern_len() {
1150        let kmp = KMPMatcher::new("Hello");
1151        assert_eq!(kmp.pattern_len(), 5);
1152    }
1153
1154    #[test]
1155    fn test_kmp_is_empty() {
1156        let kmp = KMPMatcher::new("");
1157        assert!(kmp.is_empty());
1158
1159        let kmp = KMPMatcher::new("Hello");
1160        assert!(!kmp.is_empty());
1161    }
1162
1163    // Tests for DeltaDeduplicator
1164
1165    #[test]
1166    fn test_dedup_extract_new_content_snapshot() {
1167        let mut dedup = DeltaDeduplicator::new();
1168        dedup.add_accumulated("Hello");
1169
1170        // Snapshot: "Hello World"
1171        assert_eq!(
1172            DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
1173            Some(" World")
1174        );
1175    }
1176
1177    #[test]
1178    fn test_dedup_extract_new_content_genuine_delta() {
1179        let mut dedup = DeltaDeduplicator::new();
1180        dedup.add_accumulated("Hello");
1181
1182        // Genuine delta: " World"
1183        assert_eq!(
1184            DeltaDeduplicator::extract_new_content(" World", "Hello"),
1185            None
1186        );
1187    }
1188
1189    #[test]
1190    fn test_dedup_extract_new_content_shorter_delta() {
1191        let mut dedup = DeltaDeduplicator::new();
1192        dedup.add_accumulated("Hello World");
1193
1194        // Delta shorter than accumulated - can't be snapshot
1195        assert_eq!(
1196            DeltaDeduplicator::extract_new_content("Hello", "Hello World"),
1197            None
1198        );
1199    }
1200
1201    #[test]
1202    fn test_dedup_extract_new_content_equal_length() {
1203        let mut dedup = DeltaDeduplicator::new();
1204        dedup.add_accumulated("Hello");
1205
1206        // Equal length - if identical, return empty string (no new content)
1207        assert_eq!(
1208            DeltaDeduplicator::extract_new_content("Hello", "Hello"),
1209            Some("")
1210        );
1211
1212        // Equal length but different content - not a snapshot
1213        assert_eq!(
1214            DeltaDeduplicator::extract_new_content("World", "Hello"),
1215            None
1216        );
1217    }
1218
1219    #[test]
1220    fn test_dedup_extract_new_content_no_match() {
1221        let mut dedup = DeltaDeduplicator::new();
1222        dedup.add_accumulated("Hello");
1223
1224        // Different content entirely
1225        assert_eq!(
1226            DeltaDeduplicator::extract_new_content("World", "Hello"),
1227            None
1228        );
1229    }
1230
1231    #[test]
1232    fn test_dedup_is_likely_snapshot() {
1233        let mut dedup = DeltaDeduplicator::new();
1234        dedup.add_accumulated("Hello");
1235
1236        // Actual snapshot
1237        assert!(DeltaDeduplicator::is_likely_snapshot(
1238            "Hello World",
1239            "Hello"
1240        ));
1241
1242        // Not a snapshot
1243        assert!(!DeltaDeduplicator::is_likely_snapshot(" World", "Hello"));
1244    }
1245
1246    #[test]
1247    fn test_dedup_clear() {
1248        let mut dedup = DeltaDeduplicator::new();
1249        dedup.add_accumulated("Hello");
1250        assert!(!dedup.hash_window.is_empty());
1251
1252        dedup.clear();
1253        assert!(dedup.hash_window.is_empty());
1254    }
1255
1256    // Integration tests
1257
1258    #[test]
1259    fn test_dedup_two_phase_algorithm() {
1260        let mut dedup = DeltaDeduplicator::new();
1261        dedup.add_accumulated("The quick brown fox");
1262
1263        // Phase 1: Rolling hash will match
1264        assert!(DeltaDeduplicator::is_likely_snapshot(
1265            "The quick brown fox jumps",
1266            "The quick brown fox"
1267        ));
1268
1269        // Phase 2: KMP verification confirms and extracts new portion
1270        assert_eq!(
1271            DeltaDeduplicator::extract_new_content(
1272                "The quick brown fox jumps",
1273                "The quick brown fox"
1274            ),
1275            Some(" jumps")
1276        );
1277    }
1278
1279    #[test]
1280    fn test_dedup_handles_unicode() {
1281        let mut dedup = DeltaDeduplicator::new();
1282        dedup.add_accumulated("Hello 世界");
1283
1284        // Should handle UTF-8 correctly
1285        assert_eq!(
1286            DeltaDeduplicator::extract_new_content("Hello 世界!", "Hello 世界"),
1287            Some("!")
1288        );
1289    }
1290
1291    #[test]
1292    fn test_dedup_empty_accumulated() {
1293        // No accumulated content
1294
1295        // Any delta is genuine
1296        assert_eq!(DeltaDeduplicator::extract_new_content("Hello", ""), None);
1297    }
1298
1299    // Tests for Strong Overlap Detection with Thresholds
1300
1301    #[test]
1302    fn test_strong_overlap_meets_char_threshold() {
1303        // Overlap of 30+ chars with safe boundary should pass
1304        let accumulated = "The quick brown fox jumps over the lazy";
1305        let delta = "The quick brown fox jumps over the lazy dog!";
1306
1307        assert!(
1308            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1309            "Should detect snapshot with 30+ char overlap"
1310        );
1311    }
1312
1313    #[test]
1314    fn test_strong_overlap_meets_ratio_threshold() {
1315        // Overlap is 50%+ of delta length
1316        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1317        let delta = "The quick brown fox jumps over the lazy dog. And more!";
1318
1319        assert!(
1320            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1321            "Should detect snapshot when overlap is 50%+ of delta"
1322        );
1323    }
1324
1325    #[test]
1326    fn test_strong_overlap_fails_char_threshold() {
1327        // Overlap < 30 chars, even if ratio is good
1328        let accumulated = "Hello";
1329        let delta = "Hello World!";
1330
1331        assert!(
1332            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1333            "Should NOT detect snapshot when overlap < 30 chars"
1334        );
1335    }
1336
1337    #[test]
1338    fn test_strong_overlap_fails_ratio_threshold() {
1339        // Overlap < 50% of delta, even if char count is good
1340        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1341        let delta = "The quick brown fox jumps over the lazy dog. And then a whole lot more text follows to make the ratio low!";
1342
1343        assert!(
1344            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1345            "Should NOT detect snapshot when overlap < 50% of delta"
1346        );
1347    }
1348
1349    #[test]
1350    fn test_boundary_check_whitespace() {
1351        // Overlap ends at space (safe boundary)
1352        let accumulated = "The quick brown fox jumps over the lazy dog and ";
1353        let delta = "The quick brown fox jumps over the lazy dog and then more!";
1354
1355        assert!(
1356            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1357            "Should detect snapshot when overlap ends at whitespace"
1358        );
1359    }
1360
1361    #[test]
1362    fn test_boundary_check_punctuation() {
1363        // Overlap ends at punctuation (safe boundary)
1364        let accumulated = "The quick brown fox jumps over the lazy dog.";
1365        let delta = "The quick brown fox jumps over the lazy dog. How are you?";
1366
1367        assert!(
1368            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1369            "Should detect snapshot when overlap ends at punctuation"
1370        );
1371    }
1372
1373    #[test]
1374    fn test_boundary_check_mid_word_fails() {
1375        // Overlap ends mid-word (unsafe boundary)
1376        let accumulated = "Hello";
1377        let delta = "HelloWorld! This is a lot of text to ensure we have enough characters.";
1378
1379        // Even though we have 30+ chars, the boundary check should fail
1380        // because the overlap ends mid-word (at 'W' of "World")
1381        assert!(
1382            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1383            "Should NOT detect snapshot when overlap ends mid-word"
1384        );
1385    }
1386
1387    #[test]
1388    fn test_short_chunk_never_deduped() {
1389        // Short chunks (< 20 chars) never deduped unless exact match
1390        let accumulated = "Hello";
1391        let delta = "Hello World!";
1392
1393        // Even though "Hello World!" starts with "Hello", it's < 20 chars total
1394        // and not an exact match, so it should NOT be deduped
1395        assert!(
1396            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1397            "Short chunks should NOT be deduped unless exact match"
1398        );
1399    }
1400
1401    #[test]
1402    fn test_short_chunk_exact_match_deduped() {
1403        // Short chunks (< 20 chars) ARE deduped if exact match
1404        let accumulated = "Hello";
1405        let delta = "Hello";
1406
1407        assert!(
1408            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1409            "Short chunk exact match SHOULD be deduped"
1410        );
1411    }
1412
1413    #[test]
1414    fn test_extract_new_content_with_thresholds_strong_overlap() {
1415        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1416        let delta = "The quick brown fox jumps over the lazy dog. More content here!";
1417
1418        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1419        assert_eq!(result, Some("More content here!"));
1420    }
1421
1422    #[test]
1423    fn test_extract_new_content_with_thresholds_weak_overlap() {
1424        // Weak overlap (< 30 chars) should return None
1425        let accumulated = "Hello";
1426        let delta = "Hello World! This is more content to exceed thresholds.";
1427
1428        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1429        assert_eq!(result, None, "Weak overlap should return None");
1430    }
1431
1432    #[test]
1433    fn test_extract_new_content_with_thresholds_short_chunk() {
1434        // Short chunk that's not an exact match should return None
1435        let accumulated = "Hi";
1436        let delta = "Hi there!";
1437
1438        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1439        assert_eq!(
1440            result, None,
1441            "Short chunk should return None unless exact match"
1442        );
1443    }
1444
1445    #[test]
1446    fn test_extract_new_content_with_thresholds_short_chunk_exact() {
1447        // Short chunk exact match should return empty string
1448        let accumulated = "Hello";
1449        let delta = "Hello";
1450
1451        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1452        assert_eq!(result, Some(""));
1453    }
1454
1455    #[test]
1456    fn test_strong_overlap_with_unicode() {
1457        // Test with Unicode characters
1458        let accumulated = "Hello 世界! This is a long enough string to meet thresholds. ";
1459        let delta = "Hello 世界! This is a long enough string to meet thresholds. More!";
1460
1461        assert!(
1462            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1463            "Should handle Unicode correctly with strong overlap"
1464        );
1465
1466        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1467        assert_eq!(result, Some("More!"));
1468    }
1469
1470    #[test]
1471    fn test_intentional_repetition_not_deduped() {
1472        // Simulate intentional repetition (e.g., "Hello World! Hello World!")
1473        // where the overlap is small relative to the total delta
1474        let accumulated = "Hello World!";
1475        let delta = "Hello World! Hello World! This is a lot of additional content to make the overlap ratio low enough that it won't be deduplicated!";
1476
1477        assert!(
1478            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1479            "Intentional repetition should NOT be deduped when overlap ratio is low"
1480        );
1481    }
1482
1483    #[test]
1484    fn test_snapshot_strong_overlap_deduped() {
1485        // Real snapshot scenario: agent sends full accumulated + new content
1486        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1487        let delta = "The quick brown fox jumps over the lazy dog. And then some more!";
1488
1489        assert!(
1490            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1491            "Actual snapshot SHOULD be detected and deduped"
1492        );
1493
1494        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1495        assert_eq!(result, Some("And then some more!"));
1496    }
1497
1498    #[test]
1499    fn test_overlap_score_meets_thresholds() {
1500        let thresholds = OverlapThresholds::default();
1501
1502        // Strong overlap: 30+ chars, 50%+ ratio, safe boundary
1503        let score = OverlapScore {
1504            char_count: 50,
1505            ratio_met: true,
1506            is_safe_boundary: true,
1507        };
1508
1509        assert!(score.meets_thresholds(&thresholds));
1510    }
1511
1512    #[test]
1513    fn test_overlap_score_fails_char_count() {
1514        let thresholds = OverlapThresholds::default();
1515
1516        // Char count too low
1517        let score = OverlapScore {
1518            char_count: 20,
1519            ratio_met: true,
1520            is_safe_boundary: true,
1521        };
1522
1523        assert!(!score.meets_thresholds(&thresholds));
1524    }
1525
1526    #[test]
1527    fn test_overlap_score_fails_ratio() {
1528        let thresholds = OverlapThresholds::default();
1529
1530        // Ratio too low (met = false)
1531        let score = OverlapScore {
1532            char_count: 50,
1533            ratio_met: false,
1534            is_safe_boundary: true,
1535        };
1536
1537        assert!(!score.meets_thresholds(&thresholds));
1538    }
1539
1540    #[test]
1541    fn test_overlap_score_fails_boundary() {
1542        let thresholds = OverlapThresholds::default();
1543
1544        // Boundary not safe
1545        let score = OverlapScore {
1546            char_count: 50,
1547            ratio_met: true,
1548            is_safe_boundary: false,
1549        };
1550
1551        assert!(!score.meets_thresholds(&thresholds));
1552    }
1553
1554    #[test]
1555    fn test_is_short_delta() {
1556        let thresholds = OverlapThresholds::default();
1557
1558        assert!(OverlapScore::is_short_delta(10, &thresholds));
1559        assert!(!OverlapScore::is_short_delta(30, &thresholds));
1560    }
1561
1562    #[test]
1563    fn test_is_safe_boundary_whitespace() {
1564        assert!(is_safe_boundary("Hello World", 5));
1565        assert!(is_safe_boundary("Hello\nWorld", 5));
1566        assert!(is_safe_boundary("Hello\tWorld", 5));
1567    }
1568
1569    #[test]
1570    fn test_is_safe_boundary_punctuation() {
1571        assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
1572        assert!(is_safe_boundary("Hello. World", 5)); // After "."
1573        assert!(is_safe_boundary("Hello; World", 5)); // After ";"
1574    }
1575
1576    #[test]
1577    fn test_is_safe_boundary_end_of_string() {
1578        assert!(is_safe_boundary("Hello", 5));
1579        assert!(is_safe_boundary("Hello", 10)); // Beyond length
1580    }
1581
1582    #[test]
1583    fn test_is_safe_boundary_mid_word() {
1584        assert!(!is_safe_boundary("HelloWorld", 5));
1585        assert!(!is_safe_boundary("Testing", 3));
1586    }
1587
1588    #[test]
1589    fn test_score_overlap_with_snapshot() {
1590        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1591        let delta = "The quick brown fox jumps over the lazy dog. And more!";
1592
1593        let score = score_overlap(delta, accumulated);
1594
1595        assert!(score.char_count > 30);
1596        assert!(score.ratio_met);
1597        assert!(score.is_safe_boundary);
1598    }
1599
1600    #[test]
1601    fn test_score_overlap_with_genuine_delta() {
1602        let accumulated = "Hello";
1603        let delta = " World";
1604
1605        let score = score_overlap(delta, accumulated);
1606
1607        assert_eq!(score.char_count, 0);
1608    }
1609
1610    #[test]
1611    fn test_get_overlap_thresholds_default() {
1612        let thresholds = get_overlap_thresholds();
1613
1614        assert_eq!(thresholds.min_overlap_chars, 30);
1615        assert_eq!(thresholds.short_chunk_threshold, 20);
1616        assert_eq!(thresholds.consecutive_duplicate_threshold, 3);
1617    }
1618
1619    #[test]
1620    fn test_consecutive_duplicate_threshold_default() {
1621        let thresholds = OverlapThresholds::default();
1622        assert_eq!(
1623            thresholds.consecutive_duplicate_threshold, DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
1624            "Default consecutive_duplicate_threshold should match constant"
1625        );
1626        assert_eq!(
1627            thresholds.consecutive_duplicate_threshold, 3,
1628            "Default consecutive_duplicate_threshold should be 3"
1629        );
1630    }
1631
1632    #[test]
1633    fn test_consecutive_duplicate_threshold_bounds() {
1634        // Test minimum allowed value
1635        let min_env = "2";
1636        std::env::set_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", min_env);
1637        // Note: OnceLock caches the value, so we can't test this in the same process
1638        // This test documents the expected behavior
1639
1640        // Test maximum allowed value
1641        let max_env = "10";
1642        std::env::set_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", max_env);
1643
1644        // Clean up
1645        std::env::remove_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD");
1646
1647        // Verify bounds constants are correct
1648        assert_eq!(
1649            MIN_CONSECUTIVE_DUPLICATE_THRESHOLD, 2,
1650            "Minimum threshold should be 2"
1651        );
1652        assert_eq!(
1653            MAX_CONSECUTIVE_DUPLICATE_THRESHOLD, 10,
1654            "Maximum threshold should be 10"
1655        );
1656    }
1657}
ralph_workflow/json_parser/deduplication.rs

ralph_workflow/json_parser/
deduplication.rs