ralph_workflow/json_parser/
deduplication.rs

1//! Delta deduplication using KMP and Rolling Hash algorithms.
2//!
3//! This module provides efficient deduplication for streaming deltas using:
4//! - **Rolling Hash (Rabin-Karp)**: Fast O(n) filtering to eliminate impossible matches
5//! - **KMP (Knuth-Morris-Pratt)**: O(n+m) verification for exact substring matching
6//! - **Strong Overlap Detection**: Thresholds and boundary checks to prevent false positives
7//!
8//! # Enhanced Deduplication
9//!
10//! The enhanced algorithm uses multiple layers of validation to prevent false positives:
11//!
12//! 1. **Rolling Hash Filter**: Fast O(n) check to eliminate impossible matches
13//! 2. **KMP Verification**: O(n+m) confirmation of actual substring match
14//! 3. **Overlap Threshold**: Only dedupe when overlap >= 30 chars AND >= 50% of delta
15//! 4. **Boundary Sanity**: Ensure overlap ends at whitespace/punctuation/newline
16//! 5. **Short Chunk Protection**: Chunks < 20 chars never deduped unless exact match
17//!
18//! # Architecture
19//!
20//! ```text
21//! Incoming Delta
22//!       │
23//!       ▼
24//! ┌─────────────────────┐
25//! │  Rolling Hash Check │  ◄── Compute hash of delta, compare against
26//! │  (Rabin-Karp)       │      sliding window hashes of accumulated content
27//! └──────────┬──────────┘
28//!            │
29//!     ┌──────┴──────┐
30//!     │ Hash Match? │
31//!     └──────┬──────┘
32//!       No   │   Yes
33//!       │    │
34//!       ▼    ▼
35//!    Accept  ┌─────────────────┐
36//!    Delta   │  KMP Verification│  ◄── Confirm actual substring match
37//!            └────────┬────────┘
38//!                     │
39//!              ┌──────┴──────┐
40//!              │True Match?  │
41//!              └──────┬──────┘
42//!                No   │   Yes
43//!                │    │
44//!                ▼    ▼
45//!             Accept  ┌─────────────────────┐
46//!             Delta   │ Strong Overlap Check│ ◄── >= 30 chars, >= 50%, safe boundary
47//!                     └──────────┬──────────┘
48//!                                │
49//!                         ┌──────┴──────┐
50//!                         │Measures?    │
51//!                         └──────┬──────┘
52//!                           No   │   Yes
53//!                           │    │
54//!                           ▼    ▼
55//!                        Accept  Extract New
56//!                        Delta   Portion Only
57//! ```
58
59use std::collections::HashMap;
60use std::sync::OnceLock;
61
62// ============================================================================
63// Environment Trait for Testability
64// ============================================================================
65
66/// Trait for accessing environment variables.
67///
68/// This trait enables dependency injection for testing without global state pollution.
69pub trait ThresholdEnvironment {
70    /// Get an environment variable by name.
71    fn get_var(&self, name: &str) -> Option<String>;
72}
73
74/// Production implementation that reads from actual environment.
75pub struct RealThresholdEnvironment;
76
77impl ThresholdEnvironment for RealThresholdEnvironment {
78    fn get_var(&self, name: &str) -> Option<String> {
79        std::env::var(name).ok()
80    }
81}
82
83// ============================================================================
84// Configuration Constants for Strong Overlap Detection
85// ============================================================================
86
87/// Default minimum overlap character count for deduplication.
88///
89/// Overlaps must be at least this many characters to be considered for deduplication.
90/// This prevents false positives from short accidental matches (e.g., "the", "and").
91const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
92
93/// Minimum overlap ratio expressed as integer (50 = 50%).
94/// Used for integer-based ratio comparison to avoid floating point issues.
95const MIN_OVERLAP_RATIO_INT: usize = 50;
96
97/// Default threshold for considering a chunk "short".
98///
99/// Short chunks (< this many chars) are never deduped unless they're exact matches
100/// with the accumulated content. This prevents aggressive deduplication of tokens
101/// like ".", "\n", "Ok" that are legitimately repeated.
102const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
103
104/// Default threshold for consecutive duplicate detection.
105///
106/// If the exact same chunk arrives this many times in a row, it's treated as a
107/// resend glitch and dropped entirely.
108const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
109
110/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
111const MIN_MIN_OVERLAP_CHARS: usize = 10;
112
113/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
114const MAX_MIN_OVERLAP_CHARS: usize = 100;
115
116/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
117const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
118
119/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
120const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
121
122/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
123const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
124
125/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
126const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
127
128/// Configuration for strong overlap detection.
129///
130/// This struct holds the tunable thresholds that determine when an overlap
131/// is "strong enough" to warrant deduplication.
132#[derive(Debug, Clone, Copy)]
133pub struct OverlapThresholds {
134    /// Minimum character count for overlap
135    pub min_overlap_chars: usize,
136    /// Threshold below which chunks are considered "short"
137    pub short_chunk_threshold: usize,
138    /// Number of consecutive duplicates before aggressive dedupe
139    pub consecutive_duplicate_threshold: usize,
140}
141
142impl Default for OverlapThresholds {
143    fn default() -> Self {
144        Self {
145            min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
146            short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
147            consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
148        }
149    }
150}
151
152/// Testable variant that accepts an environment trait for dependency injection.
153///
154/// This allows tests to mock environment variables without global state pollution.
155///
156/// Reads the following environment variables:
157/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
158/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
159/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
160pub fn get_overlap_thresholds_with_env(env: &dyn ThresholdEnvironment) -> OverlapThresholds {
161    let min_overlap_chars = env
162        .get_var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
163        .and_then(|s| s.parse::<usize>().ok())
164        .and_then(|v| {
165            if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
166                Some(v)
167            } else {
168                None
169            }
170        })
171        .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
172
173    let short_chunk_threshold = env
174        .get_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
175        .and_then(|s| s.parse::<usize>().ok())
176        .and_then(|v| {
177            if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
178                Some(v)
179            } else {
180                None
181            }
182        })
183        .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
184
185    let consecutive_duplicate_threshold = env
186        .get_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
187        .and_then(|s| s.parse::<usize>().ok())
188        .and_then(|v| {
189            if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
190                .contains(&v)
191            {
192                Some(v)
193            } else {
194                None
195            }
196        })
197        .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
198
199    OverlapThresholds {
200        min_overlap_chars,
201        short_chunk_threshold,
202        consecutive_duplicate_threshold,
203    }
204}
205
206pub fn get_overlap_thresholds() -> OverlapThresholds {
207    static THRESHOLDS: OnceLock<OverlapThresholds> = OnceLock::new();
208    *THRESHOLDS.get_or_init(|| get_overlap_thresholds_with_env(&RealThresholdEnvironment))
209}
210
211// ============================================================================
212// Boundary Detection
213// ============================================================================
214
215/// Check if a character position is at a safe boundary for deduplication.
216///
217/// A "safe boundary" is where the overlap ends at a natural break point in text:
218/// - Whitespace (space, tab, newline, etc.)
219/// - ASCII punctuation (.,!?;:, etc.)
220/// - End of string
221///
222/// This prevents deduplication from splitting words or tokens mid-way through,
223/// which could cause incorrect rendering of intentional repetitions.
224///
225/// # Arguments
226/// * `text` - The text to check
227/// * `pos` - The position in the text (byte offset)
228///
229/// # Returns
230/// * `true` - The position is at a safe boundary for deduplication
231/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
232///
233/// # Examples
234///
235/// ```ignore
236/// // Safe: overlap ends at space
237/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
238///
239/// // Safe: overlap ends at punctuation
240/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
241///
242/// // Unsafe: overlap ends mid-word
243/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
244/// ```
245fn is_safe_boundary(text: &str, pos: usize) -> bool {
246    // End of string is always safe
247    if pos >= text.len() {
248        return true;
249    }
250
251    // Get the character at the boundary position
252    // We need to use character iteration for Unicode safety
253    let char_at_pos = text[pos..].chars().next();
254
255    char_at_pos
256        .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
257}
258
259// ============================================================================
260// Overlap Quality Scoring
261// ============================================================================
262
263/// Score representing the "strength" of an overlap.
264///
265/// This struct captures multiple metrics about an overlap to determine
266/// if it's strong enough to warrant deduplication.
267#[derive(Debug, Clone, Copy, PartialEq, Eq)]
268pub struct OverlapScore {
269    /// Character count of the overlap
270    pub char_count: usize,
271    /// Whether the overlap meets the minimum ratio threshold
272    pub ratio_met: bool,
273    /// Whether the overlap ends at a safe boundary
274    pub is_safe_boundary: bool,
275}
276
277impl OverlapScore {
278    /// Check if this overlap meets all thresholds for deduplication.
279    ///
280    /// # Arguments
281    /// * `thresholds` - The overlap thresholds to check against
282    ///
283    /// # Returns
284    /// * `true` - The overlap is strong enough for deduplication
285    /// * `false` - The overlap is too weak
286    #[must_use]
287    pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
288        self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
289    }
290
291    /// Check if the delta is short (below short chunk threshold).
292    ///
293    /// # Arguments
294    /// * `delta_len` - The length of the delta
295    /// * `thresholds` - The overlap thresholds
296    ///
297    /// # Returns
298    /// * `true` - The delta is considered short
299    /// * `false` - The delta is normal length
300    #[must_use]
301    #[cfg(test)]
302    pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
303        delta_len < thresholds.short_chunk_threshold
304    }
305}
306
307/// Score the quality of an overlap between delta and accumulated content.
308///
309/// This function computes multiple metrics about an overlap to determine
310/// if it's strong enough to warrant deduplication.
311///
312/// # Arguments
313/// * `delta` - The incoming delta
314/// * `accumulated` - The previously accumulated content
315///
316/// # Returns
317/// An `OverlapScore` containing:
318/// - `char_count`: The length of the overlap in characters
319/// - `ratio`: The overlap as a fraction of delta length
320/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
321///
322/// # Examples
323///
324/// ```ignore
325/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
326/// let score = score_overlap("Hello World! More text here", "Hello World!");
327/// assert!(score.char_count >= 30);
328/// assert!(score.ratio_met);
329/// assert!(score.is_safe_boundary);
330/// ```
331fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
332    // Check if delta starts with accumulated (snapshot detection)
333    let overlap_len = if delta.starts_with(accumulated) {
334        accumulated.len()
335    } else {
336        0
337    };
338
339    // Calculate ratio as integer to avoid floating point precision issues
340    // We'll compare overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
341    // This avoids f64 casting entirely
342    let ratio_met = if delta.is_empty() {
343        false
344    } else {
345        // Check if overlap/delta >= MIN_OVERLAP_RATIO without floating point
346        // By cross-multiplying: overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
347        let overlap_scaled = overlap_len.saturating_mul(100);
348        let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
349        overlap_scaled >= threshold
350    };
351
352    // Check if the accumulated string ends at a safe boundary
353    // This is important because we don't want to dedupe if the accumulated
354    // string ends mid-word (e.g., accumulated="Hello" and delta="HelloWorld")
355    let is_safe_boundary = if overlap_len > 0 {
356        // Check if the last character of accumulated is a safe boundary
357        is_safe_boundary(accumulated, accumulated.len())
358    } else {
359        false
360    };
361
362    OverlapScore {
363        char_count: overlap_len,
364        ratio_met,
365        is_safe_boundary,
366    }
367}
368
369/// Rolling hash window for fast substring detection.
370///
371/// Maintains rolling hash values over accumulated content using the Rabin-Karp
372/// algorithm. Provides O(1) hash computation for sliding windows.
373///
374/// # Algorithm
375///
376/// Uses polynomial rolling hash with base 256 (byte values) and modulus
377/// from a large prime to minimize collisions.
378///
379/// Hash computation:
380/// ```text
381/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
382/// ```
383///
384/// Rolling update:
385/// ```text
386/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
387/// ```
388///
389/// # Example
390///
391/// ```ignore
392/// let mut window = RollingHashWindow::new();
393/// window.add_content("Hello World");
394///
395/// // Check if "World" exists in the accumulated content
396/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
397/// let world_hash = RollingHashWindow::compute_hash("World");
398///
399/// if hashes.contains(&world_hash) {
400///     // Potential match - verify with KMP
401/// }
402/// ```
403#[derive(Debug, Default, Clone)]
404pub struct RollingHashWindow {
405    /// Accumulated content for hash computation
406    content: String,
407    /// Cached hash values for different window sizes
408    /// Maps `window_size` -> Vec<(position, hash)>
409    cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
410}
411
412impl RollingHashWindow {
413    /// Base for polynomial rolling hash (256 for byte values)
414    const BASE: u64 = 256;
415    /// Modulus for hash computation (large prime to minimize collisions)
416    const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
417
418    /// Create a new rolling hash window.
419    #[cfg(test)]
420    pub fn new() -> Self {
421        Self::default()
422    }
423
424    /// Compute rolling hash of a string slice.
425    ///
426    /// Uses polynomial rolling hash with base 256 and a large prime modulus.
427    ///
428    /// # Arguments
429    /// * `text` - The text to hash
430    ///
431    /// # Returns
432    /// The hash value as a u64
433    ///
434    /// # Example
435    /// ```ignore
436    /// let hash = RollingHashWindow::compute_hash("Hello");
437    /// ```
438    pub fn compute_hash(text: &str) -> u64 {
439        let mut hash: u64 = 0;
440        for byte in text.bytes() {
441            hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
442        }
443        hash
444    }
445
446    /// Compute base^(n-1) mod MODULUS for rolling hash updates.
447    ///
448    /// This is used to efficiently remove the leftmost character when
449    /// sliding the window.
450    #[cfg(test)]
451    fn compute_power(power: usize) -> u64 {
452        let mut result = 1u64;
453        for _ in 0..power {
454            result = (result * Self::BASE) % Self::MODULUS;
455        }
456        result
457    }
458
459    /// Add content to the window and update cached hashes.
460    ///
461    /// This appends new content and recomputes hash values for all
462    /// cached window sizes.
463    ///
464    /// # Arguments
465    /// * `text` - The new content to add
466    #[cfg(test)]
467    pub fn add_content(&mut self, text: &str) {
468        if text.is_empty() {
469            return;
470        }
471
472        let _start_pos = self.content.len();
473        self.content.push_str(text);
474
475        // Invalidate and recompute all cached hashes
476        self.cached_hashes.clear();
477    }
478
479    /// Get all window hashes of a specific size.
480    ///
481    /// Computes rolling hash values for all windows of the given size
482    /// in the accumulated content.
483    ///
484    /// # Arguments
485    /// * `window_size` - The size of each window
486    ///
487    /// # Returns
488    /// A vector of (position, hash) tuples for each window
489    ///
490    /// # Example
491    /// ```ignore
492    /// // Get hashes for all 5-character windows
493    /// let hashes = window.get_window_hashes(5);
494    /// ```
495    #[cfg(test)]
496    pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
497        // Return cached copy if available
498        if let Some(hashes) = self.cached_hashes.get(&window_size) {
499            return hashes.clone();
500        }
501
502        // Compute hashes for this window size
503        let content_bytes = self.content.as_bytes();
504        let content_len = content_bytes.len();
505
506        if content_len < window_size {
507            // Not enough content for even one window
508            let empty: Vec<(usize, u64)> = Vec::new();
509            self.cached_hashes.insert(window_size, empty.clone());
510            return empty;
511        }
512
513        let mut hashes = Vec::new();
514        let mut hash: u64 = 0;
515
516        // Compute initial window hash
517        for byte in content_bytes.iter().take(window_size) {
518            hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
519        }
520        hashes.push((0, hash));
521
522        // Precompute base^(window_size-1) mod MODULUS for rolling updates
523        let power = Self::compute_power(window_size - 1);
524
525        // Slide window and compute rolling hashes
526        for i in 1..=(content_len - window_size) {
527            // Remove leftmost character contribution
528            let leftmost = u64::from(content_bytes[i - 1]);
529            let removed = (leftmost * power) % Self::MODULUS;
530            hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
531
532            // Shift and add new character
533            hash = (hash * Self::BASE) % Self::MODULUS;
534            let new_char = u64::from(content_bytes[i + window_size - 1]);
535            hash = (hash + new_char) % Self::MODULUS;
536
537            hashes.push((i, hash));
538        }
539
540        // Cache for future use
541        self.cached_hashes.insert(window_size, hashes.clone());
542        hashes
543    }
544
545    /// Check if a hash exists in any window of the given size.
546    ///
547    /// This is a fast O(1) check after hashes have been computed.
548    ///
549    /// # Arguments
550    /// * `hash` - The hash value to search for
551    /// * `window_size` - The window size to check
552    ///
553    /// # Returns
554    /// * `Some(position)` - The position where the hash was found
555    /// * `None` - Hash not found
556    #[cfg(test)]
557    pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
558        let hashes = self.get_window_hashes(window_size);
559        hashes
560            .into_iter()
561            .find(|(_, h)| *h == hash)
562            .map(|(pos, _)| pos)
563    }
564
565    /// Clear all content and cached hashes.
566    pub fn clear(&mut self) {
567        self.content.clear();
568        self.cached_hashes.clear();
569    }
570
571    /// Get the current content length.
572    #[cfg(test)]
573    pub const fn len(&self) -> usize {
574        self.content.len()
575    }
576
577    /// Check if the window is empty.
578    #[cfg(test)]
579    pub const fn is_empty(&self) -> bool {
580        self.content.is_empty()
581    }
582}
583
584/// KMP (Knuth-Morris-Pratt) matcher for exact substring matching.
585///
586/// Provides linear-time substring search by precomputing a failure function
587/// that allows skipping already-matched characters.
588///
589/// # Algorithm
590///
591/// The failure function (also called "prefix function" or "lps array") stores
592/// the length of the longest proper prefix of the pattern that is also a suffix
593/// for each position in the pattern.
594///
595/// # Example
596///
597/// ```ignore
598/// let mut kmp = KMPMatcher::new("abc");
599/// let text = "xyzabcuvw";
600///
601/// if let Some(pos) = kmp.find(text) {
602///     // Found "abc" at position 3
603/// }
604/// ```
605#[derive(Debug, Clone)]
606#[cfg(test)]
607pub struct KMPMatcher {
608    /// The pattern to search for
609    pattern: String,
610    /// Failure function (longest proper prefix which is also suffix)
611    failure: Vec<usize>,
612}
613
614#[cfg(test)]
615impl KMPMatcher {
616    /// Create a new KMP matcher for the given pattern.
617    ///
618    /// Precomputes the failure function for efficient searching.
619    ///
620    /// # Arguments
621    /// * `pattern` - The pattern to search for
622    ///
623    /// # Example
624    /// ```ignore
625    /// let matcher = KMPMatcher::new("hello");
626    /// ```
627    pub fn new(pattern: &str) -> Self {
628        let pattern = pattern.to_string();
629        let failure = Self::compute_failure(&pattern);
630        Self { pattern, failure }
631    }
632
633    /// Compute the failure function for KMP.
634    ///
635    /// The failure function `lps[i]` stores the length of the longest proper
636    /// prefix of `pattern[0..=i]` that is also a suffix.
637    ///
638    /// # Example
639    ///
640    /// For pattern "abab":
641    /// - lps[0] = 0 (no proper prefix for single char)
642    /// - lps[1] = 0 ("ab" has no matching prefix/suffix)
643    /// - lps[2] = 1 ("aba" has "a" as prefix and suffix)
644    /// - lps[3] = 2 ("abab" has "ab" as prefix and suffix)
645    fn compute_failure(pattern: &str) -> Vec<usize> {
646        let m = pattern.len();
647        if m == 0 {
648            return Vec::new();
649        }
650
651        let mut lps = vec![0; m];
652        let mut len = 0; // Length of the previous longest prefix suffix
653        let mut i = 1;
654
655        let pattern_bytes = pattern.as_bytes();
656
657        while i < m {
658            if pattern_bytes[i] == pattern_bytes[len] {
659                len += 1;
660                lps[i] = len;
661                i += 1;
662            } else if len != 0 {
663                // Fall back to previous longest prefix suffix
664                len = lps[len - 1];
665                // Note: we don't increment i here
666            } else {
667                // No proper prefix suffix found
668                lps[i] = 0;
669                i += 1;
670            }
671        }
672
673        lps
674    }
675
676    /// Find the pattern in text, returning the first position.
677    ///
678    /// Uses the precomputed failure function for O(n) search time.
679    ///
680    /// # Arguments
681    /// * `text` - The text to search in
682    ///
683    /// # Returns
684    /// * `Some(position)` - The starting position of the pattern
685    /// * `None` - Pattern not found
686    ///
687    /// # Example
688    /// ```ignore
689    /// let matcher = KMPMatcher::new("world");
690    /// assert_eq!(matcher.find("hello world"), Some(6));
691    /// assert_eq!(matcher.find("hello"), None);
692    /// ```
693    pub fn find(&self, text: &str) -> Option<usize> {
694        let n = text.len();
695        let m = self.pattern.len();
696
697        if m == 0 || n < m {
698            return None;
699        }
700
701        let text_bytes = text.as_bytes();
702        let pattern_bytes = self.pattern.as_bytes();
703
704        let mut i = 0; // Index for text
705        let mut j = 0; // Index for pattern
706
707        while i < n {
708            if pattern_bytes[j] == text_bytes[i] {
709                i += 1;
710                j += 1;
711
712                if j == m {
713                    // Found complete pattern match
714                    return Some(i - j);
715                }
716            } else if j != 0 {
717                // Use failure function to skip ahead
718                j = self.failure[j - 1];
719            } else {
720                // No match at all, move to next character
721                i += 1;
722            }
723        }
724
725        None
726    }
727
728    /// Find all occurrences of the pattern in text.
729    ///
730    /// Returns all positions where the pattern appears in the text.
731    ///
732    /// # Arguments
733    /// * `text` - The text to search in
734    ///
735    /// # Returns
736    /// A vector of positions where the pattern was found
737    ///
738    /// # Example
739    /// ```ignore
740    /// let matcher = KMPMatcher::new("ab");
741    /// let positions = matcher.find_all("ababab");
742    /// assert_eq!(positions, vec![0, 2, 4]);
743    /// ```
744    #[cfg(test)]
745    pub fn find_all(&self, text: &str) -> Vec<usize> {
746        let mut positions = Vec::new();
747        let n = text.len();
748        let m = self.pattern.len();
749
750        if m == 0 || n < m {
751            return positions;
752        }
753
754        let text_bytes = text.as_bytes();
755        let pattern_bytes = self.pattern.as_bytes();
756
757        let mut i = 0; // Index for text
758        let mut j = 0; // Index for pattern
759
760        while i < n {
761            if pattern_bytes[j] == text_bytes[i] {
762                i += 1;
763                j += 1;
764
765                if j == m {
766                    // Found complete pattern match
767                    positions.push(i - j);
768                    j = self.failure[j - 1];
769                }
770            } else if j != 0 {
771                j = self.failure[j - 1];
772            } else {
773                i += 1;
774            }
775        }
776
777        positions
778    }
779
780    /// Get the pattern length.
781    #[cfg(test)]
782    pub const fn pattern_len(&self) -> usize {
783        self.pattern.len()
784    }
785
786    /// Check if the pattern is empty.
787    #[cfg(test)]
788    pub const fn is_empty(&self) -> bool {
789        self.pattern.is_empty()
790    }
791}
792
793/// Delta deduplicator using rolling hash and KMP.
794///
795/// Orchestrates the two-phase deduplication approach:
796/// 1. Rolling hash for fast filtering
797/// 2. KMP for exact verification
798///
799/// # Example
800///
801/// ```ignore
802/// let mut dedup = DeltaDeduplicator::new();
803///
804/// // Add accumulated content
805/// dedup.add_accumulated("Hello World");
806///
807/// // Check if a delta is a duplicate
808/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
809///     // "!" is the new portion
810///     assert_eq!(new_portion, "!");
811/// }
812/// ```
813#[derive(Debug, Default, Clone)]
814pub struct DeltaDeduplicator {
815    /// Rolling hash window for accumulated content
816    hash_window: RollingHashWindow,
817}
818
819impl DeltaDeduplicator {
820    /// Create a new delta deduplicator.
821    #[cfg(test)]
822    pub fn new() -> Self {
823        Self::default()
824    }
825
826    /// Add accumulated content for deduplication tracking.
827    ///
828    /// # Arguments
829    /// * `content` - The accumulated content to track
830    #[cfg(test)]
831    pub fn add_accumulated(&mut self, content: &str) {
832        self.hash_window.add_content(content);
833    }
834
835    /// Extract new content from a potential snapshot.
836    ///
837    /// Uses rolling hash and KMP to detect if the incoming delta contains
838    /// previously accumulated content (a snapshot) and extracts only the
839    /// new portion.
840    ///
841    /// # Two-Phase Algorithm
842    ///
843    /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
844    ///    in any window of the accumulated content. O(n) time.
845    ///
846    /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
847    ///    substring match and find the exact position. O(n+m) time.
848    ///
849    /// # Arguments
850    /// * `delta` - The incoming delta to check
851    /// * `accumulated` - The previously accumulated content
852    ///
853    /// # Returns
854    /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
855    /// * `None` - The delta is genuinely new, doesn't start with accumulated
856    ///
857    /// # Example
858    /// ```ignore
859    /// let mut dedup = DeltaDeduplicator::new();
860    /// dedup.add_accumulated("Hello");
861    ///
862    /// // Snapshot: "Hello World"
863    /// assert_eq!(
864    ///     DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
865    ///     Some(" World")
866    /// );
867    ///
868    /// // Genuine delta: " World"
869    /// assert_eq!(
870    ///     DeltaDeduplicator::extract_new_content(" World", "Hello"),
871    ///     None
872    /// );
873    /// ```
874    #[cfg(test)]
875    pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
876        // Handle identical content (delta == accumulated) - return empty string
877        if delta == accumulated {
878            return Some("");
879        }
880
881        // Fast rejection: delta must be longer than accumulated
882        if delta.len() <= accumulated.len() {
883            return None;
884        }
885
886        // Phase 1: Rolling hash check
887        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
888        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
889
890        // If hashes don't match, definitely not a snapshot
891        if accumulated_hash != delta_prefix_hash {
892            return None;
893        }
894
895        // Phase 2: KMP verification for exact match
896        let kmp = KMPMatcher::new(accumulated);
897        if let Some(pos) = kmp.find(delta) {
898            // Verify it's at position 0 (snapshot starts with accumulated)
899            if pos == 0 {
900                return Some(&delta[accumulated.len()..]);
901            }
902        }
903
904        // Hash collision or match not at start - not a snapshot
905        None
906    }
907
908    /// Check if delta is likely a snapshot using rolling hash only.
909    ///
910    /// This is a faster O(n) check that may have false positives due to
911    /// hash collisions. Use `extract_new_content` for verified results.
912    ///
913    /// # Arguments
914    /// * `delta` - The incoming delta to check
915    /// * `accumulated` - The previously accumulated content
916    ///
917    /// # Returns
918    /// * `true` - Delta may be a snapshot (hash matches)
919    /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
920    pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
921        // Handle identical content (duplicate delta)
922        if delta == accumulated {
923            return true;
924        }
925
926        // Delta must be longer than accumulated to be a snapshot
927        if delta.len() <= accumulated.len() {
928            return false;
929        }
930
931        let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
932        let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
933
934        accumulated_hash == delta_prefix_hash
935    }
936
937    /// Check if delta is likely a snapshot with strong overlap detection.
938    ///
939    /// This is an enhanced version of `is_likely_snapshot` that applies
940    /// strong overlap thresholds to prevent false positives on intentional
941    /// repetitions.
942    ///
943    /// # Strong Overlap Detection
944    ///
945    /// This method only returns `true` when:
946    /// - The overlap meets minimum character count threshold (default: 30 chars)
947    /// - The overlap meets minimum ratio threshold (default: 50% of delta)
948    /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
949    /// - Short chunks (< 20 chars) are only deduped if exact match
950    ///
951    /// # Arguments
952    /// * `delta` - The incoming delta to check
953    /// * `accumulated` - The previously accumulated content
954    ///
955    /// # Returns
956    /// * `true` - Delta is a snapshot meeting strong overlap criteria
957    /// * `false` - Delta is either genuine or overlap is too weak
958    pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
959        let thresholds = get_overlap_thresholds();
960
961        // Handle short chunks: only dedupe if exact match
962        if delta.len() < thresholds.short_chunk_threshold {
963            return delta == accumulated;
964        }
965
966        // Handle identical content (delta == accumulated)
967        // This is a snapshot where no new content is added
968        if delta == accumulated {
969            return true;
970        }
971
972        // Fast rejection: delta must be longer than accumulated
973        if delta.len() <= accumulated.len() {
974            return false;
975        }
976
977        // First check with basic rolling hash for quick rejection
978        if !Self::is_likely_snapshot(delta, accumulated) {
979            return false;
980        }
981
982        // Score the overlap to check if it meets strong overlap criteria
983        let score = score_overlap(delta, accumulated);
984
985        // Apply threshold checks
986        score.meets_thresholds(&thresholds)
987    }
988
989    /// Extract new content from a snapshot with strong overlap detection.
990    ///
991    /// This is an enhanced version of `extract_new_content` that only extracts
992    /// new content when the overlap meets strong overlap thresholds.
993    ///
994    /// # Arguments
995    /// * `delta` - The incoming delta to check
996    /// * `accumulated` - The previously accumulated content
997    ///
998    /// # Returns
999    /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
1000    /// * `None` - The overlap is too weak or not a snapshot
1001    pub fn extract_new_content_with_thresholds<'a>(
1002        delta: &'a str,
1003        accumulated: &str,
1004    ) -> Option<&'a str> {
1005        let thresholds = get_overlap_thresholds();
1006
1007        // Handle short chunks: only dedupe if exact match
1008        if delta.len() < thresholds.short_chunk_threshold {
1009            if delta == accumulated {
1010                return Some("");
1011            }
1012            return None;
1013        }
1014
1015        // Handle identical content
1016        if delta == accumulated {
1017            return Some("");
1018        }
1019
1020        // Fast rejection: delta must be longer than accumulated
1021        if delta.len() <= accumulated.len() {
1022            return None;
1023        }
1024
1025        // Score the overlap
1026        let score = score_overlap(delta, accumulated);
1027
1028        // Check if overlap meets thresholds
1029        if !score.meets_thresholds(&thresholds) {
1030            return None;
1031        }
1032
1033        // Extract new content using the overlap length from the score
1034        if score.char_count > 0 && delta.len() > score.char_count {
1035            Some(&delta[score.char_count..])
1036        } else {
1037            None
1038        }
1039    }
1040
1041    /// Clear all tracked content.
1042    pub fn clear(&mut self) {
1043        self.hash_window.clear();
1044    }
1045}
1046
1047#[cfg(test)]
1048mod tests {
1049    use super::*;
1050
1051    // Tests for RollingHashWindow
1052
1053    #[test]
1054    fn test_rolling_hash_compute_hash() {
1055        let hash1 = RollingHashWindow::compute_hash("Hello");
1056        let hash2 = RollingHashWindow::compute_hash("Hello");
1057        let hash3 = RollingHashWindow::compute_hash("World");
1058
1059        // Same input produces same hash
1060        assert_eq!(hash1, hash2);
1061        // Different input likely produces different hash
1062        assert_ne!(hash1, hash3);
1063    }
1064
1065    #[test]
1066    fn test_rolling_hash_add_content() {
1067        let mut window = RollingHashWindow::new();
1068        assert!(window.is_empty());
1069
1070        window.add_content("Hello");
1071        assert_eq!(window.len(), 5);
1072        assert!(!window.is_empty());
1073    }
1074
1075    #[test]
1076    fn test_rolling_hash_get_window_hashes() {
1077        let mut window = RollingHashWindow::new();
1078        window.add_content("HelloWorld");
1079
1080        // Get hashes for 5-character windows
1081        let hashes = window.get_window_hashes(5);
1082        assert_eq!(hashes.len(), 6); // "Hello", "elloW", "lloWo", "loWor", "oWorl", "World"
1083
1084        // Verify positions
1085        assert_eq!(hashes[0].0, 0); // First window starts at 0
1086        assert_eq!(hashes[5].0, 5); // Last window starts at 5
1087    }
1088
1089    #[test]
1090    fn test_rolling_hash_contains_hash() {
1091        let mut window = RollingHashWindow::new();
1092        window.add_content("HelloWorld");
1093
1094        let world_hash = RollingHashWindow::compute_hash("World");
1095        let xyz_hash = RollingHashWindow::compute_hash("XYZ");
1096
1097        // "World" exists in the content
1098        assert!(window.contains_hash(world_hash, 5).is_some());
1099        // "XYZ" doesn't exist
1100        assert!(window.contains_hash(xyz_hash, 3).is_none());
1101    }
1102
1103    #[test]
1104    fn test_rolling_hash_clear() {
1105        let mut window = RollingHashWindow::new();
1106        window.add_content("Hello");
1107        assert_eq!(window.len(), 5);
1108
1109        window.clear();
1110        assert!(window.is_empty());
1111    }
1112
1113    // Tests for KMPMatcher
1114
1115    #[test]
1116    fn test_kmp_find_pattern_exists() {
1117        let kmp = KMPMatcher::new("World");
1118        assert_eq!(kmp.find("Hello World"), Some(6));
1119        assert_eq!(kmp.find("WorldHello"), Some(0));
1120    }
1121
1122    #[test]
1123    fn test_kmp_find_pattern_not_exists() {
1124        let kmp = KMPMatcher::new("XYZ");
1125        assert_eq!(kmp.find("Hello World"), None);
1126    }
1127
1128    #[test]
1129    fn test_kmp_find_pattern_empty() {
1130        let kmp = KMPMatcher::new("");
1131        assert_eq!(kmp.find("Hello"), None);
1132    }
1133
1134    #[test]
1135    fn test_kmp_find_text_shorter_than_pattern() {
1136        let kmp = KMPMatcher::new("Hello World");
1137        assert_eq!(kmp.find("Hello"), None);
1138    }
1139
1140    #[test]
1141    fn test_kmp_find_all() {
1142        let kmp = KMPMatcher::new("ab");
1143        let positions = kmp.find_all("ababab");
1144        assert_eq!(positions, vec![0, 2, 4]);
1145    }
1146
1147    #[test]
1148    fn test_kmp_find_all_no_matches() {
1149        let kmp = KMPMatcher::new("xyz");
1150        let positions = kmp.find_all("abcabc");
1151        assert!(positions.is_empty());
1152    }
1153
1154    #[test]
1155    fn test_kmp_find_overlapping_patterns() {
1156        let kmp = KMPMatcher::new("aa");
1157        let positions = kmp.find_all("aaa");
1158        assert_eq!(positions, vec![0, 1]);
1159    }
1160
1161    #[test]
1162    fn test_kmp_failure_function() {
1163        let kmp = KMPMatcher::new("abab");
1164        // lps = [0, 0, 1, 2]
1165        assert_eq!(kmp.failure, vec![0, 0, 1, 2]);
1166    }
1167
1168    #[test]
1169    fn test_kmp_pattern_len() {
1170        let kmp = KMPMatcher::new("Hello");
1171        assert_eq!(kmp.pattern_len(), 5);
1172    }
1173
1174    #[test]
1175    fn test_kmp_is_empty() {
1176        let kmp = KMPMatcher::new("");
1177        assert!(kmp.is_empty());
1178
1179        let kmp = KMPMatcher::new("Hello");
1180        assert!(!kmp.is_empty());
1181    }
1182
1183    // Tests for DeltaDeduplicator
1184
1185    #[test]
1186    fn test_dedup_extract_new_content_snapshot() {
1187        let mut dedup = DeltaDeduplicator::new();
1188        dedup.add_accumulated("Hello");
1189
1190        // Snapshot: "Hello World"
1191        assert_eq!(
1192            DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
1193            Some(" World")
1194        );
1195    }
1196
1197    #[test]
1198    fn test_dedup_extract_new_content_genuine_delta() {
1199        let mut dedup = DeltaDeduplicator::new();
1200        dedup.add_accumulated("Hello");
1201
1202        // Genuine delta: " World"
1203        assert_eq!(
1204            DeltaDeduplicator::extract_new_content(" World", "Hello"),
1205            None
1206        );
1207    }
1208
1209    #[test]
1210    fn test_dedup_extract_new_content_shorter_delta() {
1211        let mut dedup = DeltaDeduplicator::new();
1212        dedup.add_accumulated("Hello World");
1213
1214        // Delta shorter than accumulated - can't be snapshot
1215        assert_eq!(
1216            DeltaDeduplicator::extract_new_content("Hello", "Hello World"),
1217            None
1218        );
1219    }
1220
1221    #[test]
1222    fn test_dedup_extract_new_content_equal_length() {
1223        let mut dedup = DeltaDeduplicator::new();
1224        dedup.add_accumulated("Hello");
1225
1226        // Equal length - if identical, return empty string (no new content)
1227        assert_eq!(
1228            DeltaDeduplicator::extract_new_content("Hello", "Hello"),
1229            Some("")
1230        );
1231
1232        // Equal length but different content - not a snapshot
1233        assert_eq!(
1234            DeltaDeduplicator::extract_new_content("World", "Hello"),
1235            None
1236        );
1237    }
1238
1239    #[test]
1240    fn test_dedup_extract_new_content_no_match() {
1241        let mut dedup = DeltaDeduplicator::new();
1242        dedup.add_accumulated("Hello");
1243
1244        // Different content entirely
1245        assert_eq!(
1246            DeltaDeduplicator::extract_new_content("World", "Hello"),
1247            None
1248        );
1249    }
1250
1251    #[test]
1252    fn test_dedup_is_likely_snapshot() {
1253        let mut dedup = DeltaDeduplicator::new();
1254        dedup.add_accumulated("Hello");
1255
1256        // Actual snapshot
1257        assert!(DeltaDeduplicator::is_likely_snapshot(
1258            "Hello World",
1259            "Hello"
1260        ));
1261
1262        // Not a snapshot
1263        assert!(!DeltaDeduplicator::is_likely_snapshot(" World", "Hello"));
1264    }
1265
1266    #[test]
1267    fn test_dedup_clear() {
1268        let mut dedup = DeltaDeduplicator::new();
1269        dedup.add_accumulated("Hello");
1270        assert!(!dedup.hash_window.is_empty());
1271
1272        dedup.clear();
1273        assert!(dedup.hash_window.is_empty());
1274    }
1275
1276    // Integration tests
1277
1278    #[test]
1279    fn test_dedup_two_phase_algorithm() {
1280        let mut dedup = DeltaDeduplicator::new();
1281        dedup.add_accumulated("The quick brown fox");
1282
1283        // Phase 1: Rolling hash will match
1284        assert!(DeltaDeduplicator::is_likely_snapshot(
1285            "The quick brown fox jumps",
1286            "The quick brown fox"
1287        ));
1288
1289        // Phase 2: KMP verification confirms and extracts new portion
1290        assert_eq!(
1291            DeltaDeduplicator::extract_new_content(
1292                "The quick brown fox jumps",
1293                "The quick brown fox"
1294            ),
1295            Some(" jumps")
1296        );
1297    }
1298
1299    #[test]
1300    fn test_dedup_handles_unicode() {
1301        let mut dedup = DeltaDeduplicator::new();
1302        dedup.add_accumulated("Hello 世界");
1303
1304        // Should handle UTF-8 correctly
1305        assert_eq!(
1306            DeltaDeduplicator::extract_new_content("Hello 世界!", "Hello 世界"),
1307            Some("!")
1308        );
1309    }
1310
1311    #[test]
1312    fn test_dedup_empty_accumulated() {
1313        // No accumulated content
1314
1315        // Any delta is genuine
1316        assert_eq!(DeltaDeduplicator::extract_new_content("Hello", ""), None);
1317    }
1318
1319    // Tests for Strong Overlap Detection with Thresholds
1320
1321    #[test]
1322    fn test_strong_overlap_meets_char_threshold() {
1323        // Overlap of 30+ chars with safe boundary should pass
1324        let accumulated = "The quick brown fox jumps over the lazy";
1325        let delta = "The quick brown fox jumps over the lazy dog!";
1326
1327        assert!(
1328            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1329            "Should detect snapshot with 30+ char overlap"
1330        );
1331    }
1332
1333    #[test]
1334    fn test_strong_overlap_meets_ratio_threshold() {
1335        // Overlap is 50%+ of delta length
1336        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1337        let delta = "The quick brown fox jumps over the lazy dog. And more!";
1338
1339        assert!(
1340            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1341            "Should detect snapshot when overlap is 50%+ of delta"
1342        );
1343    }
1344
1345    #[test]
1346    fn test_strong_overlap_fails_char_threshold() {
1347        // Overlap < 30 chars, even if ratio is good
1348        let accumulated = "Hello";
1349        let delta = "Hello World!";
1350
1351        assert!(
1352            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1353            "Should NOT detect snapshot when overlap < 30 chars"
1354        );
1355    }
1356
1357    #[test]
1358    fn test_strong_overlap_fails_ratio_threshold() {
1359        // Overlap < 50% of delta, even if char count is good
1360        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1361        let delta = "The quick brown fox jumps over the lazy dog. And then a whole lot more text follows to make the ratio low!";
1362
1363        assert!(
1364            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1365            "Should NOT detect snapshot when overlap < 50% of delta"
1366        );
1367    }
1368
1369    #[test]
1370    fn test_boundary_check_whitespace() {
1371        // Overlap ends at space (safe boundary)
1372        let accumulated = "The quick brown fox jumps over the lazy dog and ";
1373        let delta = "The quick brown fox jumps over the lazy dog and then more!";
1374
1375        assert!(
1376            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1377            "Should detect snapshot when overlap ends at whitespace"
1378        );
1379    }
1380
1381    #[test]
1382    fn test_boundary_check_punctuation() {
1383        // Overlap ends at punctuation (safe boundary)
1384        let accumulated = "The quick brown fox jumps over the lazy dog.";
1385        let delta = "The quick brown fox jumps over the lazy dog. How are you?";
1386
1387        assert!(
1388            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1389            "Should detect snapshot when overlap ends at punctuation"
1390        );
1391    }
1392
1393    #[test]
1394    fn test_boundary_check_mid_word_fails() {
1395        // Overlap ends mid-word (unsafe boundary)
1396        let accumulated = "Hello";
1397        let delta = "HelloWorld! This is a lot of text to ensure we have enough characters.";
1398
1399        // Even though we have 30+ chars, the boundary check should fail
1400        // because the overlap ends mid-word (at 'W' of "World")
1401        assert!(
1402            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1403            "Should NOT detect snapshot when overlap ends mid-word"
1404        );
1405    }
1406
1407    #[test]
1408    fn test_short_chunk_never_deduped() {
1409        // Short chunks (< 20 chars) never deduped unless exact match
1410        let accumulated = "Hello";
1411        let delta = "Hello World!";
1412
1413        // Even though "Hello World!" starts with "Hello", it's < 20 chars total
1414        // and not an exact match, so it should NOT be deduped
1415        assert!(
1416            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1417            "Short chunks should NOT be deduped unless exact match"
1418        );
1419    }
1420
1421    #[test]
1422    fn test_short_chunk_exact_match_deduped() {
1423        // Short chunks (< 20 chars) ARE deduped if exact match
1424        let accumulated = "Hello";
1425        let delta = "Hello";
1426
1427        assert!(
1428            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1429            "Short chunk exact match SHOULD be deduped"
1430        );
1431    }
1432
1433    #[test]
1434    fn test_extract_new_content_with_thresholds_strong_overlap() {
1435        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1436        let delta = "The quick brown fox jumps over the lazy dog. More content here!";
1437
1438        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1439        assert_eq!(result, Some("More content here!"));
1440    }
1441
1442    #[test]
1443    fn test_extract_new_content_with_thresholds_weak_overlap() {
1444        // Weak overlap (< 30 chars) should return None
1445        let accumulated = "Hello";
1446        let delta = "Hello World! This is more content to exceed thresholds.";
1447
1448        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1449        assert_eq!(result, None, "Weak overlap should return None");
1450    }
1451
1452    #[test]
1453    fn test_extract_new_content_with_thresholds_short_chunk() {
1454        // Short chunk that's not an exact match should return None
1455        let accumulated = "Hi";
1456        let delta = "Hi there!";
1457
1458        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1459        assert_eq!(
1460            result, None,
1461            "Short chunk should return None unless exact match"
1462        );
1463    }
1464
1465    #[test]
1466    fn test_extract_new_content_with_thresholds_short_chunk_exact() {
1467        // Short chunk exact match should return empty string
1468        let accumulated = "Hello";
1469        let delta = "Hello";
1470
1471        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1472        assert_eq!(result, Some(""));
1473    }
1474
1475    #[test]
1476    fn test_strong_overlap_with_unicode() {
1477        // Test with Unicode characters
1478        let accumulated = "Hello 世界! This is a long enough string to meet thresholds. ";
1479        let delta = "Hello 世界! This is a long enough string to meet thresholds. More!";
1480
1481        assert!(
1482            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1483            "Should handle Unicode correctly with strong overlap"
1484        );
1485
1486        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1487        assert_eq!(result, Some("More!"));
1488    }
1489
1490    #[test]
1491    fn test_intentional_repetition_not_deduped() {
1492        // Simulate intentional repetition (e.g., "Hello World! Hello World!")
1493        // where the overlap is small relative to the total delta
1494        let accumulated = "Hello World!";
1495        let delta = "Hello World! Hello World! This is a lot of additional content to make the overlap ratio low enough that it won't be deduplicated!";
1496
1497        assert!(
1498            !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1499            "Intentional repetition should NOT be deduped when overlap ratio is low"
1500        );
1501    }
1502
1503    #[test]
1504    fn test_snapshot_strong_overlap_deduped() {
1505        // Real snapshot scenario: agent sends full accumulated + new content
1506        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1507        let delta = "The quick brown fox jumps over the lazy dog. And then some more!";
1508
1509        assert!(
1510            DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1511            "Actual snapshot SHOULD be detected and deduped"
1512        );
1513
1514        let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1515        assert_eq!(result, Some("And then some more!"));
1516    }
1517
1518    #[test]
1519    fn test_overlap_score_meets_thresholds() {
1520        let thresholds = OverlapThresholds::default();
1521
1522        // Strong overlap: 30+ chars, 50%+ ratio, safe boundary
1523        let score = OverlapScore {
1524            char_count: 50,
1525            ratio_met: true,
1526            is_safe_boundary: true,
1527        };
1528
1529        assert!(score.meets_thresholds(&thresholds));
1530    }
1531
1532    #[test]
1533    fn test_overlap_score_fails_char_count() {
1534        let thresholds = OverlapThresholds::default();
1535
1536        // Char count too low
1537        let score = OverlapScore {
1538            char_count: 20,
1539            ratio_met: true,
1540            is_safe_boundary: true,
1541        };
1542
1543        assert!(!score.meets_thresholds(&thresholds));
1544    }
1545
1546    #[test]
1547    fn test_overlap_score_fails_ratio() {
1548        let thresholds = OverlapThresholds::default();
1549
1550        // Ratio too low (met = false)
1551        let score = OverlapScore {
1552            char_count: 50,
1553            ratio_met: false,
1554            is_safe_boundary: true,
1555        };
1556
1557        assert!(!score.meets_thresholds(&thresholds));
1558    }
1559
1560    #[test]
1561    fn test_overlap_score_fails_boundary() {
1562        let thresholds = OverlapThresholds::default();
1563
1564        // Boundary not safe
1565        let score = OverlapScore {
1566            char_count: 50,
1567            ratio_met: true,
1568            is_safe_boundary: false,
1569        };
1570
1571        assert!(!score.meets_thresholds(&thresholds));
1572    }
1573
1574    #[test]
1575    fn test_is_short_delta() {
1576        let thresholds = OverlapThresholds::default();
1577
1578        assert!(OverlapScore::is_short_delta(10, &thresholds));
1579        assert!(!OverlapScore::is_short_delta(30, &thresholds));
1580    }
1581
1582    #[test]
1583    fn test_is_safe_boundary_whitespace() {
1584        assert!(is_safe_boundary("Hello World", 5));
1585        assert!(is_safe_boundary("Hello\nWorld", 5));
1586        assert!(is_safe_boundary("Hello\tWorld", 5));
1587    }
1588
1589    #[test]
1590    fn test_is_safe_boundary_punctuation() {
1591        assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
1592        assert!(is_safe_boundary("Hello. World", 5)); // After "."
1593        assert!(is_safe_boundary("Hello; World", 5)); // After ";"
1594    }
1595
1596    #[test]
1597    fn test_is_safe_boundary_end_of_string() {
1598        assert!(is_safe_boundary("Hello", 5));
1599        assert!(is_safe_boundary("Hello", 10)); // Beyond length
1600    }
1601
1602    #[test]
1603    fn test_is_safe_boundary_mid_word() {
1604        assert!(!is_safe_boundary("HelloWorld", 5));
1605        assert!(!is_safe_boundary("Testing", 3));
1606    }
1607
1608    #[test]
1609    fn test_score_overlap_with_snapshot() {
1610        let accumulated = "The quick brown fox jumps over the lazy dog. ";
1611        let delta = "The quick brown fox jumps over the lazy dog. And more!";
1612
1613        let score = score_overlap(delta, accumulated);
1614
1615        assert!(score.char_count > 30);
1616        assert!(score.ratio_met);
1617        assert!(score.is_safe_boundary);
1618    }
1619
1620    #[test]
1621    fn test_score_overlap_with_genuine_delta() {
1622        let accumulated = "Hello";
1623        let delta = " World";
1624
1625        let score = score_overlap(delta, accumulated);
1626
1627        assert_eq!(score.char_count, 0);
1628    }
1629
1630    #[test]
1631    fn test_get_overlap_thresholds_default() {
1632        let thresholds = get_overlap_thresholds();
1633
1634        assert_eq!(thresholds.min_overlap_chars, 30);
1635        assert_eq!(thresholds.short_chunk_threshold, 20);
1636        assert_eq!(thresholds.consecutive_duplicate_threshold, 3);
1637    }
1638
1639    #[test]
1640    fn test_consecutive_duplicate_threshold_default() {
1641        let thresholds = OverlapThresholds::default();
1642        assert_eq!(
1643            thresholds.consecutive_duplicate_threshold, DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
1644            "Default consecutive_duplicate_threshold should match constant"
1645        );
1646        assert_eq!(
1647            thresholds.consecutive_duplicate_threshold, 3,
1648            "Default consecutive_duplicate_threshold should be 3"
1649        );
1650    }
1651
1652    /// Mock environment for testing threshold parsing.
1653    struct MockThresholdEnv {
1654        vars: std::collections::HashMap<String, String>,
1655    }
1656
1657    impl MockThresholdEnv {
1658        fn new() -> Self {
1659            Self {
1660                vars: std::collections::HashMap::new(),
1661            }
1662        }
1663
1664        fn with_var(mut self, key: &str, value: &str) -> Self {
1665            self.vars.insert(key.to_string(), value.to_string());
1666            self
1667        }
1668    }
1669
1670    impl ThresholdEnvironment for MockThresholdEnv {
1671        fn get_var(&self, name: &str) -> Option<String> {
1672            self.vars.get(name).cloned()
1673        }
1674    }
1675
1676    #[test]
1677    fn test_threshold_env_parsing_min_overlap_chars() {
1678        // Test valid custom value
1679        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "50");
1680        let thresholds = get_overlap_thresholds_with_env(&env);
1681        assert_eq!(thresholds.min_overlap_chars, 50);
1682
1683        // Test out of range (too low) - should use default
1684        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "5");
1685        let thresholds = get_overlap_thresholds_with_env(&env);
1686        assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1687
1688        // Test out of range (too high) - should use default
1689        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "200");
1690        let thresholds = get_overlap_thresholds_with_env(&env);
1691        assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1692
1693        // Test invalid value - should use default
1694        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "invalid");
1695        let thresholds = get_overlap_thresholds_with_env(&env);
1696        assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1697    }
1698
1699    #[test]
1700    fn test_threshold_env_parsing_short_chunk_threshold() {
1701        // Test valid custom value
1702        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "10");
1703        let thresholds = get_overlap_thresholds_with_env(&env);
1704        assert_eq!(thresholds.short_chunk_threshold, 10);
1705
1706        // Test boundary values
1707        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "5");
1708        let thresholds = get_overlap_thresholds_with_env(&env);
1709        assert_eq!(thresholds.short_chunk_threshold, 5); // Min boundary
1710
1711        let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "50");
1712        let thresholds = get_overlap_thresholds_with_env(&env);
1713        assert_eq!(thresholds.short_chunk_threshold, 50); // Max boundary
1714    }
1715
1716    #[test]
1717    fn test_threshold_env_parsing_consecutive_duplicate() {
1718        // Test valid custom value
1719        let env = MockThresholdEnv::new()
1720            .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "5");
1721        let thresholds = get_overlap_thresholds_with_env(&env);
1722        assert_eq!(thresholds.consecutive_duplicate_threshold, 5);
1723
1724        // Test min boundary
1725        let env = MockThresholdEnv::new()
1726            .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "2");
1727        let thresholds = get_overlap_thresholds_with_env(&env);
1728        assert_eq!(thresholds.consecutive_duplicate_threshold, 2);
1729
1730        // Test max boundary
1731        let env = MockThresholdEnv::new()
1732            .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "10");
1733        let thresholds = get_overlap_thresholds_with_env(&env);
1734        assert_eq!(thresholds.consecutive_duplicate_threshold, 10);
1735
1736        // Test out of range - should use default
1737        let env = MockThresholdEnv::new()
1738            .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "1");
1739        let thresholds = get_overlap_thresholds_with_env(&env);
1740        assert_eq!(
1741            thresholds.consecutive_duplicate_threshold,
1742            DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1743        );
1744
1745        let env = MockThresholdEnv::new()
1746            .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "15");
1747        let thresholds = get_overlap_thresholds_with_env(&env);
1748        assert_eq!(
1749            thresholds.consecutive_duplicate_threshold,
1750            DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1751        );
1752    }
1753
1754    #[test]
1755    fn test_threshold_env_empty_returns_defaults() {
1756        let env = MockThresholdEnv::new();
1757        let thresholds = get_overlap_thresholds_with_env(&env);
1758        assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1759        assert_eq!(
1760            thresholds.short_chunk_threshold,
1761            DEFAULT_SHORT_CHUNK_THRESHOLD
1762        );
1763        assert_eq!(
1764            thresholds.consecutive_duplicate_threshold,
1765            DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1766        );
1767    }
1768
1769    #[test]
1770    fn test_threshold_bounds_constants() {
1771        // Verify bounds constants are correct (pure constant tests, no env var manipulation)
1772        assert_eq!(
1773            MIN_CONSECUTIVE_DUPLICATE_THRESHOLD, 2,
1774            "Minimum threshold should be 2"
1775        );
1776        assert_eq!(
1777            MAX_CONSECUTIVE_DUPLICATE_THRESHOLD, 10,
1778            "Maximum threshold should be 10"
1779        );
1780        assert_eq!(MIN_MIN_OVERLAP_CHARS, 10);
1781        assert_eq!(MAX_MIN_OVERLAP_CHARS, 100);
1782        assert_eq!(MIN_SHORT_CHUNK_THRESHOLD, 5);
1783        assert_eq!(MAX_SHORT_CHUNK_THRESHOLD, 50);
1784    }
1785}
ralph_workflow/json_parser/deduplication.rs

ralph_workflow/json_parser/
deduplication.rs