ralph_workflow/json_parser/deduplication.rs
1//! Delta deduplication using KMP and Rolling Hash algorithms.
2//!
3//! This module provides efficient deduplication for streaming deltas using:
4//! - **Rolling Hash (Rabin-Karp)**: Fast O(n) filtering to eliminate impossible matches
5//! - **KMP (Knuth-Morris-Pratt)**: O(n+m) verification for exact substring matching
6//! - **Strong Overlap Detection**: Thresholds and boundary checks to prevent false positives
7//!
8//! # Enhanced Deduplication
9//!
10//! The enhanced algorithm uses multiple layers of validation to prevent false positives:
11//!
12//! 1. **Rolling Hash Filter**: Fast O(n) check to eliminate impossible matches
13//! 2. **KMP Verification**: O(n+m) confirmation of actual substring match
14//! 3. **Overlap Threshold**: Only dedupe when overlap >= 30 chars AND >= 50% of delta
15//! 4. **Boundary Sanity**: Ensure overlap ends at whitespace/punctuation/newline
16//! 5. **Short Chunk Protection**: Chunks < 20 chars never deduped unless exact match
17//!
18//! # Architecture
19//!
20//! ```text
21//! Incoming Delta
22//! │
23//! ▼
24//! ┌─────────────────────┐
25//! │ Rolling Hash Check │ ◄── Compute hash of delta, compare against
26//! │ (Rabin-Karp) │ sliding window hashes of accumulated content
27//! └──────────┬──────────┘
28//! │
29//! ┌──────┴──────┐
30//! │ Hash Match? │
31//! └──────┬──────┘
32//! No │ Yes
33//! │ │
34//! ▼ ▼
35//! Accept ┌─────────────────┐
36//! Delta │ KMP Verification│ ◄── Confirm actual substring match
37//! └────────┬────────┘
38//! │
39//! ┌──────┴──────┐
40//! │True Match? │
41//! └──────┬──────┘
42//! No │ Yes
43//! │ │
44//! ▼ ▼
45//! Accept ┌─────────────────────┐
46//! Delta │ Strong Overlap Check│ ◄── >= 30 chars, >= 50%, safe boundary
47//! └──────────┬──────────┘
48//! │
49//! ┌──────┴──────┐
50//! │Measures? │
51//! └──────┬──────┘
52//! No │ Yes
53//! │ │
54//! ▼ ▼
55//! Accept Extract New
56//! Delta Portion Only
57//! ```
58
59use std::collections::HashMap;
60use std::sync::OnceLock;
61
62// ============================================================================
63// Configuration Constants for Strong Overlap Detection
64// ============================================================================
65
66/// Default minimum overlap character count for deduplication.
67///
68/// Overlaps must be at least this many characters to be considered for deduplication.
69/// This prevents false positives from short accidental matches (e.g., "the", "and").
70const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
71
72/// Minimum overlap ratio expressed as integer (50 = 50%).
73/// Used for integer-based ratio comparison to avoid floating point issues.
74const MIN_OVERLAP_RATIO_INT: usize = 50;
75
76/// Default threshold for considering a chunk "short".
77///
78/// Short chunks (< this many chars) are never deduped unless they're exact matches
79/// with the accumulated content. This prevents aggressive deduplication of tokens
80/// like ".", "\n", "Ok" that are legitimately repeated.
81const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
82
83/// Default threshold for consecutive duplicate detection.
84///
85/// If the exact same chunk arrives this many times in a row, it's treated as a
86/// resend glitch and dropped entirely.
87const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
88
89/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
90const MIN_MIN_OVERLAP_CHARS: usize = 10;
91
92/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
93const MAX_MIN_OVERLAP_CHARS: usize = 100;
94
95/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
96const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
97
98/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
99const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
100
101/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
102const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
103
104/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
105const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
106
107/// Configuration for strong overlap detection.
108///
109/// This struct holds the tunable thresholds that determine when an overlap
110/// is "strong enough" to warrant deduplication.
111#[derive(Debug, Clone, Copy)]
112pub struct OverlapThresholds {
113 /// Minimum character count for overlap
114 pub min_overlap_chars: usize,
115 /// Threshold below which chunks are considered "short"
116 pub short_chunk_threshold: usize,
117 /// Number of consecutive duplicates before aggressive dedupe
118 pub consecutive_duplicate_threshold: usize,
119}
120
121impl Default for OverlapThresholds {
122 fn default() -> Self {
123 Self {
124 min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
125 short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
126 consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
127 }
128 }
129}
130
131/// Get the overlap thresholds from environment variables or use defaults.
132///
133/// Reads the following environment variables:
134/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
135/// - `RALPH_STREAMING_MIN_OVERLAP_RATIO`: Minimum overlap ratio (default: 0.5, range: 0.1-0.9)
136/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
137/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
138///
139/// # Returns
140/// The configured overlap thresholds.
141pub fn get_overlap_thresholds() -> OverlapThresholds {
142 static THRESHOLDS: OnceLock<OverlapThresholds> = OnceLock::new();
143 *THRESHOLDS.get_or_init(|| {
144 let min_overlap_chars = std::env::var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
145 .ok()
146 .and_then(|s| s.parse::<usize>().ok())
147 .and_then(|v| {
148 if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
149 Some(v)
150 } else {
151 None
152 }
153 })
154 .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
155
156 let short_chunk_threshold = std::env::var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
157 .ok()
158 .and_then(|s| s.parse::<usize>().ok())
159 .and_then(|v| {
160 if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
161 Some(v)
162 } else {
163 None
164 }
165 })
166 .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
167
168 let consecutive_duplicate_threshold =
169 std::env::var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
170 .ok()
171 .and_then(|s| s.parse::<usize>().ok())
172 .and_then(|v| {
173 if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
174 .contains(&v)
175 {
176 Some(v)
177 } else {
178 None
179 }
180 })
181 .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
182
183 OverlapThresholds {
184 min_overlap_chars,
185 short_chunk_threshold,
186 consecutive_duplicate_threshold,
187 }
188 })
189}
190
191// ============================================================================
192// Boundary Detection
193// ============================================================================
194
195/// Check if a character position is at a safe boundary for deduplication.
196///
197/// A "safe boundary" is where the overlap ends at a natural break point in text:
198/// - Whitespace (space, tab, newline, etc.)
199/// - ASCII punctuation (.,!?;:, etc.)
200/// - End of string
201///
202/// This prevents deduplication from splitting words or tokens mid-way through,
203/// which could cause incorrect rendering of intentional repetitions.
204///
205/// # Arguments
206/// * `text` - The text to check
207/// * `pos` - The position in the text (byte offset)
208///
209/// # Returns
210/// * `true` - The position is at a safe boundary for deduplication
211/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
212///
213/// # Examples
214///
215/// ```ignore
216/// // Safe: overlap ends at space
217/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
218///
219/// // Safe: overlap ends at punctuation
220/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
221///
222/// // Unsafe: overlap ends mid-word
223/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
224/// ```
225fn is_safe_boundary(text: &str, pos: usize) -> bool {
226 // End of string is always safe
227 if pos >= text.len() {
228 return true;
229 }
230
231 // Get the character at the boundary position
232 // We need to use character iteration for Unicode safety
233 let char_at_pos = text[pos..].chars().next();
234
235 char_at_pos
236 .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
237}
238
239// ============================================================================
240// Overlap Quality Scoring
241// ============================================================================
242
243/// Score representing the "strength" of an overlap.
244///
245/// This struct captures multiple metrics about an overlap to determine
246/// if it's strong enough to warrant deduplication.
247#[derive(Debug, Clone, Copy, PartialEq, Eq)]
248pub struct OverlapScore {
249 /// Character count of the overlap
250 pub char_count: usize,
251 /// Whether the overlap meets the minimum ratio threshold
252 pub ratio_met: bool,
253 /// Whether the overlap ends at a safe boundary
254 pub is_safe_boundary: bool,
255}
256
257impl OverlapScore {
258 /// Check if this overlap meets all thresholds for deduplication.
259 ///
260 /// # Arguments
261 /// * `thresholds` - The overlap thresholds to check against
262 ///
263 /// # Returns
264 /// * `true` - The overlap is strong enough for deduplication
265 /// * `false` - The overlap is too weak
266 #[must_use]
267 pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
268 self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
269 }
270
271 /// Check if the delta is short (below short chunk threshold).
272 ///
273 /// # Arguments
274 /// * `delta_len` - The length of the delta
275 /// * `thresholds` - The overlap thresholds
276 ///
277 /// # Returns
278 /// * `true` - The delta is considered short
279 /// * `false` - The delta is normal length
280 #[must_use]
281 #[cfg(test)]
282 pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
283 delta_len < thresholds.short_chunk_threshold
284 }
285}
286
287/// Score the quality of an overlap between delta and accumulated content.
288///
289/// This function computes multiple metrics about an overlap to determine
290/// if it's strong enough to warrant deduplication.
291///
292/// # Arguments
293/// * `delta` - The incoming delta
294/// * `accumulated` - The previously accumulated content
295///
296/// # Returns
297/// An `OverlapScore` containing:
298/// - `char_count`: The length of the overlap in characters
299/// - `ratio`: The overlap as a fraction of delta length
300/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
301///
302/// # Examples
303///
304/// ```ignore
305/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
306/// let score = score_overlap("Hello World! More text here", "Hello World!");
307/// assert!(score.char_count >= 30);
308/// assert!(score.ratio_met);
309/// assert!(score.is_safe_boundary);
310/// ```
311fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
312 // Check if delta starts with accumulated (snapshot detection)
313 let overlap_len = if delta.starts_with(accumulated) {
314 accumulated.len()
315 } else {
316 0
317 };
318
319 // Calculate ratio as integer to avoid floating point precision issues
320 // We'll compare overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
321 // This avoids f64 casting entirely
322 let ratio_met = if delta.is_empty() {
323 false
324 } else {
325 // Check if overlap/delta >= MIN_OVERLAP_RATIO without floating point
326 // By cross-multiplying: overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
327 let overlap_scaled = overlap_len.saturating_mul(100);
328 let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
329 overlap_scaled >= threshold
330 };
331
332 // Check if the accumulated string ends at a safe boundary
333 // This is important because we don't want to dedupe if the accumulated
334 // string ends mid-word (e.g., accumulated="Hello" and delta="HelloWorld")
335 let is_safe_boundary = if overlap_len > 0 {
336 // Check if the last character of accumulated is a safe boundary
337 is_safe_boundary(accumulated, accumulated.len())
338 } else {
339 false
340 };
341
342 OverlapScore {
343 char_count: overlap_len,
344 ratio_met,
345 is_safe_boundary,
346 }
347}
348
349/// Rolling hash window for fast substring detection.
350///
351/// Maintains rolling hash values over accumulated content using the Rabin-Karp
352/// algorithm. Provides O(1) hash computation for sliding windows.
353///
354/// # Algorithm
355///
356/// Uses polynomial rolling hash with base 256 (byte values) and modulus
357/// from a large prime to minimize collisions.
358///
359/// Hash computation:
360/// ```text
361/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
362/// ```
363///
364/// Rolling update:
365/// ```text
366/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
367/// ```
368///
369/// # Example
370///
371/// ```ignore
372/// let mut window = RollingHashWindow::new();
373/// window.add_content("Hello World");
374///
375/// // Check if "World" exists in the accumulated content
376/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
377/// let world_hash = RollingHashWindow::compute_hash("World");
378///
379/// if hashes.contains(&world_hash) {
380/// // Potential match - verify with KMP
381/// }
382/// ```
383#[derive(Debug, Default, Clone)]
384pub struct RollingHashWindow {
385 /// Accumulated content for hash computation
386 content: String,
387 /// Cached hash values for different window sizes
388 /// Maps `window_size` -> Vec<(position, hash)>
389 cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
390}
391
392impl RollingHashWindow {
393 /// Base for polynomial rolling hash (256 for byte values)
394 const BASE: u64 = 256;
395 /// Modulus for hash computation (large prime to minimize collisions)
396 const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
397
398 /// Create a new rolling hash window.
399 #[cfg(test)]
400 pub fn new() -> Self {
401 Self::default()
402 }
403
404 /// Compute rolling hash of a string slice.
405 ///
406 /// Uses polynomial rolling hash with base 256 and a large prime modulus.
407 ///
408 /// # Arguments
409 /// * `text` - The text to hash
410 ///
411 /// # Returns
412 /// The hash value as a u64
413 ///
414 /// # Example
415 /// ```ignore
416 /// let hash = RollingHashWindow::compute_hash("Hello");
417 /// ```
418 pub fn compute_hash(text: &str) -> u64 {
419 let mut hash: u64 = 0;
420 for byte in text.bytes() {
421 hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
422 }
423 hash
424 }
425
426 /// Compute base^(n-1) mod MODULUS for rolling hash updates.
427 ///
428 /// This is used to efficiently remove the leftmost character when
429 /// sliding the window.
430 #[cfg(test)]
431 fn compute_power(power: usize) -> u64 {
432 let mut result = 1u64;
433 for _ in 0..power {
434 result = (result * Self::BASE) % Self::MODULUS;
435 }
436 result
437 }
438
439 /// Add content to the window and update cached hashes.
440 ///
441 /// This appends new content and recomputes hash values for all
442 /// cached window sizes.
443 ///
444 /// # Arguments
445 /// * `text` - The new content to add
446 #[cfg(test)]
447 pub fn add_content(&mut self, text: &str) {
448 if text.is_empty() {
449 return;
450 }
451
452 let _start_pos = self.content.len();
453 self.content.push_str(text);
454
455 // Invalidate and recompute all cached hashes
456 self.cached_hashes.clear();
457 }
458
459 /// Get all window hashes of a specific size.
460 ///
461 /// Computes rolling hash values for all windows of the given size
462 /// in the accumulated content.
463 ///
464 /// # Arguments
465 /// * `window_size` - The size of each window
466 ///
467 /// # Returns
468 /// A vector of (position, hash) tuples for each window
469 ///
470 /// # Example
471 /// ```ignore
472 /// // Get hashes for all 5-character windows
473 /// let hashes = window.get_window_hashes(5);
474 /// ```
475 #[cfg(test)]
476 pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
477 // Return cached copy if available
478 if let Some(hashes) = self.cached_hashes.get(&window_size) {
479 return hashes.clone();
480 }
481
482 // Compute hashes for this window size
483 let content_bytes = self.content.as_bytes();
484 let content_len = content_bytes.len();
485
486 if content_len < window_size {
487 // Not enough content for even one window
488 let empty: Vec<(usize, u64)> = Vec::new();
489 self.cached_hashes.insert(window_size, empty.clone());
490 return empty;
491 }
492
493 let mut hashes = Vec::new();
494 let mut hash: u64 = 0;
495
496 // Compute initial window hash
497 for byte in content_bytes.iter().take(window_size) {
498 hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
499 }
500 hashes.push((0, hash));
501
502 // Precompute base^(window_size-1) mod MODULUS for rolling updates
503 let power = Self::compute_power(window_size - 1);
504
505 // Slide window and compute rolling hashes
506 for i in 1..=(content_len - window_size) {
507 // Remove leftmost character contribution
508 let leftmost = u64::from(content_bytes[i - 1]);
509 let removed = (leftmost * power) % Self::MODULUS;
510 hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
511
512 // Shift and add new character
513 hash = (hash * Self::BASE) % Self::MODULUS;
514 let new_char = u64::from(content_bytes[i + window_size - 1]);
515 hash = (hash + new_char) % Self::MODULUS;
516
517 hashes.push((i, hash));
518 }
519
520 // Cache for future use
521 self.cached_hashes.insert(window_size, hashes.clone());
522 hashes
523 }
524
525 /// Check if a hash exists in any window of the given size.
526 ///
527 /// This is a fast O(1) check after hashes have been computed.
528 ///
529 /// # Arguments
530 /// * `hash` - The hash value to search for
531 /// * `window_size` - The window size to check
532 ///
533 /// # Returns
534 /// * `Some(position)` - The position where the hash was found
535 /// * `None` - Hash not found
536 #[cfg(test)]
537 pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
538 let hashes = self.get_window_hashes(window_size);
539 hashes
540 .into_iter()
541 .find(|(_, h)| *h == hash)
542 .map(|(pos, _)| pos)
543 }
544
545 /// Clear all content and cached hashes.
546 pub fn clear(&mut self) {
547 self.content.clear();
548 self.cached_hashes.clear();
549 }
550
551 /// Get the current content length.
552 #[cfg(test)]
553 pub const fn len(&self) -> usize {
554 self.content.len()
555 }
556
557 /// Check if the window is empty.
558 #[cfg(test)]
559 pub const fn is_empty(&self) -> bool {
560 self.content.is_empty()
561 }
562}
563
564/// KMP (Knuth-Morris-Pratt) matcher for exact substring matching.
565///
566/// Provides linear-time substring search by precomputing a failure function
567/// that allows skipping already-matched characters.
568///
569/// # Algorithm
570///
571/// The failure function (also called "prefix function" or "lps array") stores
572/// the length of the longest proper prefix of the pattern that is also a suffix
573/// for each position in the pattern.
574///
575/// # Example
576///
577/// ```ignore
578/// let mut kmp = KMPMatcher::new("abc");
579/// let text = "xyzabcuvw";
580///
581/// if let Some(pos) = kmp.find(text) {
582/// // Found "abc" at position 3
583/// }
584/// ```
585#[derive(Debug, Clone)]
586#[cfg(test)]
587pub struct KMPMatcher {
588 /// The pattern to search for
589 pattern: String,
590 /// Failure function (longest proper prefix which is also suffix)
591 failure: Vec<usize>,
592}
593
594#[cfg(test)]
595impl KMPMatcher {
596 /// Create a new KMP matcher for the given pattern.
597 ///
598 /// Precomputes the failure function for efficient searching.
599 ///
600 /// # Arguments
601 /// * `pattern` - The pattern to search for
602 ///
603 /// # Example
604 /// ```ignore
605 /// let matcher = KMPMatcher::new("hello");
606 /// ```
607 pub fn new(pattern: &str) -> Self {
608 let pattern = pattern.to_string();
609 let failure = Self::compute_failure(&pattern);
610 Self { pattern, failure }
611 }
612
613 /// Compute the failure function for KMP.
614 ///
615 /// The failure function `lps[i]` stores the length of the longest proper
616 /// prefix of `pattern[0..=i]` that is also a suffix.
617 ///
618 /// # Example
619 ///
620 /// For pattern "abab":
621 /// - lps[0] = 0 (no proper prefix for single char)
622 /// - lps[1] = 0 ("ab" has no matching prefix/suffix)
623 /// - lps[2] = 1 ("aba" has "a" as prefix and suffix)
624 /// - lps[3] = 2 ("abab" has "ab" as prefix and suffix)
625 fn compute_failure(pattern: &str) -> Vec<usize> {
626 let m = pattern.len();
627 if m == 0 {
628 return Vec::new();
629 }
630
631 let mut lps = vec![0; m];
632 let mut len = 0; // Length of the previous longest prefix suffix
633 let mut i = 1;
634
635 let pattern_bytes = pattern.as_bytes();
636
637 while i < m {
638 if pattern_bytes[i] == pattern_bytes[len] {
639 len += 1;
640 lps[i] = len;
641 i += 1;
642 } else if len != 0 {
643 // Fall back to previous longest prefix suffix
644 len = lps[len - 1];
645 // Note: we don't increment i here
646 } else {
647 // No proper prefix suffix found
648 lps[i] = 0;
649 i += 1;
650 }
651 }
652
653 lps
654 }
655
656 /// Find the pattern in text, returning the first position.
657 ///
658 /// Uses the precomputed failure function for O(n) search time.
659 ///
660 /// # Arguments
661 /// * `text` - The text to search in
662 ///
663 /// # Returns
664 /// * `Some(position)` - The starting position of the pattern
665 /// * `None` - Pattern not found
666 ///
667 /// # Example
668 /// ```ignore
669 /// let matcher = KMPMatcher::new("world");
670 /// assert_eq!(matcher.find("hello world"), Some(6));
671 /// assert_eq!(matcher.find("hello"), None);
672 /// ```
673 pub fn find(&self, text: &str) -> Option<usize> {
674 let n = text.len();
675 let m = self.pattern.len();
676
677 if m == 0 || n < m {
678 return None;
679 }
680
681 let text_bytes = text.as_bytes();
682 let pattern_bytes = self.pattern.as_bytes();
683
684 let mut i = 0; // Index for text
685 let mut j = 0; // Index for pattern
686
687 while i < n {
688 if pattern_bytes[j] == text_bytes[i] {
689 i += 1;
690 j += 1;
691
692 if j == m {
693 // Found complete pattern match
694 return Some(i - j);
695 }
696 } else if j != 0 {
697 // Use failure function to skip ahead
698 j = self.failure[j - 1];
699 } else {
700 // No match at all, move to next character
701 i += 1;
702 }
703 }
704
705 None
706 }
707
708 /// Find all occurrences of the pattern in text.
709 ///
710 /// Returns all positions where the pattern appears in the text.
711 ///
712 /// # Arguments
713 /// * `text` - The text to search in
714 ///
715 /// # Returns
716 /// A vector of positions where the pattern was found
717 ///
718 /// # Example
719 /// ```ignore
720 /// let matcher = KMPMatcher::new("ab");
721 /// let positions = matcher.find_all("ababab");
722 /// assert_eq!(positions, vec![0, 2, 4]);
723 /// ```
724 #[cfg(test)]
725 pub fn find_all(&self, text: &str) -> Vec<usize> {
726 let mut positions = Vec::new();
727 let n = text.len();
728 let m = self.pattern.len();
729
730 if m == 0 || n < m {
731 return positions;
732 }
733
734 let text_bytes = text.as_bytes();
735 let pattern_bytes = self.pattern.as_bytes();
736
737 let mut i = 0; // Index for text
738 let mut j = 0; // Index for pattern
739
740 while i < n {
741 if pattern_bytes[j] == text_bytes[i] {
742 i += 1;
743 j += 1;
744
745 if j == m {
746 // Found complete pattern match
747 positions.push(i - j);
748 j = self.failure[j - 1];
749 }
750 } else if j != 0 {
751 j = self.failure[j - 1];
752 } else {
753 i += 1;
754 }
755 }
756
757 positions
758 }
759
760 /// Get the pattern length.
761 #[cfg(test)]
762 pub const fn pattern_len(&self) -> usize {
763 self.pattern.len()
764 }
765
766 /// Check if the pattern is empty.
767 #[cfg(test)]
768 pub const fn is_empty(&self) -> bool {
769 self.pattern.is_empty()
770 }
771}
772
773/// Delta deduplicator using rolling hash and KMP.
774///
775/// Orchestrates the two-phase deduplication approach:
776/// 1. Rolling hash for fast filtering
777/// 2. KMP for exact verification
778///
779/// # Example
780///
781/// ```ignore
782/// let mut dedup = DeltaDeduplicator::new();
783///
784/// // Add accumulated content
785/// dedup.add_accumulated("Hello World");
786///
787/// // Check if a delta is a duplicate
788/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
789/// // "!" is the new portion
790/// assert_eq!(new_portion, "!");
791/// }
792/// ```
793#[derive(Debug, Default, Clone)]
794pub struct DeltaDeduplicator {
795 /// Rolling hash window for accumulated content
796 hash_window: RollingHashWindow,
797}
798
799impl DeltaDeduplicator {
800 /// Create a new delta deduplicator.
801 #[cfg(test)]
802 pub fn new() -> Self {
803 Self::default()
804 }
805
806 /// Add accumulated content for deduplication tracking.
807 ///
808 /// # Arguments
809 /// * `content` - The accumulated content to track
810 #[cfg(test)]
811 pub fn add_accumulated(&mut self, content: &str) {
812 self.hash_window.add_content(content);
813 }
814
815 /// Extract new content from a potential snapshot.
816 ///
817 /// Uses rolling hash and KMP to detect if the incoming delta contains
818 /// previously accumulated content (a snapshot) and extracts only the
819 /// new portion.
820 ///
821 /// # Two-Phase Algorithm
822 ///
823 /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
824 /// in any window of the accumulated content. O(n) time.
825 ///
826 /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
827 /// substring match and find the exact position. O(n+m) time.
828 ///
829 /// # Arguments
830 /// * `delta` - The incoming delta to check
831 /// * `accumulated` - The previously accumulated content
832 ///
833 /// # Returns
834 /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
835 /// * `None` - The delta is genuinely new, doesn't start with accumulated
836 ///
837 /// # Example
838 /// ```ignore
839 /// let mut dedup = DeltaDeduplicator::new();
840 /// dedup.add_accumulated("Hello");
841 ///
842 /// // Snapshot: "Hello World"
843 /// assert_eq!(
844 /// DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
845 /// Some(" World")
846 /// );
847 ///
848 /// // Genuine delta: " World"
849 /// assert_eq!(
850 /// DeltaDeduplicator::extract_new_content(" World", "Hello"),
851 /// None
852 /// );
853 /// ```
854 #[cfg(test)]
855 pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
856 // Handle identical content (delta == accumulated) - return empty string
857 if delta == accumulated {
858 return Some("");
859 }
860
861 // Fast rejection: delta must be longer than accumulated
862 if delta.len() <= accumulated.len() {
863 return None;
864 }
865
866 // Phase 1: Rolling hash check
867 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
868 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
869
870 // If hashes don't match, definitely not a snapshot
871 if accumulated_hash != delta_prefix_hash {
872 return None;
873 }
874
875 // Phase 2: KMP verification for exact match
876 let kmp = KMPMatcher::new(accumulated);
877 if let Some(pos) = kmp.find(delta) {
878 // Verify it's at position 0 (snapshot starts with accumulated)
879 if pos == 0 {
880 return Some(&delta[accumulated.len()..]);
881 }
882 }
883
884 // Hash collision or match not at start - not a snapshot
885 None
886 }
887
888 /// Check if delta is likely a snapshot using rolling hash only.
889 ///
890 /// This is a faster O(n) check that may have false positives due to
891 /// hash collisions. Use `extract_new_content` for verified results.
892 ///
893 /// # Arguments
894 /// * `delta` - The incoming delta to check
895 /// * `accumulated` - The previously accumulated content
896 ///
897 /// # Returns
898 /// * `true` - Delta may be a snapshot (hash matches)
899 /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
900 pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
901 // Handle identical content (duplicate delta)
902 if delta == accumulated {
903 return true;
904 }
905
906 // Delta must be longer than accumulated to be a snapshot
907 if delta.len() <= accumulated.len() {
908 return false;
909 }
910
911 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
912 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
913
914 accumulated_hash == delta_prefix_hash
915 }
916
917 /// Check if delta is likely a snapshot with strong overlap detection.
918 ///
919 /// This is an enhanced version of `is_likely_snapshot` that applies
920 /// strong overlap thresholds to prevent false positives on intentional
921 /// repetitions.
922 ///
923 /// # Strong Overlap Detection
924 ///
925 /// This method only returns `true` when:
926 /// - The overlap meets minimum character count threshold (default: 30 chars)
927 /// - The overlap meets minimum ratio threshold (default: 50% of delta)
928 /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
929 /// - Short chunks (< 20 chars) are only deduped if exact match
930 ///
931 /// # Arguments
932 /// * `delta` - The incoming delta to check
933 /// * `accumulated` - The previously accumulated content
934 ///
935 /// # Returns
936 /// * `true` - Delta is a snapshot meeting strong overlap criteria
937 /// * `false` - Delta is either genuine or overlap is too weak
938 pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
939 let thresholds = get_overlap_thresholds();
940
941 // Handle short chunks: only dedupe if exact match
942 if delta.len() < thresholds.short_chunk_threshold {
943 return delta == accumulated;
944 }
945
946 // Handle identical content (delta == accumulated)
947 // This is a snapshot where no new content is added
948 if delta == accumulated {
949 return true;
950 }
951
952 // Fast rejection: delta must be longer than accumulated
953 if delta.len() <= accumulated.len() {
954 return false;
955 }
956
957 // First check with basic rolling hash for quick rejection
958 if !Self::is_likely_snapshot(delta, accumulated) {
959 return false;
960 }
961
962 // Score the overlap to check if it meets strong overlap criteria
963 let score = score_overlap(delta, accumulated);
964
965 // Apply threshold checks
966 score.meets_thresholds(&thresholds)
967 }
968
969 /// Extract new content from a snapshot with strong overlap detection.
970 ///
971 /// This is an enhanced version of `extract_new_content` that only extracts
972 /// new content when the overlap meets strong overlap thresholds.
973 ///
974 /// # Arguments
975 /// * `delta` - The incoming delta to check
976 /// * `accumulated` - The previously accumulated content
977 ///
978 /// # Returns
979 /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
980 /// * `None` - The overlap is too weak or not a snapshot
981 pub fn extract_new_content_with_thresholds<'a>(
982 delta: &'a str,
983 accumulated: &str,
984 ) -> Option<&'a str> {
985 let thresholds = get_overlap_thresholds();
986
987 // Handle short chunks: only dedupe if exact match
988 if delta.len() < thresholds.short_chunk_threshold {
989 if delta == accumulated {
990 return Some("");
991 }
992 return None;
993 }
994
995 // Handle identical content
996 if delta == accumulated {
997 return Some("");
998 }
999
1000 // Fast rejection: delta must be longer than accumulated
1001 if delta.len() <= accumulated.len() {
1002 return None;
1003 }
1004
1005 // Score the overlap
1006 let score = score_overlap(delta, accumulated);
1007
1008 // Check if overlap meets thresholds
1009 if !score.meets_thresholds(&thresholds) {
1010 return None;
1011 }
1012
1013 // Extract new content using the overlap length from the score
1014 if score.char_count > 0 && delta.len() > score.char_count {
1015 Some(&delta[score.char_count..])
1016 } else {
1017 None
1018 }
1019 }
1020
1021 /// Clear all tracked content.
1022 pub fn clear(&mut self) {
1023 self.hash_window.clear();
1024 }
1025}
1026
1027#[cfg(test)]
1028mod tests {
1029 use super::*;
1030
1031 // Tests for RollingHashWindow
1032
1033 #[test]
1034 fn test_rolling_hash_compute_hash() {
1035 let hash1 = RollingHashWindow::compute_hash("Hello");
1036 let hash2 = RollingHashWindow::compute_hash("Hello");
1037 let hash3 = RollingHashWindow::compute_hash("World");
1038
1039 // Same input produces same hash
1040 assert_eq!(hash1, hash2);
1041 // Different input likely produces different hash
1042 assert_ne!(hash1, hash3);
1043 }
1044
1045 #[test]
1046 fn test_rolling_hash_add_content() {
1047 let mut window = RollingHashWindow::new();
1048 assert!(window.is_empty());
1049
1050 window.add_content("Hello");
1051 assert_eq!(window.len(), 5);
1052 assert!(!window.is_empty());
1053 }
1054
1055 #[test]
1056 fn test_rolling_hash_get_window_hashes() {
1057 let mut window = RollingHashWindow::new();
1058 window.add_content("HelloWorld");
1059
1060 // Get hashes for 5-character windows
1061 let hashes = window.get_window_hashes(5);
1062 assert_eq!(hashes.len(), 6); // "Hello", "elloW", "lloWo", "loWor", "oWorl", "World"
1063
1064 // Verify positions
1065 assert_eq!(hashes[0].0, 0); // First window starts at 0
1066 assert_eq!(hashes[5].0, 5); // Last window starts at 5
1067 }
1068
1069 #[test]
1070 fn test_rolling_hash_contains_hash() {
1071 let mut window = RollingHashWindow::new();
1072 window.add_content("HelloWorld");
1073
1074 let world_hash = RollingHashWindow::compute_hash("World");
1075 let xyz_hash = RollingHashWindow::compute_hash("XYZ");
1076
1077 // "World" exists in the content
1078 assert!(window.contains_hash(world_hash, 5).is_some());
1079 // "XYZ" doesn't exist
1080 assert!(window.contains_hash(xyz_hash, 3).is_none());
1081 }
1082
1083 #[test]
1084 fn test_rolling_hash_clear() {
1085 let mut window = RollingHashWindow::new();
1086 window.add_content("Hello");
1087 assert_eq!(window.len(), 5);
1088
1089 window.clear();
1090 assert!(window.is_empty());
1091 }
1092
1093 // Tests for KMPMatcher
1094
1095 #[test]
1096 fn test_kmp_find_pattern_exists() {
1097 let kmp = KMPMatcher::new("World");
1098 assert_eq!(kmp.find("Hello World"), Some(6));
1099 assert_eq!(kmp.find("WorldHello"), Some(0));
1100 }
1101
1102 #[test]
1103 fn test_kmp_find_pattern_not_exists() {
1104 let kmp = KMPMatcher::new("XYZ");
1105 assert_eq!(kmp.find("Hello World"), None);
1106 }
1107
1108 #[test]
1109 fn test_kmp_find_pattern_empty() {
1110 let kmp = KMPMatcher::new("");
1111 assert_eq!(kmp.find("Hello"), None);
1112 }
1113
1114 #[test]
1115 fn test_kmp_find_text_shorter_than_pattern() {
1116 let kmp = KMPMatcher::new("Hello World");
1117 assert_eq!(kmp.find("Hello"), None);
1118 }
1119
1120 #[test]
1121 fn test_kmp_find_all() {
1122 let kmp = KMPMatcher::new("ab");
1123 let positions = kmp.find_all("ababab");
1124 assert_eq!(positions, vec![0, 2, 4]);
1125 }
1126
1127 #[test]
1128 fn test_kmp_find_all_no_matches() {
1129 let kmp = KMPMatcher::new("xyz");
1130 let positions = kmp.find_all("abcabc");
1131 assert!(positions.is_empty());
1132 }
1133
1134 #[test]
1135 fn test_kmp_find_overlapping_patterns() {
1136 let kmp = KMPMatcher::new("aa");
1137 let positions = kmp.find_all("aaa");
1138 assert_eq!(positions, vec![0, 1]);
1139 }
1140
1141 #[test]
1142 fn test_kmp_failure_function() {
1143 let kmp = KMPMatcher::new("abab");
1144 // lps = [0, 0, 1, 2]
1145 assert_eq!(kmp.failure, vec![0, 0, 1, 2]);
1146 }
1147
1148 #[test]
1149 fn test_kmp_pattern_len() {
1150 let kmp = KMPMatcher::new("Hello");
1151 assert_eq!(kmp.pattern_len(), 5);
1152 }
1153
1154 #[test]
1155 fn test_kmp_is_empty() {
1156 let kmp = KMPMatcher::new("");
1157 assert!(kmp.is_empty());
1158
1159 let kmp = KMPMatcher::new("Hello");
1160 assert!(!kmp.is_empty());
1161 }
1162
1163 // Tests for DeltaDeduplicator
1164
1165 #[test]
1166 fn test_dedup_extract_new_content_snapshot() {
1167 let mut dedup = DeltaDeduplicator::new();
1168 dedup.add_accumulated("Hello");
1169
1170 // Snapshot: "Hello World"
1171 assert_eq!(
1172 DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
1173 Some(" World")
1174 );
1175 }
1176
1177 #[test]
1178 fn test_dedup_extract_new_content_genuine_delta() {
1179 let mut dedup = DeltaDeduplicator::new();
1180 dedup.add_accumulated("Hello");
1181
1182 // Genuine delta: " World"
1183 assert_eq!(
1184 DeltaDeduplicator::extract_new_content(" World", "Hello"),
1185 None
1186 );
1187 }
1188
1189 #[test]
1190 fn test_dedup_extract_new_content_shorter_delta() {
1191 let mut dedup = DeltaDeduplicator::new();
1192 dedup.add_accumulated("Hello World");
1193
1194 // Delta shorter than accumulated - can't be snapshot
1195 assert_eq!(
1196 DeltaDeduplicator::extract_new_content("Hello", "Hello World"),
1197 None
1198 );
1199 }
1200
1201 #[test]
1202 fn test_dedup_extract_new_content_equal_length() {
1203 let mut dedup = DeltaDeduplicator::new();
1204 dedup.add_accumulated("Hello");
1205
1206 // Equal length - if identical, return empty string (no new content)
1207 assert_eq!(
1208 DeltaDeduplicator::extract_new_content("Hello", "Hello"),
1209 Some("")
1210 );
1211
1212 // Equal length but different content - not a snapshot
1213 assert_eq!(
1214 DeltaDeduplicator::extract_new_content("World", "Hello"),
1215 None
1216 );
1217 }
1218
1219 #[test]
1220 fn test_dedup_extract_new_content_no_match() {
1221 let mut dedup = DeltaDeduplicator::new();
1222 dedup.add_accumulated("Hello");
1223
1224 // Different content entirely
1225 assert_eq!(
1226 DeltaDeduplicator::extract_new_content("World", "Hello"),
1227 None
1228 );
1229 }
1230
1231 #[test]
1232 fn test_dedup_is_likely_snapshot() {
1233 let mut dedup = DeltaDeduplicator::new();
1234 dedup.add_accumulated("Hello");
1235
1236 // Actual snapshot
1237 assert!(DeltaDeduplicator::is_likely_snapshot(
1238 "Hello World",
1239 "Hello"
1240 ));
1241
1242 // Not a snapshot
1243 assert!(!DeltaDeduplicator::is_likely_snapshot(" World", "Hello"));
1244 }
1245
1246 #[test]
1247 fn test_dedup_clear() {
1248 let mut dedup = DeltaDeduplicator::new();
1249 dedup.add_accumulated("Hello");
1250 assert!(!dedup.hash_window.is_empty());
1251
1252 dedup.clear();
1253 assert!(dedup.hash_window.is_empty());
1254 }
1255
1256 // Integration tests
1257
1258 #[test]
1259 fn test_dedup_two_phase_algorithm() {
1260 let mut dedup = DeltaDeduplicator::new();
1261 dedup.add_accumulated("The quick brown fox");
1262
1263 // Phase 1: Rolling hash will match
1264 assert!(DeltaDeduplicator::is_likely_snapshot(
1265 "The quick brown fox jumps",
1266 "The quick brown fox"
1267 ));
1268
1269 // Phase 2: KMP verification confirms and extracts new portion
1270 assert_eq!(
1271 DeltaDeduplicator::extract_new_content(
1272 "The quick brown fox jumps",
1273 "The quick brown fox"
1274 ),
1275 Some(" jumps")
1276 );
1277 }
1278
1279 #[test]
1280 fn test_dedup_handles_unicode() {
1281 let mut dedup = DeltaDeduplicator::new();
1282 dedup.add_accumulated("Hello 世界");
1283
1284 // Should handle UTF-8 correctly
1285 assert_eq!(
1286 DeltaDeduplicator::extract_new_content("Hello 世界!", "Hello 世界"),
1287 Some("!")
1288 );
1289 }
1290
1291 #[test]
1292 fn test_dedup_empty_accumulated() {
1293 // No accumulated content
1294
1295 // Any delta is genuine
1296 assert_eq!(DeltaDeduplicator::extract_new_content("Hello", ""), None);
1297 }
1298
1299 // Tests for Strong Overlap Detection with Thresholds
1300
1301 #[test]
1302 fn test_strong_overlap_meets_char_threshold() {
1303 // Overlap of 30+ chars with safe boundary should pass
1304 let accumulated = "The quick brown fox jumps over the lazy";
1305 let delta = "The quick brown fox jumps over the lazy dog!";
1306
1307 assert!(
1308 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1309 "Should detect snapshot with 30+ char overlap"
1310 );
1311 }
1312
1313 #[test]
1314 fn test_strong_overlap_meets_ratio_threshold() {
1315 // Overlap is 50%+ of delta length
1316 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1317 let delta = "The quick brown fox jumps over the lazy dog. And more!";
1318
1319 assert!(
1320 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1321 "Should detect snapshot when overlap is 50%+ of delta"
1322 );
1323 }
1324
1325 #[test]
1326 fn test_strong_overlap_fails_char_threshold() {
1327 // Overlap < 30 chars, even if ratio is good
1328 let accumulated = "Hello";
1329 let delta = "Hello World!";
1330
1331 assert!(
1332 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1333 "Should NOT detect snapshot when overlap < 30 chars"
1334 );
1335 }
1336
1337 #[test]
1338 fn test_strong_overlap_fails_ratio_threshold() {
1339 // Overlap < 50% of delta, even if char count is good
1340 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1341 let delta = "The quick brown fox jumps over the lazy dog. And then a whole lot more text follows to make the ratio low!";
1342
1343 assert!(
1344 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1345 "Should NOT detect snapshot when overlap < 50% of delta"
1346 );
1347 }
1348
1349 #[test]
1350 fn test_boundary_check_whitespace() {
1351 // Overlap ends at space (safe boundary)
1352 let accumulated = "The quick brown fox jumps over the lazy dog and ";
1353 let delta = "The quick brown fox jumps over the lazy dog and then more!";
1354
1355 assert!(
1356 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1357 "Should detect snapshot when overlap ends at whitespace"
1358 );
1359 }
1360
1361 #[test]
1362 fn test_boundary_check_punctuation() {
1363 // Overlap ends at punctuation (safe boundary)
1364 let accumulated = "The quick brown fox jumps over the lazy dog.";
1365 let delta = "The quick brown fox jumps over the lazy dog. How are you?";
1366
1367 assert!(
1368 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1369 "Should detect snapshot when overlap ends at punctuation"
1370 );
1371 }
1372
1373 #[test]
1374 fn test_boundary_check_mid_word_fails() {
1375 // Overlap ends mid-word (unsafe boundary)
1376 let accumulated = "Hello";
1377 let delta = "HelloWorld! This is a lot of text to ensure we have enough characters.";
1378
1379 // Even though we have 30+ chars, the boundary check should fail
1380 // because the overlap ends mid-word (at 'W' of "World")
1381 assert!(
1382 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1383 "Should NOT detect snapshot when overlap ends mid-word"
1384 );
1385 }
1386
1387 #[test]
1388 fn test_short_chunk_never_deduped() {
1389 // Short chunks (< 20 chars) never deduped unless exact match
1390 let accumulated = "Hello";
1391 let delta = "Hello World!";
1392
1393 // Even though "Hello World!" starts with "Hello", it's < 20 chars total
1394 // and not an exact match, so it should NOT be deduped
1395 assert!(
1396 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1397 "Short chunks should NOT be deduped unless exact match"
1398 );
1399 }
1400
1401 #[test]
1402 fn test_short_chunk_exact_match_deduped() {
1403 // Short chunks (< 20 chars) ARE deduped if exact match
1404 let accumulated = "Hello";
1405 let delta = "Hello";
1406
1407 assert!(
1408 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1409 "Short chunk exact match SHOULD be deduped"
1410 );
1411 }
1412
1413 #[test]
1414 fn test_extract_new_content_with_thresholds_strong_overlap() {
1415 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1416 let delta = "The quick brown fox jumps over the lazy dog. More content here!";
1417
1418 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1419 assert_eq!(result, Some("More content here!"));
1420 }
1421
1422 #[test]
1423 fn test_extract_new_content_with_thresholds_weak_overlap() {
1424 // Weak overlap (< 30 chars) should return None
1425 let accumulated = "Hello";
1426 let delta = "Hello World! This is more content to exceed thresholds.";
1427
1428 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1429 assert_eq!(result, None, "Weak overlap should return None");
1430 }
1431
1432 #[test]
1433 fn test_extract_new_content_with_thresholds_short_chunk() {
1434 // Short chunk that's not an exact match should return None
1435 let accumulated = "Hi";
1436 let delta = "Hi there!";
1437
1438 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1439 assert_eq!(
1440 result, None,
1441 "Short chunk should return None unless exact match"
1442 );
1443 }
1444
1445 #[test]
1446 fn test_extract_new_content_with_thresholds_short_chunk_exact() {
1447 // Short chunk exact match should return empty string
1448 let accumulated = "Hello";
1449 let delta = "Hello";
1450
1451 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1452 assert_eq!(result, Some(""));
1453 }
1454
1455 #[test]
1456 fn test_strong_overlap_with_unicode() {
1457 // Test with Unicode characters
1458 let accumulated = "Hello 世界! This is a long enough string to meet thresholds. ";
1459 let delta = "Hello 世界! This is a long enough string to meet thresholds. More!";
1460
1461 assert!(
1462 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1463 "Should handle Unicode correctly with strong overlap"
1464 );
1465
1466 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1467 assert_eq!(result, Some("More!"));
1468 }
1469
1470 #[test]
1471 fn test_intentional_repetition_not_deduped() {
1472 // Simulate intentional repetition (e.g., "Hello World! Hello World!")
1473 // where the overlap is small relative to the total delta
1474 let accumulated = "Hello World!";
1475 let delta = "Hello World! Hello World! This is a lot of additional content to make the overlap ratio low enough that it won't be deduplicated!";
1476
1477 assert!(
1478 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1479 "Intentional repetition should NOT be deduped when overlap ratio is low"
1480 );
1481 }
1482
1483 #[test]
1484 fn test_snapshot_strong_overlap_deduped() {
1485 // Real snapshot scenario: agent sends full accumulated + new content
1486 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1487 let delta = "The quick brown fox jumps over the lazy dog. And then some more!";
1488
1489 assert!(
1490 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1491 "Actual snapshot SHOULD be detected and deduped"
1492 );
1493
1494 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1495 assert_eq!(result, Some("And then some more!"));
1496 }
1497
1498 #[test]
1499 fn test_overlap_score_meets_thresholds() {
1500 let thresholds = OverlapThresholds::default();
1501
1502 // Strong overlap: 30+ chars, 50%+ ratio, safe boundary
1503 let score = OverlapScore {
1504 char_count: 50,
1505 ratio_met: true,
1506 is_safe_boundary: true,
1507 };
1508
1509 assert!(score.meets_thresholds(&thresholds));
1510 }
1511
1512 #[test]
1513 fn test_overlap_score_fails_char_count() {
1514 let thresholds = OverlapThresholds::default();
1515
1516 // Char count too low
1517 let score = OverlapScore {
1518 char_count: 20,
1519 ratio_met: true,
1520 is_safe_boundary: true,
1521 };
1522
1523 assert!(!score.meets_thresholds(&thresholds));
1524 }
1525
1526 #[test]
1527 fn test_overlap_score_fails_ratio() {
1528 let thresholds = OverlapThresholds::default();
1529
1530 // Ratio too low (met = false)
1531 let score = OverlapScore {
1532 char_count: 50,
1533 ratio_met: false,
1534 is_safe_boundary: true,
1535 };
1536
1537 assert!(!score.meets_thresholds(&thresholds));
1538 }
1539
1540 #[test]
1541 fn test_overlap_score_fails_boundary() {
1542 let thresholds = OverlapThresholds::default();
1543
1544 // Boundary not safe
1545 let score = OverlapScore {
1546 char_count: 50,
1547 ratio_met: true,
1548 is_safe_boundary: false,
1549 };
1550
1551 assert!(!score.meets_thresholds(&thresholds));
1552 }
1553
1554 #[test]
1555 fn test_is_short_delta() {
1556 let thresholds = OverlapThresholds::default();
1557
1558 assert!(OverlapScore::is_short_delta(10, &thresholds));
1559 assert!(!OverlapScore::is_short_delta(30, &thresholds));
1560 }
1561
1562 #[test]
1563 fn test_is_safe_boundary_whitespace() {
1564 assert!(is_safe_boundary("Hello World", 5));
1565 assert!(is_safe_boundary("Hello\nWorld", 5));
1566 assert!(is_safe_boundary("Hello\tWorld", 5));
1567 }
1568
1569 #[test]
1570 fn test_is_safe_boundary_punctuation() {
1571 assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
1572 assert!(is_safe_boundary("Hello. World", 5)); // After "."
1573 assert!(is_safe_boundary("Hello; World", 5)); // After ";"
1574 }
1575
1576 #[test]
1577 fn test_is_safe_boundary_end_of_string() {
1578 assert!(is_safe_boundary("Hello", 5));
1579 assert!(is_safe_boundary("Hello", 10)); // Beyond length
1580 }
1581
1582 #[test]
1583 fn test_is_safe_boundary_mid_word() {
1584 assert!(!is_safe_boundary("HelloWorld", 5));
1585 assert!(!is_safe_boundary("Testing", 3));
1586 }
1587
1588 #[test]
1589 fn test_score_overlap_with_snapshot() {
1590 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1591 let delta = "The quick brown fox jumps over the lazy dog. And more!";
1592
1593 let score = score_overlap(delta, accumulated);
1594
1595 assert!(score.char_count > 30);
1596 assert!(score.ratio_met);
1597 assert!(score.is_safe_boundary);
1598 }
1599
1600 #[test]
1601 fn test_score_overlap_with_genuine_delta() {
1602 let accumulated = "Hello";
1603 let delta = " World";
1604
1605 let score = score_overlap(delta, accumulated);
1606
1607 assert_eq!(score.char_count, 0);
1608 }
1609
1610 #[test]
1611 fn test_get_overlap_thresholds_default() {
1612 let thresholds = get_overlap_thresholds();
1613
1614 assert_eq!(thresholds.min_overlap_chars, 30);
1615 assert_eq!(thresholds.short_chunk_threshold, 20);
1616 assert_eq!(thresholds.consecutive_duplicate_threshold, 3);
1617 }
1618
1619 #[test]
1620 fn test_consecutive_duplicate_threshold_default() {
1621 let thresholds = OverlapThresholds::default();
1622 assert_eq!(
1623 thresholds.consecutive_duplicate_threshold, DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
1624 "Default consecutive_duplicate_threshold should match constant"
1625 );
1626 assert_eq!(
1627 thresholds.consecutive_duplicate_threshold, 3,
1628 "Default consecutive_duplicate_threshold should be 3"
1629 );
1630 }
1631
1632 #[test]
1633 fn test_consecutive_duplicate_threshold_bounds() {
1634 // Test minimum allowed value
1635 let min_env = "2";
1636 std::env::set_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", min_env);
1637 // Note: OnceLock caches the value, so we can't test this in the same process
1638 // This test documents the expected behavior
1639
1640 // Test maximum allowed value
1641 let max_env = "10";
1642 std::env::set_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", max_env);
1643
1644 // Clean up
1645 std::env::remove_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD");
1646
1647 // Verify bounds constants are correct
1648 assert_eq!(
1649 MIN_CONSECUTIVE_DUPLICATE_THRESHOLD, 2,
1650 "Minimum threshold should be 2"
1651 );
1652 assert_eq!(
1653 MAX_CONSECUTIVE_DUPLICATE_THRESHOLD, 10,
1654 "Maximum threshold should be 10"
1655 );
1656 }
1657}