ralph_workflow/json_parser/deduplication.rs
1//! Delta deduplication using KMP and Rolling Hash algorithms.
2//!
3//! This module provides efficient deduplication for streaming deltas using:
4//! - **Rolling Hash (Rabin-Karp)**: Fast O(n) filtering to eliminate impossible matches
5//! - **KMP (Knuth-Morris-Pratt)**: O(n+m) verification for exact substring matching
6//! - **Strong Overlap Detection**: Thresholds and boundary checks to prevent false positives
7//!
8//! # Enhanced Deduplication
9//!
10//! The enhanced algorithm uses multiple layers of validation to prevent false positives:
11//!
12//! 1. **Rolling Hash Filter**: Fast O(n) check to eliminate impossible matches
13//! 2. **KMP Verification**: O(n+m) confirmation of actual substring match
14//! 3. **Overlap Threshold**: Only dedupe when overlap >= 30 chars AND >= 50% of delta
15//! 4. **Boundary Sanity**: Ensure overlap ends at whitespace/punctuation/newline
16//! 5. **Short Chunk Protection**: Chunks < 20 chars never deduped unless exact match
17//!
18//! # Architecture
19//!
20//! ```text
21//! Incoming Delta
22//! │
23//! ▼
24//! ┌─────────────────────┐
25//! │ Rolling Hash Check │ ◄── Compute hash of delta, compare against
26//! │ (Rabin-Karp) │ sliding window hashes of accumulated content
27//! └──────────┬──────────┘
28//! │
29//! ┌──────┴──────┐
30//! │ Hash Match? │
31//! └──────┬──────┘
32//! No │ Yes
33//! │ │
34//! ▼ ▼
35//! Accept ┌─────────────────┐
36//! Delta │ KMP Verification│ ◄── Confirm actual substring match
37//! └────────┬────────┘
38//! │
39//! ┌──────┴──────┐
40//! │True Match? │
41//! └──────┬──────┘
42//! No │ Yes
43//! │ │
44//! ▼ ▼
45//! Accept ┌─────────────────────┐
46//! Delta │ Strong Overlap Check│ ◄── >= 30 chars, >= 50%, safe boundary
47//! └──────────┬──────────┘
48//! │
49//! ┌──────┴──────┐
50//! │Measures? │
51//! └──────┬──────┘
52//! No │ Yes
53//! │ │
54//! ▼ ▼
55//! Accept Extract New
56//! Delta Portion Only
57//! ```
58
59use std::collections::HashMap;
60use std::sync::OnceLock;
61
62// ============================================================================
63// Environment Trait for Testability
64// ============================================================================
65
66/// Trait for accessing environment variables.
67///
68/// This trait enables dependency injection for testing without global state pollution.
69pub trait ThresholdEnvironment {
70 /// Get an environment variable by name.
71 fn get_var(&self, name: &str) -> Option<String>;
72}
73
74/// Production implementation that reads from actual environment.
75pub struct RealThresholdEnvironment;
76
77impl ThresholdEnvironment for RealThresholdEnvironment {
78 fn get_var(&self, name: &str) -> Option<String> {
79 std::env::var(name).ok()
80 }
81}
82
83// ============================================================================
84// Configuration Constants for Strong Overlap Detection
85// ============================================================================
86
87/// Default minimum overlap character count for deduplication.
88///
89/// Overlaps must be at least this many characters to be considered for deduplication.
90/// This prevents false positives from short accidental matches (e.g., "the", "and").
91const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
92
93/// Minimum overlap ratio expressed as integer (50 = 50%).
94/// Used for integer-based ratio comparison to avoid floating point issues.
95const MIN_OVERLAP_RATIO_INT: usize = 50;
96
97/// Default threshold for considering a chunk "short".
98///
99/// Short chunks (< this many chars) are never deduped unless they're exact matches
100/// with the accumulated content. This prevents aggressive deduplication of tokens
101/// like ".", "\n", "Ok" that are legitimately repeated.
102const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
103
104/// Default threshold for consecutive duplicate detection.
105///
106/// If the exact same chunk arrives this many times in a row, it's treated as a
107/// resend glitch and dropped entirely.
108const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
109
110/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
111const MIN_MIN_OVERLAP_CHARS: usize = 10;
112
113/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
114const MAX_MIN_OVERLAP_CHARS: usize = 100;
115
116/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
117const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
118
119/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
120const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
121
122/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
123const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
124
125/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
126const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
127
128/// Configuration for strong overlap detection.
129///
130/// This struct holds the tunable thresholds that determine when an overlap
131/// is "strong enough" to warrant deduplication.
132#[derive(Debug, Clone, Copy)]
133pub struct OverlapThresholds {
134 /// Minimum character count for overlap
135 pub min_overlap_chars: usize,
136 /// Threshold below which chunks are considered "short"
137 pub short_chunk_threshold: usize,
138 /// Number of consecutive duplicates before aggressive dedupe
139 pub consecutive_duplicate_threshold: usize,
140}
141
142impl Default for OverlapThresholds {
143 fn default() -> Self {
144 Self {
145 min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
146 short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
147 consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
148 }
149 }
150}
151
152/// Testable variant that accepts an environment trait for dependency injection.
153///
154/// This allows tests to mock environment variables without global state pollution.
155///
156/// Reads the following environment variables:
157/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
158/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
159/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
160pub fn get_overlap_thresholds_with_env(env: &dyn ThresholdEnvironment) -> OverlapThresholds {
161 let min_overlap_chars = env
162 .get_var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
163 .and_then(|s| s.parse::<usize>().ok())
164 .and_then(|v| {
165 if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
166 Some(v)
167 } else {
168 None
169 }
170 })
171 .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
172
173 let short_chunk_threshold = env
174 .get_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
175 .and_then(|s| s.parse::<usize>().ok())
176 .and_then(|v| {
177 if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
178 Some(v)
179 } else {
180 None
181 }
182 })
183 .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
184
185 let consecutive_duplicate_threshold = env
186 .get_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
187 .and_then(|s| s.parse::<usize>().ok())
188 .and_then(|v| {
189 if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
190 .contains(&v)
191 {
192 Some(v)
193 } else {
194 None
195 }
196 })
197 .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
198
199 OverlapThresholds {
200 min_overlap_chars,
201 short_chunk_threshold,
202 consecutive_duplicate_threshold,
203 }
204}
205
206pub fn get_overlap_thresholds() -> OverlapThresholds {
207 static THRESHOLDS: OnceLock<OverlapThresholds> = OnceLock::new();
208 *THRESHOLDS.get_or_init(|| get_overlap_thresholds_with_env(&RealThresholdEnvironment))
209}
210
211// ============================================================================
212// Boundary Detection
213// ============================================================================
214
215/// Check if a character position is at a safe boundary for deduplication.
216///
217/// A "safe boundary" is where the overlap ends at a natural break point in text:
218/// - Whitespace (space, tab, newline, etc.)
219/// - ASCII punctuation (.,!?;:, etc.)
220/// - End of string
221///
222/// This prevents deduplication from splitting words or tokens mid-way through,
223/// which could cause incorrect rendering of intentional repetitions.
224///
225/// # Arguments
226/// * `text` - The text to check
227/// * `pos` - The position in the text (byte offset)
228///
229/// # Returns
230/// * `true` - The position is at a safe boundary for deduplication
231/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
232///
233/// # Examples
234///
235/// ```ignore
236/// // Safe: overlap ends at space
237/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
238///
239/// // Safe: overlap ends at punctuation
240/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
241///
242/// // Unsafe: overlap ends mid-word
243/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
244/// ```
245fn is_safe_boundary(text: &str, pos: usize) -> bool {
246 // End of string is always safe
247 if pos >= text.len() {
248 return true;
249 }
250
251 // Get the character at the boundary position
252 // We need to use character iteration for Unicode safety
253 let char_at_pos = text[pos..].chars().next();
254
255 char_at_pos
256 .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
257}
258
259// ============================================================================
260// Overlap Quality Scoring
261// ============================================================================
262
263/// Score representing the "strength" of an overlap.
264///
265/// This struct captures multiple metrics about an overlap to determine
266/// if it's strong enough to warrant deduplication.
267#[derive(Debug, Clone, Copy, PartialEq, Eq)]
268pub struct OverlapScore {
269 /// Character count of the overlap
270 pub char_count: usize,
271 /// Whether the overlap meets the minimum ratio threshold
272 pub ratio_met: bool,
273 /// Whether the overlap ends at a safe boundary
274 pub is_safe_boundary: bool,
275}
276
277impl OverlapScore {
278 /// Check if this overlap meets all thresholds for deduplication.
279 ///
280 /// # Arguments
281 /// * `thresholds` - The overlap thresholds to check against
282 ///
283 /// # Returns
284 /// * `true` - The overlap is strong enough for deduplication
285 /// * `false` - The overlap is too weak
286 #[must_use]
287 pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
288 self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
289 }
290
291 /// Check if the delta is short (below short chunk threshold).
292 ///
293 /// # Arguments
294 /// * `delta_len` - The length of the delta
295 /// * `thresholds` - The overlap thresholds
296 ///
297 /// # Returns
298 /// * `true` - The delta is considered short
299 /// * `false` - The delta is normal length
300 #[must_use]
301 #[cfg(test)]
302 pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
303 delta_len < thresholds.short_chunk_threshold
304 }
305}
306
307/// Score the quality of an overlap between delta and accumulated content.
308///
309/// This function computes multiple metrics about an overlap to determine
310/// if it's strong enough to warrant deduplication.
311///
312/// # Arguments
313/// * `delta` - The incoming delta
314/// * `accumulated` - The previously accumulated content
315///
316/// # Returns
317/// An `OverlapScore` containing:
318/// - `char_count`: The length of the overlap in characters
319/// - `ratio`: The overlap as a fraction of delta length
320/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
321///
322/// # Examples
323///
324/// ```ignore
325/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
326/// let score = score_overlap("Hello World! More text here", "Hello World!");
327/// assert!(score.char_count >= 30);
328/// assert!(score.ratio_met);
329/// assert!(score.is_safe_boundary);
330/// ```
331fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
332 // Check if delta starts with accumulated (snapshot detection)
333 let overlap_len = if delta.starts_with(accumulated) {
334 accumulated.len()
335 } else {
336 0
337 };
338
339 // Calculate ratio as integer to avoid floating point precision issues
340 // We'll compare overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
341 // This avoids f64 casting entirely
342 let ratio_met = if delta.is_empty() {
343 false
344 } else {
345 // Check if overlap/delta >= MIN_OVERLAP_RATIO without floating point
346 // By cross-multiplying: overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
347 let overlap_scaled = overlap_len.saturating_mul(100);
348 let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
349 overlap_scaled >= threshold
350 };
351
352 // Check if the accumulated string ends at a safe boundary
353 // This is important because we don't want to dedupe if the accumulated
354 // string ends mid-word (e.g., accumulated="Hello" and delta="HelloWorld")
355 let is_safe_boundary = if overlap_len > 0 {
356 // Check if the last character of accumulated is a safe boundary
357 is_safe_boundary(accumulated, accumulated.len())
358 } else {
359 false
360 };
361
362 OverlapScore {
363 char_count: overlap_len,
364 ratio_met,
365 is_safe_boundary,
366 }
367}
368
369/// Rolling hash window for fast substring detection.
370///
371/// Maintains rolling hash values over accumulated content using the Rabin-Karp
372/// algorithm. Provides O(1) hash computation for sliding windows.
373///
374/// # Algorithm
375///
376/// Uses polynomial rolling hash with base 256 (byte values) and modulus
377/// from a large prime to minimize collisions.
378///
379/// Hash computation:
380/// ```text
381/// hash(s[0..n]) = (s[0] * b^(n-1) + s[1] * b^(n-2) + ... + s[n-1]) mod m
382/// ```
383///
384/// Rolling update:
385/// ```text
386/// hash(s[i+1..i+n+1]) = ((hash(s[i..i+n]) - s[i] * b^(n-1)) * b + s[i+n]) mod m
387/// ```
388///
389/// # Example
390///
391/// ```ignore
392/// let mut window = RollingHashWindow::new();
393/// window.add_content("Hello World");
394///
395/// // Check if "World" exists in the accumulated content
396/// let hashes = window.get_window_hashes(5); // 5 = length of "World"
397/// let world_hash = RollingHashWindow::compute_hash("World");
398///
399/// if hashes.contains(&world_hash) {
400/// // Potential match - verify with KMP
401/// }
402/// ```
403#[derive(Debug, Default, Clone)]
404pub struct RollingHashWindow {
405 /// Accumulated content for hash computation
406 content: String,
407 /// Cached hash values for different window sizes
408 /// Maps `window_size` -> Vec<(position, hash)>
409 cached_hashes: HashMap<usize, Vec<(usize, u64)>>,
410}
411
412impl RollingHashWindow {
413 /// Base for polynomial rolling hash (256 for byte values)
414 const BASE: u64 = 256;
415 /// Modulus for hash computation (large prime to minimize collisions)
416 const MODULUS: u64 = 2_147_483_647; // 2^31 - 1 (Mersenne prime)
417
418 /// Create a new rolling hash window.
419 #[cfg(test)]
420 pub fn new() -> Self {
421 Self::default()
422 }
423
424 /// Compute rolling hash of a string slice.
425 ///
426 /// Uses polynomial rolling hash with base 256 and a large prime modulus.
427 ///
428 /// # Arguments
429 /// * `text` - The text to hash
430 ///
431 /// # Returns
432 /// The hash value as a u64
433 ///
434 /// # Example
435 /// ```ignore
436 /// let hash = RollingHashWindow::compute_hash("Hello");
437 /// ```
438 pub fn compute_hash(text: &str) -> u64 {
439 let mut hash: u64 = 0;
440 for byte in text.bytes() {
441 hash = (hash * Self::BASE + u64::from(byte)) % Self::MODULUS;
442 }
443 hash
444 }
445
446 /// Compute base^(n-1) mod MODULUS for rolling hash updates.
447 ///
448 /// This is used to efficiently remove the leftmost character when
449 /// sliding the window.
450 #[cfg(test)]
451 fn compute_power(power: usize) -> u64 {
452 let mut result = 1u64;
453 for _ in 0..power {
454 result = (result * Self::BASE) % Self::MODULUS;
455 }
456 result
457 }
458
459 /// Add content to the window and update cached hashes.
460 ///
461 /// This appends new content and recomputes hash values for all
462 /// cached window sizes.
463 ///
464 /// # Arguments
465 /// * `text` - The new content to add
466 #[cfg(test)]
467 pub fn add_content(&mut self, text: &str) {
468 if text.is_empty() {
469 return;
470 }
471
472 let _start_pos = self.content.len();
473 self.content.push_str(text);
474
475 // Invalidate and recompute all cached hashes
476 self.cached_hashes.clear();
477 }
478
479 /// Get all window hashes of a specific size.
480 ///
481 /// Computes rolling hash values for all windows of the given size
482 /// in the accumulated content.
483 ///
484 /// # Arguments
485 /// * `window_size` - The size of each window
486 ///
487 /// # Returns
488 /// A vector of (position, hash) tuples for each window
489 ///
490 /// # Example
491 /// ```ignore
492 /// // Get hashes for all 5-character windows
493 /// let hashes = window.get_window_hashes(5);
494 /// ```
495 #[cfg(test)]
496 pub fn get_window_hashes(&mut self, window_size: usize) -> Vec<(usize, u64)> {
497 // Return cached copy if available
498 if let Some(hashes) = self.cached_hashes.get(&window_size) {
499 return hashes.clone();
500 }
501
502 // Compute hashes for this window size
503 let content_bytes = self.content.as_bytes();
504 let content_len = content_bytes.len();
505
506 if content_len < window_size {
507 // Not enough content for even one window
508 let empty: Vec<(usize, u64)> = Vec::new();
509 self.cached_hashes.insert(window_size, empty.clone());
510 return empty;
511 }
512
513 let mut hashes = Vec::new();
514 let mut hash: u64 = 0;
515
516 // Compute initial window hash
517 for byte in content_bytes.iter().take(window_size) {
518 hash = (hash * Self::BASE + u64::from(*byte)) % Self::MODULUS;
519 }
520 hashes.push((0, hash));
521
522 // Precompute base^(window_size-1) mod MODULUS for rolling updates
523 let power = Self::compute_power(window_size - 1);
524
525 // Slide window and compute rolling hashes
526 for i in 1..=(content_len - window_size) {
527 // Remove leftmost character contribution
528 let leftmost = u64::from(content_bytes[i - 1]);
529 let removed = (leftmost * power) % Self::MODULUS;
530 hash = (hash + Self::MODULUS - removed) % Self::MODULUS;
531
532 // Shift and add new character
533 hash = (hash * Self::BASE) % Self::MODULUS;
534 let new_char = u64::from(content_bytes[i + window_size - 1]);
535 hash = (hash + new_char) % Self::MODULUS;
536
537 hashes.push((i, hash));
538 }
539
540 // Cache for future use
541 self.cached_hashes.insert(window_size, hashes.clone());
542 hashes
543 }
544
545 /// Check if a hash exists in any window of the given size.
546 ///
547 /// This is a fast O(1) check after hashes have been computed.
548 ///
549 /// # Arguments
550 /// * `hash` - The hash value to search for
551 /// * `window_size` - The window size to check
552 ///
553 /// # Returns
554 /// * `Some(position)` - The position where the hash was found
555 /// * `None` - Hash not found
556 #[cfg(test)]
557 pub fn contains_hash(&mut self, hash: u64, window_size: usize) -> Option<usize> {
558 let hashes = self.get_window_hashes(window_size);
559 hashes
560 .into_iter()
561 .find(|(_, h)| *h == hash)
562 .map(|(pos, _)| pos)
563 }
564
565 /// Clear all content and cached hashes.
566 pub fn clear(&mut self) {
567 self.content.clear();
568 self.cached_hashes.clear();
569 }
570
571 /// Get the current content length.
572 #[cfg(test)]
573 pub const fn len(&self) -> usize {
574 self.content.len()
575 }
576
577 /// Check if the window is empty.
578 #[cfg(test)]
579 pub const fn is_empty(&self) -> bool {
580 self.content.is_empty()
581 }
582}
583
584/// KMP (Knuth-Morris-Pratt) matcher for exact substring matching.
585///
586/// Provides linear-time substring search by precomputing a failure function
587/// that allows skipping already-matched characters.
588///
589/// # Algorithm
590///
591/// The failure function (also called "prefix function" or "lps array") stores
592/// the length of the longest proper prefix of the pattern that is also a suffix
593/// for each position in the pattern.
594///
595/// # Example
596///
597/// ```ignore
598/// let mut kmp = KMPMatcher::new("abc");
599/// let text = "xyzabcuvw";
600///
601/// if let Some(pos) = kmp.find(text) {
602/// // Found "abc" at position 3
603/// }
604/// ```
605#[derive(Debug, Clone)]
606#[cfg(test)]
607pub struct KMPMatcher {
608 /// The pattern to search for
609 pattern: String,
610 /// Failure function (longest proper prefix which is also suffix)
611 failure: Vec<usize>,
612}
613
614#[cfg(test)]
615impl KMPMatcher {
616 /// Create a new KMP matcher for the given pattern.
617 ///
618 /// Precomputes the failure function for efficient searching.
619 ///
620 /// # Arguments
621 /// * `pattern` - The pattern to search for
622 ///
623 /// # Example
624 /// ```ignore
625 /// let matcher = KMPMatcher::new("hello");
626 /// ```
627 pub fn new(pattern: &str) -> Self {
628 let pattern = pattern.to_string();
629 let failure = Self::compute_failure(&pattern);
630 Self { pattern, failure }
631 }
632
633 /// Compute the failure function for KMP.
634 ///
635 /// The failure function `lps[i]` stores the length of the longest proper
636 /// prefix of `pattern[0..=i]` that is also a suffix.
637 ///
638 /// # Example
639 ///
640 /// For pattern "abab":
641 /// - lps[0] = 0 (no proper prefix for single char)
642 /// - lps[1] = 0 ("ab" has no matching prefix/suffix)
643 /// - lps[2] = 1 ("aba" has "a" as prefix and suffix)
644 /// - lps[3] = 2 ("abab" has "ab" as prefix and suffix)
645 fn compute_failure(pattern: &str) -> Vec<usize> {
646 let m = pattern.len();
647 if m == 0 {
648 return Vec::new();
649 }
650
651 let mut lps = vec![0; m];
652 let mut len = 0; // Length of the previous longest prefix suffix
653 let mut i = 1;
654
655 let pattern_bytes = pattern.as_bytes();
656
657 while i < m {
658 if pattern_bytes[i] == pattern_bytes[len] {
659 len += 1;
660 lps[i] = len;
661 i += 1;
662 } else if len != 0 {
663 // Fall back to previous longest prefix suffix
664 len = lps[len - 1];
665 // Note: we don't increment i here
666 } else {
667 // No proper prefix suffix found
668 lps[i] = 0;
669 i += 1;
670 }
671 }
672
673 lps
674 }
675
676 /// Find the pattern in text, returning the first position.
677 ///
678 /// Uses the precomputed failure function for O(n) search time.
679 ///
680 /// # Arguments
681 /// * `text` - The text to search in
682 ///
683 /// # Returns
684 /// * `Some(position)` - The starting position of the pattern
685 /// * `None` - Pattern not found
686 ///
687 /// # Example
688 /// ```ignore
689 /// let matcher = KMPMatcher::new("world");
690 /// assert_eq!(matcher.find("hello world"), Some(6));
691 /// assert_eq!(matcher.find("hello"), None);
692 /// ```
693 pub fn find(&self, text: &str) -> Option<usize> {
694 let n = text.len();
695 let m = self.pattern.len();
696
697 if m == 0 || n < m {
698 return None;
699 }
700
701 let text_bytes = text.as_bytes();
702 let pattern_bytes = self.pattern.as_bytes();
703
704 let mut i = 0; // Index for text
705 let mut j = 0; // Index for pattern
706
707 while i < n {
708 if pattern_bytes[j] == text_bytes[i] {
709 i += 1;
710 j += 1;
711
712 if j == m {
713 // Found complete pattern match
714 return Some(i - j);
715 }
716 } else if j != 0 {
717 // Use failure function to skip ahead
718 j = self.failure[j - 1];
719 } else {
720 // No match at all, move to next character
721 i += 1;
722 }
723 }
724
725 None
726 }
727
728 /// Find all occurrences of the pattern in text.
729 ///
730 /// Returns all positions where the pattern appears in the text.
731 ///
732 /// # Arguments
733 /// * `text` - The text to search in
734 ///
735 /// # Returns
736 /// A vector of positions where the pattern was found
737 ///
738 /// # Example
739 /// ```ignore
740 /// let matcher = KMPMatcher::new("ab");
741 /// let positions = matcher.find_all("ababab");
742 /// assert_eq!(positions, vec![0, 2, 4]);
743 /// ```
744 #[cfg(test)]
745 pub fn find_all(&self, text: &str) -> Vec<usize> {
746 let mut positions = Vec::new();
747 let n = text.len();
748 let m = self.pattern.len();
749
750 if m == 0 || n < m {
751 return positions;
752 }
753
754 let text_bytes = text.as_bytes();
755 let pattern_bytes = self.pattern.as_bytes();
756
757 let mut i = 0; // Index for text
758 let mut j = 0; // Index for pattern
759
760 while i < n {
761 if pattern_bytes[j] == text_bytes[i] {
762 i += 1;
763 j += 1;
764
765 if j == m {
766 // Found complete pattern match
767 positions.push(i - j);
768 j = self.failure[j - 1];
769 }
770 } else if j != 0 {
771 j = self.failure[j - 1];
772 } else {
773 i += 1;
774 }
775 }
776
777 positions
778 }
779
780 /// Get the pattern length.
781 #[cfg(test)]
782 pub const fn pattern_len(&self) -> usize {
783 self.pattern.len()
784 }
785
786 /// Check if the pattern is empty.
787 #[cfg(test)]
788 pub const fn is_empty(&self) -> bool {
789 self.pattern.is_empty()
790 }
791}
792
793/// Delta deduplicator using rolling hash and KMP.
794///
795/// Orchestrates the two-phase deduplication approach:
796/// 1. Rolling hash for fast filtering
797/// 2. KMP for exact verification
798///
799/// # Example
800///
801/// ```ignore
802/// let mut dedup = DeltaDeduplicator::new();
803///
804/// // Add accumulated content
805/// dedup.add_accumulated("Hello World");
806///
807/// // Check if a delta is a duplicate
808/// if let Some(new_portion) = dedup.extract_new_content("Hello World!") {
809/// // "!" is the new portion
810/// assert_eq!(new_portion, "!");
811/// }
812/// ```
813#[derive(Debug, Default, Clone)]
814pub struct DeltaDeduplicator {
815 /// Rolling hash window for accumulated content
816 hash_window: RollingHashWindow,
817}
818
819impl DeltaDeduplicator {
820 /// Create a new delta deduplicator.
821 #[cfg(test)]
822 pub fn new() -> Self {
823 Self::default()
824 }
825
826 /// Add accumulated content for deduplication tracking.
827 ///
828 /// # Arguments
829 /// * `content` - The accumulated content to track
830 #[cfg(test)]
831 pub fn add_accumulated(&mut self, content: &str) {
832 self.hash_window.add_content(content);
833 }
834
835 /// Extract new content from a potential snapshot.
836 ///
837 /// Uses rolling hash and KMP to detect if the incoming delta contains
838 /// previously accumulated content (a snapshot) and extracts only the
839 /// new portion.
840 ///
841 /// # Two-Phase Algorithm
842 ///
843 /// 1. **Rolling Hash Filter**: Compute hash of delta and check if it exists
844 /// in any window of the accumulated content. O(n) time.
845 ///
846 /// 2. **KMP Verification**: If hash matches, use KMP to verify actual
847 /// substring match and find the exact position. O(n+m) time.
848 ///
849 /// # Arguments
850 /// * `delta` - The incoming delta to check
851 /// * `accumulated` - The previously accumulated content
852 ///
853 /// # Returns
854 /// * `Some(new_portion)` - The delta starts with accumulated, returns new portion only
855 /// * `None` - The delta is genuinely new, doesn't start with accumulated
856 ///
857 /// # Example
858 /// ```ignore
859 /// let mut dedup = DeltaDeduplicator::new();
860 /// dedup.add_accumulated("Hello");
861 ///
862 /// // Snapshot: "Hello World"
863 /// assert_eq!(
864 /// DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
865 /// Some(" World")
866 /// );
867 ///
868 /// // Genuine delta: " World"
869 /// assert_eq!(
870 /// DeltaDeduplicator::extract_new_content(" World", "Hello"),
871 /// None
872 /// );
873 /// ```
874 #[cfg(test)]
875 pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
876 // Handle identical content (delta == accumulated) - return empty string
877 if delta == accumulated {
878 return Some("");
879 }
880
881 // Fast rejection: delta must be longer than accumulated
882 if delta.len() <= accumulated.len() {
883 return None;
884 }
885
886 // Phase 1: Rolling hash check
887 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
888 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
889
890 // If hashes don't match, definitely not a snapshot
891 if accumulated_hash != delta_prefix_hash {
892 return None;
893 }
894
895 // Phase 2: KMP verification for exact match
896 let kmp = KMPMatcher::new(accumulated);
897 if let Some(pos) = kmp.find(delta) {
898 // Verify it's at position 0 (snapshot starts with accumulated)
899 if pos == 0 {
900 return Some(&delta[accumulated.len()..]);
901 }
902 }
903
904 // Hash collision or match not at start - not a snapshot
905 None
906 }
907
908 /// Check if delta is likely a snapshot using rolling hash only.
909 ///
910 /// This is a faster O(n) check that may have false positives due to
911 /// hash collisions. Use `extract_new_content` for verified results.
912 ///
913 /// # Arguments
914 /// * `delta` - The incoming delta to check
915 /// * `accumulated` - The previously accumulated content
916 ///
917 /// # Returns
918 /// * `true` - Delta may be a snapshot (hash matches)
919 /// * `false` - Delta is definitely not a snapshot (hash doesn't match)
920 pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
921 // Handle identical content (duplicate delta)
922 if delta == accumulated {
923 return true;
924 }
925
926 // Delta must be longer than accumulated to be a snapshot
927 if delta.len() <= accumulated.len() {
928 return false;
929 }
930
931 let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
932 let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
933
934 accumulated_hash == delta_prefix_hash
935 }
936
937 /// Check if delta is likely a snapshot with strong overlap detection.
938 ///
939 /// This is an enhanced version of `is_likely_snapshot` that applies
940 /// strong overlap thresholds to prevent false positives on intentional
941 /// repetitions.
942 ///
943 /// # Strong Overlap Detection
944 ///
945 /// This method only returns `true` when:
946 /// - The overlap meets minimum character count threshold (default: 30 chars)
947 /// - The overlap meets minimum ratio threshold (default: 50% of delta)
948 /// - The overlap ends at a safe boundary (whitespace/punctuation/newline)
949 /// - Short chunks (< 20 chars) are only deduped if exact match
950 ///
951 /// # Arguments
952 /// * `delta` - The incoming delta to check
953 /// * `accumulated` - The previously accumulated content
954 ///
955 /// # Returns
956 /// * `true` - Delta is a snapshot meeting strong overlap criteria
957 /// * `false` - Delta is either genuine or overlap is too weak
958 pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
959 let thresholds = get_overlap_thresholds();
960
961 // Handle short chunks: only dedupe if exact match
962 if delta.len() < thresholds.short_chunk_threshold {
963 return delta == accumulated;
964 }
965
966 // Handle identical content (delta == accumulated)
967 // This is a snapshot where no new content is added
968 if delta == accumulated {
969 return true;
970 }
971
972 // Fast rejection: delta must be longer than accumulated
973 if delta.len() <= accumulated.len() {
974 return false;
975 }
976
977 // First check with basic rolling hash for quick rejection
978 if !Self::is_likely_snapshot(delta, accumulated) {
979 return false;
980 }
981
982 // Score the overlap to check if it meets strong overlap criteria
983 let score = score_overlap(delta, accumulated);
984
985 // Apply threshold checks
986 score.meets_thresholds(&thresholds)
987 }
988
989 /// Extract new content from a snapshot with strong overlap detection.
990 ///
991 /// This is an enhanced version of `extract_new_content` that only extracts
992 /// new content when the overlap meets strong overlap thresholds.
993 ///
994 /// # Arguments
995 /// * `delta` - The incoming delta to check
996 /// * `accumulated` - The previously accumulated content
997 ///
998 /// # Returns
999 /// * `Some(new_portion)` - The overlap meets thresholds, returns new portion
1000 /// * `None` - The overlap is too weak or not a snapshot
1001 pub fn extract_new_content_with_thresholds<'a>(
1002 delta: &'a str,
1003 accumulated: &str,
1004 ) -> Option<&'a str> {
1005 let thresholds = get_overlap_thresholds();
1006
1007 // Handle short chunks: only dedupe if exact match
1008 if delta.len() < thresholds.short_chunk_threshold {
1009 if delta == accumulated {
1010 return Some("");
1011 }
1012 return None;
1013 }
1014
1015 // Handle identical content
1016 if delta == accumulated {
1017 return Some("");
1018 }
1019
1020 // Fast rejection: delta must be longer than accumulated
1021 if delta.len() <= accumulated.len() {
1022 return None;
1023 }
1024
1025 // Score the overlap
1026 let score = score_overlap(delta, accumulated);
1027
1028 // Check if overlap meets thresholds
1029 if !score.meets_thresholds(&thresholds) {
1030 return None;
1031 }
1032
1033 // Extract new content using the overlap length from the score
1034 if score.char_count > 0 && delta.len() > score.char_count {
1035 Some(&delta[score.char_count..])
1036 } else {
1037 None
1038 }
1039 }
1040
1041 /// Clear all tracked content.
1042 pub fn clear(&mut self) {
1043 self.hash_window.clear();
1044 }
1045}
1046
1047#[cfg(test)]
1048mod tests {
1049 use super::*;
1050
1051 // Tests for RollingHashWindow
1052
1053 #[test]
1054 fn test_rolling_hash_compute_hash() {
1055 let hash1 = RollingHashWindow::compute_hash("Hello");
1056 let hash2 = RollingHashWindow::compute_hash("Hello");
1057 let hash3 = RollingHashWindow::compute_hash("World");
1058
1059 // Same input produces same hash
1060 assert_eq!(hash1, hash2);
1061 // Different input likely produces different hash
1062 assert_ne!(hash1, hash3);
1063 }
1064
1065 #[test]
1066 fn test_rolling_hash_add_content() {
1067 let mut window = RollingHashWindow::new();
1068 assert!(window.is_empty());
1069
1070 window.add_content("Hello");
1071 assert_eq!(window.len(), 5);
1072 assert!(!window.is_empty());
1073 }
1074
1075 #[test]
1076 fn test_rolling_hash_get_window_hashes() {
1077 let mut window = RollingHashWindow::new();
1078 window.add_content("HelloWorld");
1079
1080 // Get hashes for 5-character windows
1081 let hashes = window.get_window_hashes(5);
1082 assert_eq!(hashes.len(), 6); // "Hello", "elloW", "lloWo", "loWor", "oWorl", "World"
1083
1084 // Verify positions
1085 assert_eq!(hashes[0].0, 0); // First window starts at 0
1086 assert_eq!(hashes[5].0, 5); // Last window starts at 5
1087 }
1088
1089 #[test]
1090 fn test_rolling_hash_contains_hash() {
1091 let mut window = RollingHashWindow::new();
1092 window.add_content("HelloWorld");
1093
1094 let world_hash = RollingHashWindow::compute_hash("World");
1095 let xyz_hash = RollingHashWindow::compute_hash("XYZ");
1096
1097 // "World" exists in the content
1098 assert!(window.contains_hash(world_hash, 5).is_some());
1099 // "XYZ" doesn't exist
1100 assert!(window.contains_hash(xyz_hash, 3).is_none());
1101 }
1102
1103 #[test]
1104 fn test_rolling_hash_clear() {
1105 let mut window = RollingHashWindow::new();
1106 window.add_content("Hello");
1107 assert_eq!(window.len(), 5);
1108
1109 window.clear();
1110 assert!(window.is_empty());
1111 }
1112
1113 // Tests for KMPMatcher
1114
1115 #[test]
1116 fn test_kmp_find_pattern_exists() {
1117 let kmp = KMPMatcher::new("World");
1118 assert_eq!(kmp.find("Hello World"), Some(6));
1119 assert_eq!(kmp.find("WorldHello"), Some(0));
1120 }
1121
1122 #[test]
1123 fn test_kmp_find_pattern_not_exists() {
1124 let kmp = KMPMatcher::new("XYZ");
1125 assert_eq!(kmp.find("Hello World"), None);
1126 }
1127
1128 #[test]
1129 fn test_kmp_find_pattern_empty() {
1130 let kmp = KMPMatcher::new("");
1131 assert_eq!(kmp.find("Hello"), None);
1132 }
1133
1134 #[test]
1135 fn test_kmp_find_text_shorter_than_pattern() {
1136 let kmp = KMPMatcher::new("Hello World");
1137 assert_eq!(kmp.find("Hello"), None);
1138 }
1139
1140 #[test]
1141 fn test_kmp_find_all() {
1142 let kmp = KMPMatcher::new("ab");
1143 let positions = kmp.find_all("ababab");
1144 assert_eq!(positions, vec![0, 2, 4]);
1145 }
1146
1147 #[test]
1148 fn test_kmp_find_all_no_matches() {
1149 let kmp = KMPMatcher::new("xyz");
1150 let positions = kmp.find_all("abcabc");
1151 assert!(positions.is_empty());
1152 }
1153
1154 #[test]
1155 fn test_kmp_find_overlapping_patterns() {
1156 let kmp = KMPMatcher::new("aa");
1157 let positions = kmp.find_all("aaa");
1158 assert_eq!(positions, vec![0, 1]);
1159 }
1160
1161 #[test]
1162 fn test_kmp_failure_function() {
1163 let kmp = KMPMatcher::new("abab");
1164 // lps = [0, 0, 1, 2]
1165 assert_eq!(kmp.failure, vec![0, 0, 1, 2]);
1166 }
1167
1168 #[test]
1169 fn test_kmp_pattern_len() {
1170 let kmp = KMPMatcher::new("Hello");
1171 assert_eq!(kmp.pattern_len(), 5);
1172 }
1173
1174 #[test]
1175 fn test_kmp_is_empty() {
1176 let kmp = KMPMatcher::new("");
1177 assert!(kmp.is_empty());
1178
1179 let kmp = KMPMatcher::new("Hello");
1180 assert!(!kmp.is_empty());
1181 }
1182
1183 // Tests for DeltaDeduplicator
1184
1185 #[test]
1186 fn test_dedup_extract_new_content_snapshot() {
1187 let mut dedup = DeltaDeduplicator::new();
1188 dedup.add_accumulated("Hello");
1189
1190 // Snapshot: "Hello World"
1191 assert_eq!(
1192 DeltaDeduplicator::extract_new_content("Hello World", "Hello"),
1193 Some(" World")
1194 );
1195 }
1196
1197 #[test]
1198 fn test_dedup_extract_new_content_genuine_delta() {
1199 let mut dedup = DeltaDeduplicator::new();
1200 dedup.add_accumulated("Hello");
1201
1202 // Genuine delta: " World"
1203 assert_eq!(
1204 DeltaDeduplicator::extract_new_content(" World", "Hello"),
1205 None
1206 );
1207 }
1208
1209 #[test]
1210 fn test_dedup_extract_new_content_shorter_delta() {
1211 let mut dedup = DeltaDeduplicator::new();
1212 dedup.add_accumulated("Hello World");
1213
1214 // Delta shorter than accumulated - can't be snapshot
1215 assert_eq!(
1216 DeltaDeduplicator::extract_new_content("Hello", "Hello World"),
1217 None
1218 );
1219 }
1220
1221 #[test]
1222 fn test_dedup_extract_new_content_equal_length() {
1223 let mut dedup = DeltaDeduplicator::new();
1224 dedup.add_accumulated("Hello");
1225
1226 // Equal length - if identical, return empty string (no new content)
1227 assert_eq!(
1228 DeltaDeduplicator::extract_new_content("Hello", "Hello"),
1229 Some("")
1230 );
1231
1232 // Equal length but different content - not a snapshot
1233 assert_eq!(
1234 DeltaDeduplicator::extract_new_content("World", "Hello"),
1235 None
1236 );
1237 }
1238
1239 #[test]
1240 fn test_dedup_extract_new_content_no_match() {
1241 let mut dedup = DeltaDeduplicator::new();
1242 dedup.add_accumulated("Hello");
1243
1244 // Different content entirely
1245 assert_eq!(
1246 DeltaDeduplicator::extract_new_content("World", "Hello"),
1247 None
1248 );
1249 }
1250
1251 #[test]
1252 fn test_dedup_is_likely_snapshot() {
1253 let mut dedup = DeltaDeduplicator::new();
1254 dedup.add_accumulated("Hello");
1255
1256 // Actual snapshot
1257 assert!(DeltaDeduplicator::is_likely_snapshot(
1258 "Hello World",
1259 "Hello"
1260 ));
1261
1262 // Not a snapshot
1263 assert!(!DeltaDeduplicator::is_likely_snapshot(" World", "Hello"));
1264 }
1265
1266 #[test]
1267 fn test_dedup_clear() {
1268 let mut dedup = DeltaDeduplicator::new();
1269 dedup.add_accumulated("Hello");
1270 assert!(!dedup.hash_window.is_empty());
1271
1272 dedup.clear();
1273 assert!(dedup.hash_window.is_empty());
1274 }
1275
1276 // Integration tests
1277
1278 #[test]
1279 fn test_dedup_two_phase_algorithm() {
1280 let mut dedup = DeltaDeduplicator::new();
1281 dedup.add_accumulated("The quick brown fox");
1282
1283 // Phase 1: Rolling hash will match
1284 assert!(DeltaDeduplicator::is_likely_snapshot(
1285 "The quick brown fox jumps",
1286 "The quick brown fox"
1287 ));
1288
1289 // Phase 2: KMP verification confirms and extracts new portion
1290 assert_eq!(
1291 DeltaDeduplicator::extract_new_content(
1292 "The quick brown fox jumps",
1293 "The quick brown fox"
1294 ),
1295 Some(" jumps")
1296 );
1297 }
1298
1299 #[test]
1300 fn test_dedup_handles_unicode() {
1301 let mut dedup = DeltaDeduplicator::new();
1302 dedup.add_accumulated("Hello 世界");
1303
1304 // Should handle UTF-8 correctly
1305 assert_eq!(
1306 DeltaDeduplicator::extract_new_content("Hello 世界!", "Hello 世界"),
1307 Some("!")
1308 );
1309 }
1310
1311 #[test]
1312 fn test_dedup_empty_accumulated() {
1313 // No accumulated content
1314
1315 // Any delta is genuine
1316 assert_eq!(DeltaDeduplicator::extract_new_content("Hello", ""), None);
1317 }
1318
1319 // Tests for Strong Overlap Detection with Thresholds
1320
1321 #[test]
1322 fn test_strong_overlap_meets_char_threshold() {
1323 // Overlap of 30+ chars with safe boundary should pass
1324 let accumulated = "The quick brown fox jumps over the lazy";
1325 let delta = "The quick brown fox jumps over the lazy dog!";
1326
1327 assert!(
1328 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1329 "Should detect snapshot with 30+ char overlap"
1330 );
1331 }
1332
1333 #[test]
1334 fn test_strong_overlap_meets_ratio_threshold() {
1335 // Overlap is 50%+ of delta length
1336 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1337 let delta = "The quick brown fox jumps over the lazy dog. And more!";
1338
1339 assert!(
1340 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1341 "Should detect snapshot when overlap is 50%+ of delta"
1342 );
1343 }
1344
1345 #[test]
1346 fn test_strong_overlap_fails_char_threshold() {
1347 // Overlap < 30 chars, even if ratio is good
1348 let accumulated = "Hello";
1349 let delta = "Hello World!";
1350
1351 assert!(
1352 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1353 "Should NOT detect snapshot when overlap < 30 chars"
1354 );
1355 }
1356
1357 #[test]
1358 fn test_strong_overlap_fails_ratio_threshold() {
1359 // Overlap < 50% of delta, even if char count is good
1360 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1361 let delta = "The quick brown fox jumps over the lazy dog. And then a whole lot more text follows to make the ratio low!";
1362
1363 assert!(
1364 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1365 "Should NOT detect snapshot when overlap < 50% of delta"
1366 );
1367 }
1368
1369 #[test]
1370 fn test_boundary_check_whitespace() {
1371 // Overlap ends at space (safe boundary)
1372 let accumulated = "The quick brown fox jumps over the lazy dog and ";
1373 let delta = "The quick brown fox jumps over the lazy dog and then more!";
1374
1375 assert!(
1376 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1377 "Should detect snapshot when overlap ends at whitespace"
1378 );
1379 }
1380
1381 #[test]
1382 fn test_boundary_check_punctuation() {
1383 // Overlap ends at punctuation (safe boundary)
1384 let accumulated = "The quick brown fox jumps over the lazy dog.";
1385 let delta = "The quick brown fox jumps over the lazy dog. How are you?";
1386
1387 assert!(
1388 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1389 "Should detect snapshot when overlap ends at punctuation"
1390 );
1391 }
1392
1393 #[test]
1394 fn test_boundary_check_mid_word_fails() {
1395 // Overlap ends mid-word (unsafe boundary)
1396 let accumulated = "Hello";
1397 let delta = "HelloWorld! This is a lot of text to ensure we have enough characters.";
1398
1399 // Even though we have 30+ chars, the boundary check should fail
1400 // because the overlap ends mid-word (at 'W' of "World")
1401 assert!(
1402 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1403 "Should NOT detect snapshot when overlap ends mid-word"
1404 );
1405 }
1406
1407 #[test]
1408 fn test_short_chunk_never_deduped() {
1409 // Short chunks (< 20 chars) never deduped unless exact match
1410 let accumulated = "Hello";
1411 let delta = "Hello World!";
1412
1413 // Even though "Hello World!" starts with "Hello", it's < 20 chars total
1414 // and not an exact match, so it should NOT be deduped
1415 assert!(
1416 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1417 "Short chunks should NOT be deduped unless exact match"
1418 );
1419 }
1420
1421 #[test]
1422 fn test_short_chunk_exact_match_deduped() {
1423 // Short chunks (< 20 chars) ARE deduped if exact match
1424 let accumulated = "Hello";
1425 let delta = "Hello";
1426
1427 assert!(
1428 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1429 "Short chunk exact match SHOULD be deduped"
1430 );
1431 }
1432
1433 #[test]
1434 fn test_extract_new_content_with_thresholds_strong_overlap() {
1435 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1436 let delta = "The quick brown fox jumps over the lazy dog. More content here!";
1437
1438 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1439 assert_eq!(result, Some("More content here!"));
1440 }
1441
1442 #[test]
1443 fn test_extract_new_content_with_thresholds_weak_overlap() {
1444 // Weak overlap (< 30 chars) should return None
1445 let accumulated = "Hello";
1446 let delta = "Hello World! This is more content to exceed thresholds.";
1447
1448 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1449 assert_eq!(result, None, "Weak overlap should return None");
1450 }
1451
1452 #[test]
1453 fn test_extract_new_content_with_thresholds_short_chunk() {
1454 // Short chunk that's not an exact match should return None
1455 let accumulated = "Hi";
1456 let delta = "Hi there!";
1457
1458 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1459 assert_eq!(
1460 result, None,
1461 "Short chunk should return None unless exact match"
1462 );
1463 }
1464
1465 #[test]
1466 fn test_extract_new_content_with_thresholds_short_chunk_exact() {
1467 // Short chunk exact match should return empty string
1468 let accumulated = "Hello";
1469 let delta = "Hello";
1470
1471 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1472 assert_eq!(result, Some(""));
1473 }
1474
1475 #[test]
1476 fn test_strong_overlap_with_unicode() {
1477 // Test with Unicode characters
1478 let accumulated = "Hello 世界! This is a long enough string to meet thresholds. ";
1479 let delta = "Hello 世界! This is a long enough string to meet thresholds. More!";
1480
1481 assert!(
1482 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1483 "Should handle Unicode correctly with strong overlap"
1484 );
1485
1486 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1487 assert_eq!(result, Some("More!"));
1488 }
1489
1490 #[test]
1491 fn test_intentional_repetition_not_deduped() {
1492 // Simulate intentional repetition (e.g., "Hello World! Hello World!")
1493 // where the overlap is small relative to the total delta
1494 let accumulated = "Hello World!";
1495 let delta = "Hello World! Hello World! This is a lot of additional content to make the overlap ratio low enough that it won't be deduplicated!";
1496
1497 assert!(
1498 !DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1499 "Intentional repetition should NOT be deduped when overlap ratio is low"
1500 );
1501 }
1502
1503 #[test]
1504 fn test_snapshot_strong_overlap_deduped() {
1505 // Real snapshot scenario: agent sends full accumulated + new content
1506 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1507 let delta = "The quick brown fox jumps over the lazy dog. And then some more!";
1508
1509 assert!(
1510 DeltaDeduplicator::is_likely_snapshot_with_thresholds(delta, accumulated),
1511 "Actual snapshot SHOULD be detected and deduped"
1512 );
1513
1514 let result = DeltaDeduplicator::extract_new_content_with_thresholds(delta, accumulated);
1515 assert_eq!(result, Some("And then some more!"));
1516 }
1517
1518 #[test]
1519 fn test_overlap_score_meets_thresholds() {
1520 let thresholds = OverlapThresholds::default();
1521
1522 // Strong overlap: 30+ chars, 50%+ ratio, safe boundary
1523 let score = OverlapScore {
1524 char_count: 50,
1525 ratio_met: true,
1526 is_safe_boundary: true,
1527 };
1528
1529 assert!(score.meets_thresholds(&thresholds));
1530 }
1531
1532 #[test]
1533 fn test_overlap_score_fails_char_count() {
1534 let thresholds = OverlapThresholds::default();
1535
1536 // Char count too low
1537 let score = OverlapScore {
1538 char_count: 20,
1539 ratio_met: true,
1540 is_safe_boundary: true,
1541 };
1542
1543 assert!(!score.meets_thresholds(&thresholds));
1544 }
1545
1546 #[test]
1547 fn test_overlap_score_fails_ratio() {
1548 let thresholds = OverlapThresholds::default();
1549
1550 // Ratio too low (met = false)
1551 let score = OverlapScore {
1552 char_count: 50,
1553 ratio_met: false,
1554 is_safe_boundary: true,
1555 };
1556
1557 assert!(!score.meets_thresholds(&thresholds));
1558 }
1559
1560 #[test]
1561 fn test_overlap_score_fails_boundary() {
1562 let thresholds = OverlapThresholds::default();
1563
1564 // Boundary not safe
1565 let score = OverlapScore {
1566 char_count: 50,
1567 ratio_met: true,
1568 is_safe_boundary: false,
1569 };
1570
1571 assert!(!score.meets_thresholds(&thresholds));
1572 }
1573
1574 #[test]
1575 fn test_is_short_delta() {
1576 let thresholds = OverlapThresholds::default();
1577
1578 assert!(OverlapScore::is_short_delta(10, &thresholds));
1579 assert!(!OverlapScore::is_short_delta(30, &thresholds));
1580 }
1581
1582 #[test]
1583 fn test_is_safe_boundary_whitespace() {
1584 assert!(is_safe_boundary("Hello World", 5));
1585 assert!(is_safe_boundary("Hello\nWorld", 5));
1586 assert!(is_safe_boundary("Hello\tWorld", 5));
1587 }
1588
1589 #[test]
1590 fn test_is_safe_boundary_punctuation() {
1591 assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
1592 assert!(is_safe_boundary("Hello. World", 5)); // After "."
1593 assert!(is_safe_boundary("Hello; World", 5)); // After ";"
1594 }
1595
1596 #[test]
1597 fn test_is_safe_boundary_end_of_string() {
1598 assert!(is_safe_boundary("Hello", 5));
1599 assert!(is_safe_boundary("Hello", 10)); // Beyond length
1600 }
1601
1602 #[test]
1603 fn test_is_safe_boundary_mid_word() {
1604 assert!(!is_safe_boundary("HelloWorld", 5));
1605 assert!(!is_safe_boundary("Testing", 3));
1606 }
1607
1608 #[test]
1609 fn test_score_overlap_with_snapshot() {
1610 let accumulated = "The quick brown fox jumps over the lazy dog. ";
1611 let delta = "The quick brown fox jumps over the lazy dog. And more!";
1612
1613 let score = score_overlap(delta, accumulated);
1614
1615 assert!(score.char_count > 30);
1616 assert!(score.ratio_met);
1617 assert!(score.is_safe_boundary);
1618 }
1619
1620 #[test]
1621 fn test_score_overlap_with_genuine_delta() {
1622 let accumulated = "Hello";
1623 let delta = " World";
1624
1625 let score = score_overlap(delta, accumulated);
1626
1627 assert_eq!(score.char_count, 0);
1628 }
1629
1630 #[test]
1631 fn test_get_overlap_thresholds_default() {
1632 let thresholds = get_overlap_thresholds();
1633
1634 assert_eq!(thresholds.min_overlap_chars, 30);
1635 assert_eq!(thresholds.short_chunk_threshold, 20);
1636 assert_eq!(thresholds.consecutive_duplicate_threshold, 3);
1637 }
1638
1639 #[test]
1640 fn test_consecutive_duplicate_threshold_default() {
1641 let thresholds = OverlapThresholds::default();
1642 assert_eq!(
1643 thresholds.consecutive_duplicate_threshold, DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
1644 "Default consecutive_duplicate_threshold should match constant"
1645 );
1646 assert_eq!(
1647 thresholds.consecutive_duplicate_threshold, 3,
1648 "Default consecutive_duplicate_threshold should be 3"
1649 );
1650 }
1651
1652 /// Mock environment for testing threshold parsing.
1653 struct MockThresholdEnv {
1654 vars: std::collections::HashMap<String, String>,
1655 }
1656
1657 impl MockThresholdEnv {
1658 fn new() -> Self {
1659 Self {
1660 vars: std::collections::HashMap::new(),
1661 }
1662 }
1663
1664 fn with_var(mut self, key: &str, value: &str) -> Self {
1665 self.vars.insert(key.to_string(), value.to_string());
1666 self
1667 }
1668 }
1669
1670 impl ThresholdEnvironment for MockThresholdEnv {
1671 fn get_var(&self, name: &str) -> Option<String> {
1672 self.vars.get(name).cloned()
1673 }
1674 }
1675
1676 #[test]
1677 fn test_threshold_env_parsing_min_overlap_chars() {
1678 // Test valid custom value
1679 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "50");
1680 let thresholds = get_overlap_thresholds_with_env(&env);
1681 assert_eq!(thresholds.min_overlap_chars, 50);
1682
1683 // Test out of range (too low) - should use default
1684 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "5");
1685 let thresholds = get_overlap_thresholds_with_env(&env);
1686 assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1687
1688 // Test out of range (too high) - should use default
1689 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "200");
1690 let thresholds = get_overlap_thresholds_with_env(&env);
1691 assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1692
1693 // Test invalid value - should use default
1694 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_MIN_OVERLAP_CHARS", "invalid");
1695 let thresholds = get_overlap_thresholds_with_env(&env);
1696 assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1697 }
1698
1699 #[test]
1700 fn test_threshold_env_parsing_short_chunk_threshold() {
1701 // Test valid custom value
1702 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "10");
1703 let thresholds = get_overlap_thresholds_with_env(&env);
1704 assert_eq!(thresholds.short_chunk_threshold, 10);
1705
1706 // Test boundary values
1707 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "5");
1708 let thresholds = get_overlap_thresholds_with_env(&env);
1709 assert_eq!(thresholds.short_chunk_threshold, 5); // Min boundary
1710
1711 let env = MockThresholdEnv::new().with_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD", "50");
1712 let thresholds = get_overlap_thresholds_with_env(&env);
1713 assert_eq!(thresholds.short_chunk_threshold, 50); // Max boundary
1714 }
1715
1716 #[test]
1717 fn test_threshold_env_parsing_consecutive_duplicate() {
1718 // Test valid custom value
1719 let env = MockThresholdEnv::new()
1720 .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "5");
1721 let thresholds = get_overlap_thresholds_with_env(&env);
1722 assert_eq!(thresholds.consecutive_duplicate_threshold, 5);
1723
1724 // Test min boundary
1725 let env = MockThresholdEnv::new()
1726 .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "2");
1727 let thresholds = get_overlap_thresholds_with_env(&env);
1728 assert_eq!(thresholds.consecutive_duplicate_threshold, 2);
1729
1730 // Test max boundary
1731 let env = MockThresholdEnv::new()
1732 .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "10");
1733 let thresholds = get_overlap_thresholds_with_env(&env);
1734 assert_eq!(thresholds.consecutive_duplicate_threshold, 10);
1735
1736 // Test out of range - should use default
1737 let env = MockThresholdEnv::new()
1738 .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "1");
1739 let thresholds = get_overlap_thresholds_with_env(&env);
1740 assert_eq!(
1741 thresholds.consecutive_duplicate_threshold,
1742 DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1743 );
1744
1745 let env = MockThresholdEnv::new()
1746 .with_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD", "15");
1747 let thresholds = get_overlap_thresholds_with_env(&env);
1748 assert_eq!(
1749 thresholds.consecutive_duplicate_threshold,
1750 DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1751 );
1752 }
1753
1754 #[test]
1755 fn test_threshold_env_empty_returns_defaults() {
1756 let env = MockThresholdEnv::new();
1757 let thresholds = get_overlap_thresholds_with_env(&env);
1758 assert_eq!(thresholds.min_overlap_chars, DEFAULT_MIN_OVERLAP_CHARS);
1759 assert_eq!(
1760 thresholds.short_chunk_threshold,
1761 DEFAULT_SHORT_CHUNK_THRESHOLD
1762 );
1763 assert_eq!(
1764 thresholds.consecutive_duplicate_threshold,
1765 DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD
1766 );
1767 }
1768
1769 #[test]
1770 fn test_threshold_bounds_constants() {
1771 // Verify bounds constants are correct (pure constant tests, no env var manipulation)
1772 assert_eq!(
1773 MIN_CONSECUTIVE_DUPLICATE_THRESHOLD, 2,
1774 "Minimum threshold should be 2"
1775 );
1776 assert_eq!(
1777 MAX_CONSECUTIVE_DUPLICATE_THRESHOLD, 10,
1778 "Maximum threshold should be 10"
1779 );
1780 assert_eq!(MIN_MIN_OVERLAP_CHARS, 10);
1781 assert_eq!(MAX_MIN_OVERLAP_CHARS, 100);
1782 assert_eq!(MIN_SHORT_CHUNK_THRESHOLD, 5);
1783 assert_eq!(MAX_SHORT_CHUNK_THRESHOLD, 50);
1784 }
1785}