ralph_workflow/json_parser/deduplication/thresholds.rs
1// Threshold configuration and overlap detection for deduplication.
2//
3// Contains:
4// - ThresholdEnvironment trait for testability
5// - Configuration constants
6// - OverlapThresholds struct
7// - Boundary detection (is_safe_boundary)
8// - OverlapScore struct and scoring functions
9
10use std::sync::OnceLock;
11
12// ============================================================================
13// Environment Trait for Testability
14// ============================================================================
15
16/// Trait for accessing environment variables.
17///
18/// This trait enables dependency injection for testing without global state pollution.
19pub trait ThresholdEnvironment {
20 /// Get an environment variable by name.
21 fn get_var(&self, name: &str) -> Option<String>;
22}
23
24/// Production implementation that reads from actual environment.
25pub struct RealThresholdEnvironment;
26
27impl ThresholdEnvironment for RealThresholdEnvironment {
28 fn get_var(&self, name: &str) -> Option<String> {
29 std::env::var(name).ok()
30 }
31}
32
33// ============================================================================
34// Configuration Constants for Strong Overlap Detection
35// ============================================================================
36
37/// Default minimum overlap character count for deduplication.
38///
39/// Overlaps must be at least this many characters to be considered for deduplication.
40/// This prevents false positives from short accidental matches (e.g., "the", "and").
41const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
42
43/// Minimum overlap ratio expressed as integer (50 = 50%).
44/// Used for integer-based ratio comparison to avoid floating point issues.
45const MIN_OVERLAP_RATIO_INT: usize = 50;
46
47/// Default threshold for considering a chunk "short".
48///
49/// Short chunks (< this many chars) are never deduped unless they're exact matches
50/// with the accumulated content. This prevents aggressive deduplication of tokens
51/// like ".", "\n", "Ok" that are legitimately repeated.
52const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
53
54/// Default threshold for consecutive duplicate detection.
55///
56/// If the exact same chunk arrives this many times in a row, it's treated as a
57/// resend glitch and dropped entirely.
58const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
59
60/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
61const MIN_MIN_OVERLAP_CHARS: usize = 10;
62
63/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
64const MAX_MIN_OVERLAP_CHARS: usize = 100;
65
66/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
67const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
68
69/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
70const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
71
72/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
73const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
74
75/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
76const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
77
78/// Configuration for strong overlap detection.
79///
80/// This struct holds the tunable thresholds that determine when an overlap
81/// is "strong enough" to warrant deduplication.
82#[derive(Debug, Clone, Copy)]
83pub struct OverlapThresholds {
84 /// Minimum character count for overlap
85 pub min_overlap_chars: usize,
86 /// Threshold below which chunks are considered "short"
87 pub short_chunk_threshold: usize,
88 /// Number of consecutive duplicates before aggressive dedupe
89 pub consecutive_duplicate_threshold: usize,
90}
91
92impl Default for OverlapThresholds {
93 fn default() -> Self {
94 Self {
95 min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
96 short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
97 consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
98 }
99 }
100}
101
102/// Testable variant that accepts an environment trait for dependency injection.
103///
104/// This allows tests to mock environment variables without global state pollution.
105///
106/// Reads the following environment variables:
107/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
108/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
109/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
110pub fn get_overlap_thresholds_with_env(env: &dyn ThresholdEnvironment) -> OverlapThresholds {
111 let min_overlap_chars = env
112 .get_var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
113 .and_then(|s| s.parse::<usize>().ok())
114 .and_then(|v| {
115 if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
116 Some(v)
117 } else {
118 None
119 }
120 })
121 .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
122
123 let short_chunk_threshold = env
124 .get_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
125 .and_then(|s| s.parse::<usize>().ok())
126 .and_then(|v| {
127 if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
128 Some(v)
129 } else {
130 None
131 }
132 })
133 .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
134
135 let consecutive_duplicate_threshold = env
136 .get_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
137 .and_then(|s| s.parse::<usize>().ok())
138 .and_then(|v| {
139 if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
140 .contains(&v)
141 {
142 Some(v)
143 } else {
144 None
145 }
146 })
147 .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
148
149 OverlapThresholds {
150 min_overlap_chars,
151 short_chunk_threshold,
152 consecutive_duplicate_threshold,
153 }
154}
155
156pub fn get_overlap_thresholds() -> OverlapThresholds {
157 static THRESHOLDS: OnceLock<OverlapThresholds> = OnceLock::new();
158 *THRESHOLDS.get_or_init(|| get_overlap_thresholds_with_env(&RealThresholdEnvironment))
159}
160
161// ============================================================================
162// Boundary Detection
163// ============================================================================
164
165/// Check if a character position is at a safe boundary for deduplication.
166///
167/// A "safe boundary" is where the overlap ends at a natural break point in text:
168/// - Whitespace (space, tab, newline, etc.)
169/// - ASCII punctuation (.,!?;:, etc.)
170/// - End of string
171///
172/// This prevents deduplication from splitting words or tokens mid-way through,
173/// which could cause incorrect rendering of intentional repetitions.
174///
175/// # Arguments
176/// * `text` - The text to check
177/// * `pos` - The position in the text (byte offset)
178///
179/// # Returns
180/// * `true` - The position is at a safe boundary for deduplication
181/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
182///
183/// # Examples
184///
185/// ```ignore
186/// // Safe: overlap ends at space
187/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
188///
189/// // Safe: overlap ends at punctuation
190/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
191///
192/// // Unsafe: overlap ends mid-word
193/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
194/// ```
195fn is_safe_boundary(text: &str, pos: usize) -> bool {
196 // End of string is always safe
197 if pos >= text.len() {
198 return true;
199 }
200
201 // Get the character at the boundary position
202 // We need to use character iteration for Unicode safety
203 let char_at_pos = text[pos..].chars().next();
204
205 char_at_pos
206 .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
207}
208
209// ============================================================================
210// Overlap Quality Scoring
211// ============================================================================
212
213/// Score representing the "strength" of an overlap.
214///
215/// This struct captures multiple metrics about an overlap to determine
216/// if it's strong enough to warrant deduplication.
217#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218pub struct OverlapScore {
219 /// Character count of the overlap
220 pub char_count: usize,
221 /// Whether the overlap meets the minimum ratio threshold
222 pub ratio_met: bool,
223 /// Whether the overlap ends at a safe boundary
224 pub is_safe_boundary: bool,
225}
226
227impl OverlapScore {
228 /// Check if this overlap meets all thresholds for deduplication.
229 ///
230 /// # Arguments
231 /// * `thresholds` - The overlap thresholds to check against
232 ///
233 /// # Returns
234 /// * `true` - The overlap is strong enough for deduplication
235 /// * `false` - The overlap is too weak
236 #[must_use]
237 pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
238 self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
239 }
240
241 /// Check if the delta is short (below short chunk threshold).
242 ///
243 /// # Arguments
244 /// * `delta_len` - The length of the delta
245 /// * `thresholds` - The overlap thresholds
246 ///
247 /// # Returns
248 /// * `true` - The delta is considered short
249 /// * `false` - The delta is normal length
250 #[must_use]
251 #[cfg(test)]
252 pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
253 delta_len < thresholds.short_chunk_threshold
254 }
255}
256
257/// Score the quality of an overlap between delta and accumulated content.
258///
259/// This function computes multiple metrics about an overlap to determine
260/// if it's strong enough to warrant deduplication.
261///
262/// # Arguments
263/// * `delta` - The incoming delta
264/// * `accumulated` - The previously accumulated content
265///
266/// # Returns
267/// An `OverlapScore` containing:
268/// - `char_count`: The length of the overlap in characters
269/// - `ratio`: The overlap as a fraction of delta length
270/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
271///
272/// # Examples
273///
274/// ```ignore
275/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
276/// let score = score_overlap("Hello World! More text here", "Hello World!");
277/// assert!(score.char_count >= 30);
278/// assert!(score.ratio_met);
279/// assert!(score.is_safe_boundary);
280/// ```
281pub(super) fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
282 // Check if delta starts with accumulated (snapshot detection)
283 let overlap_len = if delta.starts_with(accumulated) {
284 accumulated.len()
285 } else {
286 0
287 };
288
289 // Calculate ratio as integer to avoid floating point precision issues
290 // We'll compare overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
291 // This avoids f64 casting entirely
292 let ratio_met = if delta.is_empty() {
293 false
294 } else {
295 // Check if overlap/delta >= MIN_OVERLAP_RATIO without floating point
296 // By cross-multiplying: overlap * 100 >= delta * MIN_OVERLAP_RATIO_INT
297 let overlap_scaled = overlap_len.saturating_mul(100);
298 let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
299 overlap_scaled >= threshold
300 };
301
302 // Check if the accumulated string ends at a safe boundary
303 // This is important because we don't want to dedupe if the accumulated
304 // string ends mid-word (e.g., accumulated="Hello" and delta="HelloWorld")
305 let is_safe_boundary = if overlap_len > 0 {
306 // Check if the last character of accumulated is a safe boundary
307 is_safe_boundary(accumulated, accumulated.len())
308 } else {
309 false
310 };
311
312 OverlapScore {
313 char_count: overlap_len,
314 ratio_met,
315 is_safe_boundary,
316 }
317}