ralph_workflow/json_parser/deduplication/thresholds.rs
1// Threshold configuration and overlap detection for deduplication.
2//
3// Contains:
4// - ThresholdEnvironment trait for testability
5// - Configuration constants
6// - OverlapThresholds struct
7// - Boundary detection (is_safe_boundary)
8// - OverlapScore struct and scoring functions
9
10/// Trait for accessing environment variables.
11///
12/// This trait enables dependency injection for testing without global state pollution.
13pub trait ThresholdEnvironment {
14 /// Get an environment variable by name.
15 fn get_var(&self, name: &str) -> Option<String>;
16}
17
18/// Production implementation that reads from actual environment.
19pub struct RealThresholdEnvironment;
20
21impl ThresholdEnvironment for RealThresholdEnvironment {
22 fn get_var(&self, name: &str) -> Option<String> {
23 std::env::var(name).ok()
24 }
25}
26
27// ============================================================================
28// Configuration Constants for Strong Overlap Detection
29// ============================================================================
30
31/// Default minimum overlap character count for deduplication.
32///
33/// Overlaps must be at least this many characters to be considered for deduplication.
34/// This prevents false positives from short accidental matches (e.g., "the", "and").
35const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
36
37/// Minimum overlap ratio expressed as integer (50 = 50%).
38/// Used for integer-based ratio comparison to avoid floating point issues.
39const MIN_OVERLAP_RATIO_INT: usize = 50;
40
41/// Default threshold for considering a chunk "short".
42///
43/// Short chunks (< this many chars) are never deduped unless they're exact matches
44/// with the accumulated content. This prevents aggressive deduplication of tokens
45/// like ".", "\n", "Ok" that are legitimately repeated.
46const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
47
48/// Default threshold for consecutive duplicate detection.
49///
50/// If the exact same chunk arrives this many times in a row, it's treated as a
51/// resend glitch and dropped entirely.
52const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
53
54/// Minimum allowed value for `MIN_OVERLAP_CHARS`.
55const MIN_MIN_OVERLAP_CHARS: usize = 10;
56
57/// Maximum allowed value for `MIN_OVERLAP_CHARS`.
58const MAX_MIN_OVERLAP_CHARS: usize = 100;
59
60/// Minimum allowed value for `SHORT_CHUNK_THRESHOLD`.
61const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
62
63/// Maximum allowed value for `SHORT_CHUNK_THRESHOLD`.
64const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
65
66/// Minimum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
67const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
68
69/// Maximum allowed value for `CONSECUTIVE_DUPLICATE_THRESHOLD`.
70const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
71
72/// Configuration for strong overlap detection.
73///
74/// This struct holds the tunable thresholds that determine when an overlap
75/// is "strong enough" to warrant deduplication.
76#[derive(Debug, Clone, Copy)]
77pub struct OverlapThresholds {
78 /// Minimum character count for overlap
79 pub min_overlap_chars: usize,
80 /// Threshold below which chunks are considered "short"
81 pub short_chunk_threshold: usize,
82 /// Number of consecutive duplicates before aggressive dedupe
83 pub consecutive_duplicate_threshold: usize,
84}
85
86impl Default for OverlapThresholds {
87 fn default() -> Self {
88 Self {
89 min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
90 short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
91 consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
92 }
93 }
94}
95
96/// Testable variant that accepts an environment trait for dependency injection.
97///
98/// This allows tests to mock environment variables without global state pollution.
99///
100/// Reads the following environment variables:
101/// - `RALPH_STREAMING_MIN_OVERLAP_CHARS`: Minimum overlap characters (default: 30, range: 10-100)
102/// - `RALPH_STREAMING_SHORT_CHUNK_THRESHOLD`: Short chunk threshold (default: 20, range: 5-50)
103/// - `RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD`: Consecutive duplicate threshold (default: 3, range: 2-10)
104pub fn get_overlap_thresholds_with_env(env: &dyn ThresholdEnvironment) -> OverlapThresholds {
105 let min_overlap_chars = env
106 .get_var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
107 .and_then(|s| s.parse::<usize>().ok())
108 .and_then(|v| {
109 if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
110 Some(v)
111 } else {
112 None
113 }
114 })
115 .unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
116
117 let short_chunk_threshold = env
118 .get_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
119 .and_then(|s| s.parse::<usize>().ok())
120 .and_then(|v| {
121 if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
122 Some(v)
123 } else {
124 None
125 }
126 })
127 .unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
128
129 let consecutive_duplicate_threshold = env
130 .get_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
131 .and_then(|s| s.parse::<usize>().ok())
132 .and_then(|v| {
133 if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
134 .contains(&v)
135 {
136 Some(v)
137 } else {
138 None
139 }
140 })
141 .unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
142
143 OverlapThresholds {
144 min_overlap_chars,
145 short_chunk_threshold,
146 consecutive_duplicate_threshold,
147 }
148}
149
150pub fn get_overlap_thresholds() -> OverlapThresholds {
151 get_overlap_thresholds_with_env(&RealThresholdEnvironment)
152}
153
154// ============================================================================
155// Boundary Detection
156// ============================================================================
157
158/// Check if a character position is at a safe boundary for deduplication.
159///
160/// A "safe boundary" is where the overlap ends at a natural break point in text:
161/// - Whitespace (space, tab, newline, etc.)
162/// - ASCII punctuation (.,!?;:, etc.)
163/// - End of string
164///
165/// This prevents deduplication from splitting words or tokens mid-way through,
166/// which could cause incorrect rendering of intentional repetitions.
167///
168/// # Arguments
169/// * `text` - The text to check
170/// * `pos` - The position in the text (byte offset)
171///
172/// # Returns
173/// * `true` - The position is at a safe boundary for deduplication
174/// * `false` - The position is NOT at a safe boundary (mid-word, etc.)
175///
176/// # Examples
177///
178/// ```ignore
179/// // Safe: overlap ends at space
180/// assert!(is_safe_boundary("Hello World", 11)); // After "World"
181///
182/// // Safe: overlap ends at punctuation
183/// assert!(is_safe_boundary("Hello, World!", 12)); // After "!"
184///
185/// // Unsafe: overlap ends mid-word
186/// assert!(!is_safe_boundary("HelloWorld", 5)); // After "Hello"
187/// ```
188fn is_safe_boundary(text: &str, pos: usize) -> bool {
189 // End of string is always safe
190 if pos >= text.len() {
191 return true;
192 }
193
194 // Get the character at the boundary position
195 // We need to use character iteration for Unicode safety
196 let char_at_pos = text[pos..].chars().next();
197
198 char_at_pos
199 .is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
200}
201
202// ============================================================================
203// Overlap Quality Scoring
204// ============================================================================
205
206/// Score representing the "strength" of an overlap.
207///
208/// This struct captures multiple metrics about an overlap to determine
209/// if it's strong enough to warrant deduplication.
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211pub struct OverlapScore {
212 /// Character count of the overlap
213 pub char_count: usize,
214 /// Whether the overlap meets the minimum ratio threshold
215 pub ratio_met: bool,
216 /// Whether the overlap ends at a safe boundary
217 pub is_safe_boundary: bool,
218}
219
220impl OverlapScore {
221 /// Check if this overlap meets all thresholds for deduplication.
222 ///
223 /// # Arguments
224 /// * `thresholds` - The overlap thresholds to check against
225 ///
226 /// # Returns
227 /// * `true` - The overlap is strong enough for deduplication
228 /// * `false` - The overlap is too weak
229 #[must_use]
230 pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
231 self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
232 }
233
234 /// Check if the delta is short (below short chunk threshold).
235 ///
236 /// # Arguments
237 /// * `delta_len` - The length of the delta
238 /// * `thresholds` - The overlap thresholds
239 ///
240 /// # Returns
241 /// * `true` - The delta is considered short
242 /// * `false` - The delta is normal length
243 #[must_use]
244 #[cfg(test)]
245 pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
246 delta_len < thresholds.short_chunk_threshold
247 }
248}
249
250/// Score the quality of an overlap between delta and accumulated content.
251///
252/// This function computes multiple metrics about an overlap to determine
253/// if it's strong enough to warrant deduplication.
254///
255/// # Arguments
256/// * `delta` - The incoming delta
257/// * `accumulated` - The previously accumulated content
258///
259/// # Returns
260/// An `OverlapScore` containing:
261/// - `char_count`: The length of the overlap in characters
262/// - `ratio`: The overlap as a fraction of delta length
263/// - `is_safe_boundary`: Whether the overlap ends at a safe boundary
264///
265/// # Examples
266///
267/// ```ignore
268/// // Strong overlap (30+ chars, 50%+ ratio, safe boundary)
269/// let score = score_overlap("Hello World! More text here", "Hello World!");
270/// assert!(score.char_count >= 30);
271/// assert!(score.ratio_met);
272/// assert!(score.is_safe_boundary);
273/// ```
274pub(super) fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
275 let overlap_len = compute_overlap_len(delta, accumulated);
276 let ratio_met = compute_ratio_met(delta, overlap_len);
277 let is_safe_boundary = overlap_len > 0 && is_safe_boundary(accumulated, accumulated.len());
278
279 OverlapScore {
280 char_count: overlap_len,
281 ratio_met,
282 is_safe_boundary,
283 }
284}
285
286/// Compute the overlap length: number of chars from `accumulated` that prefix `delta`.
287fn compute_overlap_len(delta: &str, accumulated: &str) -> usize {
288 if delta.starts_with(accumulated) {
289 accumulated.len()
290 } else {
291 0
292 }
293}
294
295/// Check if the overlap ratio meets the minimum threshold using integer arithmetic.
296fn compute_ratio_met(delta: &str, overlap_len: usize) -> bool {
297 if delta.is_empty() {
298 return false;
299 }
300 // Cross-multiply to avoid floating point: overlap/delta >= MIN_OVERLAP_RATIO_INT/100
301 let overlap_scaled = overlap_len.saturating_mul(100);
302 let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
303 overlap_scaled >= threshold
304}