pub trait ThresholdEnvironment {
fn get_var(&self, name: &str) -> Option<String>;
}
pub struct RealThresholdEnvironment;
impl ThresholdEnvironment for RealThresholdEnvironment {
fn get_var(&self, name: &str) -> Option<String> {
std::env::var(name).ok()
}
}
const DEFAULT_MIN_OVERLAP_CHARS: usize = 30;
const MIN_OVERLAP_RATIO_INT: usize = 50;
const DEFAULT_SHORT_CHUNK_THRESHOLD: usize = 20;
const DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 3;
const MIN_MIN_OVERLAP_CHARS: usize = 10;
const MAX_MIN_OVERLAP_CHARS: usize = 100;
const MIN_SHORT_CHUNK_THRESHOLD: usize = 5;
const MAX_SHORT_CHUNK_THRESHOLD: usize = 50;
const MIN_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 2;
const MAX_CONSECUTIVE_DUPLICATE_THRESHOLD: usize = 10;
#[derive(Debug, Clone, Copy)]
pub struct OverlapThresholds {
pub min_overlap_chars: usize,
pub short_chunk_threshold: usize,
pub consecutive_duplicate_threshold: usize,
}
impl Default for OverlapThresholds {
fn default() -> Self {
Self {
min_overlap_chars: DEFAULT_MIN_OVERLAP_CHARS,
short_chunk_threshold: DEFAULT_SHORT_CHUNK_THRESHOLD,
consecutive_duplicate_threshold: DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD,
}
}
}
pub fn get_overlap_thresholds_with_env(env: &dyn ThresholdEnvironment) -> OverlapThresholds {
let min_overlap_chars = env
.get_var("RALPH_STREAMING_MIN_OVERLAP_CHARS")
.and_then(|s| s.parse::<usize>().ok())
.and_then(|v| {
if (MIN_MIN_OVERLAP_CHARS..=MAX_MIN_OVERLAP_CHARS).contains(&v) {
Some(v)
} else {
None
}
})
.unwrap_or(DEFAULT_MIN_OVERLAP_CHARS);
let short_chunk_threshold = env
.get_var("RALPH_STREAMING_SHORT_CHUNK_THRESHOLD")
.and_then(|s| s.parse::<usize>().ok())
.and_then(|v| {
if (MIN_SHORT_CHUNK_THRESHOLD..=MAX_SHORT_CHUNK_THRESHOLD).contains(&v) {
Some(v)
} else {
None
}
})
.unwrap_or(DEFAULT_SHORT_CHUNK_THRESHOLD);
let consecutive_duplicate_threshold = env
.get_var("RALPH_STREAMING_CONSECUTIVE_DUPLICATE_THRESHOLD")
.and_then(|s| s.parse::<usize>().ok())
.and_then(|v| {
if (MIN_CONSECUTIVE_DUPLICATE_THRESHOLD..=MAX_CONSECUTIVE_DUPLICATE_THRESHOLD)
.contains(&v)
{
Some(v)
} else {
None
}
})
.unwrap_or(DEFAULT_CONSECUTIVE_DUPLICATE_THRESHOLD);
OverlapThresholds {
min_overlap_chars,
short_chunk_threshold,
consecutive_duplicate_threshold,
}
}
pub fn get_overlap_thresholds() -> OverlapThresholds {
get_overlap_thresholds_with_env(&RealThresholdEnvironment)
}
fn is_safe_boundary(text: &str, pos: usize) -> bool {
if pos >= text.len() {
return true;
}
let char_at_pos = text[pos..].chars().next();
char_at_pos
.is_none_or(|c| c.is_whitespace() || c.is_ascii_punctuation() || c.is_ascii_control())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct OverlapScore {
pub char_count: usize,
pub ratio_met: bool,
pub is_safe_boundary: bool,
}
impl OverlapScore {
#[must_use]
pub const fn meets_thresholds(&self, thresholds: &OverlapThresholds) -> bool {
self.char_count >= thresholds.min_overlap_chars && self.ratio_met && self.is_safe_boundary
}
#[must_use]
#[cfg(test)]
pub const fn is_short_delta(delta_len: usize, thresholds: &OverlapThresholds) -> bool {
delta_len < thresholds.short_chunk_threshold
}
}
pub(super) fn score_overlap(delta: &str, accumulated: &str) -> OverlapScore {
let overlap_len = compute_overlap_len(delta, accumulated);
let ratio_met = compute_ratio_met(delta, overlap_len);
let is_safe_boundary = overlap_len > 0 && is_safe_boundary(accumulated, accumulated.len());
OverlapScore {
char_count: overlap_len,
ratio_met,
is_safe_boundary,
}
}
fn compute_overlap_len(delta: &str, accumulated: &str) -> usize {
if delta.starts_with(accumulated) {
accumulated.len()
} else {
0
}
}
fn compute_ratio_met(delta: &str, overlap_len: usize) -> bool {
if delta.is_empty() {
return false;
}
let overlap_scaled = overlap_len.saturating_mul(100);
let threshold = delta.len().saturating_mul(MIN_OVERLAP_RATIO_INT);
overlap_scaled >= threshold
}