use crate::layout::TextSpan;
use log::debug;
#[derive(Debug, Clone)]
pub struct GapStatistics {
pub gaps: Vec<f32>,
pub count: usize,
pub min: f32,
pub max: f32,
pub mean: f32,
pub median: f32,
pub std_dev: f32,
pub p25: f32,
pub p75: f32,
pub p10: f32,
pub p90: f32,
}
impl GapStatistics {
pub fn iqr(&self) -> f32 {
self.p75 - self.p25
}
pub fn range(&self) -> f32 {
self.max - self.min
}
pub fn coefficient_of_variation(&self) -> f32 {
if self.mean > 0.0 {
self.std_dev / self.mean
} else {
0.0
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct AdaptiveThresholdConfig {
pub median_multiplier: f32,
pub min_threshold_pt: f32,
pub max_threshold_pt: f32,
pub use_iqr: bool,
pub min_samples: usize,
}
impl Default for AdaptiveThresholdConfig {
fn default() -> Self {
Self {
median_multiplier: 1.5,
min_threshold_pt: 0.05,
max_threshold_pt: 100.0, use_iqr: false,
min_samples: 10,
}
}
}
impl AdaptiveThresholdConfig {
pub fn new() -> Self {
Self::default()
}
pub fn balanced() -> Self {
Self::default()
}
pub fn aggressive() -> Self {
Self {
median_multiplier: 1.2,
min_threshold_pt: 0.05,
max_threshold_pt: 100.0, use_iqr: false,
min_samples: 10,
}
}
pub fn conservative() -> Self {
Self {
median_multiplier: 2.0,
min_threshold_pt: 0.05,
max_threshold_pt: 100.0, use_iqr: false,
min_samples: 10,
}
}
pub fn policy_documents() -> Self {
Self {
median_multiplier: 1.3,
min_threshold_pt: 0.08,
max_threshold_pt: 100.0, use_iqr: false,
min_samples: 10,
}
}
pub fn academic() -> Self {
Self {
median_multiplier: 1.6,
min_threshold_pt: 0.2,
max_threshold_pt: 100.0, use_iqr: false,
min_samples: 10,
}
}
pub fn with_multiplier(multiplier: f32) -> Self {
Self {
median_multiplier: multiplier,
..Default::default()
}
}
pub fn with_iqr(mut self, use_iqr: bool) -> Self {
self.use_iqr = use_iqr;
self
}
pub fn with_min_threshold(mut self, min_pt: f32) -> Self {
self.min_threshold_pt = min_pt;
self
}
pub fn with_max_threshold(mut self, max_pt: f32) -> Self {
self.max_threshold_pt = max_pt;
self
}
pub fn with_min_samples(mut self, count: usize) -> Self {
self.min_samples = count;
self
}
}
#[derive(Debug, Clone)]
pub struct AdaptiveThresholdResult {
pub threshold_pt: f32,
pub stats: Option<GapStatistics>,
pub reason: String,
}
pub fn extract_gaps(spans: &[TextSpan]) -> Vec<f32> {
if spans.len() < 2 {
return Vec::new();
}
let mut gaps = Vec::with_capacity(spans.len() - 1);
for i in 0..spans.len() - 1 {
let current_right = spans[i].bbox.right();
let next_left = spans[i + 1].bbox.left();
let gap = next_left - current_right;
gaps.push(gap);
}
gaps
}
pub fn calculate_statistics(mut gaps: Vec<f32>) -> Option<GapStatistics> {
if gaps.is_empty() {
return None;
}
let count = gaps.len();
let min = gaps.iter().copied().fold(f32::INFINITY, f32::min);
let max = gaps.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let sum: f32 = gaps.iter().sum();
let mean = sum / count as f32;
let variance: f32 = gaps.iter().map(|&g| (g - mean).powi(2)).sum::<f32>() / count as f32;
let std_dev = variance.sqrt();
gaps.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
let p10 = percentile(&gaps, 0.10);
let p25 = percentile(&gaps, 0.25);
let median = percentile(&gaps, 0.50);
let p75 = percentile(&gaps, 0.75);
let p90 = percentile(&gaps, 0.90);
Some(GapStatistics {
gaps,
count,
min,
max,
mean,
median,
std_dev,
p25,
p75,
p10,
p90,
})
}
pub fn determine_adaptive_threshold(
stats: &GapStatistics,
config: &AdaptiveThresholdConfig,
) -> f32 {
let base_threshold = if config.use_iqr {
stats.iqr() * config.median_multiplier
} else {
stats.median * config.median_multiplier
};
base_threshold
.max(config.min_threshold_pt)
.min(config.max_threshold_pt)
}
fn detect_word_boundary_threshold(spans: &[TextSpan]) -> Option<f32> {
let mut gaps: Vec<f32> = spans.windows(2)
.map(|w| w[1].bbox.left() - w[0].bbox.right())
.filter(|g| *g > 0.0) .collect();
if gaps.len() < 10 {
return None; }
gaps.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
let p75 = percentile(&gaps, 0.75);
if (2.0..=10.0).contains(&p75) {
debug!("Percentile-based threshold: P75 = {:.4}pt", p75);
Some(p75)
} else {
debug!("Percentile-based threshold: P75 = {:.4}pt (out of bounds 2-10pt)", p75);
None
}
}
pub fn analyze_document_gaps(
spans: &[TextSpan],
config: Option<AdaptiveThresholdConfig>,
) -> AdaptiveThresholdResult {
let config = config.unwrap_or_default();
debug!(
"Analyzing {} spans with config: multiplier={}, min={}pt, max={}pt, iqr={}",
spans.len(),
config.median_multiplier,
config.min_threshold_pt,
config.max_threshold_pt,
config.use_iqr
);
if spans.len() < 2 {
let reason = if spans.is_empty() {
"No spans provided".to_string()
} else {
"Single span: no gaps to analyze".to_string()
};
debug!("{}, using default threshold", reason);
return AdaptiveThresholdResult {
threshold_pt: 0.1,
stats: None,
reason,
};
}
if let Some(bimodal_threshold) = detect_word_boundary_threshold(spans) {
let reason =
format!("Bimodal detection: identified word boundary at {:.4}pt", bimodal_threshold);
debug!("Using bimodal threshold: {}", reason);
return AdaptiveThresholdResult {
threshold_pt: bimodal_threshold,
stats: None,
reason,
};
}
let gaps = extract_gaps(spans);
debug!("Extracted {} gaps from {} spans", gaps.len(), spans.len());
if gaps.len() < config.min_samples {
let reason = format!(
"Insufficient samples: {} gaps < min_samples ({}), using default",
gaps.len(),
config.min_samples
);
debug!("{}", reason);
return AdaptiveThresholdResult {
threshold_pt: 0.1,
stats: None,
reason,
};
}
let positive_gaps: Vec<f32> = gaps.iter().filter(|g| **g > 0.0).copied().collect();
let gaps_to_analyze = if positive_gaps.len() >= 10 {
debug!(
"Filtered to {} positive gaps (from {} total gaps)",
positive_gaps.len(),
gaps.len()
);
positive_gaps
} else {
debug!("Not enough positive gaps ({}) to filter, using all gaps", positive_gaps.len());
gaps
};
let stats = match calculate_statistics(gaps_to_analyze) {
Some(s) => s,
None => {
let reason = "Failed to calculate statistics".to_string();
debug!("{}", reason);
return AdaptiveThresholdResult {
threshold_pt: 0.1,
stats: None,
reason,
};
},
};
let threshold_pt = determine_adaptive_threshold(&stats, &config);
let base_value = if config.use_iqr {
format!("IQR={:.3}pt", stats.iqr())
} else {
format!("median={:.3}pt", stats.median)
};
let reason = format!(
"Computed from {} gaps: {} * {:.1} = {:.3}pt (clamped to {:.3}pt)",
stats.count,
base_value,
config.median_multiplier,
if config.use_iqr {
stats.iqr() * config.median_multiplier
} else {
stats.median * config.median_multiplier
},
threshold_pt
);
debug!("Threshold analysis: {}", reason);
AdaptiveThresholdResult {
threshold_pt,
stats: Some(stats),
reason,
}
}
fn percentile(sorted_values: &[f32], percentile: f32) -> f32 {
if sorted_values.is_empty() {
return 0.0;
}
if sorted_values.len() == 1 {
return sorted_values[0];
}
let index = percentile * (sorted_values.len() - 1) as f32;
let lower_index = index.floor() as usize;
let upper_index = (lower_index + 1).min(sorted_values.len() - 1);
if lower_index == upper_index {
sorted_values[lower_index]
} else {
let fraction = index - lower_index as f32;
sorted_values[lower_index] * (1.0 - fraction) + sorted_values[upper_index] * fraction
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_percentile_single_value() {
let values = vec![5.0];
assert_eq!(percentile(&values, 0.5), 5.0);
}
#[test]
fn test_percentile_two_values() {
let values = vec![1.0, 3.0];
assert_eq!(percentile(&values, 0.0), 1.0);
assert_eq!(percentile(&values, 1.0), 3.0);
assert_eq!(percentile(&values, 0.5), 2.0);
}
#[test]
fn test_percentile_many_values() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
assert_eq!(percentile(&values, 0.0), 1.0);
assert_eq!(percentile(&values, 1.0), 10.0);
assert_eq!(percentile(&values, 0.5), 5.5);
}
#[test]
fn test_extract_gaps() {
use crate::geometry::Rect;
let spans = vec![
TextSpan {
artifact_type: None,
text: "Hello".to_string(),
bbox: Rect::new(0.0, 0.0, 30.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color::new(0.0, 0.0, 0.0),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "World".to_string(),
bbox: Rect::new(35.0, 0.0, 30.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color::new(0.0, 0.0, 0.0),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let gaps = extract_gaps(&spans);
assert_eq!(gaps.len(), 1);
assert_eq!(gaps[0], 5.0); }
#[test]
fn test_extract_gaps_empty() {
let gaps = extract_gaps(&[]);
assert!(gaps.is_empty());
}
#[test]
fn test_calculate_statistics() {
let gaps = vec![0.1, 0.2, 0.15, 0.25, 0.3];
let stats = calculate_statistics(gaps).unwrap();
assert_eq!(stats.count, 5);
assert_eq!(stats.min, 0.1);
assert_eq!(stats.max, 0.3);
assert!(stats.mean > 0.19 && stats.mean < 0.21); }
#[test]
fn test_calculate_statistics_empty() {
let gaps = vec![];
assert!(calculate_statistics(gaps).is_none());
}
#[test]
fn test_gap_statistics_iqr() {
let gaps = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let stats = calculate_statistics(gaps).unwrap();
let iqr = stats.iqr();
assert!(iqr > 0.0);
}
#[test]
fn test_adaptive_threshold_config_defaults() {
let config = AdaptiveThresholdConfig::default();
assert_eq!(config.median_multiplier, 1.5);
assert_eq!(config.min_threshold_pt, 0.05);
assert_eq!(config.max_threshold_pt, 100.0);
assert!(!config.use_iqr);
assert_eq!(config.min_samples, 10);
}
#[test]
fn test_adaptive_threshold_config_aggressive() {
let config = AdaptiveThresholdConfig::aggressive();
assert_eq!(config.median_multiplier, 1.2);
}
#[test]
fn test_adaptive_threshold_config_conservative() {
let config = AdaptiveThresholdConfig::conservative();
assert_eq!(config.median_multiplier, 2.0);
}
#[test]
fn test_determine_threshold_clamping() {
let gaps = vec![0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01];
let stats = calculate_statistics(gaps).unwrap();
let config = AdaptiveThresholdConfig::default();
let threshold = determine_adaptive_threshold(&stats, &config);
assert!(threshold >= config.min_threshold_pt);
assert!(threshold <= config.max_threshold_pt);
}
#[test]
fn test_analyze_document_gaps_empty() {
let result = analyze_document_gaps(&[], None);
assert_eq!(result.threshold_pt, 0.1);
assert!(result.stats.is_none());
}
#[test]
fn test_analyze_document_gaps_insufficient_samples() {
use crate::geometry::Rect;
let spans = vec![
TextSpan {
artifact_type: None,
text: "A".to_string(),
bbox: Rect::new(0.0, 0.0, 10.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color::new(0.0, 0.0, 0.0),
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "B".to_string(),
bbox: Rect::new(15.0, 0.0, 10.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color::new(0.0, 0.0, 0.0),
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let result = analyze_document_gaps(&spans, None);
assert_eq!(result.threshold_pt, 0.1);
assert!(result.stats.is_none());
}
}