use crate::layout::TextSpan;
use std::fmt::Write as FmtWrite;
#[derive(Debug, Clone)]
pub struct GapDecision {
pub gap_index: usize,
pub left_text: String,
pub right_text: String,
pub gap_pt: f32,
pub font_size: f32,
pub space_threshold_pt: f32,
pub adaptive_threshold_pt: f32,
pub needs_space_by_gap: bool,
pub needs_space_by_heuristic: bool,
pub needs_space_by_adaptive: bool,
pub space_inserted: bool,
pub reason: SpaceInsertReason,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SpaceInsertReason {
AdaptiveThreshold,
Heuristic,
AdaptiveAndHeuristic,
BelowThreshold,
NegativeGap,
}
impl std::fmt::Display for SpaceInsertReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SpaceInsertReason::AdaptiveThreshold => write!(f, "adaptive"),
SpaceInsertReason::Heuristic => write!(f, "heuristic"),
SpaceInsertReason::AdaptiveAndHeuristic => write!(f, "adaptive+heuristic"),
SpaceInsertReason::BelowThreshold => write!(f, "below-threshold"),
SpaceInsertReason::NegativeGap => write!(f, "negative-gap"),
}
}
}
#[derive(Debug, Clone, Default)]
pub struct PageGapStats {
pub page_num: usize,
pub span_count: usize,
pub gap_count: usize,
pub positive_gaps: usize,
pub negative_gaps: usize,
pub min_gap: f32,
pub max_gap: f32,
pub mean_gap: f32,
pub median_gap: f32,
pub p25: f32,
pub p75: f32,
}
#[derive(Debug, Clone)]
pub struct ThresholdComputation {
pub page_num: usize,
pub config_name: String,
pub multiplier: f32,
pub min_threshold: f32,
pub max_threshold: f32,
pub median_gap: f32,
pub computed_raw: f32,
pub computed_final: f32,
pub used_bimodal: bool,
pub reason: String,
}
#[derive(Debug, Default)]
pub struct SpanMergingDebugger {
pub current_page: usize,
pub gap_decisions: Vec<GapDecision>,
pub threshold_computations: Vec<ThresholdComputation>,
pub page_stats: Vec<PageGapStats>,
pub total_spaces_inserted: usize,
pub spaces_by_adaptive: usize,
pub spaces_by_heuristic: usize,
pub spaces_by_both: usize,
}
impl SpanMergingDebugger {
pub fn new() -> Self {
Self::default()
}
pub fn set_page(&mut self, page_num: usize) {
self.current_page = page_num;
}
pub fn record_threshold(
&mut self,
config_name: &str,
multiplier: f32,
min_threshold: f32,
max_threshold: f32,
median_gap: f32,
computed_raw: f32,
computed_final: f32,
used_bimodal: bool,
reason: &str,
) {
self.threshold_computations.push(ThresholdComputation {
page_num: self.current_page,
config_name: config_name.to_string(),
multiplier,
min_threshold,
max_threshold,
median_gap,
computed_raw,
computed_final,
used_bimodal,
reason: reason.to_string(),
});
}
pub fn record_page_stats(&mut self, stats: PageGapStats) {
self.page_stats.push(stats);
}
pub fn record_gap_decision(
&mut self,
gap_index: usize,
left_text: &str,
right_text: &str,
gap_pt: f32,
font_size: f32,
space_threshold_pt: f32,
adaptive_threshold_pt: f32,
needs_space_by_gap: bool,
needs_space_by_heuristic: bool,
needs_space_by_adaptive: bool,
space_inserted: bool,
) {
let reason = if gap_pt < 0.0 {
SpaceInsertReason::NegativeGap
} else if !space_inserted {
SpaceInsertReason::BelowThreshold
} else if needs_space_by_adaptive && needs_space_by_heuristic {
SpaceInsertReason::AdaptiveAndHeuristic
} else if needs_space_by_adaptive {
SpaceInsertReason::AdaptiveThreshold
} else if needs_space_by_heuristic {
SpaceInsertReason::Heuristic
} else {
SpaceInsertReason::BelowThreshold
};
if space_inserted {
self.total_spaces_inserted += 1;
match reason {
SpaceInsertReason::AdaptiveThreshold => self.spaces_by_adaptive += 1,
SpaceInsertReason::Heuristic => self.spaces_by_heuristic += 1,
SpaceInsertReason::AdaptiveAndHeuristic => self.spaces_by_both += 1,
_ => {},
}
}
let left_truncated = if left_text.len() > 20 {
format!("{}...", &left_text[..17])
} else {
left_text.to_string()
};
let right_truncated = if right_text.len() > 20 {
format!("{}...", &right_text[..17])
} else {
right_text.to_string()
};
self.gap_decisions.push(GapDecision {
gap_index,
left_text: left_truncated,
right_text: right_truncated,
gap_pt,
font_size,
space_threshold_pt,
adaptive_threshold_pt,
needs_space_by_gap,
needs_space_by_heuristic,
needs_space_by_adaptive,
space_inserted,
reason,
});
}
pub fn generate_page_report(&self, page_num: usize) -> String {
let mut report = String::new();
writeln!(report, "=== PAGE {} SPAN MERGING ANALYSIS ===", page_num).unwrap();
writeln!(report).unwrap();
if let Some(stats) = self.page_stats.iter().find(|s| s.page_num == page_num) {
writeln!(report, "Extracted {} spans from page {}", stats.span_count, page_num)
.unwrap();
writeln!(report).unwrap();
writeln!(report, "Gap Statistics:").unwrap();
writeln!(report, " Total gaps: {}", stats.gap_count).unwrap();
writeln!(report, " Positive gaps: {}", stats.positive_gaps).unwrap();
writeln!(report, " Negative gaps (overlaps): {}", stats.negative_gaps).unwrap();
writeln!(report, " Min: {:.2}pt", stats.min_gap).unwrap();
writeln!(report, " Max: {:.2}pt", stats.max_gap).unwrap();
writeln!(report, " Mean: {:.2}pt", stats.mean_gap).unwrap();
writeln!(report, " Median: {:.2}pt", stats.median_gap).unwrap();
writeln!(report, " P25: {:.2}pt, P75: {:.2}pt", stats.p25, stats.p75).unwrap();
writeln!(report).unwrap();
}
if let Some(thresh) = self
.threshold_computations
.iter()
.find(|t| t.page_num == page_num)
{
writeln!(report, "Adaptive Threshold Computation:").unwrap();
writeln!(
report,
" Config: {} [multiplier={}, min={}pt, max={}pt]",
thresh.config_name, thresh.multiplier, thresh.min_threshold, thresh.max_threshold
)
.unwrap();
if thresh.used_bimodal {
writeln!(report, " Method: Bimodal detection").unwrap();
} else {
writeln!(report, " Median gap: {:.2}pt", thresh.median_gap).unwrap();
writeln!(
report,
" Computed: {:.2}pt * {} = {:.2}pt",
thresh.median_gap, thresh.multiplier, thresh.computed_raw
)
.unwrap();
}
writeln!(
report,
" Clamped to: {:.2}pt (within [{}, {}])",
thresh.computed_final, thresh.min_threshold, thresh.max_threshold
)
.unwrap();
writeln!(report, " Reason: {}", thresh.reason).unwrap();
writeln!(report).unwrap();
}
let page_decisions: Vec<_> = self.gap_decisions.iter().collect();
if !page_decisions.is_empty() {
writeln!(report, "Space Insertion Analysis (first 30 gaps):").unwrap();
for (i, decision) in page_decisions.iter().take(30).enumerate() {
writeln!(
report,
" Gap {}: {:.2}pt (span \"{}\" -> \"{}\")",
i + 1,
decision.gap_pt,
decision.left_text,
decision.right_text
)
.unwrap();
writeln!(
report,
" - needs_space_by_gap ({:.2}pt): {} ({:.2} {} {:.2})",
decision.space_threshold_pt,
if decision.needs_space_by_gap {
"YES"
} else {
"NO"
},
decision.gap_pt,
if decision.needs_space_by_gap {
">"
} else {
"<"
},
decision.space_threshold_pt
)
.unwrap();
writeln!(
report,
" - needs_space_by_heuristic: {}",
if decision.needs_space_by_heuristic {
"YES"
} else {
"NO"
}
)
.unwrap();
writeln!(
report,
" - needs_space_by_adaptive ({:.2}pt): {} ({:.2} {} {:.2})",
decision.adaptive_threshold_pt,
if decision.needs_space_by_adaptive {
"YES"
} else {
"NO"
},
decision.gap_pt,
if decision.needs_space_by_adaptive {
">"
} else {
"<"
},
decision.adaptive_threshold_pt
)
.unwrap();
let marker = if decision.space_inserted {
"SPACE INSERTED"
} else {
"NO SPACE"
};
writeln!(report, " -> {} ({})", marker, decision.reason).unwrap();
writeln!(report).unwrap();
}
}
report
}
pub fn generate_summary(&self) -> String {
let mut report = String::new();
writeln!(report, "=== SPAN MERGING SUMMARY ===").unwrap();
writeln!(report).unwrap();
writeln!(report, "Total Spaces Inserted: {}", self.total_spaces_inserted).unwrap();
writeln!(report, " - By adaptive threshold: {} spaces", self.spaces_by_adaptive).unwrap();
writeln!(report, " - By heuristic: {} spaces", self.spaces_by_heuristic).unwrap();
writeln!(report, " - By both (adaptive+heuristic): {} spaces", self.spaces_by_both)
.unwrap();
writeln!(report).unwrap();
writeln!(report, "Per-Page Adaptive Thresholds:").unwrap();
for thresh in &self.threshold_computations {
writeln!(
report,
" Page {}: {:.2}pt (median: {:.2}pt, {})",
thresh.page_num,
thresh.computed_final,
thresh.median_gap,
if thresh.used_bimodal {
"bimodal"
} else {
"median*multiplier"
}
)
.unwrap();
}
report
}
}
pub fn compute_page_gap_stats(page_num: usize, spans: &[TextSpan]) -> PageGapStats {
if spans.len() < 2 {
return PageGapStats {
page_num,
span_count: spans.len(),
..Default::default()
};
}
let gaps: Vec<f32> = spans
.windows(2)
.map(|w| w[1].bbox.left() - w[0].bbox.right())
.collect();
let positive_gaps: Vec<f32> = gaps.iter().filter(|&&g| g > 0.0).copied().collect();
let negative_count = gaps.iter().filter(|&&g| g < 0.0).count();
let min = gaps.iter().copied().fold(f32::INFINITY, f32::min);
let max = gaps.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let mean = gaps.iter().sum::<f32>() / gaps.len() as f32;
let (median, p25, p75) = if !positive_gaps.is_empty() {
let mut sorted = positive_gaps.clone();
sorted.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
let len = sorted.len();
let median = sorted[len / 2];
let p25 = sorted[len / 4];
let p75 = sorted[3 * len / 4];
(median, p25, p75)
} else {
(0.0, 0.0, 0.0)
};
PageGapStats {
page_num,
span_count: spans.len(),
gap_count: gaps.len(),
positive_gaps: positive_gaps.len(),
negative_gaps: negative_count,
min_gap: min,
max_gap: max,
mean_gap: mean,
median_gap: median,
p25,
p75,
}
}