use crate::geometry::Rect;
use crate::layout::TextChar;
#[derive(Debug, Clone)]
pub struct DocumentProperties {
pub median_font_size: f32,
pub median_char_width: f32,
pub median_line_spacing: f32,
pub page_width: f32,
pub page_height: f32,
pub column_count: usize,
pub avg_chars_per_line: f32,
pub line_y_variance: f32,
}
impl DocumentProperties {
pub fn analyze(chars: &[TextChar], page_bbox: Rect) -> Result<Self, String> {
if chars.is_empty() {
return Err("Cannot analyze empty page".into());
}
let median_font_size = Self::compute_median_font_size(chars);
let median_char_width = Self::compute_median_char_width(chars);
let (median_line_spacing, avg_chars_per_line, line_y_variance) =
Self::estimate_line_properties(chars);
let column_count = Self::detect_column_count(chars, page_bbox.width);
Ok(Self {
median_font_size,
median_char_width,
median_line_spacing,
page_width: page_bbox.width,
page_height: page_bbox.height,
column_count,
avg_chars_per_line,
line_y_variance,
})
}
fn compute_median_font_size(chars: &[TextChar]) -> f32 {
let mut font_sizes: Vec<f32> = chars.iter().map(|c| c.font_size).collect();
font_sizes.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
if font_sizes.is_empty() {
return 12.0; }
font_sizes[font_sizes.len() / 2]
}
fn compute_median_char_width(chars: &[TextChar]) -> f32 {
let mut widths: Vec<f32> = chars.iter().map(|c| c.bbox.width).collect();
widths.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
if widths.is_empty() {
return 6.0; }
widths[widths.len() / 2]
}
fn estimate_line_properties(chars: &[TextChar]) -> (f32, f32, f32) {
if chars.is_empty() {
return (12.0, 50.0, 0.0);
}
use std::collections::HashMap;
let mut y_bins: HashMap<i32, Vec<&TextChar>> = HashMap::new();
for ch in chars {
let y_bin = (ch.bbox.y / 5.0).round() as i32;
y_bins.entry(y_bin).or_default().push(ch);
}
let mut line_ys: Vec<f32> = y_bins.keys().map(|&k| k as f32 * 5.0).collect();
line_ys.sort_by(|a, b| crate::utils::safe_float_cmp(*b, *a));
let mut spacings = Vec::new();
for i in 0..line_ys.len().saturating_sub(1) {
let spacing = (line_ys[i] - line_ys[i + 1]).abs();
if spacing > 0.1 {
spacings.push(spacing);
}
}
let median_line_spacing = if spacings.is_empty() {
12.0
} else {
spacings.sort_by(|a, b| crate::utils::safe_float_cmp(*a, *b));
spacings[spacings.len() / 2]
};
let total_lines = y_bins.len() as f32;
let avg_chars_per_line = if total_lines > 0.0 {
chars.len() as f32 / total_lines
} else {
50.0
};
let mean_y = line_ys.iter().sum::<f32>() / line_ys.len().max(1) as f32;
let variance = line_ys.iter().map(|&y| (y - mean_y).powi(2)).sum::<f32>()
/ line_ys.len().max(1) as f32;
(median_line_spacing, avg_chars_per_line, variance)
}
fn detect_column_count(chars: &[TextChar], page_width: f32) -> usize {
if chars.is_empty() {
return 1;
}
const BIN_WIDTH: f32 = 10.0; let bin_count = (page_width / BIN_WIDTH).ceil() as usize;
let mut bins = vec![0usize; bin_count];
for ch in chars {
let bin = (ch.bbox.x / BIN_WIDTH).floor() as usize;
if bin < bin_count {
bins[bin] += 1;
}
}
let max_density = *bins.iter().max().unwrap_or(&1);
let gap_threshold = (max_density as f32 * 0.1) as usize;
let mut gap_count = 0;
let mut in_gap = false;
let mut gap_width = 0;
let mut has_content = false;
for &density in &bins {
if density <= gap_threshold {
if !in_gap {
in_gap = true;
gap_width = 1;
} else {
gap_width += 1;
}
} else {
if in_gap && gap_width >= 3 && has_content {
gap_count += 1;
}
in_gap = false;
gap_width = 0;
has_content = true;
}
}
(gap_count + 1).min(4)
}
}
#[derive(Debug, Clone)]
pub struct AdaptiveLayoutParams {
pub xy_cut_min_gap_ratio: f32,
pub word_gap_threshold: f32,
pub line_gap_threshold: f32,
pub column_gap_threshold: f32,
pub xy_cut_max_depth: u32,
pub xy_cut_min_region_size: f32,
pub gaussian_sigma: f32,
}
impl AdaptiveLayoutParams {
pub fn from_properties(props: &DocumentProperties) -> Self {
Self {
xy_cut_min_gap_ratio: 0.05,
word_gap_threshold: props.median_char_width * 0.3,
line_gap_threshold: (props.median_line_spacing * 1.3).min(props.median_font_size * 0.8),
column_gap_threshold: props.median_font_size * 2.0,
xy_cut_max_depth: 10,
xy_cut_min_region_size: (props.page_width * props.page_height * 0.05).sqrt(),
gaussian_sigma: {
let density = props.avg_chars_per_line;
if density < 30.0 {
0.5 } else if density < 60.0 {
1.5 } else {
2.5 }
},
}
}
pub fn default_for_letter_pdf() -> Self {
Self {
xy_cut_min_gap_ratio: 0.05,
word_gap_threshold: 3.0, line_gap_threshold: 15.0, column_gap_threshold: 24.0, xy_cut_max_depth: 10,
xy_cut_min_region_size: 50.0,
gaussian_sigma: 2.0, }
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::layout::{Color, FontWeight};
fn mock_char(x: f32, y: f32, font_size: f32) -> TextChar {
let bbox = Rect::new(x, y, 6.0, font_size);
TextChar {
char: 'x',
bbox,
font_name: "Times".to_string(),
font_size,
font_weight: FontWeight::Normal,
color: Color::black(),
mcid: None,
is_italic: false,
is_monospace: false,
origin_x: bbox.x,
origin_y: bbox.y,
rotation_degrees: 0.0,
advance_width: bbox.width,
matrix: None,
}
}
#[test]
fn test_median_font_size() {
let chars = vec![
mock_char(0.0, 100.0, 10.0),
mock_char(10.0, 100.0, 12.0),
mock_char(20.0, 100.0, 12.0),
mock_char(30.0, 100.0, 14.0),
mock_char(40.0, 100.0, 16.0),
];
let median = DocumentProperties::compute_median_font_size(&chars);
assert_eq!(median, 12.0);
}
#[test]
fn test_column_detection_single() {
let mut chars = Vec::new();
for i in 0..100 {
chars.push(mock_char(100.0 + (i % 10) as f32 * 10.0, 100.0, 12.0));
}
let columns = DocumentProperties::detect_column_count(&chars, 612.0);
assert_eq!(columns, 1);
}
#[test]
fn test_column_detection_double() {
let mut chars = Vec::new();
for i in 0..50 {
chars.push(mock_char(50.0 + (i % 15) as f32 * 10.0, 100.0, 12.0));
}
for i in 0..50 {
chars.push(mock_char(350.0 + (i % 15) as f32 * 10.0, 100.0, 12.0));
}
let columns = DocumentProperties::detect_column_count(&chars, 612.0);
assert_eq!(columns, 2);
}
#[test]
fn test_adaptive_params_from_properties() {
let chars = vec![
mock_char(0.0, 100.0, 12.0),
mock_char(10.0, 100.0, 12.0),
mock_char(20.0, 85.0, 12.0),
mock_char(30.0, 85.0, 12.0),
];
let page_bbox = Rect::new(0.0, 0.0, 612.0, 792.0);
let props = DocumentProperties::analyze(&chars, page_bbox).unwrap();
let params = AdaptiveLayoutParams::from_properties(&props);
assert!(params.word_gap_threshold > 0.0);
assert!(params.line_gap_threshold > 0.0);
assert!(params.column_gap_threshold > 0.0);
assert!(params.word_gap_threshold < params.column_gap_threshold);
}
}