use super::TextSpan;
#[derive(Debug, Clone, PartialEq)]
pub struct PageFontStats {
pub dominant_em: f32,
pub dominant_line_height: f32,
pub dominant_char_width: f32,
pub body_font_name: String,
}
impl Default for PageFontStats {
fn default() -> Self {
Self {
dominant_em: 12.0,
dominant_line_height: 14.4,
dominant_char_width: 6.0,
body_font_name: String::new(),
}
}
}
impl PageFontStats {
pub fn from_spans(spans: &[TextSpan]) -> Self {
if spans.is_empty() {
return Self::default();
}
let mut size_buckets: std::collections::HashMap<u32, usize> =
std::collections::HashMap::new();
let mut font_buckets: std::collections::HashMap<&str, usize> =
std::collections::HashMap::new();
for s in spans {
let chars = s.text.chars().count();
if chars == 0 || !s.font_size.is_finite() || s.font_size <= 0.0 {
continue;
}
let bucket = (s.font_size * 4.0).round() as u32;
*size_buckets.entry(bucket).or_insert(0) += chars;
*font_buckets.entry(s.font_name.as_str()).or_insert(0) += chars;
}
if size_buckets.is_empty() {
return Self::default();
}
let dominant_em = {
let (bucket, _) = size_buckets
.iter()
.max_by_key(|(_, &count)| count)
.expect("size_buckets non-empty checked above");
(*bucket as f32) / 4.0
};
let body_font_name = font_buckets
.iter()
.max_by_key(|(_, &count)| count)
.map(|(name, _)| (*name).to_string())
.unwrap_or_default();
let dominant_line_height =
compute_line_height(spans, &body_font_name, dominant_em).unwrap_or(dominant_em * 1.2);
let mut total_width = 0.0_f64;
let mut total_chars = 0_usize;
let dominant_size_min = dominant_em - 0.25;
let dominant_size_max = dominant_em + 0.25;
for s in spans {
if s.font_name == body_font_name
&& s.font_size >= dominant_size_min
&& s.font_size <= dominant_size_max
&& s.bbox.width > 0.0
{
let chars = s.text.chars().count();
if chars > 0 {
total_width += s.bbox.width as f64;
total_chars += chars;
}
}
}
let dominant_char_width = if total_chars > 0 {
(total_width / total_chars as f64) as f32
} else {
dominant_em * 0.5
};
Self {
dominant_em,
dominant_line_height,
dominant_char_width,
body_font_name,
}
}
}
fn compute_line_height(spans: &[TextSpan], body_font: &str, dominant_em: f32) -> Option<f32> {
let mut ys: Vec<f32> = spans
.iter()
.filter(|s| {
s.font_name == body_font
&& (s.font_size - dominant_em).abs() < 0.5
&& s.bbox.y.is_finite()
})
.map(|s| s.bbox.y)
.collect();
if ys.len() < 4 {
return None;
}
ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
ys.dedup_by(|later, earlier| (*earlier - *later).abs() < 0.5);
let mut gaps: Vec<f32> = ys
.windows(2)
.map(|w| w[0] - w[1])
.filter(|&g| g >= dominant_em * 0.5 && g <= dominant_em * 3.0)
.collect();
if gaps.is_empty() {
return None;
}
gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
Some(gaps[gaps.len() / 2])
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
fn span(text: &str, x: f32, y: f32, font: &str, size: f32) -> TextSpan {
TextSpan {
text: text.to_string(),
bbox: Rect::new(x, y, text.chars().count() as f32 * size * 0.5, size),
font_name: font.to_string(),
font_size: size,
..Default::default()
}
}
#[test]
fn empty_spans_returns_default() {
let stats = PageFontStats::from_spans(&[]);
assert_eq!(stats.dominant_em, 12.0);
assert_eq!(stats.dominant_line_height, 14.4);
assert_eq!(stats.dominant_char_width, 6.0);
assert!(stats.body_font_name.is_empty());
}
#[test]
fn single_uniform_block_picks_size_and_font() {
let mut spans = Vec::new();
let mut y = 720.0;
for i in 0..20 {
spans.push(span(&format!("body line {i:02}"), 72.0, y, "Helvetica", 12.0));
y -= 14.4;
}
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
assert_eq!(stats.body_font_name, "Helvetica");
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.01,
"expected ~14.4, got {}",
stats.dominant_line_height
);
assert!(
(stats.dominant_char_width - 6.0).abs() < 0.01,
"expected ~6.0, got {}",
stats.dominant_char_width
);
}
#[test]
fn mode_not_mean_when_outliers_present() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("two body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("X", 72.0, 720.0, "Helvetica", 72.0));
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
}
#[test]
fn body_font_is_majority_by_char_count() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("body line text", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("Fig 1", 200.0, 400.0, "Times", 9.0));
spans.push(span("Fig 2", 200.0, 300.0, "Times", 9.0));
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.body_font_name, "Helvetica");
}
#[test]
fn quantized_size_bucket_collapses_near_duplicates() {
let spans = vec![
span("aaaaa", 72.0, 720.0, "Helvetica", 11.97),
span("bbbbb", 72.0, 706.0, "Helvetica", 12.00),
span("ccccc", 72.0, 692.0, "Helvetica", 12.03),
span("ddddd", 72.0, 678.0, "Helvetica", 12.00),
];
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_em - 12.0).abs() < 0.05,
"expected 12.0, got {}",
stats.dominant_em
);
}
#[test]
fn line_height_falls_back_to_1_2_em_when_unmeasurable() {
let spans = vec![span("solo", 72.0, 720.0, "Helvetica", 12.0)];
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.01,
"fallback expected, got {}",
stats.dominant_line_height
);
}
#[test]
fn char_width_uses_dominant_font_only() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("CAPTION CAPTION CAPTION", 200.0, 400.0, "Times", 9.0));
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_char_width - 6.0).abs() < 0.01,
"expected ~6.0, got {}",
stats.dominant_char_width
);
}
#[test]
fn wide_column_line_height_measured_correctly() {
let mut spans = Vec::new();
let mut y = 720.0f32;
for i in 0..20 {
let w = 10.0 + (i as f32 * 20.0);
spans.push(TextSpan {
text: format!("line {i:02}"),
bbox: crate::geometry::Rect::new(72.0, y, w, 12.0),
font_name: "Helvetica".into(),
font_size: 12.0,
..Default::default()
});
y -= 14.4;
}
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.5,
"wide-column line height must be ~14.4, got {}",
stats.dominant_line_height
);
}
#[test]
fn two_column_line_height_measured_correctly() {
let mut spans = Vec::new();
for col_x in [72.0_f32, 320.0] {
let mut y = 720.0;
for i in 0..10 {
spans.push(TextSpan {
text: format!("col{col_x:.0} line {i:02}"),
bbox: crate::geometry::Rect::new(col_x, y, 100.0, 12.0),
font_name: "Helvetica".into(),
font_size: 12.0,
..Default::default()
});
y -= 14.4;
}
}
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.5,
"two-column line height must be ~14.4, got {}",
stats.dominant_line_height
);
}
#[test]
fn ignores_non_finite_or_zero_size() {
let spans = vec![
span("ok", 72.0, 720.0, "Helvetica", 12.0),
span("zero", 72.0, 700.0, "Helvetica", 0.0),
TextSpan {
text: "nan".into(),
bbox: Rect::new(72.0, 680.0, 10.0, 12.0),
font_name: "Helvetica".into(),
font_size: f32::NAN,
..Default::default()
},
];
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
}
}