use super::TextSpan;
#[derive(Debug, Clone, PartialEq)]
pub struct PageFontStats {
pub dominant_em: f32,
pub dominant_line_height: f32,
pub dominant_char_width: f32,
pub body_font_name: String,
}
impl Default for PageFontStats {
fn default() -> Self {
Self {
dominant_em: 12.0,
dominant_line_height: 14.4,
dominant_char_width: 6.0,
body_font_name: String::new(),
}
}
}
impl PageFontStats {
pub fn from_spans(spans: &[TextSpan]) -> Self {
if spans.is_empty() {
return Self::default();
}
let mut size_buckets: std::collections::HashMap<u16, usize> =
std::collections::HashMap::new();
let mut font_buckets: std::collections::HashMap<&str, usize> =
std::collections::HashMap::new();
for s in spans {
let chars = s.text.chars().count();
if chars == 0 || !s.font_size.is_finite() || s.font_size <= 0.0 {
continue;
}
let bucket = (s.font_size * 4.0).round() as u16;
*size_buckets.entry(bucket).or_insert(0) += chars;
*font_buckets.entry(s.font_name.as_str()).or_insert(0) += chars;
}
if size_buckets.is_empty() {
return Self::default();
}
let dominant_em = {
let (bucket, _) = size_buckets
.iter()
.max_by_key(|(_, &count)| count)
.expect("size_buckets non-empty checked above");
(*bucket as f32) / 4.0
};
let body_font_name = font_buckets
.iter()
.max_by_key(|(_, &count)| count)
.map(|(name, _)| (*name).to_string())
.unwrap_or_default();
let dominant_line_height =
compute_line_height(spans, &body_font_name, dominant_em).unwrap_or(dominant_em * 1.2);
let mut total_width = 0.0_f64;
let mut total_chars = 0_usize;
let dominant_size_min = dominant_em - 0.25;
let dominant_size_max = dominant_em + 0.25;
for s in spans {
if s.font_name == body_font_name
&& s.font_size >= dominant_size_min
&& s.font_size <= dominant_size_max
&& s.bbox.width > 0.0
{
let chars = s.text.chars().count();
if chars > 0 {
total_width += s.bbox.width as f64;
total_chars += chars;
}
}
}
let dominant_char_width = if total_chars > 0 {
(total_width / total_chars as f64) as f32
} else {
dominant_em * 0.5
};
Self {
dominant_em,
dominant_line_height,
dominant_char_width,
body_font_name,
}
}
}
fn compute_line_height(spans: &[TextSpan], body_font: &str, dominant_em: f32) -> Option<f32> {
let mut body_spans: Vec<(f32, f32)> = spans
.iter()
.filter(|s| {
s.font_name == body_font
&& (s.font_size - dominant_em).abs() < 0.5
&& s.bbox.y.is_finite()
&& s.bbox.x.is_finite()
})
.map(|s| (s.bbox.x + s.bbox.width * 0.5, s.bbox.y))
.collect();
if body_spans.len() < 4 {
return None;
}
body_spans.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let bucket_w = (dominant_em * 6.0).max(40.0);
let mut gaps: Vec<f32> = Vec::new();
let mut i = 0;
while i < body_spans.len() {
let bucket_start = body_spans[i].0;
let mut col_ys: Vec<f32> = Vec::new();
while i < body_spans.len() && body_spans[i].0 - bucket_start <= bucket_w {
col_ys.push(body_spans[i].1);
i += 1;
}
if col_ys.len() >= 2 {
col_ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
for w in col_ys.windows(2) {
let g = w[0] - w[1];
if g >= dominant_em * 0.5 && g <= dominant_em * 3.0 {
gaps.push(g);
}
}
}
}
if gaps.is_empty() {
return None;
}
gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
Some(gaps[gaps.len() / 2])
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
fn span(text: &str, x: f32, y: f32, font: &str, size: f32) -> TextSpan {
TextSpan {
text: text.to_string(),
bbox: Rect::new(x, y, text.chars().count() as f32 * size * 0.5, size),
font_name: font.to_string(),
font_size: size,
..Default::default()
}
}
#[test]
fn empty_spans_returns_default() {
let stats = PageFontStats::from_spans(&[]);
assert_eq!(stats.dominant_em, 12.0);
assert_eq!(stats.dominant_line_height, 14.4);
assert_eq!(stats.dominant_char_width, 6.0);
assert!(stats.body_font_name.is_empty());
}
#[test]
fn single_uniform_block_picks_size_and_font() {
let mut spans = Vec::new();
let mut y = 720.0;
for i in 0..20 {
spans.push(span(&format!("body line {i:02}"), 72.0, y, "Helvetica", 12.0));
y -= 14.4;
}
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
assert_eq!(stats.body_font_name, "Helvetica");
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.01,
"expected ~14.4, got {}",
stats.dominant_line_height
);
assert!(
(stats.dominant_char_width - 6.0).abs() < 0.01,
"expected ~6.0, got {}",
stats.dominant_char_width
);
}
#[test]
fn mode_not_mean_when_outliers_present() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("two body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("X", 72.0, 720.0, "Helvetica", 72.0));
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
}
#[test]
fn body_font_is_majority_by_char_count() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("body line text", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("Fig 1", 200.0, 400.0, "Times", 9.0));
spans.push(span("Fig 2", 200.0, 300.0, "Times", 9.0));
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.body_font_name, "Helvetica");
}
#[test]
fn quantized_size_bucket_collapses_near_duplicates() {
let spans = vec![
span("aaaaa", 72.0, 720.0, "Helvetica", 11.97),
span("bbbbb", 72.0, 706.0, "Helvetica", 12.00),
span("ccccc", 72.0, 692.0, "Helvetica", 12.03),
span("ddddd", 72.0, 678.0, "Helvetica", 12.00),
];
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_em - 12.0).abs() < 0.05,
"expected 12.0, got {}",
stats.dominant_em
);
}
#[test]
fn line_height_falls_back_to_1_2_em_when_unmeasurable() {
let spans = vec![span("solo", 72.0, 720.0, "Helvetica", 12.0)];
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_line_height - 14.4).abs() < 0.01,
"fallback expected, got {}",
stats.dominant_line_height
);
}
#[test]
fn char_width_uses_dominant_font_only() {
let mut spans: Vec<TextSpan> = (0..15)
.map(|i| span("body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
.collect();
spans.push(span("CAPTION CAPTION CAPTION", 200.0, 400.0, "Times", 9.0));
let stats = PageFontStats::from_spans(&spans);
assert!(
(stats.dominant_char_width - 6.0).abs() < 0.01,
"expected ~6.0, got {}",
stats.dominant_char_width
);
}
#[test]
fn ignores_non_finite_or_zero_size() {
let spans = vec![
span("ok", 72.0, 720.0, "Helvetica", 12.0),
span("zero", 72.0, 700.0, "Helvetica", 0.0),
TextSpan {
text: "nan".into(),
bbox: Rect::new(72.0, 680.0, 10.0, 12.0),
font_name: "Helvetica".into(),
font_size: f32::NAN,
..Default::default()
},
];
let stats = PageFontStats::from_spans(&spans);
assert_eq!(stats.dominant_em, 12.0);
}
}