pdf_oxide 0.3.38

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
//! Per-page font statistics derived from extracted text spans.
//!
//! Every layout heuristic in the multi-column / reading-order pipeline
//! used to depend on hardcoded absolute thresholds (a 100 pt minimum
//! column width, a 6 pt column-boundary gutter, etc.). That breaks on
//! documents whose body font isn't 12 pt — a 6 pt fax sets one set of
//! gutter sizes, a 24 pt poster sets another, and a hardcoded threshold
//! is necessarily wrong on at least one of them.
//!
//! [`PageFontStats`] holds the four measurements every downstream
//! threshold should be derived from. They are computed once per page in
//! a single pass over the spans and passed by reference into the
//! layout heuristics that need them.
//!
//! ```ignore
//! use pdf_oxide::layout::{PageFontStats, TextSpan};
//! let stats = PageFontStats::from_spans(&spans);
//! let body_em = stats.dominant_em;          // ≈ font size of the body text
//! let line_h  = stats.dominant_line_height;  // baseline-to-baseline distance
//! ```

use super::TextSpan;

/// Per-page font statistics. Computed in O(n) over the spans.
#[derive(Debug, Clone, PartialEq)]
pub struct PageFontStats {
    /// Mode of `font_size` weighted by character count. The "1 em"
    /// for the page's body text. Defaults to 12.0 on an empty page.
    pub dominant_em: f32,

    /// Median baseline-to-baseline distance for adjacent same-column
    /// same-font spans. Approximates the page's set leading. Defaults
    /// to `1.2 × dominant_em` when fewer than two same-font spans are
    /// available to derive it.
    pub dominant_line_height: f32,

    /// Average advance width per character in the dominant font. Used
    /// to convert "N characters wide" thresholds into points. Defaults
    /// to `0.5 × dominant_em` (Helvetica's per-em mean width).
    pub dominant_char_width: f32,

    /// Identifier of the font used by the largest share of characters
    /// on the page. Used by region-distinctness heuristics (caption
    /// vs body) to detect font-discontinuity boundaries.
    pub body_font_name: String,
}

impl Default for PageFontStats {
    fn default() -> Self {
        Self {
            dominant_em: 12.0,
            dominant_line_height: 14.4,
            dominant_char_width: 6.0,
            body_font_name: String::new(),
        }
    }
}

impl PageFontStats {
    /// Compute font statistics from a slice of spans.
    ///
    /// Empty input → returns [`PageFontStats::default`].
    pub fn from_spans(spans: &[TextSpan]) -> Self {
        if spans.is_empty() {
            return Self::default();
        }

        // Mode of font_size weighted by character count.
        // Bin font sizes into 0.25 pt buckets so 11.97 / 12.00 / 12.03
        // (which all came from the same nominal 12 pt body) collapse to
        // one bucket.
        let mut size_buckets: std::collections::HashMap<u16, usize> =
            std::collections::HashMap::new();
        let mut font_buckets: std::collections::HashMap<&str, usize> =
            std::collections::HashMap::new();
        for s in spans {
            let chars = s.text.chars().count();
            if chars == 0 || !s.font_size.is_finite() || s.font_size <= 0.0 {
                continue;
            }
            // Quantize to 0.25 pt buckets.
            let bucket = (s.font_size * 4.0).round() as u16;
            *size_buckets.entry(bucket).or_insert(0) += chars;
            *font_buckets.entry(s.font_name.as_str()).or_insert(0) += chars;
        }
        if size_buckets.is_empty() {
            return Self::default();
        }

        let dominant_em = {
            let (bucket, _) = size_buckets
                .iter()
                .max_by_key(|(_, &count)| count)
                .expect("size_buckets non-empty checked above");
            (*bucket as f32) / 4.0
        };

        let body_font_name = font_buckets
            .iter()
            .max_by_key(|(_, &count)| count)
            .map(|(name, _)| (*name).to_string())
            .unwrap_or_default();

        // Dominant line height: median of baseline-to-baseline distances
        // between vertically-adjacent same-font same-size spans.
        // Sort spans by (font_name, x_center, y descending) and walk
        // adjacent pairs.
        let dominant_line_height =
            compute_line_height(spans, &body_font_name, dominant_em).unwrap_or(dominant_em * 1.2);

        // Dominant char width: total width of dominant-font spans
        // divided by their total character count. Falls back to
        // 0.5 × em when no dominant-font spans have positive width.
        let mut total_width = 0.0_f64;
        let mut total_chars = 0_usize;
        let dominant_size_min = dominant_em - 0.25;
        let dominant_size_max = dominant_em + 0.25;
        for s in spans {
            if s.font_name == body_font_name
                && s.font_size >= dominant_size_min
                && s.font_size <= dominant_size_max
                && s.bbox.width > 0.0
            {
                let chars = s.text.chars().count();
                if chars > 0 {
                    total_width += s.bbox.width as f64;
                    total_chars += chars;
                }
            }
        }
        let dominant_char_width = if total_chars > 0 {
            (total_width / total_chars as f64) as f32
        } else {
            dominant_em * 0.5
        };

        Self {
            dominant_em,
            dominant_line_height,
            dominant_char_width,
            body_font_name,
        }
    }
}

fn compute_line_height(spans: &[TextSpan], body_font: &str, dominant_em: f32) -> Option<f32> {
    // Filter to dominant-font dominant-size spans with finite y.
    let mut body_spans: Vec<(f32, f32)> = spans
        .iter()
        .filter(|s| {
            s.font_name == body_font
                && (s.font_size - dominant_em).abs() < 0.5
                && s.bbox.y.is_finite()
                && s.bbox.x.is_finite()
        })
        .map(|s| (s.bbox.x + s.bbox.width * 0.5, s.bbox.y))
        .collect();
    if body_spans.len() < 4 {
        return None;
    }

    // Group by x-center bucket so vertically adjacent spans in the
    // same column are compared to each other, not across columns.
    body_spans.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));

    let bucket_w = (dominant_em * 6.0).max(40.0); // wide enough that one column lands in one bucket

    let mut gaps: Vec<f32> = Vec::new();
    let mut i = 0;
    while i < body_spans.len() {
        let bucket_start = body_spans[i].0;
        let mut col_ys: Vec<f32> = Vec::new();
        while i < body_spans.len() && body_spans[i].0 - bucket_start <= bucket_w {
            col_ys.push(body_spans[i].1);
            i += 1;
        }
        if col_ys.len() >= 2 {
            // Sort by y descending (top to bottom in PDF coordinates).
            col_ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
            for w in col_ys.windows(2) {
                let g = w[0] - w[1];
                // Only accept gaps in the plausible line-height range
                // [0.5em, 3em] — wider gaps are paragraph breaks or
                // section dividers, narrower are subscripts / inline
                // math.
                if g >= dominant_em * 0.5 && g <= dominant_em * 3.0 {
                    gaps.push(g);
                }
            }
        }
    }

    if gaps.is_empty() {
        return None;
    }
    gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    Some(gaps[gaps.len() / 2])
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geometry::Rect;

    fn span(text: &str, x: f32, y: f32, font: &str, size: f32) -> TextSpan {
        TextSpan {
            text: text.to_string(),
            bbox: Rect::new(x, y, text.chars().count() as f32 * size * 0.5, size),
            font_name: font.to_string(),
            font_size: size,
            ..Default::default()
        }
    }

    #[test]
    fn empty_spans_returns_default() {
        let stats = PageFontStats::from_spans(&[]);
        assert_eq!(stats.dominant_em, 12.0);
        assert_eq!(stats.dominant_line_height, 14.4);
        assert_eq!(stats.dominant_char_width, 6.0);
        assert!(stats.body_font_name.is_empty());
    }

    #[test]
    fn single_uniform_block_picks_size_and_font() {
        let mut spans = Vec::new();
        let mut y = 720.0;
        for i in 0..20 {
            spans.push(span(&format!("body line {i:02}"), 72.0, y, "Helvetica", 12.0));
            y -= 14.4;
        }
        let stats = PageFontStats::from_spans(&spans);
        assert_eq!(stats.dominant_em, 12.0);
        assert_eq!(stats.body_font_name, "Helvetica");
        // Line height derived from the actual 14.4 pt baseline gap.
        assert!(
            (stats.dominant_line_height - 14.4).abs() < 0.01,
            "expected ~14.4, got {}",
            stats.dominant_line_height
        );
        // Char width ≈ size × 0.5 by our synthetic bbox formula.
        assert!(
            (stats.dominant_char_width - 6.0).abs() < 0.01,
            "expected ~6.0, got {}",
            stats.dominant_char_width
        );
    }

    #[test]
    fn mode_not_mean_when_outliers_present() {
        // 30 chars of 12 pt body + 1 char of 72 pt heading.
        // Mean would be skewed up; mode-by-char-count stays at 12.
        let mut spans: Vec<TextSpan> = (0..15)
            .map(|i| span("two body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
            .collect();
        spans.push(span("X", 72.0, 720.0, "Helvetica", 72.0));
        let stats = PageFontStats::from_spans(&spans);
        assert_eq!(stats.dominant_em, 12.0);
    }

    #[test]
    fn body_font_is_majority_by_char_count() {
        // Body in Helvetica (lots of chars), captions in Times (few chars).
        let mut spans: Vec<TextSpan> = (0..15)
            .map(|i| span("body line text", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
            .collect();
        spans.push(span("Fig 1", 200.0, 400.0, "Times", 9.0));
        spans.push(span("Fig 2", 200.0, 300.0, "Times", 9.0));
        let stats = PageFontStats::from_spans(&spans);
        assert_eq!(stats.body_font_name, "Helvetica");
    }

    #[test]
    fn quantized_size_bucket_collapses_near_duplicates() {
        // Sizes 11.97, 12.00, 12.03 should all bucket to 12.0.
        let spans = vec![
            span("aaaaa", 72.0, 720.0, "Helvetica", 11.97),
            span("bbbbb", 72.0, 706.0, "Helvetica", 12.00),
            span("ccccc", 72.0, 692.0, "Helvetica", 12.03),
            span("ddddd", 72.0, 678.0, "Helvetica", 12.00),
        ];
        let stats = PageFontStats::from_spans(&spans);
        assert!(
            (stats.dominant_em - 12.0).abs() < 0.05,
            "expected 12.0, got {}",
            stats.dominant_em
        );
    }

    #[test]
    fn line_height_falls_back_to_1_2_em_when_unmeasurable() {
        // Single span has no neighbour to measure baseline gap from.
        let spans = vec![span("solo", 72.0, 720.0, "Helvetica", 12.0)];
        let stats = PageFontStats::from_spans(&spans);
        assert!(
            (stats.dominant_line_height - 14.4).abs() < 0.01,
            "fallback expected, got {}",
            stats.dominant_line_height
        );
    }

    #[test]
    fn char_width_uses_dominant_font_only() {
        // Body 12 pt Helvetica with char-width 6.0; an outlier Times
        // span at 9 pt should NOT pull the dominant_char_width away.
        let mut spans: Vec<TextSpan> = (0..15)
            .map(|i| span("body line", 72.0, 720.0 - i as f32 * 14.4, "Helvetica", 12.0))
            .collect();
        spans.push(span("CAPTION CAPTION CAPTION", 200.0, 400.0, "Times", 9.0));
        let stats = PageFontStats::from_spans(&spans);
        assert!(
            (stats.dominant_char_width - 6.0).abs() < 0.01,
            "expected ~6.0, got {}",
            stats.dominant_char_width
        );
    }

    #[test]
    fn ignores_non_finite_or_zero_size() {
        let spans = vec![
            span("ok", 72.0, 720.0, "Helvetica", 12.0),
            span("zero", 72.0, 700.0, "Helvetica", 0.0),
            TextSpan {
                text: "nan".into(),
                bbox: Rect::new(72.0, 680.0, 10.0, 12.0),
                font_name: "Helvetica".into(),
                font_size: f32::NAN,
                ..Default::default()
            },
        ];
        let stats = PageFontStats::from_spans(&spans);
        // Only the valid 12.0 span counts → dominant_em = 12.
        assert_eq!(stats.dominant_em, 12.0);
    }
}