use crate::document::PdfDocument;
use crate::error::Result;
use crate::geometry::Rect;
use crate::pipeline::{OrderedTextSpan, ReadingOrderContext, TextPipeline, TextPipelineConfig};
pub fn page_reading_order(doc: &PdfDocument, page_index: usize) -> Result<Vec<OrderedTextSpan>> {
page_reading_order_inner(doc, page_index, true)
}
pub fn page_reading_order_no_artifacts(
doc: &PdfDocument,
page_index: usize,
) -> Result<Vec<OrderedTextSpan>> {
page_reading_order_inner(doc, page_index, false)
}
fn page_reading_order_inner(
doc: &PdfDocument,
page_index: usize,
include_artifacts: bool,
) -> Result<Vec<OrderedTextSpan>> {
let mut spans = doc.extract_spans(page_index)?;
if !include_artifacts {
spans.retain(|s| s.artifact_type.is_none());
}
if spans.is_empty() {
return Ok(Vec::new());
}
let mut context = build_context(doc, page_index);
if !context.has_structure_tree {
if let Some(beads) = page_article_bead_rects(doc, page_index, &spans) {
context = context.with_bead_rects(beads);
}
}
let pipeline = TextPipeline::with_config(TextPipelineConfig::default());
pipeline.process(spans, context)
}
fn page_article_bead_rects(
doc: &PdfDocument,
page_index: usize,
spans: &[crate::layout::TextSpan],
) -> Option<Vec<Rect>> {
let threads = crate::structure::parse_article_threads(doc);
if threads.is_empty() {
return None;
}
let beads: Vec<Rect> = threads
.iter()
.flat_map(|t| t.beads.iter())
.filter(|b| b.page_index == page_index)
.map(|b| b.rect)
.collect();
if beads.len() < 2 {
return None;
}
let body: Vec<&crate::layout::TextSpan> =
spans.iter().filter(|s| !s.text.trim().is_empty()).collect();
if body.is_empty() {
return None;
}
let inside = |r: &Rect, x: f32, y: f32| {
x >= r.x && x <= r.x + r.width && y >= r.y && y <= r.y + r.height
};
let covered = body
.iter()
.filter(|s| {
let cx = s.bbox.x + s.bbox.width * 0.5;
let cy = s.bbox.y + s.bbox.height * 0.5;
beads.iter().any(|r| inside(r, cx, cy))
})
.count();
if (covered as f32) < 0.8 * body.len() as f32 {
return None;
}
let mut xs: Vec<(f32, f32)> = beads.iter().map(|r| (r.x, r.x + r.width)).collect();
xs.sort_by(|a, b| crate::utils::safe_float_cmp(a.0, b.0));
let mut bands = 1usize;
let mut cover_right = xs[0].1;
for &(l, r) in &xs[1..] {
if l > cover_right {
bands += 1;
}
cover_right = cover_right.max(r);
}
if bands < 2 {
return None;
}
let mut geom: Vec<Rect> = beads.clone();
geom.sort_by(|a, b| {
let y = crate::utils::safe_float_cmp(b.y, a.y); if y != std::cmp::Ordering::Equal {
return y;
}
crate::utils::safe_float_cmp(a.x, b.x)
});
let same_order = beads
.iter()
.zip(geom.iter())
.all(|(a, b)| a.x == b.x && a.y == b.y);
if same_order {
return None;
}
Some(beads)
}
pub(crate) fn build_context(doc: &PdfDocument, page_index: usize) -> ReadingOrderContext {
let media_box = doc
.get_page_media_box(page_index)
.unwrap_or((0.0, 0.0, 612.0, 792.0));
let bbox = Rect::from_points(media_box.0, media_box.1, media_box.2, media_box.3);
let mut ctx = ReadingOrderContext::new()
.with_page(page_index as u32)
.with_bbox(bbox);
let Some(tree) = doc.struct_tree_trustworthy() else {
return ctx;
};
let mcid_order: Vec<u32> = doc
.cached_mcid_order_for_page(&tree, page_index as u32)
.into_iter()
.map(|(_scope, m)| m)
.collect();
if !mcid_order.is_empty() {
ctx = ctx.with_mcid_order(mcid_order);
}
ctx = ctx.with_suspects(false);
ctx
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn issue_211_fixture(name: &str) -> Option<PathBuf> {
let home = std::env::var("HOME").ok()?;
let path = PathBuf::from(home)
.join("projects/pdf_oxide_tests/pdfs_issue_regression")
.join(name);
if !path.exists() {
eprintln!("Skipping: {} not found", path.display());
return None;
}
Some(path)
}
fn open(name: &str) -> Option<PdfDocument> {
let path = issue_211_fixture(name)?;
let bytes = std::fs::read(&path).ok()?;
PdfDocument::from_bytes(bytes).ok()
}
#[test]
fn empty_page_returns_empty_vec() {
let Some(doc) = open("issue_211_pdf_structure.pdf") else {
return;
};
let result = page_reading_order(&doc, 0).expect("page 0 should resolve");
assert!(!result.is_empty(), "page 0 of pdf_structure has spans");
}
#[test]
fn tagged_pdf_uses_structure_tree_first() {
let Some(doc) = open("issue_211_municipal_minutes.pdf") else {
return;
};
let ordered = page_reading_order(&doc, 0).expect("ordering succeeds");
let title_pos = ordered
.iter()
.position(|s| s.span.text.contains("COMITÉ"))
.expect("title must appear");
let body_pos = ordered
.iter()
.position(|s| s.span.text.contains("Séance"))
.expect("body must appear");
assert!(
title_pos < body_pos,
"title (COMITÉ at index {}) must precede body (Séance at index {}) \
in canonical reading order",
title_pos,
body_pos,
);
}
#[test]
fn untagged_pdf_falls_back_to_geometric() {
let Some(doc) = open("issue_211_pdf_structure.pdf") else {
return;
};
let ordered = page_reading_order(&doc, 0).expect("ordering succeeds");
assert!(!ordered.is_empty());
assert!(
ordered[0].span.text.contains("Titre")
|| ordered
.iter()
.take(3)
.any(|s| s.span.text.contains("Titre")),
"title 'Titre' must appear among the first few ordered spans; \
got first 5: {:?}",
ordered
.iter()
.take(5)
.map(|s| &s.span.text)
.collect::<Vec<_>>(),
);
}
}