use crate::document::PdfDocument;
use crate::error::Result;
use crate::geometry::Rect;
use crate::pipeline::{OrderedTextSpan, ReadingOrderContext, TextPipeline, TextPipelineConfig};
pub fn page_reading_order(doc: &PdfDocument, page_index: usize) -> Result<Vec<OrderedTextSpan>> {
page_reading_order_inner(doc, page_index, true)
}
pub fn page_reading_order_no_artifacts(
doc: &PdfDocument,
page_index: usize,
) -> Result<Vec<OrderedTextSpan>> {
page_reading_order_inner(doc, page_index, false)
}
fn page_reading_order_inner(
doc: &PdfDocument,
page_index: usize,
include_artifacts: bool,
) -> Result<Vec<OrderedTextSpan>> {
let mut spans = doc.extract_spans(page_index)?;
if !include_artifacts {
spans.retain(|s| s.artifact_type.is_none());
}
if spans.is_empty() {
return Ok(Vec::new());
}
let context = build_context(doc, page_index);
let pipeline = TextPipeline::with_config(TextPipelineConfig::default());
pipeline.process(spans, context)
}
pub(crate) fn build_context(doc: &PdfDocument, page_index: usize) -> ReadingOrderContext {
let media_box = doc
.get_page_media_box(page_index)
.unwrap_or((0.0, 0.0, 612.0, 792.0));
let bbox = Rect::from_points(media_box.0, media_box.1, media_box.2, media_box.3);
let mut ctx = ReadingOrderContext::new()
.with_page(page_index as u32)
.with_bbox(bbox);
let Some(tree) = doc.struct_tree_trustworthy() else {
return ctx;
};
let ordered =
crate::structure::traverse_structure_tree(&tree, page_index as u32).unwrap_or_default();
let mcid_order: Vec<u32> = ordered.iter().filter_map(|c| c.mcid).collect();
if !mcid_order.is_empty() {
ctx = ctx.with_mcid_order(mcid_order);
}
ctx = ctx.with_suspects(false);
ctx
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn issue_211_fixture(name: &str) -> Option<PathBuf> {
let home = std::env::var("HOME").ok()?;
let path = PathBuf::from(home)
.join("projects/pdf_oxide_tests/pdfs_issue_regression")
.join(name);
if !path.exists() {
eprintln!("Skipping: {} not found", path.display());
return None;
}
Some(path)
}
fn open(name: &str) -> Option<PdfDocument> {
let path = issue_211_fixture(name)?;
let bytes = std::fs::read(&path).ok()?;
PdfDocument::from_bytes(bytes).ok()
}
#[test]
fn empty_page_returns_empty_vec() {
let Some(doc) = open("issue_211_pdf_structure.pdf") else {
return;
};
let result = page_reading_order(&doc, 0).expect("page 0 should resolve");
assert!(!result.is_empty(), "page 0 of pdf_structure has spans");
}
#[test]
fn tagged_pdf_uses_structure_tree_first() {
let Some(doc) = open("issue_211_municipal_minutes.pdf") else {
return;
};
let ordered = page_reading_order(&doc, 0).expect("ordering succeeds");
let title_pos = ordered
.iter()
.position(|s| s.span.text.contains("COMITÉ"))
.expect("title must appear");
let body_pos = ordered
.iter()
.position(|s| s.span.text.contains("Séance"))
.expect("body must appear");
assert!(
title_pos < body_pos,
"title (COMITÉ at index {}) must precede body (Séance at index {}) \
in canonical reading order",
title_pos,
body_pos,
);
}
#[test]
fn untagged_pdf_falls_back_to_geometric() {
let Some(doc) = open("issue_211_pdf_structure.pdf") else {
return;
};
let ordered = page_reading_order(&doc, 0).expect("ordering succeeds");
assert!(!ordered.is_empty());
assert!(
ordered[0].span.text.contains("Titre")
|| ordered
.iter()
.take(3)
.any(|s| s.span.text.contains("Titre")),
"title 'Titre' must appear among the first few ordered spans; \
got first 5: {:?}",
ordered
.iter()
.take(5)
.map(|s| &s.span.text)
.collect::<Vec<_>>(),
);
}
}