use std::borrow::Cow;
use std::collections::HashMap;
use std::io::Read;
use std::sync::Arc;
use flate2::read::ZlibDecoder;
#[cfg(feature = "parallel")]
use rayon::prelude::*;
use sha2::{Digest, Sha256};
use crate::engine::ExtractionEngine;
use crate::error::{DonglerError, Result};
use crate::ir::{
Asset, BBox, Block, Confidence, Document, FigureBlock, ImageObject, Line, Metadata, Page,
SourceAnchor, Span, TableBlock, TableCell, TextBlock, Warning, SCHEMA_VERSION,
};
use crate::source::Source;
#[derive(Debug, Default, Clone, Copy)]
pub struct PdfEngine;
impl ExtractionEngine for PdfEngine {
fn name(&self) -> &'static str {
"pdf-native"
}
fn extract(&self, source: &Source) -> Result<Document> {
let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
extract_pdf(bytes, source, self.name())
}
}
#[derive(Debug, Clone)]
struct PdfObject {
object_number: u32,
generation: u16,
body: Vec<u8>,
}
#[derive(Debug, Clone)]
struct PageSeed {
number: usize,
body: String,
}
#[derive(Debug, Clone)]
struct PageExtraction {
page: Page,
text: String,
spans: Vec<SpanGeom>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct SpanGeom {
pub bbox: BBox,
pub text: String,
}
#[derive(Debug, Clone, PartialEq)]
pub struct PageSpans {
pub page_number: usize,
pub width: f32,
pub height: f32,
pub spans: Vec<SpanGeom>,
}
#[derive(Debug, Clone)]
struct TextRun {
text: String,
bbox: BBox,
baseline_y: f32,
font: Option<String>,
size: f32,
space_width: f32,
bold: bool,
italic: bool,
source_object_ids: Vec<String>,
}
#[derive(Debug, Clone)]
struct TextLine {
runs: Vec<TextRun>,
bbox: BBox,
baseline_y: f32,
}
#[derive(Debug, Clone)]
struct DetectedTable {
table: TableBlock,
line_indices: Vec<usize>,
}
#[derive(Debug, Clone)]
struct TableRowCandidate {
line_index: usize,
cells: Vec<TextRun>,
}
#[derive(Debug, Clone, Copy)]
struct GraphicEdge {
x0: f32,
y0: f32,
x1: f32,
y1: f32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ScriptKind {
Superscript,
Subscript,
}
#[derive(Debug, Clone)]
struct ColumnLayout<'a> {
leading: Vec<&'a TextLine>,
columns: Vec<Vec<&'a TextLine>>,
trailing: Vec<&'a TextLine>,
}
#[derive(Debug, Clone)]
struct ContentExtraction {
text_runs: Vec<TextRun>,
edges: Vec<GraphicEdge>,
images: Vec<ImageObject>,
assets: Vec<Asset>,
warnings: Vec<Warning>,
}
#[derive(Debug, Clone, Default)]
struct FontDecoder {
cmap: HashMap<Vec<u8>, String>,
encoding: HashMap<u8, String>,
widths: HashMap<char, f32>,
max_code_len: usize,
bold: bool,
italic: bool,
ascent: f32,
descent: f32,
}
impl FontDecoder {
fn decode_byte(&self, byte: u8) -> String {
self.encoding
.get(&byte)
.cloned()
.unwrap_or_else(|| (byte as char).to_string())
}
}
#[derive(Debug, Clone)]
enum Operand {
Number(f32),
Name(String),
Literal(Vec<u8>),
Hex(Vec<u8>),
Array(Vec<Operand>),
Other,
}
#[derive(Debug, Clone)]
struct ContentOp {
operands: Vec<Operand>,
operator: String,
}
#[derive(Debug, Clone)]
struct GraphicsState {
ctm: Matrix,
text_matrix: Matrix,
line_matrix: Matrix,
font_name: Option<String>,
font_size: f32,
leading: f32,
char_spacing: f32,
word_spacing: f32,
horizontal_scaling: f32,
text_rise: f32,
}
impl Default for GraphicsState {
fn default() -> Self {
Self {
ctm: Matrix::identity(),
text_matrix: Matrix::identity(),
line_matrix: Matrix::identity(),
font_name: None,
font_size: 12.0,
leading: 12.0,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 1.0,
text_rise: 0.0,
}
}
}
#[derive(Debug, Clone, Copy)]
struct Matrix {
a: f32,
b: f32,
c: f32,
d: f32,
e: f32,
f: f32,
}
impl Matrix {
fn identity() -> Self {
Self {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: 0.0,
}
}
fn multiply(self, other: Self) -> Self {
Self {
a: self.a * other.a + self.b * other.c,
b: self.a * other.b + self.b * other.d,
c: self.c * other.a + self.d * other.c,
d: self.c * other.b + self.d * other.d,
e: self.e * other.a + self.f * other.c + other.e,
f: self.e * other.b + self.f * other.d + other.f,
}
}
fn point(self, x: f32, y: f32) -> (f32, f32) {
(
self.a * x + self.c * y + self.e,
self.b * x + self.d * y + self.f,
)
}
fn translate(self, x: f32, y: f32) -> Self {
Self {
e: self.e + self.a * x + self.c * y,
f: self.f + self.b * x + self.d * y,
..self
}
}
fn bbox(self) -> BBox {
BBox {
x: self.e,
y: self.f,
width: self.a.abs(),
height: self.d.abs(),
}
}
}
struct ParsedPdf {
page_extractions: Vec<PageExtraction>,
document_warnings: Vec<crate::ir::Warning>,
title: Option<String>,
encrypted: bool,
}
pub fn extract_pdf(bytes: &[u8], source: &Source, engine_name: &str) -> Result<Document> {
let parsed = parse_pdf_pages(bytes)?;
let ParsedPdf {
page_extractions,
document_warnings,
title,
encrypted,
} = parsed;
let mut pages = Vec::with_capacity(page_extractions.len());
let mut all_text = String::new();
let mut assets = Vec::new();
for extraction in page_extractions {
all_text.push_str(&extraction.text);
all_text.push('\n');
assets.extend(extraction.page.assets.clone());
pages.push(extraction.page);
}
Ok(Document {
schema_version: SCHEMA_VERSION.to_owned(),
metadata: Metadata {
format: "pdf".to_owned(),
engine: engine_name.to_owned(),
source: source.path.clone(),
title,
character_count: all_text.chars().count(),
word_count: all_text.split_whitespace().count(),
block_count: pages.iter().map(|page| page.blocks.len()).sum(),
file_size_bytes: Some(bytes.len() as u64),
pdf_version: pdf_version(bytes),
encrypted,
},
pages,
assets,
warnings: document_warnings,
})
}
pub fn extract_pdf_spans(bytes: &[u8]) -> Result<Vec<PageSpans>> {
let parsed = parse_pdf_pages(bytes)?;
Ok(parsed
.page_extractions
.into_iter()
.map(|e| PageSpans {
page_number: e.page.number,
width: e.page.width.unwrap_or(0.0),
height: e.page.height.unwrap_or(0.0),
spans: e.spans,
})
.collect())
}
fn parse_pdf_pages(bytes: &[u8]) -> Result<ParsedPdf> {
if !bytes.starts_with(b"%PDF-") {
return Err(DonglerError::pdf("missing %PDF header"));
}
let mut objects = parse_indirect_objects(bytes);
expand_object_streams(&mut objects);
if objects.is_empty() {
return Err(DonglerError::pdf("no indirect objects found"));
}
let title = extract_info_string(&objects, "Title");
let objects: Vec<Arc<PdfObject>> = objects.into_iter().map(Arc::new).collect();
let object_map: HashMap<u32, Arc<PdfObject>> = objects
.iter()
.map(|object| (object.object_number, Arc::clone(object)))
.collect();
let page_seeds = objects
.iter()
.filter_map(|object| page_seed(object.as_ref(), &object_map))
.enumerate()
.map(|(index, mut seed)| {
seed.number = index + 1;
seed
})
.collect::<Vec<_>>();
if page_seeds.is_empty() {
return Err(DonglerError::pdf("no page objects found"));
}
let mut document_warnings = Vec::new();
let encrypted = contains_name(bytes, b"/Encrypt");
if encrypted {
document_warnings.push(warning(
"pdf.encrypted",
"warning",
"document declares encryption; extraction may be incomplete",
None,
));
}
if contains_name(bytes, b"/ObjStm") {
document_warnings.push(warning(
"pdf.object_stream",
"info",
"object streams detected and expanded by the native scanner",
None,
));
}
let mut font_object_numbers: Vec<u32> = page_seeds
.iter()
.flat_map(|seed| {
let resource_body = resolve_resource_body(&seed.body, &object_map);
let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
resolve_named_resource_refs(resource_text, "/Font", &object_map)
.into_values()
.collect::<Vec<_>>()
})
.collect();
font_object_numbers.sort_unstable();
font_object_numbers.dedup();
let decode_font = |number: u32| {
object_map
.get(&number)
.map(|font| (number, Arc::new(font_decoder(font.as_ref(), &object_map))))
};
#[cfg(feature = "parallel")]
let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
.into_par_iter()
.filter_map(decode_font)
.collect();
#[cfg(not(feature = "parallel"))]
let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
.into_iter()
.filter_map(decode_font)
.collect();
let extract_one = |seed: &PageSeed| extract_page(seed, &object_map, &font_cache);
#[cfg(feature = "parallel")]
let page_extractions = page_seeds.par_iter().map(extract_one).collect::<Vec<_>>();
#[cfg(not(feature = "parallel"))]
let page_extractions = page_seeds.iter().map(extract_one).collect::<Vec<_>>();
Ok(ParsedPdf {
page_extractions,
document_warnings,
title,
encrypted,
})
}
fn extract_page(
seed: &PageSeed,
object_map: &HashMap<u32, Arc<PdfObject>>,
font_cache: &HashMap<u32, Arc<FontDecoder>>,
) -> PageExtraction {
let media_box = parse_number_array_after(&seed.body, "/MediaBox")
.unwrap_or_else(|| vec![0.0, 0.0, 612.0, 792.0]);
let width =
media_box.get(2).copied().unwrap_or(612.0) - media_box.first().copied().unwrap_or(0.0);
let height =
media_box.get(3).copied().unwrap_or(792.0) - media_box.get(1).copied().unwrap_or(0.0);
let rotation = parse_number_after(&seed.body, "/Rotate").map(|value| value as i32);
let contents = parse_refs_after_key(&seed.body, "/Contents");
let resource_body = resolve_resource_body(&seed.body, object_map);
let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
let xobjects = resolve_named_resource_refs(resource_text, "/XObject", object_map);
let fonts = load_font_decoders(resource_text, object_map, font_cache);
let mut warnings = Vec::new();
let mut extraction = ContentExtraction {
text_runs: Vec::new(),
edges: Vec::new(),
images: Vec::new(),
assets: Vec::new(),
warnings: Vec::new(),
};
for content_ref in contents {
match object_map
.get(&(content_ref as u32))
.map(|object| decode_stream_object(object.as_ref()))
{
Some(Ok(Some(stream))) => {
let object_id = format!("{content_ref} 0 R");
let mut content = interpret_content_stream(
&stream,
seed.number,
&[object_id],
&xobjects,
&fonts,
object_map,
);
extraction.text_runs.append(&mut content.text_runs);
extraction.edges.append(&mut content.edges);
extraction.images.append(&mut content.images);
extraction.assets.append(&mut content.assets);
extraction.warnings.append(&mut content.warnings);
}
Some(Ok(None)) | None => warnings.push(warning(
"pdf.missing_content",
"warning",
"page content stream is missing",
Some(seed.number),
)),
Some(Err(error)) => warnings.push(warning(
"pdf.stream_decode",
"warning",
&error.to_string(),
Some(seed.number),
)),
}
}
warnings.append(&mut extraction.warnings);
let normalized_rotation = rotation.map(|value| value.rem_euclid(360)).unwrap_or(0);
if normalized_rotation != 0 {
for run in &mut extraction.text_runs {
run.bbox = rotate_bbox(run.bbox, normalized_rotation, width, height);
}
for image in &mut extraction.images {
if let Some(bbox) = image.bbox {
image.bbox = Some(rotate_bbox(bbox, normalized_rotation, width, height));
}
}
for edge in &mut extraction.edges {
let (x0, y0) = rotate_point(edge.x0, edge.y0, normalized_rotation, width, height);
let (x1, y1) = rotate_point(edge.x1, edge.y1, normalized_rotation, width, height);
edge.x0 = x0;
edge.y0 = y0;
edge.x1 = x1;
edge.y1 = y1;
}
}
let (page_width, page_height) = if matches!(normalized_rotation, 90 | 270) {
(height, width)
} else {
(width, height)
};
let (page_x, page_y) = if normalized_rotation == 0 {
(
media_box.first().copied().unwrap_or(0.0),
media_box.get(1).copied().unwrap_or(0.0),
)
} else {
(0.0, 0.0)
};
let lines = group_text_runs(extraction.text_runs);
let spans: Vec<SpanGeom> = lines
.iter()
.flat_map(|line| line.runs.iter())
.filter(|run| !run.text.trim().is_empty())
.map(|run| SpanGeom {
bbox: run.bbox,
text: run.text.clone(),
})
.collect();
let mut blocks = build_blocks(seed.number, &lines, &extraction.edges);
if blocks.is_empty() && !extraction.images.is_empty() {
blocks.extend(image_figure_blocks(seed.number, &extraction.images));
}
let text = blocks
.iter()
.map(block_text)
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join("\n");
let page = Page {
number: seed.number,
width: Some(page_width),
height: Some(page_height),
rotation,
bbox: Some(BBox {
x: page_x,
y: page_y,
width: page_width,
height: page_height,
}),
blocks,
images: extraction.images,
assets: extraction.assets,
warnings, ..Default::default()
};
PageExtraction { page, text, spans }
}
fn interpret_content_stream(
bytes: &[u8],
page_number: usize,
source_object_ids: &[String],
xobjects: &HashMap<String, u32>,
fonts: &HashMap<String, Arc<FontDecoder>>,
object_map: &HashMap<u32, Arc<PdfObject>>,
) -> ContentExtraction {
let mut state = GraphicsState::default();
let mut graphics_stack = Vec::new();
let mut current_path_point: Option<(f32, f32)> = None;
let mut pending_edges = Vec::new();
let mut extraction = ContentExtraction {
text_runs: Vec::new(),
edges: Vec::new(),
images: Vec::new(),
assets: Vec::new(),
warnings: Vec::new(),
};
for op in parse_content_ops(bytes) {
match op.operator.as_str() {
"q" => graphics_stack.push(state.clone()),
"Q" => {
if let Some(previous) = graphics_stack.pop() {
state = previous;
}
}
"cm" => {
if let Some(values) = numbers(&op.operands, 6) {
state.ctm = state.ctm.multiply(Matrix {
a: values[0],
b: values[1],
c: values[2],
d: values[3],
e: values[4],
f: values[5],
});
}
}
"BT" => {
state.text_matrix = Matrix::identity();
state.line_matrix = Matrix::identity();
}
"Tf" => {
if let [Operand::Name(name), Operand::Number(size)] = op.operands.as_slice() {
state.font_name = Some(name.clone());
state.font_size = *size;
state.leading = *size * 1.2;
}
}
"Tc" => {
if let Some(values) = numbers(&op.operands, 1) {
state.char_spacing = values[0];
}
}
"Tw" => {
if let Some(values) = numbers(&op.operands, 1) {
state.word_spacing = values[0];
}
}
"Tz" => {
if let Some(values) = numbers(&op.operands, 1) {
state.horizontal_scaling = (values[0] / 100.0).max(0.01);
}
}
"TL" => {
if let Some(values) = numbers(&op.operands, 1) {
state.leading = values[0];
}
}
"Ts" => {
if let Some(values) = numbers(&op.operands, 1) {
state.text_rise = values[0];
}
}
"Td" | "TD" => {
if let Some(values) = numbers(&op.operands, 2) {
let next_line = state.line_matrix.translate(values[0], values[1]);
state.line_matrix = next_line;
state.text_matrix = next_line;
if op.operator == "TD" {
state.leading = -values[1];
}
}
}
"Tm" => {
if let Some(values) = numbers(&op.operands, 6) {
let matrix = Matrix {
a: values[0],
b: values[1],
c: values[2],
d: values[3],
e: values[4],
f: values[5],
};
state.line_matrix = matrix;
state.text_matrix = matrix;
}
}
"T*" => {
move_to_next_text_line(&mut state);
}
"Tj" => {
if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
}
}
"TJ" => {
if let Some(Operand::Array(items)) = op.operands.first() {
let text = text_from_array(items, &state, fonts);
push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
}
}
"'" => {
move_to_next_text_line(&mut state);
if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
}
}
"\"" => {
if let [Operand::Number(word_spacing), Operand::Number(char_spacing), ..] =
op.operands.as_slice()
{
state.word_spacing = *word_spacing;
state.char_spacing = *char_spacing;
}
move_to_next_text_line(&mut state);
if let Some(text) = op
.operands
.last()
.and_then(|operand| operand_text(operand, &state, fonts))
{
push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
}
}
"Do" => {
if let Some(Operand::Name(name)) = op.operands.first() {
if let Some(object_number) = xobjects.get(name) {
if let Some(object) = object_map.get(object_number) {
let object_body = lossy(&object.body);
if object_body.contains("/Subtype /Image") {
let bbox = state.ctm.bbox();
let id = format!("image-{}-{name}", page_number);
let object_id = Some(format!(
"{} {} R",
object.object_number, object.generation
));
let width = parse_number_after(&object_body, "/Width")
.map(|value| value as u32);
let height = parse_number_after(&object_body, "/Height")
.map(|value| value as u32);
extraction.images.push(ImageObject {
id: id.clone(),
object_id: object_id.clone(),
bbox: Some(bbox),
width,
height,
});
extraction.assets.push(Asset {
id,
kind: "image".to_owned(),
object_id,
bbox: Some(bbox),
width,
height,
});
}
}
}
}
}
"m" => {
if let Some(values) = numbers(&op.operands, 2) {
current_path_point = Some((values[0], values[1]));
}
}
"l" => {
if let (Some(start), Some(values)) = (current_path_point, numbers(&op.operands, 2))
{
let end = (values[0], values[1]);
pending_edges.push(graphic_edge_from_points(state.ctm, start, end));
current_path_point = Some(end);
}
}
"re" => {
if let Some(values) = numbers(&op.operands, 4) {
pending_edges.extend(graphic_edges_from_rect(
state.ctm, values[0], values[1], values[2], values[3],
));
current_path_point = Some((values[0], values[1]));
}
}
"S" | "s" => {
extraction.edges.append(&mut pending_edges);
current_path_point = None;
}
"n" => {
pending_edges.clear();
current_path_point = None;
}
_ => {}
}
}
extraction
}
fn graphic_edge_from_points(matrix: Matrix, start: (f32, f32), end: (f32, f32)) -> GraphicEdge {
let (x0, y0) = matrix.point(start.0, start.1);
let (x1, y1) = matrix.point(end.0, end.1);
GraphicEdge { x0, y0, x1, y1 }
}
fn graphic_edges_from_rect(
matrix: Matrix,
x: f32,
y: f32,
width: f32,
height: f32,
) -> Vec<GraphicEdge> {
let right = x + width;
let top = y + height;
vec![
graphic_edge_from_points(matrix, (x, y), (right, y)),
graphic_edge_from_points(matrix, (right, y), (right, top)),
graphic_edge_from_points(matrix, (right, top), (x, top)),
graphic_edge_from_points(matrix, (x, top), (x, y)),
]
}
fn move_to_next_text_line(state: &mut GraphicsState) {
let next_line = state.line_matrix.translate(0.0, -state.leading);
state.line_matrix = next_line;
state.text_matrix = next_line;
}
fn push_text_run(
extraction: &mut ContentExtraction,
state: &mut GraphicsState,
source_object_ids: &[String],
text: String,
fonts: &HashMap<String, Arc<FontDecoder>>,
) {
let advance = text_advance_width(&text, state, fonts);
if text.trim().is_empty() {
state.text_matrix = state.text_matrix.translate(advance, 0.0);
return;
}
let font = state.font_name.as_ref().and_then(|name| fonts.get(name));
let (bold, italic) = font
.map(|font| (font.bold, font.italic))
.unwrap_or((false, false));
let (ascent, descent) = font
.map(|font| (font.ascent, font.descent))
.unwrap_or((0.75, -0.25));
let bbox = text_run_bbox(state, advance, ascent, descent);
let (base_x, base_y) = state.text_matrix.point(0.0, state.text_rise);
let (_, baseline_y) = state.ctm.point(base_x, base_y);
let space_width = space_advance_width(state, fonts);
extraction.text_runs.push(TextRun {
text,
bbox,
baseline_y,
font: state.font_name.clone(),
size: state.font_size,
space_width,
bold,
italic,
source_object_ids: source_object_ids.to_vec(),
});
state.text_matrix = state.text_matrix.translate(advance, 0.0);
}
fn text_advance_width(
text: &str,
state: &GraphicsState,
fonts: &HashMap<String, Arc<FontDecoder>>,
) -> f32 {
let glyphs = text.chars().count() as f32;
if glyphs == 0.0 {
return 0.0;
}
let spaces = text.chars().filter(|character| *character == ' ').count() as f32;
let font = state
.font_name
.as_ref()
.and_then(|font_name| fonts.get(font_name));
let base = text
.chars()
.map(|character| {
font.and_then(|font| font.widths.get(&character).copied())
.unwrap_or_else(|| default_glyph_width(character))
/ 1000.0
* state.font_size
})
.sum::<f32>();
let spacing = glyphs * state.char_spacing + spaces * state.word_spacing;
((base + spacing) * state.horizontal_scaling).max(0.0)
}
fn default_glyph_width(character: char) -> f32 {
match character {
' ' | '!' | ',' | '.' | '/' | ':' | ';' | 'I' | '[' | '\\' | ']' | 'i' | 'j' | 'l'
| '|' | '\'' => 250.0,
'"' | '(' | ')' | '*' | '`' | '-' | 'f' | 'r' | 't' | '{' | '}' => 333.0,
'm' | 'M' | 'W' | 'w' | '@' => 850.0,
'0'..='9' => 556.0,
'A'..='Z' | '$' | '+' | '<' | '=' | '>' | '?' | '_' | '~' => 650.0,
_ => 500.0,
}
}
fn space_advance_width(state: &GraphicsState, fonts: &HashMap<String, Arc<FontDecoder>>) -> f32 {
let from_font = state
.font_name
.as_ref()
.and_then(|font_name| fonts.get(font_name))
.and_then(|font| font.widths.get(&' ').copied())
.filter(|width| *width > 0.0)
.map(|width| width / 1000.0 * state.font_size);
let width = from_font.unwrap_or_else(|| default_glyph_width(' ') / 1000.0 * state.font_size);
(width * state.horizontal_scaling).max(0.0)
}
fn text_run_bbox(state: &GraphicsState, advance: f32, ascent: f32, descent: f32) -> BBox {
let bottom = state.text_rise + descent * state.font_size;
let top = state.text_rise + ascent * state.font_size;
let corners = [
(0.0, bottom),
(advance, bottom),
(0.0, top),
(advance, top),
];
let points = corners
.into_iter()
.map(|(x, y)| {
let (text_x, text_y) = state.text_matrix.point(x, y);
state.ctm.point(text_x, text_y)
})
.collect::<Vec<_>>();
let min_x = points.iter().map(|(x, _)| *x).fold(f32::INFINITY, f32::min);
let min_y = points.iter().map(|(_, y)| *y).fold(f32::INFINITY, f32::min);
let max_x = points
.iter()
.map(|(x, _)| *x)
.fold(f32::NEG_INFINITY, f32::max);
let max_y = points
.iter()
.map(|(_, y)| *y)
.fold(f32::NEG_INFINITY, f32::max);
BBox {
x: min_x,
y: min_y,
width: (max_x - min_x).max(state.font_size * 0.25),
height: (max_y - min_y).max(state.font_size * 0.25),
}
}
fn build_blocks(page_number: usize, lines: &[TextLine], edges: &[GraphicEdge]) -> Vec<Block> {
let body_size = page_body_size(lines);
let tables = detect_page_tables(page_number, lines, edges);
if tables.is_empty() {
let split_lines = split_wide_text_lines(lines);
let text_blocks = text_lines_in_reading_order(&split_lines)
.into_iter()
.filter_map(|line| text_block_from_line(page_number, line, body_size))
.collect::<Vec<_>>();
return merge_wrapped_text_blocks(text_blocks)
.into_iter()
.map(Block::Text)
.collect();
}
build_blocks_with_tables(page_number, lines, tables, body_size)
}
fn detect_page_tables(
page_number: usize,
lines: &[TextLine],
edges: &[GraphicEdge],
) -> Vec<DetectedTable> {
let mut tables: Vec<DetectedTable> = Vec::new();
let mut consumed = vec![false; lines.len()];
while tables.len() < 8 {
let mapping: Vec<usize> = (0..lines.len()).filter(|&index| !consumed[index]).collect();
if mapping.len() < 2 {
break;
}
let subset: Vec<TextLine> = mapping.iter().map(|&index| lines[index].clone()).collect();
let Some(mut detected) = detect_table(page_number, &subset, edges) else {
break;
};
let original: Vec<usize> = detected
.line_indices
.iter()
.filter_map(|&subset_index| mapping.get(subset_index).copied())
.collect();
if original.is_empty() {
break;
}
for &index in &original {
consumed[index] = true;
}
detected.line_indices = original;
tables.push(detected);
}
tables
}
fn build_blocks_with_tables(
page_number: usize,
lines: &[TextLine],
mut tables: Vec<DetectedTable>,
body_size: f32,
) -> Vec<Block> {
let mut consumed = vec![false; lines.len()];
for table in &tables {
for &index in &table.line_indices {
if let Some(slot) = consumed.get_mut(index) {
*slot = true;
}
}
}
let remaining_lines = lines
.iter()
.enumerate()
.filter(|(line_index, _)| !consumed[*line_index])
.map(|(_, line)| line.clone())
.collect::<Vec<_>>();
let split_lines = split_wide_text_lines(&remaining_lines);
let text_blocks = merge_wrapped_text_blocks(
text_lines_in_reading_order(&split_lines)
.into_iter()
.filter_map(|line| text_block_from_line(page_number, line, body_size))
.collect(),
);
let table_top = |table: &DetectedTable| {
table
.table
.bbox
.map(|bbox| bbox.y + bbox.height)
.unwrap_or(f32::NEG_INFINITY)
};
tables.sort_by(|left, right| table_top(right).total_cmp(&table_top(left)));
let mut blocks = Vec::new();
let mut next_table = 0usize;
for text_block in text_blocks {
let block_top = text_block
.bbox
.map(|bbox| bbox.y + bbox.height)
.unwrap_or(f32::NEG_INFINITY);
while next_table < tables.len() && table_top(&tables[next_table]) > block_top {
blocks.push(Block::Table(tables[next_table].table.clone()));
next_table += 1;
}
blocks.push(Block::Text(text_block));
}
for table in tables.into_iter().skip(next_table) {
blocks.push(Block::Table(table.table));
}
blocks
}
fn image_figure_blocks(page_number: usize, images: &[ImageObject]) -> Vec<Block> {
images
.iter()
.map(|image| {
Block::Figure(FigureBlock {
alt_text: Some(format!("Image {}", image.id)),
caption: None,
bbox: image.bbox,
image_ref: Some(image.id.clone()),
source_anchors: vec![anchor(
page_number,
image.bbox,
image.object_id.clone().into_iter().collect(),
)],
confidence: Some(Confidence {
score: 0.6,
calibrated: false,
}), ..Default::default()
})
})
.collect()
}
fn split_wide_text_lines(lines: &[TextLine]) -> Vec<TextLine> {
let enable_tight_column_band = has_repeated_tight_column_band_evidence(lines);
let mut split_lines = Vec::new();
for line in lines {
match split_text_line_at_wide_gap(line, enable_tight_column_band) {
Some((left, right)) => {
split_lines.push(left);
split_lines.push(right);
}
None => split_lines.push(line.clone()),
}
}
split_lines
}
fn line_runs_x_sorted(runs: &[TextRun]) -> bool {
runs.windows(2).all(|pair| pair[0].bbox.x <= pair[1].bbox.x)
}
fn runs_sorted_by_x(line: &TextLine) -> Cow<'_, [TextRun]> {
if line_runs_x_sorted(&line.runs) {
Cow::Borrowed(&line.runs)
} else {
let mut runs = line.runs.clone();
runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
Cow::Owned(runs)
}
}
fn split_text_line_at_wide_gap(
line: &TextLine,
enable_tight_column_band: bool,
) -> Option<(TextLine, TextLine)> {
if line.runs.len() < 2 {
return None;
}
let runs = runs_sorted_by_x(line);
let contains_math = runs
.iter()
.any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
let tight_column_split_index = enable_tight_column_band
.then(|| tight_column_band_split_index_for_runs(&runs[..]))
.flatten();
let largest_gap_split = largest_run_gap(&runs[..]);
if contains_math && tight_column_split_index.is_none() {
return None;
}
let split_index = match (tight_column_split_index, largest_gap_split) {
(Some(tight_index), Some((wide_index, gap, x_jump)))
if prefers_wide_gap_before_tight_band(&runs[..], wide_index, tight_index, gap, x_jump) =>
{
wide_index
}
(Some(tight_index), _) => tight_index,
(None, Some((wide_index, _, _))) => wide_index,
(None, None) => return None,
};
let left_runs = runs[..split_index].to_vec();
let right_runs = runs[split_index..].to_vec();
if left_runs.is_empty() || right_runs.is_empty() {
return None;
}
let right_value_cells = right_runs
.iter()
.filter(|run| is_numeric_value(&run.text))
.count();
let right_all_figures = right_runs.iter().all(|run| {
let text = run.text.trim();
text.is_empty()
|| is_value_cell(text)
|| matches!(text, "$" | "€" | "£" | "¥" | "(" | ")" | "($")
});
let leader_gap = right_runs.first().map_or(0.0, |run| run.bbox.x)
- left_runs
.last()
.map_or(0.0, |run| run.bbox.x + run.bbox.width);
if right_value_cells >= 3 && right_all_figures && leader_gap >= 100.0 {
return None;
}
Some((
text_line_from_runs(left_runs)?,
text_line_from_runs(right_runs)?,
))
}
fn has_repeated_tight_column_band_evidence(lines: &[TextLine]) -> bool {
lines
.iter()
.filter(|line| {
let runs = runs_sorted_by_x(line);
tight_column_band_split_index_for_runs(&runs[..]).is_some()
})
.take(2)
.count()
>= 2
}
fn tight_column_band_split_index_for_runs(runs: &[TextRun]) -> Option<usize> {
let split_index = right_column_band_split_index(runs)?;
let contains_math = runs
.iter()
.any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
if contains_math && !allows_math_column_split(&runs[..split_index]) {
return None;
}
Some(split_index)
}
fn right_column_band_split_index(runs: &[TextRun]) -> Option<usize> {
if runs.len() < 3 || runs.first()?.bbox.x > 120.0 {
return None;
}
for index in 1..runs.len() {
if index < 2 {
continue;
}
let algorithm_like_left = allows_math_column_split(&runs[..index]);
let right_x = runs[index].bbox.x;
let in_standard_column_band = (300.0..=340.0).contains(&right_x);
let in_algorithm_column_band = algorithm_like_left && (280.0..=340.0).contains(&right_x);
if !in_standard_column_band && !in_algorithm_column_band {
continue;
}
if runs.len() - index < 2 && !algorithm_like_left {
continue;
}
let previous = &runs[index - 1].bbox;
let gap = right_x - (previous.x + previous.width);
if gap < -35.0 {
continue;
}
let right_text_len = runs[index..]
.iter()
.map(|run| run.text.trim().len())
.sum::<usize>();
if right_text_len < 18 {
continue;
}
return Some(index);
}
None
}
fn allows_math_column_split(left_runs: &[TextRun]) -> bool {
let text = left_runs
.iter()
.map(|run| run.text.trim())
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join(" ");
let trimmed = text.trim_start();
starts_with_numbered_step(trimmed)
|| trimmed.starts_with("Require:")
|| trimmed.starts_with("Ensure:")
|| trimmed.starts_with("Algorithm ")
}
fn largest_run_gap(runs: &[TextRun]) -> Option<(usize, f32, f32)> {
runs.windows(2)
.enumerate()
.filter_map(|(index, window)| {
let left = &window[0].bbox;
let right = &window[1].bbox;
let gap = right.x - (left.x + left.width);
let x_jump = right.x - left.x;
is_likely_column_split_gap(&window[0].bbox, &window[1].bbox, gap, x_jump).then_some((
index + 1,
gap,
x_jump,
))
})
.max_by(|left, right| left.1.max(left.2).total_cmp(&right.1.max(right.2)))
}
fn is_likely_column_split_gap(left: &BBox, right: &BBox, gap: f32, x_jump: f32) -> bool {
if gap >= 18.0 {
return true;
}
x_jump >= 110.0 && left.x < 280.0 && right.x > 280.0
}
fn column_gutter_is_clear(lines: &[TextLine], midpoint: f32, min_y: f32, max_y: f32) -> bool {
let band = 4.0;
let mut region = 0usize;
let mut crossing = 0usize;
for line in lines {
if line.bbox.y < min_y - line.bbox.height || line.bbox.y > max_y + line.bbox.height {
continue;
}
region += 1;
if line.bbox.x < midpoint - band && line.bbox.x + line.bbox.width > midpoint + band {
crossing += 1;
}
}
region == 0 || (crossing as f32) <= (region as f32) * 0.25
}
fn text_line_from_runs(runs: Vec<TextRun>) -> Option<TextLine> {
let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
let baseline_y = runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32;
Some(TextLine {
runs,
bbox,
baseline_y,
})
}
fn prefers_wide_gap_before_tight_band(
runs: &[TextRun],
wide_index: usize,
tight_index: usize,
gap: f32,
x_jump: f32,
) -> bool {
if wide_index == 0 || wide_index >= tight_index || tight_index > runs.len() {
return false;
}
let left = &runs[wide_index - 1].bbox;
let right = &runs[wide_index].bbox;
let stranded_right_glyphs = runs[wide_index..tight_index]
.iter()
.all(|run| run.bbox.x >= 280.0 && run.text.trim().chars().count() <= 2);
stranded_right_glyphs && left.x < 280.0 && right.x >= 280.0 && x_jump >= 110.0 && gap >= -160.0
}
fn text_lines_in_reading_order(lines: &[TextLine]) -> Vec<&TextLine> {
if let Some(layout) = detect_paired_text_columns(lines) {
return order_column_layout(layout);
}
if let Some(mut columns) = detect_text_columns(lines) {
columns.sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
return columns
.into_iter()
.flat_map(|mut column| {
column.sort_by(|left, right| {
right
.bbox
.y
.total_cmp(&left.bbox.y)
.then(left.bbox.x.total_cmp(&right.bbox.x))
});
column
})
.collect();
}
lines.iter().collect()
}
fn order_column_layout(mut layout: ColumnLayout<'_>) -> Vec<&TextLine> {
let mut ordered = Vec::new();
sort_lines_top_down(&mut layout.leading);
ordered.extend(layout.leading);
layout
.columns
.sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
for mut column in layout.columns {
sort_lines_top_down(&mut column);
ordered.extend(column);
}
sort_lines_top_down(&mut layout.trailing);
ordered.extend(layout.trailing);
ordered
}
fn sort_lines_top_down(lines: &mut [&TextLine]) {
lines.sort_by(|left, right| {
right
.bbox
.y
.total_cmp(&left.bbox.y)
.then(left.bbox.x.total_cmp(&right.bbox.x))
});
}
fn detect_paired_text_columns(lines: &[TextLine]) -> Option<ColumnLayout<'_>> {
if lines.len() < 4 {
return None;
}
let mut left_seed_indices = Vec::new();
let mut right_seed_indices = Vec::new();
for (left_index, left) in lines.iter().enumerate() {
for (right_index, right) in lines.iter().enumerate() {
if left_index == right_index || left.bbox.x >= right.bbox.x {
continue;
}
if (left.bbox.y - right.bbox.y).abs() > column_pair_y_tolerance(left, right) {
continue;
}
let gap = right.bbox.x - (left.bbox.x + left.bbox.width);
let x_jump = right.bbox.x - left.bbox.x;
if !is_likely_column_split_gap(&left.bbox, &right.bbox, gap, x_jump) {
continue;
}
left_seed_indices.push(left_index);
right_seed_indices.push(right_index);
}
}
dedupe_indices(&mut left_seed_indices);
dedupe_indices(&mut right_seed_indices);
if left_seed_indices.len() < 2 || right_seed_indices.len() < 2 {
return None;
}
let left_x = average_x(lines, &left_seed_indices)?;
let right_x = average_x(lines, &right_seed_indices)?;
if right_x - left_x < 90.0 {
return None;
}
let column_min_y = left_seed_indices
.iter()
.chain(&right_seed_indices)
.map(|index| lines[*index].bbox.y)
.reduce(f32::min)?;
let column_max_y = left_seed_indices
.iter()
.chain(&right_seed_indices)
.map(|index| lines[*index].bbox.y)
.reduce(f32::max)?;
let abstract_y = abstract_heading_y(lines);
let midpoint = (left_x + right_x) / 2.0;
if !column_gutter_is_clear(lines, midpoint, column_min_y, column_max_y) {
return None;
}
let mut leading = Vec::new();
let mut trailing = Vec::new();
let mut left_column = Vec::new();
let mut right_column = Vec::new();
for line in lines {
if is_likely_front_matter_line(line, abstract_y)
|| line.bbox.y > column_max_y + line.bbox.height
{
leading.push(line);
} else if line.bbox.y < column_min_y - line.bbox.height * 1.8
&& (is_likely_page_number_line(line) || is_likely_bottom_footnote_line(line))
{
trailing.push(line);
} else if line.bbox.x < midpoint {
left_column.push(line);
} else {
right_column.push(line);
}
}
if left_column.len() < 2 || right_column.len() < 2 {
return None;
}
Some(ColumnLayout {
leading,
columns: vec![left_column, right_column],
trailing,
})
}
fn column_pair_y_tolerance(left: &TextLine, right: &TextLine) -> f32 {
left.bbox.height.max(right.bbox.height) * 0.45
}
fn abstract_heading_y(lines: &[TextLine]) -> Option<f32> {
lines
.iter()
.find(|line| text_line_plain_text(line).eq_ignore_ascii_case("abstract"))
.map(|line| line.bbox.y)
}
fn is_likely_front_matter_line(line: &TextLine, abstract_y: Option<f32>) -> bool {
abstract_y.is_some_and(|y| line.bbox.y > y + 36.0)
}
fn is_likely_bottom_footnote_line(line: &TextLine) -> bool {
average_run_size(line) <= 10.0 && text_line_plain_text(line).len() > 4
}
fn average_run_size(line: &TextLine) -> f32 {
if line.runs.is_empty() {
return line.bbox.height;
}
line.runs.iter().map(|run| run.size).sum::<f32>() / line.runs.len() as f32
}
fn is_likely_page_number_line(line: &TextLine) -> bool {
let text = text_line_plain_text(line);
!text.is_empty() && text.len() <= 4 && text.chars().all(|character| character.is_ascii_digit())
}
fn text_line_plain_text(line: &TextLine) -> String {
join_runs_spaced(&runs_sorted_by_x(line)).trim().to_owned()
}
fn dedupe_indices(indices: &mut Vec<usize>) {
indices.sort_unstable();
indices.dedup();
}
fn average_x(lines: &[TextLine], indices: &[usize]) -> Option<f32> {
if indices.is_empty() {
return None;
}
Some(
indices
.iter()
.map(|index| lines[*index].bbox.x)
.sum::<f32>()
/ indices.len() as f32,
)
}
fn detect_text_columns(lines: &[TextLine]) -> Option<Vec<Vec<&TextLine>>> {
if lines.len() < 4 {
return None;
}
let mut centers = lines
.iter()
.enumerate()
.map(|(index, line)| (index, line.bbox.x + line.bbox.width / 2.0))
.collect::<Vec<_>>();
centers.sort_by(|left, right| left.1.total_cmp(&right.1));
let (split_index, largest_gap) = centers
.windows(2)
.enumerate()
.map(|(index, window)| (index + 1, window[1].1 - window[0].1))
.max_by(|left, right| left.1.total_cmp(&right.1))?;
if largest_gap < 90.0 {
return None;
}
let (left_indices, right_indices) = centers.split_at(split_index);
if left_indices.len() < 2 || right_indices.len() < 2 {
return None;
}
let left = left_indices
.iter()
.map(|(index, _)| &lines[*index])
.collect::<Vec<_>>();
let right = right_indices
.iter()
.map(|(index, _)| &lines[*index])
.collect::<Vec<_>>();
let overlap = y_overlap(&left, &right)?;
let average_height = average_line_height(lines);
if overlap < average_height {
return None;
}
let left_right_edge = left
.iter()
.map(|line| line.bbox.x + line.bbox.width)
.fold(f32::MIN, f32::max);
let right_left_edge = right.iter().map(|line| line.bbox.x).fold(f32::MAX, f32::min);
if right_left_edge - left_right_edge < 15.0 {
return None;
}
Some(vec![left, right])
}
fn column_x(lines: &[&TextLine]) -> f32 {
if lines.is_empty() {
return 0.0;
}
lines.iter().map(|line| line.bbox.x).sum::<f32>() / lines.len() as f32
}
fn y_overlap(left: &[&TextLine], right: &[&TextLine]) -> Option<f32> {
let left_min = left.iter().map(|line| line.bbox.y).reduce(f32::min)?;
let left_max = left
.iter()
.map(|line| line.bbox.y + line.bbox.height)
.reduce(f32::max)?;
let right_min = right.iter().map(|line| line.bbox.y).reduce(f32::min)?;
let right_max = right
.iter()
.map(|line| line.bbox.y + line.bbox.height)
.reduce(f32::max)?;
Some((left_max.min(right_max) - left_min.max(right_min)).max(0.0))
}
fn average_line_height(lines: &[TextLine]) -> f32 {
let total = lines.iter().map(|line| line.bbox.height).sum::<f32>();
total / lines.len() as f32
}
fn text_block_from_line(page_number: usize, line: &TextLine, body_size: f32) -> Option<TextBlock> {
let text = text_from_line_runs(line);
let text = clean_pdf_line_text(&text);
if text.is_empty() {
return None;
}
Some(TextBlock {
text: text.clone(),
kind: classify_text_line(&text, line_dominant_size(line), body_size),
bbox: Some(line.bbox),
lines: vec![Line {
text,
bbox: Some(line.bbox),
spans: line
.runs
.iter()
.filter_map(|run| {
let text = clean_pdf_span_text(&run.text);
(!text.is_empty()).then(|| Span {
text,
bbox: Some(run.bbox),
font: run.font.clone(),
size: Some(run.size),
bold: run.bold,
italic: run.italic,
})
})
.collect(),
}],
source_anchors: vec![anchor(
page_number,
Some(line.bbox),
source_ids_for_line(line),
)],
confidence: Some(Confidence {
score: 0.82,
calibrated: false,
}), ..Default::default()
})
}
fn adaptive_single_glyph_gap(runs: &[TextRun]) -> Option<f32> {
let mut gaps: Vec<f32> = Vec::new();
let mut space_w = 0.0f32;
let mut prev_end: Option<f32> = None;
for run in runs {
if run.text.is_empty() {
continue;
}
space_w = space_w.max(run.space_width);
if let Some(end) = prev_end {
let gap = run.bbox.x - end;
if gap.is_finite() && gap > 0.0 {
gaps.push(gap);
}
}
prev_end = Some(run.bbox.x + run.bbox.width);
}
if gaps.len() < 3 || space_w <= 0.0 {
return None;
}
gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = gaps[gaps.len() / 2];
Some((median * 1.8).clamp(space_w * 0.08, space_w * 0.4))
}
fn join_runs_spaced(runs: &[TextRun]) -> String {
let mut out = String::new();
let adaptive_glyph_gap = adaptive_single_glyph_gap(runs);
let mut previous: Option<(f32, f32, f32, bool)> = None;
for run in runs {
if run.text.is_empty() {
continue;
}
let multi_char = run.text.trim().chars().count() >= 2;
if let Some((prev_end_x, prev_space_width, prev_baseline_y, prev_multi)) = previous {
let boundary_has_space = out.ends_with(char::is_whitespace)
|| run.text.starts_with(char::is_whitespace);
let gap = run.bbox.x - prev_end_x;
let numeric_continuation = out.trim_end().ends_with(|c: char| c.is_ascii_digit())
&& run.text.trim_start().starts_with(|c: char| c.is_ascii_digit());
let tokens_separate = (prev_multi || multi_char) && !numeric_continuation;
let threshold = match adaptive_glyph_gap {
Some(adaptive) if !tokens_separate => adaptive,
_ => word_gap_threshold(prev_space_width, run.space_width, run.size, tokens_separate),
};
let baseline_break =
(prev_baseline_y - run.baseline_y).abs() >= run.size.max(1.0) * 0.18;
let overlap_break =
tokens_separate && gap <= -(prev_space_width.max(run.space_width) * 0.6).max(0.5);
if !out.is_empty()
&& !boundary_has_space
&& (gap >= threshold || baseline_break || overlap_break)
{
out.push(' ');
}
}
out.push_str(&run.text);
previous = Some((
run.bbox.x + run.bbox.width,
run.space_width,
run.baseline_y,
multi_char,
));
}
out
}
fn word_gap_threshold(
left_space_width: f32,
right_space_width: f32,
size: f32,
tokens_separate: bool,
) -> f32 {
let space = left_space_width
.max(right_space_width)
.max(size * 0.25)
.max(0.1);
space * if tokens_separate { 0.1 } else { 0.4 }
}
fn text_from_line_runs(line: &TextLine) -> String {
let runs = runs_sorted_by_x(line);
if !line_has_math_script_context(&runs[..]) {
return join_runs_spaced(&runs[..]);
}
let Some(baseline_y) = dominant_baseline_y(&runs[..]) else {
return join_runs_spaced(&runs[..]);
};
let mut pieces: Vec<String> = Vec::new();
for run in runs.iter() {
let token = run.text.trim();
if token.is_empty() {
continue;
}
if let Some(script) = script_kind_for_run(run, baseline_y) {
if let Some(previous) = pieces.last_mut() {
if can_attach_math_script(previous, token) {
previous.push_str(&format_math_script(script, token));
continue;
}
}
}
pieces.push(token.to_owned());
}
pieces.join(" ")
}
fn dominant_baseline_y(runs: &[TextRun]) -> Option<f32> {
let max_size = runs
.iter()
.map(|run| run.size)
.reduce(f32::max)
.filter(|size| *size > 0.0)?;
let mut baselines = runs
.iter()
.filter(|run| run.size >= max_size * 0.8)
.map(|run| run.baseline_y)
.collect::<Vec<_>>();
if baselines.is_empty() {
baselines = runs.iter().map(|run| run.baseline_y).collect();
}
baselines.sort_by(|left, right| left.total_cmp(right));
baselines.get(baselines.len() / 2).copied()
}
fn script_kind_for_run(run: &TextRun, baseline_y: f32) -> Option<ScriptKind> {
let delta = run.baseline_y - baseline_y;
let threshold = (run.size * 0.25).clamp(2.0, 4.0);
if delta >= threshold {
Some(ScriptKind::Superscript)
} else if delta <= -threshold {
Some(ScriptKind::Subscript)
} else {
None
}
}
fn line_has_math_script_context(runs: &[TextRun]) -> bool {
let joined = runs
.iter()
.map(|run| run.text.as_str())
.collect::<Vec<_>>()
.join(" ");
joined.chars().any(|character| {
matches!(
character,
'=' | '+'
| '−'
| '×'
| '*'
| '^'
| '_'
| '∈'
| '≤'
| '≥'
| '≠'
| 'λ'
| 'θ'
| 'ρ'
| 'τ'
| 'Σ'
| '∑'
)
}) || runs.windows(2).any(|window| {
let left = window[0].text.trim();
let right = window[1].text.trim();
let baseline_delta = (window[0].baseline_y - window[1].baseline_y).abs();
let script_offset = window[0].size.max(window[1].size) * 0.2;
baseline_delta >= script_offset
&& is_math_script_base(left)
&& is_math_script_text(right)
})
}
fn can_attach_math_script(previous: &str, token: &str) -> bool {
!previous.ends_with('^')
&& !previous.ends_with('_')
&& is_math_script_text(token)
&& previous_has_math_script_base(previous)
}
fn is_math_script_base(token: &str) -> bool {
let trimmed = token.trim_matches(|character: char| matches!(character, '(' | '[' | '{'));
let count = trimmed.chars().count();
(count == 1 && trimmed.chars().any(|character| character.is_alphanumeric()))
|| trimmed.starts_with('\\')
}
fn previous_has_math_script_base(previous: &str) -> bool {
let trimmed = previous.trim_end();
if trimmed.ends_with('}') || trimmed.ends_with(']') || trimmed.ends_with(')') {
return trimmed.contains('\\') || trimmed.contains('_') || trimmed.contains('^');
}
trimmed
.chars()
.rev()
.find(|character| !matches!(character, '*' | '\'' | '′'))
.is_some_and(|character| character.is_alphabetic() || character == '\\')
}
fn is_math_script_text(token: &str) -> bool {
let cleaned = token.trim_matches(|character: char| matches!(character, '(' | ')' | '[' | ']'));
!cleaned.is_empty()
&& cleaned.chars().all(|character| {
character.is_alphanumeric()
|| matches!(character, '+' | '-' | '−' | '=' | ',' | '.' | '\\')
})
}
fn format_math_script(kind: ScriptKind, token: &str) -> String {
let marker = match kind {
ScriptKind::Superscript => '^',
ScriptKind::Subscript => '_',
};
let cleaned = token.trim();
if cleaned.chars().count() == 1
|| cleaned
.chars()
.all(|character| character.is_ascii_alphanumeric())
{
format!("{marker}{cleaned}")
} else {
format!("{marker}{{{cleaned}}}")
}
}
fn merge_wrapped_text_blocks(blocks: Vec<TextBlock>) -> Vec<TextBlock> {
let mut merged: Vec<TextBlock> = Vec::new();
for block in blocks {
if let Some(previous) = merged.last_mut() {
if should_merge_text_blocks(previous, &block) {
merge_text_block(previous, block);
continue;
}
}
merged.push(block);
}
merged
}
fn should_merge_text_blocks(previous: &TextBlock, next: &TextBlock) -> bool {
let Some(previous_bbox) = previous.bbox else {
return false;
};
let Some(next_bbox) = next.bbox else {
return false;
};
let baseline_gap = previous_bbox.y - next_bbox.y;
if baseline_gap <= 0.0 || baseline_gap > previous_bbox.height.max(next_bbox.height) * 1.8 {
return false;
}
let x_aligned = (previous_bbox.x - next_bbox.x).abs() <= 18.0;
let hyphenated = previous.text.ends_with('-') && starts_with_lowercase(&next.text);
if x_aligned && hyphenated {
return true;
}
if starts_with_numbered_step(&previous.text) && starts_with_numbered_step(&next.text) {
return false;
}
if previous.kind != "paragraph" || next.kind != "paragraph" {
return false;
}
let lowercase_continuation =
starts_with_lowercase(&next.text) && !ends_sentence(&previous.text);
x_aligned && (hyphenated || lowercase_continuation)
}
fn merge_text_block(previous: &mut TextBlock, next: TextBlock) {
previous.text = join_wrapped_text(&previous.text, &next.text);
previous.bbox = union_boxes(previous.bbox.into_iter().chain(next.bbox)).or(previous.bbox);
previous.lines.extend(next.lines);
for anchor in next.source_anchors {
previous.source_anchors.push(anchor);
}
}
fn join_wrapped_text(previous: &str, next: &str) -> String {
if let Some(stem) = previous.strip_suffix('-') {
format!("{stem}{}", next.trim_start())
} else {
format!("{} {}", previous.trim_end(), next.trim_start())
}
}
fn starts_with_lowercase(text: &str) -> bool {
text.chars()
.find(|character| character.is_alphabetic())
.is_some_and(|character| character.is_lowercase())
}
fn starts_with_numbered_step(text: &str) -> bool {
let trimmed = text.trim_start();
let digit_count = trimmed
.chars()
.take_while(|character| character.is_ascii_digit())
.count();
digit_count > 0
&& trimmed
.chars()
.nth(digit_count)
.is_some_and(|character| matches!(character, ':' | '.'))
}
fn ends_sentence(text: &str) -> bool {
text.trim_end()
.chars()
.last()
.is_some_and(|character| matches!(character, '.' | '!' | '?'))
}
fn clean_pdf_line_text(text: &str) -> String {
let text = repair_windows_1252_ellipsis_before_tokenizing(text);
let tokens = text
.split_whitespace()
.map(normalize_pdf_token)
.filter(|token| !token.is_empty())
.collect::<Vec<_>>();
let mut cleaned: Vec<String> = Vec::new();
let mut index = 0;
while index < tokens.len() {
let token = tokens[index].as_str();
if is_closing_punctuation_token(token) && !cleaned.is_empty() {
let previous = cleaned.last_mut().expect("checked non-empty");
previous.push_str(token);
index += 1;
continue;
}
if is_joining_apostrophe(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
let next = tokens[index + 1].as_str();
if is_word_piece(next) {
let previous = cleaned.last_mut().expect("checked non-empty");
previous.push('\'');
previous.push_str(next);
index += 2;
continue;
}
}
if is_joining_hyphen(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
let next = tokens[index + 1].as_str();
if is_word_piece(next) {
let previous = cleaned.last_mut().expect("checked non-empty");
previous.push('-');
previous.push_str(next);
index += 2;
continue;
}
}
if let Some(previous) = cleaned.last_mut() {
if should_join_after_trailing_hyphen(previous, token) {
previous.push_str(token);
index += 1;
continue;
}
if should_join_pdf_word_piece(previous, token) {
previous.push_str(token);
index += 1;
continue;
}
}
if is_letter_fragment(token) {
let mut merged = String::new();
let mut end = index;
while end < tokens.len() && is_letter_fragment(tokens[end].as_str()) {
merged.push_str(tokens[end].as_str());
end += 1;
}
if end - index >= 2 {
cleaned.push(merged);
index = end;
continue;
}
}
cleaned.push(token.to_owned());
index += 1;
}
repair_pdf_math_notation(&repair_pdf_word_fragment_phrases(&cleaned.join(" ")))
}
fn clean_pdf_span_text(text: &str) -> String {
repair_pdf_math_notation(&normalize_pdf_token(text))
}
fn repair_pdf_word_fragment_phrases(text: &str) -> String {
let mut repaired = text.to_owned();
for (broken, fixed) in [
("a c onversatio n", "a conversation"),
("ac onversatio n", "a conversation"),
("an other", "another"),
("ce nters", "centers"),
("prod uction", "production"),
("de mands", "demands"),
("turn s", "turns"),
("coordinate s", "coordinates"),
("coordinat e", "coordinate"),
("facilitat e", "facilitate"),
("speake rs", "speakers"),
("listener s'", "listeners'"),
("th e", "the"),
("p resent", "present"),
("linguisti c", "linguistic"),
("an d", "and"),
("inferen ces", "inferences"),
("attentio n", "attention"),
("B eyond", "Beyond"),
("variabilit y", "variability"),
("l essons", "lessons"),
("re peating", "repeating"),
("import ant", "important"),
("sp ecified", "specified"),
] {
repaired = repaired.replace(broken, fixed);
}
repaired
}
fn normalize_pdf_token(token: &str) -> String {
let normalized = token
.replace("â\u{80}\u{98}", "'")
.replace("â\u{80}\u{99}", "'")
.replace("·", "·")
.replace("â\u{84}\u{93}", "ℓ")
.replace("Γ", "Γ")
.replace("Θ", "Θ")
.replace("Λ", "Λ")
.replace("Î\u{a0}", "Π")
.replace("Σ", "Σ")
.replace("Φ", "Φ")
.replace("Ω", "Ω")
.replace("λ", "λ")
.replace("Ï\u{84}", "τ")
.replace("Ã\u{97}", "×")
.replace("â\u{86}\u{92}", "→")
.replace("â\u{89}¥", "≥")
.replace("â\u{89}¤", "≤")
.replace("â\u{88}\u{88}", "∈")
.replace("â\u{88}\u{91}", "∑")
.replace(['‘', '’'], "'")
.replace(['“', '”'], "\"");
let normalized = expand_latin_ligatures(&normalized);
let normalized = repair_windows_1252_control_punctuation(&normalized);
repair_embedded_pdf_control_glyphs(&normalized)
}
fn expand_latin_ligatures(text: &str) -> String {
if !text.chars().any(|character| ('\u{FB00}'..='\u{FB06}').contains(&character)) {
return text.to_owned();
}
let mut output = String::with_capacity(text.len());
for character in text.chars() {
match character {
'\u{FB00}' => output.push_str("ff"),
'\u{FB01}' => output.push_str("fi"),
'\u{FB02}' => output.push_str("fl"),
'\u{FB03}' => output.push_str("ffi"),
'\u{FB04}' => output.push_str("ffl"),
'\u{FB05}' | '\u{FB06}' => output.push_str("st"),
other => output.push(other),
}
}
output
}
fn repair_windows_1252_control_punctuation(text: &str) -> String {
let mut output = String::with_capacity(text.len());
for character in text.chars() {
match character {
'\u{80}' => output.push_str("EUR"),
'\u{82}' => output.push(','),
'\u{83}' => output.push('f'),
'\u{84}' => output.push('"'),
'\u{85}' => output.push_str("..."),
'\u{86}' => output.push_str("†"),
'\u{87}' => output.push_str("‡"),
'\u{88}' => output.push('^'),
'\u{89}' => output.push_str("‰"),
'\u{8a}' => output.push_str("Š"),
'\u{8b}' => output.push('<'),
'\u{8c}' => output.push_str("OE"),
'\u{8e}' => output.push_str("Ž"),
'\u{91}' | '\u{92}' => output.push('\''),
'\u{93}' | '\u{94}' => output.push('"'),
'\u{95}' => output.push('*'),
'\u{96}' => output.push('–'),
'\u{97}' => output.push('—'),
'\u{98}' => output.push('~'),
'\u{99}' => output.push_str("(TM)"),
'\u{9a}' => output.push_str("š"),
'\u{9b}' => output.push('>'),
'\u{9c}' => output.push_str("oe"),
'\u{9e}' => output.push_str("ž"),
'\u{9f}' => output.push_str("Ÿ"),
_ => output.push(character),
}
}
output
}
fn repair_windows_1252_ellipsis_before_tokenizing(text: &str) -> String {
text.replace('\u{85}', "...")
}
fn repair_embedded_pdf_control_glyphs(token: &str) -> String {
let characters = token.chars().collect::<Vec<_>>();
let mut output = String::with_capacity(token.len());
for (index, character) in characters.iter().enumerate() {
match character {
'\u{2}' if has_following_alphabetic(&characters, index + 1) => {
output.push_str("fi");
}
'\u{2}' => {}
'\u{3}' if has_following_alphabetic(&characters, index + 1) => {
output.push_str("fl");
}
_ => output.push(*character),
}
}
output
}
fn has_following_alphabetic(characters: &[char], index: usize) -> bool {
characters
.get(index)
.is_some_and(|character| character.is_alphabetic())
}
fn is_closing_punctuation_token(token: &str) -> bool {
matches!(token, "." | "," | ":" | ";" | "!" | "?" | ")" | "]" | "}")
}
fn should_join_after_trailing_hyphen(previous: &str, token: &str) -> bool {
previous.ends_with('-')
&& token
.chars()
.next()
.is_some_and(|character| character.is_ascii_alphanumeric())
&& previous
.chars()
.any(|character| character.is_ascii_alphanumeric())
}
fn should_join_pdf_word_piece(previous: &str, token: &str) -> bool {
if !is_alphabetic_word(previous) || !is_alphabetic_word(token) {
return false;
}
if !previous
.chars()
.last()
.is_some_and(|character| character.is_lowercase())
|| !starts_with_lowercase(token)
{
return false;
}
matches!(
(previous, token),
("coordina", "ting") | ("de", "scribe") | ("foc", "i") | ("pro", "posed")
)
}
fn is_alphabetic_word(token: &str) -> bool {
!token.is_empty() && token.chars().all(|character| character.is_alphabetic())
}
fn repair_pdf_math_notation(text: &str) -> String {
let normalized = text.replace("·", "·").replace("â\u{84}\u{93}", "ℓ");
if !looks_like_pdf_math_notation(&normalized) {
return strip_pdf_control_glyphs(&normalized);
}
let normalized = repair_combining_math_operator_sequences(&normalized);
let symbols = replace_math_symbols(&normalized);
strip_pdf_control_glyphs(&repair_math_subscript_spacing(&symbols))
}
fn repair_combining_math_operator_sequences(text: &str) -> String {
text.replace("\u{338} =", "≠")
.replace("\u{338}=", "≠")
.replace("=\u{338}", "≠")
}
fn looks_like_pdf_math_notation(text: &str) -> bool {
text.chars().any(|character| {
matches!(
character,
'ℓ' | 'λ'
| 'θ'
| 'ρ'
| 'τ'
| '∆'
| 'Δ'
| '≤'
| '≥'
| '∈'
| '∪'
| '∑'
| '∅'
| '·'
| '−'
| '±'
| '⊆'
| '∼'
| '≠'
| '→'
)
}) || has_math_ellipsis_context(text)
|| text.contains("Fq")
|| text.contains(" 6 =")
}
fn has_math_ellipsis_context(text: &str) -> bool {
if !text.contains("...") {
return false;
}
let compact = text.split_whitespace().collect::<String>();
compact.contains(",...,")
|| compact.contains("),...")
|| compact.contains("...,(")
|| text.chars().any(|character| {
matches!(
character,
'=' | '+' | '_' | '^' | '\\' | '∈' | '≤' | '≥' | '≠' | 'λ' | 'θ' | 'ρ' | 'τ'
)
})
}
fn replace_math_symbols(text: &str) -> String {
let collapsed = text
.replace("· · ·", r"\cdots")
.replace("...", r"\ldots")
.replace("6 =", r"\neq")
.replace("Fq", r"\mathbb{F}_q");
let mut output = String::with_capacity(collapsed.len());
for character in collapsed.chars() {
match character {
'\u{3}' => output.push_str(r"\Lambda"),
'Γ' => output.push_str(r"\Gamma"),
'Θ' => output.push_str(r"\Theta"),
'ℓ' => output.push_str(r"\ell"),
'λ' => output.push_str(r"\lambda"),
'Λ' => output.push_str(r"\Lambda"),
'Π' => output.push_str(r"\Pi"),
'Σ' => output.push_str(r"\Sigma"),
'Φ' => output.push_str(r"\Phi"),
'Ω' => output.push_str(r"\Omega"),
'θ' => output.push_str(r"\theta"),
'ρ' => output.push_str(r"\rho"),
'τ' => output.push_str(r"\tau"),
'∆' | 'Δ' => output.push_str(r"\Delta"),
'≤' => output.push_str(r"\leq"),
'≥' => output.push_str(r"\geq"),
'∈' => output.push_str(r"\in"),
'∪' => output.push_str(r"\cup"),
'∑' => output.push_str(r"\sum"),
'∅' => output.push_str(r"\varnothing"),
'−' => output.push('-'),
'±' => output.push_str(r"\pm"),
'⊆' => output.push_str(r"\subseteq"),
'∼' => output.push_str(r"\sim"),
'≠' => output.push_str(r"\neq"),
'×' => output.push_str(r"\times"),
'→' => output.push_str(r"\to"),
'·' => output.push_str(r"\cdot"),
_ => output.push(character),
}
}
output
}
fn strip_pdf_control_glyphs(text: &str) -> String {
let mut sanitized = String::with_capacity(text.len());
let mut last_was_space = false;
for character in text.chars() {
if is_nonprinting_pdf_control(character) {
if !last_was_space {
sanitized.push(' ');
last_was_space = true;
}
continue;
}
sanitized.push(character);
last_was_space = character.is_whitespace();
}
sanitized.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn is_nonprinting_pdf_control(character: char) -> bool {
character.is_control() && !matches!(character, '\n' | '\r' | '\t')
}
fn repair_math_subscript_spacing(text: &str) -> String {
let tokens = text.split_whitespace().collect::<Vec<_>>();
let mut repaired = Vec::with_capacity(tokens.len());
let mut index = 0;
while index < tokens.len() {
let token = tokens[index];
if is_math_base_token(token) && index + 1 < tokens.len() {
if tokens[index + 1].starts_with('_') {
repaired.push(format!("{}{}", token, tokens[index + 1]));
index += 2;
continue;
}
if let Some((subscript, suffix)) = split_math_subscript_token(tokens[index + 1]) {
repaired.push(format!(
"{}{}{}",
token,
format_math_subscript(subscript),
suffix
));
index += 2;
continue;
}
}
repaired.push(repair_compact_math_subscript(token));
index += 1;
}
repaired.join(" ")
}
fn repair_compact_math_subscript(token: &str) -> String {
if token.chars().count() > 2 && token.chars().all(|character| character.is_alphabetic()) {
return token.to_owned();
}
for base in ["m", "n", "N", "T", "V", "C", "x", "t", "i", "k", "h", "g"] {
if let Some(rest) = token.strip_prefix(base) {
if rest.is_empty() || rest.starts_with('_') {
continue;
}
if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
}
}
}
for base in [r"\lambda", r"\theta", r"\rho"] {
if let Some(rest) = token.strip_prefix(base) {
if rest.is_empty() || rest.starts_with('_') {
continue;
}
if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
}
}
}
token.to_owned()
}
fn is_math_base_token(token: &str) -> bool {
matches!(
token,
"m" | "n"
| "N"
| "T"
| "V"
| "C"
| "x"
| "t"
| "i"
| "k"
| "h"
| "g"
| r"\lambda"
| r"\theta"
| r"\rho"
)
}
fn split_math_subscript_token(token: &str) -> Option<(&str, &str)> {
for command in [r"\ell", r"\lambda", r"\theta", r"\rho"] {
if let Some(suffix) = token.strip_prefix(command) {
return Some((command, suffix));
}
}
for word in ["init", "cl"] {
if let Some(suffix) = token.strip_prefix(word) {
return Some((word, suffix));
}
}
let mut end = 0;
for (offset, character) in token.char_indices() {
if character.is_ascii_digit() {
end = offset + character.len_utf8();
continue;
}
break;
}
if end > 0 {
return Some((&token[..end], &token[end..]));
}
let mut chars = token.char_indices();
let (_, first) = chars.next()?;
if matches!(first, 'i' | 'j' | 'k' | 'l' | 'n' | 'r' | 's') {
let end = first.len_utf8();
return Some((&token[..end], &token[end..]));
}
None
}
fn format_math_subscript(subscript: &str) -> String {
match subscript {
"init" => r"_{\text{init}}".to_owned(),
_ => format!("_{subscript}"),
}
}
fn is_letter_fragment(token: &str) -> bool {
let chars = token.chars().collect::<Vec<_>>();
matches!(chars.as_slice(), [character] if character.is_ascii_alphabetic())
|| matches!(chars.as_slice(), [character, '-'] if character.is_ascii_alphabetic())
}
fn is_word_piece(token: &str) -> bool {
token.chars().any(|character| character.is_alphabetic())
}
fn is_joining_apostrophe(token: &str) -> bool {
matches!(token, "'" | "’")
}
fn is_joining_hyphen(token: &str) -> bool {
matches!(token, "-" | "‐" | "‑")
}
fn detect_table(
page_number: usize,
lines: &[TextLine],
edges: &[GraphicEdge],
) -> Option<DetectedTable> {
detect_ruled_grid_table(page_number, lines, edges)
.or_else(|| detect_exact_run_table(page_number, lines))
.or_else(|| detect_columnar_numeric_table(page_number, lines))
.or_else(|| detect_implied_alignment_table(page_number, lines))
}
fn detect_columnar_numeric_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
let line_cells: Vec<Vec<TextRun>> = lines
.iter()
.map(|line| coalesce_currency_prefixes(implied_table_cells(line)))
.collect();
let mut right_edges: Vec<f32> = Vec::new();
let mut data_rows = 0usize;
for cells in &line_cells {
if cells_contain_prose(cells) {
continue;
}
let values = cells.iter().filter(|cell| is_value_cell(&cell.text)).count();
if values >= 2 {
data_rows += 1;
for cell in cells.iter().filter(|cell| is_value_cell(&cell.text)) {
right_edges.push(cell.bbox.x + cell.bbox.width);
}
}
}
if data_rows < 4 {
return None;
}
let min_support = ((data_rows as f32) * 0.35).ceil().max(3.0) as usize;
let all_clusters = cluster_column_right_edges_with_support(&right_edges, 8.0);
let mut columns: Vec<f32> = all_clusters
.iter()
.filter(|(_, support)| *support >= min_support)
.map(|(position, _)| *position)
.collect();
columns.extend(rescue_periodic_subcolumns(
&all_clusters,
&columns,
min_support,
data_rows,
));
columns.sort_by(f32::total_cmp);
if columns.len() < 2 {
return None;
}
let cell_width = column_cell_width(&line_cells, columns[0]);
let half_gap = columns
.get(1)
.map_or(cell_width * 2.5, |next| (next - columns[0]) / 2.0);
let first_column_left = columns[0] - (cell_width * 2.5).min(half_gap.max(cell_width * 1.5));
let table_right = columns.last().copied().unwrap_or_default();
let aligned: Vec<usize> = (0..lines.len())
.filter(|&index| {
line_cells[index]
.iter()
.filter(|cell| is_value_cell(&cell.text))
.any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
})
.collect();
let (first, last) = (*aligned.first()?, *aligned.last()?);
let mut row_indices: Vec<usize> = Vec::new();
let mut previous_y: Option<f32> = None;
for index in first..=last {
let line = &lines[index];
let cells = &line_cells[index];
let aligned_here = cells
.iter()
.filter(|cell| is_value_cell(&cell.text))
.any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some());
let numeric_here = cells.iter().any(|cell| is_numeric_value(&cell.text));
let label_only = !numeric_here && line.bbox.x <= table_right;
if !aligned_here && !label_only {
break;
}
if let Some(prev) = previous_y {
if (prev - line.bbox.y).abs() > average_run_size(line).max(line.bbox.height) * 3.5 {
break;
}
}
row_indices.push(index);
previous_y = Some(line.bbox.y);
}
let aligned_in_span = row_indices
.iter()
.filter(|&&index| {
line_cells[index]
.iter()
.filter(|cell| is_value_cell(&cell.text))
.any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
})
.count();
if aligned_in_span < 4 {
return None;
}
build_columnar_table(page_number, lines, &line_cells, &columns, first_column_left, &row_indices)
}
fn coalesce_currency_prefixes(cells: Vec<TextRun>) -> Vec<TextRun> {
const SYMBOLS: [char; 4] = ['$', '€', '£', '¥'];
let mut out: Vec<TextRun> = Vec::with_capacity(cells.len());
let mut pending: Option<TextRun> = None;
for mut cell in cells {
let mut text = cell.text.trim().to_string();
if let Some(prefix) = pending.take() {
cell.bbox = union_boxes([prefix.bbox, cell.bbox]).unwrap_or(cell.bbox);
text = format!("{}{}", prefix.text.trim(), text);
}
if text.chars().count() == 1 && text.chars().all(|c| SYMBOLS.contains(&c)) {
cell.text = text;
pending = Some(cell);
continue;
}
if let Some(last) = text.chars().last() {
if SYMBOLS.contains(&last) {
let stripped = text[..text.len() - last.len_utf8()].trim_end();
if !stripped.is_empty() {
let mut carry = cell.clone();
carry.text = last.to_string();
text = stripped.to_string();
pending = Some(carry);
}
}
}
cell.text = text;
out.push(cell);
}
if let Some(prefix) = pending {
out.push(prefix);
}
out
}
fn is_numeric_value(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let mut digits = 0usize;
for character in trimmed.chars() {
match character {
'0'..='9' => digits += 1,
'$' | '(' | ')' | ',' | '.' | '%' | '-' | '+' | ' ' | '\u{2014}' | '\u{2013}' => {}
_ => return false,
}
}
digits >= 1
}
fn is_value_cell(text: &str) -> bool {
is_numeric_value(text) || matches!(text.trim(), "—" | "–")
}
fn cells_contain_prose(cells: &[TextRun]) -> bool {
if cells.iter().filter(|cell| is_value_cell(&cell.text)).count() >= 2 {
return false;
}
cells.iter().any(|cell| {
cell.text
.split_whitespace()
.filter(|word| word.chars().any(|c| c.is_alphabetic()))
.count()
> 12
})
}
fn cluster_column_right_edges_with_support(values: &[f32], tol: f32) -> Vec<(f32, usize)> {
let mut sorted = values.to_vec();
sorted.sort_by(f32::total_cmp);
let mut clusters: Vec<(f32, usize)> = Vec::new();
let mut start = 0usize;
for index in 1..=sorted.len() {
let split = index == sorted.len() || sorted[index] - sorted[index - 1] > tol;
if split {
let cluster = &sorted[start..index];
if !cluster.is_empty() {
clusters.push((cluster[cluster.len() / 2], cluster.len()));
}
start = index;
}
}
clusters
}
fn rescue_periodic_subcolumns(
all_clusters: &[(f32, usize)],
kept: &[f32],
min_support: usize,
data_rows: usize,
) -> Vec<f32> {
if kept.len() < 2 {
return Vec::new();
}
let floor = ((data_rows as f32) * 0.15).ceil().max(3.0) as usize;
if floor >= min_support {
return Vec::new();
}
let mut diffs: Vec<f32> = kept.windows(2).map(|window| window[1] - window[0]).collect();
diffs.sort_by(f32::total_cmp);
let pitch = diffs[diffs.len() / 2];
if pitch <= 0.0 {
return Vec::new();
}
let anchor = kept[0];
let (first, last) = (kept[0], kept[kept.len() - 1]);
let candidates: Vec<f32> = all_clusters
.iter()
.filter(|(position, support)| {
*support >= floor
&& *support < min_support
&& *position >= first - pitch
&& *position <= last + pitch
})
.map(|(position, _)| *position)
.collect();
let residue = |position: f32| ((position - anchor) % pitch + pitch) % pitch;
let group_of = |position: f32| ((position - anchor) / pitch).round() as i32;
let mut rescued = Vec::new();
let mut used = vec![false; candidates.len()];
for index in 0..candidates.len() {
if used[index] {
continue;
}
let target = residue(candidates[index]);
let mut class = vec![index];
for other in (index + 1)..candidates.len() {
if used[other] {
continue;
}
let delta = (target - residue(candidates[other])).abs();
if delta.min(pitch - delta) <= 8.0 {
class.push(other);
}
}
let groups: std::collections::HashSet<i32> =
class.iter().map(|&member| group_of(candidates[member])).collect();
if class.len() >= 2 && groups.len() >= 2 {
for &member in &class {
used[member] = true;
rescued.push(candidates[member]);
}
}
}
rescued
}
fn nearest_column(right_edge: f32, columns: &[f32]) -> Option<usize> {
columns
.iter()
.enumerate()
.map(|(index, edge)| (index, (right_edge - edge).abs()))
.filter(|(_, distance)| *distance <= 14.0)
.min_by(|left, right| left.1.total_cmp(&right.1))
.map(|(index, _)| index)
}
fn column_cell_width(line_cells: &[Vec<TextRun>], first_column: f32) -> f32 {
let widths: Vec<f32> = line_cells
.iter()
.flat_map(|cells| cells.iter())
.filter(|cell| is_numeric_value(&cell.text))
.filter(|cell| ((cell.bbox.x + cell.bbox.width) - first_column).abs() <= 14.0)
.map(|cell| cell.bbox.width)
.collect();
if widths.is_empty() {
return 40.0;
}
let mut sorted = widths.clone();
sorted.sort_by(f32::total_cmp);
sorted[sorted.len() / 2].max(20.0)
}
fn wrapped_label_above(
lines: &[TextLine],
line_cells: &[Vec<TextRun>],
row_index: usize,
first_column_left: f32,
used: &[usize],
) -> Vec<usize> {
let label_x = lines[row_index].bbox.x;
let line_height = average_run_size(&lines[row_index]).max(lines[row_index].bbox.height);
let mut result: Vec<usize> = Vec::new();
let mut current_y = lines[row_index].bbox.y;
loop {
let above = (0..lines.len())
.filter(|&index| {
index != row_index
&& !used.contains(&index)
&& !result.contains(&index)
&& lines[index].bbox.y > current_y
})
.min_by(|&left, &right| lines[left].bbox.y.total_cmp(&lines[right].bbox.y));
let Some(above) = above else { break };
let line = &lines[above];
let text = text_line_plain_text(line);
let long_enough = text.chars().count() >= 28
|| line.bbox.x + line.bbox.width >= first_column_left - 12.0;
let all_caps_heading = text.chars().any(char::is_alphabetic)
&& text.chars().filter(|c| c.is_alphabetic()).all(char::is_uppercase);
if line.bbox.y - current_y > line_height * 1.8
|| (line.bbox.x - label_x).abs() > 16.0
|| !long_enough
|| all_caps_heading
|| text.trim().is_empty()
|| text.trim_end().ends_with(':')
|| line_cells[above].iter().any(|cell| is_numeric_value(&cell.text))
{
break;
}
result.push(above);
current_y = line.bbox.y;
}
result.reverse();
result
}
fn is_period_header_row(row: &[String]) -> bool {
let values: Vec<&str> = row[1..]
.iter()
.map(|cell| cell.trim())
.filter(|cell| !cell.is_empty())
.collect();
!values.is_empty()
&& values.iter().all(|cell| {
cell.len() == 4
&& cell.chars().all(|c| c.is_ascii_digit())
&& cell.parse::<i32>().is_ok_and(|year| (1900..=2100).contains(&year))
})
}
fn build_columnar_table(
page_number: usize,
lines: &[TextLine],
line_cells: &[Vec<TextRun>],
columns: &[f32],
first_column_left: f32,
row_indices: &[usize],
) -> Option<DetectedTable> {
let column_count = columns.len() + 1; let assign_row = |index: usize| -> Vec<String> {
let mut row = vec![String::new(); column_count];
for cell in &line_cells[index] {
let column = assign_cell_column(cell, columns, first_column_left);
push_table_cell_text(&mut row[column], &cell.text);
}
row
};
let span_top_y = lines[*row_indices.first()?].bbox.y;
let mut header_indices: Vec<usize> = (0..lines.len())
.filter(|&index| {
let line = &lines[index];
!row_indices.contains(&index)
&& line.bbox.y > span_top_y
&& line.bbox.y - span_top_y
<= average_run_size(line).max(line.bbox.height) * 5.0
&& line.bbox.x + line.bbox.width >= first_column_left - 24.0
&& !text_line_plain_text(line).to_ascii_lowercase().starts_with("table ")
&& !line_is_data_row(line, column_count)
&& !cells_contain_prose(&line_cells[index])
&& assign_row(index)[1..].iter().any(|cell| !cell.trim().is_empty())
})
.collect();
let mut data_start = 0usize;
for (position, &index) in row_indices.iter().enumerate() {
let row = assign_row(index);
if row[0].trim().is_empty() || is_period_header_row(&row) {
header_indices.push(index);
data_start = position + 1;
} else {
data_start = position;
break;
}
}
header_indices.sort_by(|left, right| lines[*right].bbox.y.total_cmp(&lines[*left].bbox.y));
let mut header_cells: Vec<String> = vec![String::new(); column_count];
for &index in &header_indices {
for (column, text) in assign_row(index).into_iter().enumerate() {
push_table_cell_text(&mut header_cells[column], &text);
}
}
let header_has_text = header_cells.iter().any(|cell| !cell.is_empty());
let mut rows: Vec<Vec<String>> = Vec::new();
let mut cell_records: Vec<TableCell> = Vec::new();
if header_has_text {
for (column, text) in header_cells.iter().enumerate() {
cell_records.push(table_cell(0, column, text.clone(), true));
}
}
let mut consumed: Vec<usize> = Vec::new();
let mut prefixes: Vec<(usize, String)> = Vec::new();
for &index in &row_indices[data_start..] {
if !line_cells[index].iter().any(|cell| is_numeric_value(&cell.text)) {
continue;
}
if assign_row(index)[0].trim().chars().count() > 11 {
continue;
}
let mut search_used = header_indices.clone();
search_used.extend_from_slice(&consumed);
let chain = wrapped_label_above(lines, line_cells, index, first_column_left, &search_used);
if !chain.is_empty() {
let prefix = chain
.iter()
.map(|&line| text_line_plain_text(&lines[line]))
.collect::<Vec<_>>()
.join(" ");
prefixes.push((index, prefix));
consumed.extend(chain);
}
}
let mut prose_skipped: Vec<usize> = Vec::new();
for &index in &row_indices[data_start..] {
if consumed.contains(&index) {
continue;
}
if cells_contain_prose(&line_cells[index]) {
prose_skipped.push(index);
continue;
}
let mut row = assign_row(index);
if let Some((_, prefix)) = prefixes.iter().find(|(line, _)| *line == index) {
row[0] = if row[0].trim().is_empty() {
prefix.clone()
} else {
format!("{prefix} {}", row[0])
};
}
if row.iter().all(|cell| cell.is_empty()) {
continue;
}
let table_row = rows.len() + usize::from(header_has_text);
for (column, text) in row.iter().enumerate() {
cell_records.push(table_cell(table_row, column, text.clone(), false));
}
rows.push(row);
}
if rows.is_empty() {
return None;
}
let value_rows = rows.iter().filter(|row| !row[0].trim().is_empty()).count();
let label_only_rows = rows
.iter()
.filter(|row| !row[0].trim().is_empty() && row[1..].iter().all(|cell| cell.trim().is_empty()))
.count();
let data_with_figures = rows
.iter()
.filter(|row| row[1..].iter().any(|cell| !cell.trim().is_empty()))
.count();
let multi_section = label_only_rows >= 2 && value_rows >= 8;
let wide_table = columns.len() >= 5 && value_rows >= 6;
if data_with_figures < 6 || !(multi_section || wide_table) {
return None;
}
let mut line_index_set: Vec<usize> = row_indices.to_vec();
line_index_set.extend(header_indices.iter().copied());
line_index_set.extend(consumed.iter().copied());
line_index_set.retain(|index| !prose_skipped.contains(index));
line_index_set.sort_unstable();
line_index_set.dedup();
let bbox = union_boxes(line_index_set.iter().map(|&index| lines[index].bbox))?;
Some(DetectedTable {
table: TableBlock {
headers: if header_has_text {
header_cells
} else {
Vec::new()
},
rows,
caption: None,
bbox: Some(bbox),
cells: cell_records,
source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
confidence: Some(Confidence {
score: 0.7,
calibrated: false,
}), ..Default::default()
},
line_indices: line_index_set,
})
}
fn assign_cell_column(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
if is_numeric_value(&cell.text) {
if let Some(column) = nearest_column(cell.bbox.x + cell.bbox.width, columns) {
return column + 1;
}
}
column_band(cell, columns, first_column_left)
}
fn column_band(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
let center = cell.bbox.x + cell.bbox.width / 2.0;
if center < first_column_left {
return 0;
}
for index in 0..columns.len() {
let upper = columns
.get(index + 1)
.map_or(f32::INFINITY, |next| (columns[index] + next) / 2.0);
if center <= upper {
return index + 1;
}
}
columns.len()
}
fn push_table_cell_text(target: &mut String, text: &str) {
let text = text.trim();
if text.is_empty() {
return;
}
if !target.is_empty() {
target.push(' ');
}
target.push_str(text);
}
fn table_cell(row: usize, column: usize, text: String, is_header: bool) -> TableCell {
TableCell {
row,
column,
text,
bbox: None,
is_header,
col_span: 1,
row_span: 1,
}
}
fn sort_runs_reading_order(runs: &mut [TextRun]) {
runs.sort_by(|a, b| {
let line_a = (a.baseline_y / 3.0).round();
let line_b = (b.baseline_y / 3.0).round();
line_b
.total_cmp(&line_a)
.then(a.bbox.x.total_cmp(&b.bbox.x))
});
}
fn row_is_prose(cells: &[String]) -> bool {
let word_counts: Vec<usize> = cells.iter().map(|c| c.split_whitespace().count()).collect();
if word_counts.iter().copied().max().unwrap_or(0) >= 12 {
return true;
}
let nonempty = cells.iter().filter(|c| !c.trim().is_empty()).count();
let total_words: usize = word_counts.iter().sum();
let numeric = cells.iter().filter(|c| is_value_cell(c)).count();
nonempty >= 5 && total_words >= 25 && (numeric as f32) < nonempty as f32 * 0.3
}
fn detect_ruled_grid_table(
page_number: usize,
lines: &[TextLine],
edges: &[GraphicEdge],
) -> Option<DetectedTable> {
let verticals = grid_axis_values(edges, EdgeOrientation::Vertical);
let horizontals = grid_axis_values(edges, EdgeOrientation::Horizontal);
if verticals.len() < 2 || horizontals.len() < 2 {
return None;
}
let columns = verticals.len() - 1;
let rows = horizontals.len() - 1;
if columns < 2 || rows < 2 {
return None;
}
if !has_nearby_ruled_table_label(lines, &verticals, &horizontals)
&& !has_multirow_ruled_grid_evidence(columns, rows)
{
return None;
}
let mut grid_runs: Vec<Vec<Vec<TextRun>>> = vec![vec![Vec::new(); columns]; rows];
let mut cell_boxes = vec![vec![None; columns]; rows];
let mut line_indices = Vec::new();
for (line_index, line) in lines.iter().enumerate() {
let mut used_line = false;
for run in &line.runs {
let center_x = run.bbox.x + run.bbox.width / 2.0;
let center_y = run.bbox.y + run.bbox.height / 2.0;
let Some(column) = grid_column_for(center_x, &verticals) else {
continue;
};
let Some(row) = grid_row_for(center_y, &horizontals) else {
continue;
};
grid_runs[row][column].push(run.clone());
cell_boxes[row][column] = Some(
cell_boxes[row][column]
.and_then(|bbox| union_boxes([bbox, run.bbox]))
.unwrap_or(run.bbox),
);
used_line = true;
}
if used_line {
line_indices.push(line_index);
}
}
let mut grid = vec![vec![String::new(); columns]; rows];
let mut prose_rows = vec![false; rows];
for row in 0..rows {
let mut cell_texts = vec![String::new(); columns];
for column in 0..columns {
if grid_runs[row][column].is_empty() {
continue;
}
let mut runs = grid_runs[row][column].clone();
sort_runs_reading_order(&mut runs);
cell_texts[column] = clean_pdf_line_text(&join_runs_spaced(&runs));
}
if row_is_prose(&cell_texts) {
prose_rows[row] = true;
let mut all: Vec<TextRun> = grid_runs[row].iter().flatten().cloned().collect();
sort_runs_reading_order(&mut all);
grid[row][0] = clean_pdf_line_text(&join_runs_spaced(&all));
} else {
grid[row] = cell_texts;
}
}
if grid
.iter()
.flatten()
.filter(|text| !text.trim().is_empty())
.count()
< 3
{
return None;
}
let headers = grid[0].clone();
let body_rows = grid.iter().skip(1).cloned().collect::<Vec<_>>();
if headers.iter().all(|text| text.trim().is_empty())
|| body_rows
.iter()
.flatten()
.all(|text| text.trim().is_empty())
{
return None;
}
let (mut col_span, mut covered) = merged_cell_col_spans(&cell_boxes, &verticals);
for row in 0..rows {
if prose_rows[row] {
covered[row][0] = false;
col_span[row][0] = columns;
for column in 1..columns {
covered[row][column] = true;
}
}
}
let mut cells = Vec::new();
for row in 0..rows {
for column in 0..columns {
if covered[row][column] {
continue;
}
cells.push(TableCell {
row,
column,
text: grid[row][column].clone(),
bbox: cell_boxes[row][column],
is_header: row == 0,
col_span: col_span[row][column],
row_span: 1,
});
}
}
let bbox = BBox {
x: *verticals.first()?,
y: *horizontals.first()?,
width: *verticals.last()? - *verticals.first()?,
height: *horizontals.last()? - *horizontals.first()?,
};
Some(DetectedTable {
table: TableBlock {
headers,
rows: body_rows,
caption: None,
bbox: Some(bbox),
cells,
source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
confidence: Some(Confidence {
score: 0.7,
calibrated: false,
}), ..Default::default()
},
line_indices,
})
}
fn merged_cell_col_spans(
cell_boxes: &[Vec<Option<BBox>>],
verticals: &[f32],
) -> (Vec<Vec<usize>>, Vec<Vec<bool>>) {
const SPAN_MARGIN: f32 = 2.0;
let rows = cell_boxes.len();
let columns = cell_boxes.first().map_or(0, Vec::len);
let mut col_span = vec![vec![1usize; columns]; rows];
let mut covered = vec![vec![false; columns]; rows];
for row in 0..rows {
for column in 0..columns {
if covered[row][column] {
continue;
}
let Some(bbox) = cell_boxes[row][column] else {
continue;
};
let content_right = bbox.x + bbox.width;
let mut next_column = column + 1;
while next_column < columns
&& cell_boxes[row][next_column].is_none()
&& !covered[row][next_column]
&& verticals
.get(next_column)
.is_some_and(|edge| content_right > edge + SPAN_MARGIN)
{
covered[row][next_column] = true;
next_column += 1;
}
col_span[row][column] = next_column - column;
}
}
(col_span, covered)
}
fn has_nearby_ruled_table_label(
lines: &[TextLine],
verticals: &[f32],
horizontals: &[f32],
) -> bool {
let Some(left) = verticals.first().copied() else {
return false;
};
let Some(right) = verticals.last().copied() else {
return false;
};
let Some(top) = horizontals.last().copied() else {
return false;
};
lines.iter().any(|line| {
let text = text_line_plain_text(line).to_ascii_lowercase();
text.starts_with("table")
&& line.bbox.y >= top
&& line.bbox.y <= top + 96.0
&& line.bbox.x <= right + 24.0
&& line.bbox.x + line.bbox.width >= left - 24.0
})
}
fn has_multirow_ruled_grid_evidence(columns: usize, rows: usize) -> bool {
columns >= 2 && rows >= 4
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum EdgeOrientation {
Horizontal,
Vertical,
}
fn grid_axis_values(edges: &[GraphicEdge], orientation: EdgeOrientation) -> Vec<f32> {
let mut values = edges
.iter()
.filter_map(|edge| match orientation {
EdgeOrientation::Horizontal if is_horizontal_edge(edge) => {
Some((edge.y0 + edge.y1) / 2.0)
}
EdgeOrientation::Vertical if is_vertical_edge(edge) => Some((edge.x0 + edge.x1) / 2.0),
_ => None,
})
.collect::<Vec<_>>();
values.sort_by(f32::total_cmp);
dedup_axis_values(values, 2.0)
}
fn is_horizontal_edge(edge: &GraphicEdge) -> bool {
(edge.y0 - edge.y1).abs() <= 1.0 && (edge.x0 - edge.x1).abs() >= 12.0
}
fn is_vertical_edge(edge: &GraphicEdge) -> bool {
(edge.x0 - edge.x1).abs() <= 1.0 && (edge.y0 - edge.y1).abs() >= 12.0
}
fn dedup_axis_values(values: Vec<f32>, tolerance: f32) -> Vec<f32> {
let mut deduped: Vec<f32> = Vec::new();
for value in values {
if let Some(previous) = deduped.last_mut() {
if (value - *previous).abs() <= tolerance {
*previous = (*previous + value) / 2.0;
continue;
}
}
deduped.push(value);
}
deduped
}
fn grid_column_for(x: f32, verticals: &[f32]) -> Option<usize> {
verticals
.windows(2)
.position(|window| x >= window[0] - 1.0 && x <= window[1] + 1.0)
}
fn grid_row_for(y: f32, horizontals: &[f32]) -> Option<usize> {
let band = horizontals
.windows(2)
.position(|window| y >= window[0] - 1.0 && y <= window[1] + 1.0)?;
Some(horizontals.len().saturating_sub(2).saturating_sub(band))
}
fn detect_exact_run_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
let candidate_lines = lines
.iter()
.enumerate()
.filter(|(_, line)| line.runs.len() >= 2)
.collect::<Vec<_>>();
if candidate_lines.len() < 2 {
return None;
}
let width = candidate_lines[0].1.runs.len();
if !candidate_lines.iter().all(|(_, line)| {
line.runs.len() == width && columns_align(&candidate_lines[0].1.runs, &line.runs)
}) {
return None;
}
if !has_table_evidence(&candidate_lines) {
return None;
}
let headers = candidate_lines[0]
.1
.runs
.iter()
.map(|run| run.text.trim().to_owned())
.collect::<Vec<_>>();
let rows = candidate_lines
.iter()
.skip(1)
.map(|(_, line)| {
line.runs
.iter()
.map(|run| run.text.trim().to_owned())
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
let bbox = union_boxes(candidate_lines.iter().map(|(_, line)| line.bbox))?;
let mut cells = Vec::new();
for (row_index, (_, line)) in candidate_lines.iter().enumerate() {
for (column_index, run) in line.runs.iter().enumerate() {
cells.push(TableCell {
row: row_index,
column: column_index,
text: run.text.clone(),
bbox: Some(run.bbox),
is_header: row_index == 0,
col_span: 1,
row_span: 1,
});
}
}
Some(DetectedTable {
table: TableBlock {
headers,
rows,
caption: None,
bbox: Some(bbox),
cells,
source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
confidence: Some(Confidence {
score: 0.72,
calibrated: false,
}), ..Default::default()
},
line_indices: candidate_lines
.iter()
.map(|(line_index, _)| *line_index)
.collect(),
})
}
fn detect_implied_alignment_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
let row_candidates = lines
.iter()
.enumerate()
.filter_map(|(line_index, line)| {
let cells = implied_table_cells(line);
(cells.len() >= 3 && row_has_numeric_table_evidence(&cells))
.then_some(TableRowCandidate { line_index, cells })
})
.collect::<Vec<_>>();
let group = best_aligned_table_row_group(&row_candidates)?;
if !has_nearby_table_label(lines, &group) && !has_strong_numeric_table_evidence(&group) {
return None;
}
build_implied_alignment_table(page_number, lines, &group)
}
fn has_strong_numeric_table_evidence(rows: &[TableRowCandidate]) -> bool {
let columns = rows.first().map_or(0, |row| row.cells.len());
if rows.len() < 4 || columns < 3 {
return false;
}
let numeric_rows = rows
.iter()
.filter(|row| row_has_numeric_table_evidence(&row.cells))
.count();
numeric_rows * 4 >= rows.len() * 3
}
fn has_nearby_table_label(lines: &[TextLine], rows: &[TableRowCandidate]) -> bool {
let Some(first_row) = rows.first() else {
return false;
};
let first_y = first_row
.cells
.iter()
.map(|cell| cell.bbox.y)
.reduce(f32::max)
.unwrap_or_default();
let table_left = first_row
.cells
.iter()
.map(|cell| cell.bbox.x)
.reduce(f32::min)
.unwrap_or_default();
let table_right = first_row
.cells
.iter()
.map(|cell| cell.bbox.x + cell.bbox.width)
.reduce(f32::max)
.unwrap_or_default();
lines.iter().any(|line| {
let text = text_line_plain_text(line).to_ascii_lowercase();
text.starts_with("table")
&& line.bbox.y >= first_y
&& line.bbox.y <= first_y + 96.0
&& line.bbox.x <= table_right + 24.0
&& line.bbox.x + line.bbox.width >= table_left - 24.0
})
}
fn implied_table_cells(line: &TextLine) -> Vec<TextRun> {
if line.runs.len() < 2 {
return line.runs.clone();
}
let mut runs = line.runs.clone();
runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
let threshold = implied_cell_gap_threshold(line);
let mut groups: Vec<Vec<TextRun>> = Vec::new();
let mut current: Vec<TextRun> = Vec::new();
for run in runs {
if let Some(previous) = current.last() {
let gap = run.bbox.x - (previous.bbox.x + previous.bbox.width);
let starts_currency = run.text.trim_start().starts_with('$');
let previous_attaches_currency = matches!(previous.text.trim(), "$" | "(" | "($");
if gap >= threshold || (starts_currency && !previous_attaches_currency) {
groups.push(std::mem::take(&mut current));
}
}
current.push(run);
}
if !current.is_empty() {
groups.push(current);
}
groups
.into_iter()
.filter_map(|runs| text_run_from_cell_runs(&runs))
.collect()
}
fn implied_cell_gap_threshold(line: &TextLine) -> f32 {
let height = average_run_size(line).max(line.bbox.height);
(height * 1.5).clamp(10.0, 18.0)
}
fn text_run_from_cell_runs(runs: &[TextRun]) -> Option<TextRun> {
let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
let text = clean_pdf_line_text(&join_runs_spaced(runs));
if text.is_empty() {
return None;
}
Some(TextRun {
text,
bbox,
baseline_y: runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32,
font: runs.iter().find_map(|run| run.font.clone()),
size: runs.iter().map(|run| run.size).sum::<f32>() / runs.len() as f32,
space_width: runs.iter().map(|run| run.space_width).fold(0.0, f32::max),
bold: !runs.is_empty() && runs.iter().all(|run| run.bold),
italic: !runs.is_empty() && runs.iter().all(|run| run.italic),
source_object_ids: source_ids_for_runs(runs),
})
}
fn row_has_numeric_table_evidence(cells: &[TextRun]) -> bool {
cells.iter().skip(1).any(|cell| {
cell.text
.chars()
.any(|character| character.is_ascii_digit())
})
}
fn best_aligned_table_row_group(rows: &[TableRowCandidate]) -> Option<Vec<TableRowCandidate>> {
let mut best: Option<Vec<TableRowCandidate>> = None;
let mut current: Vec<TableRowCandidate> = Vec::new();
for row in rows {
if current.is_empty() {
current.push(row.clone());
continue;
}
let compatible = current
.first()
.is_some_and(|first| table_rows_align(first, row))
&& current
.last()
.is_some_and(|previous| table_row_vertical_gap(previous, row) <= 28.0);
if compatible {
current.push(row.clone());
} else {
record_table_row_group(&mut best, ¤t);
current.clear();
current.push(row.clone());
}
}
record_table_row_group(&mut best, ¤t);
best
}
fn record_table_row_group(
best: &mut Option<Vec<TableRowCandidate>>,
candidate: &[TableRowCandidate],
) {
if candidate.len() < 2 {
return;
}
let Some(width) = candidate.first().map(|row| row.cells.len()) else {
return;
};
if width < 3 {
return;
}
let score = candidate.len() * width;
let best_score = best
.as_ref()
.and_then(|rows| rows.first().map(|row| rows.len() * row.cells.len()))
.unwrap_or_default();
if score > best_score {
*best = Some(candidate.to_vec());
}
}
fn table_rows_align(first: &TableRowCandidate, next: &TableRowCandidate) -> bool {
first.cells.len() == next.cells.len()
&& first
.cells
.iter()
.zip(&next.cells)
.all(|(left, right)| cells_column_aligned(left, right))
}
fn cells_column_aligned(left: &TextRun, right: &TextRun) -> bool {
let left_edge = (left.bbox.x - right.bbox.x).abs() <= 14.0;
let right_edge =
((left.bbox.x + left.bbox.width) - (right.bbox.x + right.bbox.width)).abs() <= 14.0;
left_edge || right_edge
}
fn table_row_vertical_gap(previous: &TableRowCandidate, next: &TableRowCandidate) -> f32 {
let previous_y = previous
.cells
.iter()
.map(|cell| cell.bbox.y)
.reduce(f32::max)
.unwrap_or_default();
let next_y = next
.cells
.iter()
.map(|cell| cell.bbox.y)
.reduce(f32::max)
.unwrap_or_default();
(previous_y - next_y).abs()
}
fn build_implied_alignment_table(
page_number: usize,
lines: &[TextLine],
rows: &[TableRowCandidate],
) -> Option<DetectedTable> {
let columns = rows.first()?.cells.len();
let bbox = union_boxes(
rows.iter()
.flat_map(|row| row.cells.iter().map(|cell| cell.bbox)),
)?;
let header = implied_table_header(lines, rows, columns);
let has_explicit_header = header.has_text();
let mut line_indices = rows.iter().map(|row| row.line_index).collect::<Vec<_>>();
line_indices.extend(header.line_indices.iter().copied());
line_indices.sort_unstable();
line_indices.dedup();
let (headers, body_rows, header_cells) = if has_explicit_header {
(
header
.cells
.iter()
.map(|cell| {
cell.as_ref()
.map(|cell| cell.text.clone())
.unwrap_or_default()
})
.collect::<Vec<_>>(),
rows.iter()
.map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
.collect::<Vec<Vec<_>>>(),
header.cells,
)
} else {
(
rows.first()?
.cells
.iter()
.map(|cell| cell.text.clone())
.collect::<Vec<_>>(),
rows.iter()
.skip(1)
.map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
.collect::<Vec<Vec<_>>>(),
rows.first()?.cells.iter().cloned().map(Some).collect(),
)
};
let mut cells = Vec::new();
for (column, cell) in header_cells.into_iter().enumerate() {
let text = headers.get(column).cloned().unwrap_or_default();
cells.push(TableCell {
row: 0,
column,
text,
bbox: cell.map(|cell| cell.bbox),
is_header: true,
col_span: 1,
row_span: 1,
});
}
for (row_index, row) in rows.iter().enumerate() {
let table_row = if has_explicit_header {
row_index + 1
} else {
row_index
};
if !has_explicit_header && row_index == 0 {
continue;
}
for (column, cell) in row.cells.iter().enumerate() {
cells.push(TableCell {
row: table_row,
column,
text: cell.text.clone(),
bbox: Some(cell.bbox),
is_header: false,
col_span: 1,
row_span: 1,
});
}
}
Some(DetectedTable {
table: TableBlock {
headers,
rows: body_rows,
caption: None,
bbox: Some(bbox),
cells,
source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
confidence: Some(Confidence {
score: 0.68,
calibrated: false,
}), ..Default::default()
},
line_indices,
})
}
#[derive(Debug, Clone)]
struct ImpliedTableHeader {
cells: Vec<Option<TextRun>>,
line_indices: Vec<usize>,
}
impl ImpliedTableHeader {
fn has_text(&self) -> bool {
self.cells
.iter()
.any(|cell| cell.as_ref().is_some_and(|cell| !cell.text.is_empty()))
}
}
fn implied_table_header(
lines: &[TextLine],
rows: &[TableRowCandidate],
columns: usize,
) -> ImpliedTableHeader {
let mut header = ImpliedTableHeader {
cells: vec![None; columns],
line_indices: Vec::new(),
};
let Some(first_row) = rows.first() else {
return header;
};
let first_y = first_row
.cells
.iter()
.map(|cell| cell.bbox.y)
.reduce(f32::max)
.unwrap_or_default();
let table_left = first_row
.cells
.iter()
.map(|cell| cell.bbox.x)
.reduce(f32::min)
.unwrap_or_default();
let table_right = first_row
.cells
.iter()
.map(|cell| cell.bbox.x + cell.bbox.width)
.reduce(f32::max)
.unwrap_or_default();
let column_refs = first_row
.cells
.iter()
.map(|cell| (cell.bbox.x, cell.bbox.x + cell.bbox.width))
.collect::<Vec<_>>();
let mut candidates = lines
.iter()
.enumerate()
.filter(|(line_index, line)| {
!rows.iter().any(|row| row.line_index == *line_index)
&& line.bbox.y > first_y
&& line.bbox.y <= first_y + 80.0
&& line.bbox.x <= table_right + 12.0
&& line.bbox.x + line.bbox.width >= table_left - 12.0
&& !text_line_plain_text(line)
.to_ascii_lowercase()
.starts_with("table ")
&& !line_is_data_row(line, columns)
})
.collect::<Vec<_>>();
candidates.sort_by(|left, right| right.1.bbox.y.total_cmp(&left.1.bbox.y));
for (line_index, line) in candidates {
let mut used_line = false;
for cell in implied_table_cells(line) {
if cell.text.chars().count() > 40 {
continue;
}
let Some(column) = nearest_table_column(&cell, &column_refs) else {
continue;
};
append_header_cell(&mut header.cells[column], cell);
used_line = true;
}
if used_line {
header.line_indices.push(line_index);
}
}
header
}
fn line_is_data_row(line: &TextLine, columns: usize) -> bool {
let cells = implied_table_cells(line);
cells.len() >= columns && row_has_numeric_table_evidence(&cells)
}
fn nearest_table_column(cell: &TextRun, column_refs: &[(f32, f32)]) -> Option<usize> {
let cell_center = cell.bbox.x + cell.bbox.width / 2.0;
let (column, distance) = column_refs
.iter()
.enumerate()
.map(|(index, (left, right))| {
let column_center = (left + right) / 2.0;
(index, (cell_center - column_center).abs())
})
.min_by(|left, right| left.1.total_cmp(&right.1))?;
let (left, right) = column_refs[column];
let tolerance = ((right - left) / 2.0 + 18.0).max(24.0);
(distance <= tolerance).then_some(column)
}
fn append_header_cell(target: &mut Option<TextRun>, fragment: TextRun) {
if let Some(existing) = target {
if !existing.text.is_empty() {
existing.text.push(' ');
}
existing.text.push_str(&fragment.text);
existing.bbox = union_boxes([existing.bbox, fragment.bbox]).unwrap_or(existing.bbox);
for id in fragment.source_object_ids {
if !existing.source_object_ids.contains(&id) {
existing.source_object_ids.push(id);
}
}
} else {
*target = Some(fragment);
}
}
fn has_table_evidence(candidate_lines: &[(usize, &TextLine)]) -> bool {
if candidate_lines.len() >= 3 {
return true;
}
candidate_lines
.iter()
.skip(1)
.flat_map(|(_, line)| line.runs.iter())
.any(|run| run.text.chars().any(|character| character.is_ascii_digit()))
}
fn columns_align(first: &[TextRun], next: &[TextRun]) -> bool {
first
.iter()
.zip(next)
.all(|(left, right)| (left.bbox.x - right.bbox.x).abs() <= 6.0)
}
fn rotate_point(x: f32, y: f32, rotation: i32, width: f32, height: f32) -> (f32, f32) {
match rotation.rem_euclid(360) {
90 => (y, width - x),
180 => (width - x, height - y),
270 => (height - y, x),
_ => (x, y),
}
}
fn rotate_bbox(bbox: BBox, rotation: i32, width: f32, height: f32) -> BBox {
if rotation.rem_euclid(360) == 0 {
return bbox;
}
let (x0, y0) = rotate_point(bbox.x, bbox.y, rotation, width, height);
let (x1, y1) = rotate_point(bbox.x + bbox.width, bbox.y + bbox.height, rotation, width, height);
BBox {
x: x0.min(x1),
y: y0.min(y1),
width: (x1 - x0).abs(),
height: (y1 - y0).abs(),
}
}
fn group_text_runs(mut runs: Vec<TextRun>) -> Vec<TextLine> {
runs.sort_by(|left, right| {
right
.baseline_y
.total_cmp(&left.baseline_y)
.then(left.bbox.x.total_cmp(&right.bbox.x))
});
let mut lines: Vec<TextLine> = Vec::new();
for run in runs {
if let Some(line) = lines
.iter_mut()
.find(|line| (line.baseline_y - run.baseline_y).abs() <= 3.0)
{
line.bbox = union_boxes([line.bbox, run.bbox]).unwrap_or(line.bbox);
line.baseline_y = line.baseline_y.min(run.baseline_y);
line.runs.push(run);
} else {
lines.push(TextLine {
baseline_y: run.baseline_y,
bbox: run.bbox,
runs: vec![run],
});
}
}
for line in &mut lines {
line.runs
.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
}
lines
}
fn parse_content_ops(bytes: &[u8]) -> Vec<ContentOp> {
let mut parser = ContentParser::new(bytes);
let mut stack = Vec::new();
let mut ops = Vec::new();
while let Some(token) = parser.next_operand_or_operator() {
match token {
ContentToken::Operand(operand) => stack.push(operand),
ContentToken::Operator(operator) => {
ops.push(ContentOp {
operands: std::mem::take(&mut stack),
operator,
});
}
}
}
ops
}
#[derive(Debug)]
enum ContentToken {
Operand(Operand),
Operator(String),
}
struct ContentParser<'a> {
bytes: &'a [u8],
pos: usize,
}
impl<'a> ContentParser<'a> {
fn new(bytes: &'a [u8]) -> Self {
Self { bytes, pos: 0 }
}
fn next_operand_or_operator(&mut self) -> Option<ContentToken> {
self.skip_ws_and_comments();
if self.pos >= self.bytes.len() {
return None;
}
let byte = self.bytes[self.pos];
match byte {
b'/' => Some(ContentToken::Operand(Operand::Name(self.read_name()))),
b'(' => Some(ContentToken::Operand(Operand::Literal(self.read_literal()))),
b'[' => Some(ContentToken::Operand(Operand::Array(self.read_array()))),
b'<' if self.peek(1) != Some(b'<') => {
Some(ContentToken::Operand(Operand::Hex(self.read_hex_string())))
}
b'+' | b'-' | b'.' | b'0'..=b'9' => self
.read_number()
.map(|number| ContentToken::Operand(Operand::Number(number))),
_ => {
let word = self.read_word();
if word.is_empty() {
self.pos += 1;
Some(ContentToken::Operand(Operand::Other))
} else {
Some(ContentToken::Operator(word))
}
}
}
}
fn read_array(&mut self) -> Vec<Operand> {
self.pos += 1;
let mut items = Vec::new();
loop {
self.skip_ws_and_comments();
if self.pos >= self.bytes.len() || self.bytes[self.pos] == b']' {
self.pos = (self.pos + 1).min(self.bytes.len());
break;
}
match self.next_operand_or_operator() {
Some(ContentToken::Operand(operand)) => items.push(operand),
Some(ContentToken::Operator(_)) | None => {}
}
}
items
}
fn read_name(&mut self) -> String {
self.pos += 1;
let start = self.pos;
while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
self.pos += 1;
}
lossy(&self.bytes[start..self.pos])
}
fn read_literal(&mut self) -> Vec<u8> {
self.pos += 1;
let mut depth = 1;
let mut output = Vec::new();
while self.pos < self.bytes.len() && depth > 0 {
let byte = self.bytes[self.pos];
self.pos += 1;
match byte {
b'\\' => {
if self.pos < self.bytes.len() {
match self.bytes[self.pos] {
b'n' => {
output.push(b'\n');
self.pos += 1;
}
b'r' => {
output.push(b'\r');
self.pos += 1;
}
b't' => {
output.push(b'\t');
self.pos += 1;
}
b'b' => {
output.push(0x08);
self.pos += 1;
}
b'f' => {
output.push(0x0c);
self.pos += 1;
}
b'\n' => {
self.pos += 1;
}
b'\r' => {
self.pos += 1;
if self.bytes.get(self.pos) == Some(&b'\n') {
self.pos += 1;
}
}
b'0'..=b'7' => output.push(self.read_octal_escape()),
other => {
output.push(other);
self.pos += 1;
}
}
}
}
b'(' => {
depth += 1;
output.push(byte);
}
b')' => {
depth -= 1;
if depth > 0 {
output.push(byte);
}
}
_ => output.push(byte),
}
}
output
}
fn read_octal_escape(&mut self) -> u8 {
let mut value = 0u16;
let mut digits = 0;
while self.pos < self.bytes.len()
&& digits < 3
&& matches!(self.bytes[self.pos], b'0'..=b'7')
{
value = (value << 3) + u16::from(self.bytes[self.pos] - b'0');
self.pos += 1;
digits += 1;
}
value.min(u16::from(u8::MAX)) as u8
}
fn read_hex_string(&mut self) -> Vec<u8> {
self.pos += 1;
let start = self.pos;
while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
self.pos += 1;
}
let raw = self.bytes[start..self.pos].to_vec();
self.pos = (self.pos + 1).min(self.bytes.len());
decode_hex(&raw)
}
fn read_number(&mut self) -> Option<f32> {
let start = self.pos;
while self.pos < self.bytes.len()
&& matches!(self.bytes[self.pos], b'+' | b'-' | b'.' | b'0'..=b'9')
{
self.pos += 1;
}
std::str::from_utf8(&self.bytes[start..self.pos])
.ok()
.and_then(|text| text.parse().ok())
}
fn read_word(&mut self) -> String {
let start = self.pos;
while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
self.pos += 1;
}
lossy(&self.bytes[start..self.pos])
}
fn skip_ws_and_comments(&mut self) {
loop {
while self.pos < self.bytes.len() && is_ws(self.bytes[self.pos]) {
self.pos += 1;
}
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'%' {
while self.pos < self.bytes.len() && !matches!(self.bytes[self.pos], b'\n' | b'\r')
{
self.pos += 1;
}
} else {
break;
}
}
}
fn peek(&self, offset: usize) -> Option<u8> {
self.bytes.get(self.pos + offset).copied()
}
}
fn parse_indirect_objects(bytes: &[u8]) -> Vec<PdfObject> {
let mut objects = Vec::new();
let mut pos = 0;
while pos < bytes.len() {
if !is_ws_or_line_start(bytes, pos) && pos != 0 {
pos += 1;
continue;
}
let Some((object_number, after_object_number)) = parse_unsigned_at(bytes, pos) else {
pos += 1;
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_object_number) else {
pos += 1;
continue;
};
let Some((generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
pos += 1;
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_generation) else {
pos += 1;
continue;
};
if !bytes[after_space..].starts_with(b"obj") {
pos += 1;
continue;
}
let body_start = after_space + 3;
if let Some(relative_end) = find_subslice(&bytes[body_start..], b"endobj") {
let body_end = body_start + relative_end;
objects.push(PdfObject {
object_number: object_number as u32,
generation: generation as u16,
body: bytes[body_start..body_end].to_vec(),
});
pos = body_end + b"endobj".len();
} else {
break;
}
}
objects
}
fn expand_object_streams(objects: &mut Vec<PdfObject>) {
let object_streams = objects
.iter()
.filter(|object| {
lossy(&object.body)
.split_whitespace()
.collect::<String>()
.contains("/Type/ObjStm")
})
.cloned()
.collect::<Vec<_>>();
let existing = objects
.iter()
.map(|object| object.object_number)
.collect::<std::collections::HashSet<_>>();
let mut expanded = Vec::new();
for object_stream in object_streams {
let object_body = lossy(&object_stream.body);
let Some(count) = parse_number_after(&object_body, "/N").map(|value| value as usize) else {
continue;
};
let Some(first) = parse_number_after(&object_body, "/First").map(|value| value as usize)
else {
continue;
};
let Ok(Some(decoded)) = decode_stream_object(&object_stream) else {
continue;
};
if first > decoded.len() {
continue;
}
let header = lossy(&decoded[..first]);
let header_numbers = header
.split_whitespace()
.filter_map(|part| part.parse::<usize>().ok())
.collect::<Vec<_>>();
let mut entries = Vec::new();
for pair in header_numbers.chunks_exact(2).take(count) {
entries.push((pair[0] as u32, pair[1]));
}
for (index, (object_number, offset)) in entries.iter().enumerate() {
if existing.contains(object_number) {
continue;
}
let next_offset = entries
.get(index + 1)
.map(|(_, next_offset)| *next_offset)
.unwrap_or(decoded.len() - first);
if *offset > next_offset || first + next_offset > decoded.len() {
continue;
}
expanded.push(PdfObject {
object_number: *object_number,
generation: 0,
body: decoded[first + *offset..first + next_offset].to_vec(),
});
}
}
objects.extend(expanded);
}
fn page_seed(object: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<PageSeed> {
let body = lossy(&object.body);
let compact = body.split_whitespace().collect::<String>();
if compact.contains("/Type/Page") && !compact.contains("/Type/Pages") {
Some(PageSeed {
number: 0,
body: body_with_inherited_page_tree_entries(&body, object_map),
})
} else {
None
}
}
fn body_with_inherited_page_tree_entries(
page_body: &str,
object_map: &HashMap<u32, Arc<PdfObject>>,
) -> String {
let mut body = page_body.to_owned();
append_parent_page_tree_entries(page_body, object_map, &mut body, 0);
body
}
fn append_parent_page_tree_entries(
body: &str,
object_map: &HashMap<u32, Arc<PdfObject>>,
output: &mut String,
depth: usize,
) {
if depth >= 16 {
return;
}
let Some(parent_ref) = parse_direct_ref_after_key(body, "/Parent") else {
return;
};
let Some(parent) = object_map.get(&(parent_ref as u32)) else {
return;
};
let parent_body = lossy(&parent.body);
output.push('\n');
output.push_str(&parent_body);
append_parent_page_tree_entries(&parent_body, object_map, output, depth + 1);
}
fn decode_stream_object(object: &PdfObject) -> Result<Option<Vec<u8>>> {
let Some(stream_marker) = find_subslice(&object.body, b"stream") else {
return Ok(None);
};
let Some(end_marker) = find_subslice(&object.body, b"endstream") else {
return Err(DonglerError::pdf("stream is missing endstream marker"));
};
if end_marker <= stream_marker {
return Err(DonglerError::pdf("stream markers are malformed"));
}
let dict = lossy(&object.body[..stream_marker]);
let mut stream = object.body[stream_marker + b"stream".len()..end_marker].to_vec();
trim_stream_edges(&mut stream);
for filter in stream_filters(&dict) {
stream = decode_stream_filter(&filter, &stream)?;
}
Ok(Some(stream))
}
fn decode_stream_filter(filter: &str, stream: &[u8]) -> Result<Vec<u8>> {
match filter {
"FlateDecode" | "Fl" => {
let mut decoder = ZlibDecoder::new(stream);
let mut decoded = Vec::new();
decoder
.read_to_end(&mut decoded)
.map_err(|error| DonglerError::pdf(format!("FlateDecode failed: {error}")))?;
Ok(decoded)
}
"ASCII85Decode" | "A85" => ascii85_decode(stream),
other => Err(DonglerError::pdf(format!(
"unsupported stream filter: {other}"
))),
}
}
fn stream_filters(dict: &str) -> Vec<String> {
let Some(mut index) = dict.find("/Filter").map(|index| index + "/Filter".len()) else {
return Vec::new();
};
let bytes = dict.as_bytes();
skip_pdf_whitespace(bytes, &mut index);
if bytes.get(index) == Some(&b'[') {
index += 1;
let mut filters = Vec::new();
while index < bytes.len() && bytes[index] != b']' {
skip_pdf_whitespace(bytes, &mut index);
if bytes.get(index) == Some(&b']') {
break;
}
if bytes.get(index) == Some(&b'/') {
index += 1;
let start = index;
while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
index += 1;
}
if start < index {
filters.push(dict[start..index].to_owned());
}
} else {
index += 1;
}
}
filters
} else if bytes.get(index) == Some(&b'/') {
index += 1;
let start = index;
while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
index += 1;
}
(start < index)
.then(|| vec![dict[start..index].to_owned()])
.unwrap_or_default()
} else {
Vec::new()
}
}
fn skip_pdf_whitespace(bytes: &[u8], index: &mut usize) {
while bytes
.get(*index)
.is_some_and(|byte| matches!(byte, b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' '))
{
*index += 1;
}
}
fn is_pdf_name_delimiter(byte: u8) -> bool {
matches!(
byte,
b'\0'
| b'\t'
| b'\n'
| b'\x0c'
| b'\r'
| b' '
| b'('
| b')'
| b'<'
| b'>'
| b'['
| b']'
| b'{'
| b'}'
| b'/'
| b'%'
)
}
fn ascii85_decode(bytes: &[u8]) -> Result<Vec<u8>> {
let mut output = Vec::new();
let mut group = Vec::new();
let mut index = 0;
while index < bytes.len() {
let byte = bytes[index];
match byte {
b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ' => {}
b'<' if bytes.get(index + 1) == Some(&b'~') => {
index += 1;
}
b'~' if bytes.get(index + 1) == Some(&b'>') => break,
b'z' if group.is_empty() => output.extend_from_slice(&[0, 0, 0, 0]),
b'!'..=b'u' => {
group.push(byte - b'!');
if group.len() == 5 {
output.extend_from_slice(&ascii85_group_to_bytes(&group)?);
group.clear();
}
}
_ => {
return Err(DonglerError::pdf(format!(
"ASCII85Decode failed: invalid byte 0x{byte:02x}"
)));
}
}
index += 1;
}
if !group.is_empty() {
if group.len() == 1 {
return Err(DonglerError::pdf(
"ASCII85Decode failed: dangling single digit",
));
}
let output_len = group.len() - 1;
while group.len() < 5 {
group.push(b'u' - b'!');
}
output.extend_from_slice(&ascii85_group_to_bytes(&group)?[..output_len]);
}
Ok(output)
}
fn ascii85_group_to_bytes(group: &[u8]) -> Result<[u8; 4]> {
let mut value = 0u64;
for digit in group {
value = value * 85 + u64::from(*digit);
}
if value > u64::from(u32::MAX) {
return Err(DonglerError::pdf("ASCII85Decode failed: invalid group"));
}
Ok((value as u32).to_be_bytes())
}
fn trim_stream_edges(stream: &mut Vec<u8>) {
while matches!(stream.first(), Some(b'\n' | b'\r')) {
stream.remove(0);
}
while matches!(stream.last(), Some(b'\n' | b'\r')) {
stream.pop();
}
}
fn parse_refs_after_key(text: &str, key: &str) -> Vec<usize> {
let Some(start) = text.find(key) else {
return Vec::new();
};
let rest = &text[start + key.len()..];
if let Some(array_start) = rest.find('[') {
let before_array = rest[..array_start].trim();
if before_array.is_empty() {
if let Some(array_end) = rest[array_start..].find(']') {
return parse_refs(&rest[array_start..array_start + array_end]);
}
}
}
parse_refs(rest).into_iter().take(1).collect()
}
fn parse_direct_ref_after_key(text: &str, key: &str) -> Option<usize> {
let start = text.find(key)?;
let bytes = text.as_bytes();
let mut pos = start + key.len();
while pos < bytes.len() && is_ws(bytes[pos]) {
pos += 1;
}
let (object, after_object) = parse_unsigned_at(bytes, pos)?;
let after_space = skip_required_ws(bytes, after_object)?;
let (_generation, after_generation) = parse_unsigned_at(bytes, after_space)?;
let after_space = skip_required_ws(bytes, after_generation)?;
if bytes.get(after_space) == Some(&b'R') {
Some(object)
} else {
None
}
}
fn parse_resource_refs(text: &str, key: &str) -> HashMap<String, u32> {
let Some(start) = text.find(key) else {
return HashMap::new();
};
let rest = &text[start + key.len()..];
let Some(dict_start) = rest.find("<<") else {
return HashMap::new();
};
let Some(dict_end) = rest[dict_start + 2..].find(">>") else {
return HashMap::new();
};
let dict = &rest[dict_start + 2..dict_start + 2 + dict_end];
parse_named_refs(dict)
}
fn resolve_resource_body(page_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<String> {
let resource_ref = parse_direct_ref_after_key(page_body, "/Resources")?;
object_map
.get(&(resource_ref as u32))
.map(|object| lossy(&object.body))
}
fn load_font_decoders(
resource_text: &str,
object_map: &HashMap<u32, Arc<PdfObject>>,
font_cache: &HashMap<u32, Arc<FontDecoder>>,
) -> HashMap<String, Arc<FontDecoder>> {
resolve_named_resource_refs(resource_text, "/Font", object_map)
.into_iter()
.map(|(name, object_number)| {
let decoder = font_cache.get(&object_number).cloned().unwrap_or_else(|| {
Arc::new(
object_map
.get(&object_number)
.map(|font| font_decoder(font.as_ref(), object_map))
.unwrap_or_default(),
)
});
(name, decoder)
})
.collect()
}
fn resolve_named_resource_refs(
resource_text: &str,
key: &str,
object_map: &HashMap<u32, Arc<PdfObject>>,
) -> HashMap<String, u32> {
let direct = parse_resource_refs(resource_text, key);
if !direct.is_empty() {
return direct;
}
parse_direct_ref_after_key(resource_text, key)
.and_then(|object_number| object_map.get(&(object_number as u32)))
.map(|object| parse_named_refs(&lossy(&object.body)))
.unwrap_or_default()
}
fn font_decoder(font: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> FontDecoder {
let font_body = lossy(&font.body);
let encoding = font_encoding_differences(&font_body, object_map);
let widths = font_widths(&font_body, &encoding);
let (bold, italic) = font_style(&font_body, object_map);
let (ascent, descent) = font_vertical_metrics(&font_body, object_map);
let Some(to_unicode_ref) = parse_refs_after_key(&font_body, "/ToUnicode")
.into_iter()
.next()
else {
return FontDecoder {
cmap: HashMap::new(),
encoding,
widths,
max_code_len: 1,
bold,
italic,
ascent,
descent,
};
};
let Some(to_unicode) = object_map.get(&(to_unicode_ref as u32)) else {
return FontDecoder {
cmap: HashMap::new(),
encoding,
widths,
max_code_len: 1,
bold,
italic,
ascent,
descent,
};
};
let Ok(Some(cmap_stream)) = decode_stream_object(to_unicode.as_ref()) else {
return FontDecoder {
cmap: HashMap::new(),
encoding,
widths,
max_code_len: 1,
bold,
italic,
ascent,
descent,
};
};
let mut decoder = parse_to_unicode_cmap(&lossy(&cmap_stream));
decoder.encoding = encoding;
decoder.widths = if widths.is_empty() {
cid_char_widths(&decoder.cmap, &font_cid_widths(&font_body, object_map))
} else {
widths
};
decoder.bold = bold;
decoder.italic = italic;
decoder.ascent = ascent;
decoder.descent = descent;
decoder
}
fn font_vertical_metrics(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (f32, f32) {
let mut ascent = 0.75;
let mut descent = -0.25;
if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
let body = lossy(&object.body);
if let Some(value) = parse_number_after(&body, "/Ascent") {
if value != 0.0 {
ascent = value / 1000.0;
}
}
if let Some(value) = parse_number_after(&body, "/Descent") {
if value != 0.0 {
descent = value / 1000.0;
}
}
}
}
(ascent, descent)
}
fn font_style(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (bool, bool) {
let mut bold = false;
let mut italic = false;
if let Some(name) = parse_name_after(font_body, "/BaseFont") {
let bare = name.rsplit('+').next().unwrap_or(name.as_str()).to_ascii_lowercase();
bold |= ["bold", "black", "heavy", "semibold", "demibold", "-bd", "demi"]
.iter()
.any(|needle| bare.contains(needle));
italic |= ["italic", "oblique", "-it"]
.iter()
.any(|needle| bare.contains(needle));
}
if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
let body = lossy(&object.body);
if let Some(flags) = parse_number_after(&body, "/Flags") {
let flags = flags as i64;
italic |= flags & 64 != 0;
bold |= flags & 262_144 != 0;
}
if let Some(angle) = parse_number_after(&body, "/ItalicAngle") {
italic |= angle.abs() > f32::EPSILON;
}
}
}
(bold, italic)
}
fn parse_name_after(text: &str, key: &str) -> Option<String> {
let start = text.find(key)? + key.len();
let rest = text[start..].trim_start();
let mut chars = rest.chars();
if chars.next()? != '/' {
return None;
}
let name: String = chars
.take_while(|character| {
!character.is_whitespace()
&& !matches!(character, '/' | '[' | ']' | '<' | '>' | '(' | ')')
})
.collect();
(!name.is_empty()).then_some(name)
}
fn font_widths(font_body: &str, encoding: &HashMap<u8, String>) -> HashMap<char, f32> {
let Some(first_char) = parse_number_after(font_body, "/FirstChar").map(|value| value as u8)
else {
return HashMap::new();
};
let Some(widths) = parse_number_array_after(font_body, "/Widths") else {
return HashMap::new();
};
widths
.into_iter()
.enumerate()
.filter_map(|(index, width)| {
let code = first_char.wrapping_add(index as u8);
let text = encoding
.get(&code)
.cloned()
.unwrap_or_else(|| (code as char).to_string());
let mut chars = text.chars();
let character = chars.next()?;
chars.next().is_none().then_some((character, width))
})
.collect()
}
fn font_cid_widths(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> HashMap<u32, f32> {
let mut widths = HashMap::new();
if parse_name_after(font_body, "/Subtype").as_deref() != Some("Type0") {
return widths;
}
let Some(descendant) = parse_refs_after_key(font_body, "/DescendantFonts")
.into_iter()
.next()
else {
return widths;
};
let Some(cidfont) = object_map.get(&(descendant as u32)) else {
return widths;
};
let body = lossy(&cidfont.body);
let Some((open, close)) = find_w_array(&body) else {
return widths;
};
let mut parser = ContentParser::new(&body.as_bytes()[open..=close]);
let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator() else {
return widths;
};
let mut index = 0;
while index < items.len() {
match (&items[index], items.get(index + 1)) {
(Operand::Number(first), Some(Operand::Array(list))) => {
let base = *first as i64;
for (offset, width) in list.iter().enumerate() {
if let Operand::Number(width) = width {
let cid = base + offset as i64;
if cid >= 0 {
widths.insert(cid as u32, *width);
}
}
}
index += 2;
}
(Operand::Number(first), Some(Operand::Number(last))) => {
if let Some(Operand::Number(width)) = items.get(index + 2) {
let (lo, hi) = (*first as i64, *last as i64);
if lo >= 0 && hi >= lo && hi - lo < 70_000 {
for cid in lo..=hi {
widths.insert(cid as u32, *width);
}
}
index += 3;
} else {
index += 1;
}
}
_ => index += 1,
}
}
widths
}
fn find_w_array(body: &str) -> Option<(usize, usize)> {
let bytes = body.as_bytes();
let mut search = 0;
while let Some(rel) = body[search..].find("/W") {
let key_end = search + rel + 2;
if matches!(bytes.get(key_end), Some(byte) if is_ws(*byte) || *byte == b'[') {
let mut pos = key_end;
while pos < bytes.len() && is_ws(bytes[pos]) {
pos += 1;
}
if bytes.get(pos) == Some(&b'[') {
if let Some(close) = matching_array_close(body, pos) {
return Some((pos, close));
}
}
}
search = key_end;
}
None
}
fn cid_char_widths(
cmap: &HashMap<Vec<u8>, String>,
cid_widths: &HashMap<u32, f32>,
) -> HashMap<char, f32> {
let mut out = HashMap::new();
if cid_widths.is_empty() {
return out;
}
for (code, text) in cmap {
if code.is_empty() || code.len() > 4 {
continue;
}
let mut chars = text.chars();
let (Some(character), None) = (chars.next(), chars.next()) else {
continue;
};
let cid = code.iter().fold(0u32, |acc, byte| (acc << 8) | u32::from(*byte));
if let Some(width) = cid_widths.get(&cid) {
out.insert(character, *width);
}
}
out
}
fn font_encoding_differences(
font_body: &str,
object_map: &HashMap<u32, Arc<PdfObject>>,
) -> HashMap<u8, String> {
if let Some(encoding_ref) = parse_direct_ref_after_key(font_body, "/Encoding") {
if let Some(object) = object_map.get(&(encoding_ref as u32)) {
let differences = parse_encoding_differences(&lossy(&object.body));
if !differences.is_empty() {
return differences;
}
}
}
parse_encoding_differences(font_body)
}
fn parse_encoding_differences(text: &str) -> HashMap<u8, String> {
let Some(start) = text.find("/Differences") else {
return HashMap::new();
};
let rest = &text[start + "/Differences".len()..];
let Some(open) = rest.find('[') else {
return HashMap::new();
};
let Some(close) = matching_array_close(rest, open) else {
return HashMap::new();
};
let mut parser = ContentParser::new(rest[open..=close].as_bytes());
let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator()
else {
return HashMap::new();
};
let mut differences = HashMap::new();
let mut code: Option<u16> = None;
for item in items {
match item {
Operand::Number(value) if value >= 0.0 => {
code = Some(value as u16);
}
Operand::Name(name) => {
let Some(current_code) = code else {
continue;
};
if current_code <= u16::from(u8::MAX) {
if let Some(text) = glyph_name_to_text(&name) {
differences.insert(current_code as u8, text);
}
}
code = current_code.checked_add(1);
}
_ => {}
}
}
differences
}
fn matching_array_close(text: &str, open: usize) -> Option<usize> {
let mut depth = 0usize;
for (offset, byte) in text.as_bytes().iter().enumerate().skip(open) {
match byte {
b'[' => depth += 1,
b']' => {
depth = depth.checked_sub(1)?;
if depth == 0 {
return Some(offset);
}
}
_ => {}
}
}
None
}
fn parse_to_unicode_cmap(text: &str) -> FontDecoder {
let mut cmap = HashMap::new();
let mut in_bfchar = false;
let mut in_bfrange = false;
let mut bfrange_array_entry = String::new();
let mut bfrange_array_depth = 0i32;
for line in text.lines() {
let trimmed = line.trim();
match trimmed {
value if value.ends_with("beginbfchar") => {
in_bfchar = true;
continue;
}
"endbfchar" => {
in_bfchar = false;
continue;
}
value if value.ends_with("beginbfrange") => {
in_bfrange = true;
continue;
}
"endbfrange" => {
in_bfrange = false;
bfrange_array_entry.clear();
bfrange_array_depth = 0;
continue;
}
_ => {}
}
if in_bfrange {
if bfrange_array_depth > 0 {
bfrange_array_entry.push(' ');
bfrange_array_entry.push_str(trimmed);
bfrange_array_depth += bracket_delta(trimmed);
if bfrange_array_depth <= 0 {
add_bfrange_entry(&mut cmap, &bfrange_array_entry);
bfrange_array_entry.clear();
bfrange_array_depth = 0;
}
continue;
}
let depth = bracket_delta(trimmed);
if depth > 0 {
bfrange_array_entry.clear();
bfrange_array_entry.push_str(trimmed);
bfrange_array_depth = depth;
continue;
}
add_bfrange_entry(&mut cmap, trimmed);
continue;
}
let hexes = hex_strings_in_line(trimmed);
if in_bfchar && hexes.len() >= 2 {
cmap.insert(
hexes[0].clone(),
cmap_text_for_mapping(&hexes[0], &hexes[1]),
);
}
}
let max_code_len = cmap.keys().map(Vec::len).max().unwrap_or(1);
FontDecoder {
cmap,
encoding: HashMap::new(),
widths: HashMap::new(),
max_code_len,
bold: false,
italic: false,
ascent: 0.75,
descent: -0.25,
}
}
fn bracket_delta(text: &str) -> i32 {
text.chars().fold(0, |depth, character| match character {
'[' => depth + 1,
']' => depth - 1,
_ => depth,
})
}
fn add_bfrange_entry(cmap: &mut HashMap<Vec<u8>, String>, line: &str) {
let hexes = hex_strings_in_line(line);
if hexes.len() < 3 {
return;
}
if line.contains('[') {
add_bfrange_array(cmap, &hexes);
} else {
add_bfrange(cmap, &hexes);
}
}
fn add_bfrange(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
let Some(start) = hex_to_u32(&hexes[0]) else {
return;
};
let Some(end) = hex_to_u32(&hexes[1]) else {
return;
};
let Some(destination) = hex_to_u32(&hexes[2]) else {
return;
};
let source_len = hexes[0].len();
for offset in 0..=(end.saturating_sub(start)).min(512) {
let source = start + offset;
let destination = destination + offset;
cmap.insert(
number_to_be_bytes(source, source_len),
cmap_text_for_codes(source, destination),
);
}
}
fn add_bfrange_array(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
let Some(start) = hex_to_u32(&hexes[0]) else {
return;
};
let Some(end) = hex_to_u32(&hexes[1]) else {
return;
};
let source_len = hexes[0].len();
let range_len = end.saturating_sub(start).saturating_add(1) as usize;
for (offset, destination) in hexes.iter().skip(2).take(range_len.min(512)).enumerate() {
let source = start + offset as u32;
let source_bytes = number_to_be_bytes(source, source_len);
cmap.insert(
source_bytes.clone(),
cmap_text_for_mapping(&source_bytes, destination),
);
}
}
fn cmap_text_for_mapping(source: &[u8], destination: &[u8]) -> String {
if destination.len() > 2 {
return utf16be_hex_to_string(destination);
}
let Some(source_code) = hex_to_u32(source) else {
return utf16be_hex_to_string(destination);
};
let Some(destination_code) = hex_to_u32(destination) else {
return utf16be_hex_to_string(destination);
};
cmap_text_for_codes(source_code, destination_code)
}
fn cmap_text_for_codes(source: u32, destination: u32) -> String {
if is_private_use_text_code(destination) {
if let Some(character) = private_use_source_ascii(source) {
return character.to_string();
}
}
char::from_u32(destination)
.map(|character| character.to_string())
.unwrap_or_default()
}
fn is_private_use_text_code(code: u32) -> bool {
(0xe000..=0xf8ff).contains(&code)
}
fn private_use_source_ascii(source: u32) -> Option<char> {
let ascii = source + 28;
(0x20..=0x7e)
.contains(&ascii)
.then(|| char::from_u32(ascii))
.flatten()
}
fn hex_strings_in_line(line: &str) -> Vec<Vec<u8>> {
let bytes = line.as_bytes();
let mut hexes = Vec::new();
let mut pos = 0;
while pos < bytes.len() {
if bytes[pos] == b'<' && bytes.get(pos + 1) != Some(&b'<') {
let start = pos + 1;
if let Some(end) = bytes[start..].iter().position(|byte| *byte == b'>') {
hexes.push(decode_hex(&bytes[start..start + end]));
pos = start + end + 1;
continue;
}
}
pos += 1;
}
hexes
}
fn utf16be_hex_to_string(bytes: &[u8]) -> String {
if bytes.len() >= 2 {
let units = bytes
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
String::from_utf16_lossy(&units)
} else {
bytes.iter().map(|byte| *byte as char).collect()
}
}
fn hex_to_u32(bytes: &[u8]) -> Option<u32> {
let mut value = 0u32;
for byte in bytes {
value = (value << 8) | (*byte as u32);
}
Some(value)
}
fn number_to_be_bytes(value: u32, len: usize) -> Vec<u8> {
(0..len)
.rev()
.map(|shift| ((value >> (shift * 8)) & 0xff) as u8)
.collect()
}
fn parse_named_refs(text: &str) -> HashMap<String, u32> {
let mut refs = HashMap::new();
let bytes = text.as_bytes();
let mut pos = 0;
while pos < bytes.len() {
if bytes[pos] != b'/' || bytes.get(pos + 1) == Some(&b'/') {
pos += 1;
continue;
}
pos += 1;
let name_start = pos;
while pos < bytes.len() && !is_delimiter_or_ws(bytes[pos]) {
pos += 1;
}
let name = lossy(&bytes[name_start..pos]);
while pos < bytes.len() && is_ws(bytes[pos]) {
pos += 1;
}
let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_object) else {
pos += 1;
continue;
};
let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
pos += 1;
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_generation) else {
pos += 1;
continue;
};
if bytes.get(after_space) == Some(&b'R') {
refs.insert(name, object as u32);
pos = after_space + 1;
}
}
refs
}
fn parse_refs(text: &str) -> Vec<usize> {
let mut refs = Vec::new();
let bytes = text.as_bytes();
let mut pos = 0;
while pos < bytes.len() {
let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
pos += 1;
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_object) else {
pos += 1;
continue;
};
let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
pos += 1;
continue;
};
let Some(after_space) = skip_required_ws(bytes, after_generation) else {
pos += 1;
continue;
};
if bytes.get(after_space) == Some(&b'R') {
refs.push(object);
pos = after_space + 1;
} else {
pos += 1;
}
}
refs
}
fn parse_number_array_after(text: &str, key: &str) -> Option<Vec<f32>> {
let start = text.find(key)?;
let rest = &text[start + key.len()..];
let open = rest.find('[')?;
let close = rest[open + 1..].find(']')?;
Some(
rest[open + 1..open + 1 + close]
.split_whitespace()
.filter_map(|part| part.parse::<f32>().ok())
.collect(),
)
}
fn parse_number_after(text: &str, key: &str) -> Option<f32> {
let start = text.find(key)?;
let bytes = text.as_bytes();
let mut pos = start + key.len();
while pos < bytes.len() && (is_ws(bytes[pos]) || matches!(bytes[pos], b'[' | b']')) {
pos += 1;
}
let number_start = pos;
while pos < bytes.len() && matches!(bytes[pos], b'+' | b'-' | b'.' | b'0'..=b'9') {
pos += 1;
}
if pos == number_start {
return None;
}
text[number_start..pos].parse().ok()
}
fn first_text_operand(
operands: &[Operand],
state: &GraphicsState,
fonts: &HashMap<String, Arc<FontDecoder>>,
) -> Option<String> {
operands
.first()
.and_then(|operand| operand_text(operand, state, fonts))
}
fn operand_text(
operand: &Operand,
state: &GraphicsState,
fonts: &HashMap<String, Arc<FontDecoder>>,
) -> Option<String> {
match operand {
Operand::Literal(bytes) | Operand::Hex(bytes) => Some(decode_pdf_text(
bytes,
state
.font_name
.as_ref()
.and_then(|font_name| fonts.get(font_name))
.map(|font| font.as_ref()),
)),
_ => None,
}
}
fn text_from_array(
items: &[Operand],
state: &GraphicsState,
fonts: &HashMap<String, Arc<FontDecoder>>,
) -> String {
let space_width = space_advance_width(state, fonts).max(state.font_size * 0.04);
let gap_threshold = space_width * SPACE_GAP_FRACTION;
let mut text = String::new();
for item in items {
match item {
Operand::Number(value) => {
let gap = -value / 1000.0 * state.font_size * state.horizontal_scaling;
if gap >= gap_threshold && !text.ends_with(' ') {
text.push(' ');
}
}
_ => {
if let Some(part) = operand_text(item, state, fonts) {
text.push_str(&part);
}
}
}
}
text
}
const SPACE_GAP_FRACTION: f32 = 0.3;
fn decode_pdf_text(bytes: &[u8], font: Option<&FontDecoder>) -> String {
if let Some(font) = font {
if !font.cmap.is_empty() {
return decode_with_cmap(bytes, font);
}
if !font.encoding.is_empty() {
return bytes.iter().map(|byte| font.decode_byte(*byte)).collect();
}
}
if bytes.starts_with(&[0xfe, 0xff]) {
let utf16 = bytes[2..]
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect::<Vec<_>>();
String::from_utf16_lossy(&utf16)
} else {
bytes.iter().map(|byte| *byte as char).collect()
}
}
fn decode_with_cmap(bytes: &[u8], font: &FontDecoder) -> String {
let mut output = String::new();
let mut index = 0;
while index < bytes.len() {
let max_len = font.max_code_len.min(bytes.len() - index).max(1);
let mut matched = false;
for len in (1..=max_len).rev() {
if let Some(text) = font.cmap.get(&bytes[index..index + len]) {
output.push_str(text);
index += len;
matched = true;
break;
}
}
if !matched {
output.push_str(&font.decode_byte(bytes[index]));
index += 1;
}
}
output
}
fn glyph_name_to_text(name: &str) -> Option<String> {
let text = match name {
"space" => " ",
"exclam" => "!",
"quotedbl" => "\"",
"numbersign" => "#",
"dollar" => "$",
"percent" => "%",
"ampersand" => "&",
"quotesingle" | "quoteright" | "quoteleft" => "'",
"parenleft" | "parenleftbig" | "parenleftBig" | "parenleftbigg" | "parenleftBigg" => "(",
"parenright" | "parenrightbig" | "parenrightBig" | "parenrightbigg" | "parenrightBigg" => {
")"
}
"asterisk" | "asteriskmath" => "*",
"plus" => "+",
"comma" => ",",
"hyphen" => "-",
"period" => ".",
"slash" => "/",
"zero" => "0",
"one" => "1",
"two" => "2",
"three" => "3",
"four" => "4",
"five" => "5",
"six" => "6",
"seven" => "7",
"eight" => "8",
"nine" => "9",
"colon" => ":",
"semicolon" => ";",
"less" => "<",
"equal" => "=",
"greater" => ">",
"question" => "?",
"at" => "@",
"bracketleft" => "[",
"backslash" => "\\",
"bracketright" => "]",
"circumflex" | "hatwide" | "hatwider" | "hatwidest" => "^",
"underscore" => "_",
"braceleft" | "braceleftBig" | "braceleftBigg" | "bracelefttp" | "braceleftbt"
| "braceleftmid" => "{",
"bar" | "vextendsingle" | "braceex" => "|",
"braceright" | "bracerightBig" => "}",
"tilde" | "tildewide" => "~",
"ff" => "ff",
"fi" => "fi",
"fl" => "fl",
"ffi" => "ffi",
"ffl" => "ffl",
"Gamma" => "Γ",
"Theta" => "Θ",
"Lambda" => "Λ",
"Pi" => "Π",
"Sigma" => "Σ",
"Phi" => "Φ",
"Omega" => "Ω",
"alpha" => "α",
"beta" => "β",
"gamma" => "γ",
"delta" => "δ",
"epsilon" => "ε",
"zeta" => "ζ",
"lambda" => "λ",
"mu" => "μ",
"pi" | "pi1" => "π",
"rho" => "ρ",
"sigma" => "σ",
"tau" => "τ",
"phi" => "φ",
"chi" => "χ",
"omega" => "ω",
"partialdiff" => "∂",
"minus" => "−",
"periodcentered" => "·",
"multiply" => "×",
"plusminus" => "±",
"circlemultiply" => "⊗",
"openbullet" | "bullet" => "•",
"lessequal" => "≤",
"greaterequal" => "≥",
"similar" => "∼",
"arrowright" => "→",
"mapsto" => "↦",
"prime" => "′",
"infinity" => "∞",
"element" => "∈",
"universal" => "∀",
"union" | "uniontext" | "uniondisplay" => "∪",
"intersection" | "intersectiontext" | "intersectiondisplay" => "∩",
"reflexsubset" => "⊇",
"reflexsuperset" => "⊆",
"summationtext" | "summationdisplay" => "∑",
"productdisplay" => "∏",
"integraldisplay" => "∫",
"circleplusdisplay" => "⊕",
"unionsqdisplay" => "⊔",
"negationslash" => "̸",
_ if name.chars().count() == 1 => name,
_ => return unicode_glyph_name_to_text(name),
};
Some(text.to_owned())
}
fn unicode_glyph_name_to_text(name: &str) -> Option<String> {
if let Some(hex) = name.strip_prefix("uni") {
if hex.len() >= 4 && hex.len() % 4 == 0 {
let mut output = String::new();
for chunk in hex.as_bytes().chunks(4) {
let chunk = std::str::from_utf8(chunk).ok()?;
let code = u32::from_str_radix(chunk, 16).ok()?;
output.push(char::from_u32(code)?);
}
return Some(output);
}
}
if let Some(hex) = name.strip_prefix('u') {
if (4..=6).contains(&hex.len()) {
let code = u32::from_str_radix(hex, 16).ok()?;
return char::from_u32(code).map(|character| character.to_string());
}
}
None
}
fn numbers(operands: &[Operand], count: usize) -> Option<Vec<f32>> {
if operands.len() < count {
return None;
}
let values = operands[operands.len() - count..]
.iter()
.map(|operand| match operand {
Operand::Number(value) => Some(*value),
_ => None,
})
.collect::<Option<Vec<_>>>()?;
Some(values)
}
fn block_text(block: &Block) -> String {
match block {
Block::Text(text) => text.text.clone(),
Block::Table(table) => {
let mut rows = Vec::new();
if !table.headers.is_empty() {
rows.push(table.headers.join(" "));
}
rows.extend(table.rows.iter().map(|row| row.join(" ")));
rows.join("\n")
}
Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
}
}
fn classify_text_line(text: &str, line_size: f32, body_size: f32) -> String {
let chars = text.chars().count();
if chars == 0 || chars >= 200 || body_size <= 0.0 || line_size <= 0.0 {
return "paragraph".to_owned();
}
let ratio = line_size / body_size;
if ratio >= 1.5 {
"heading_1".to_owned()
} else if ratio >= 1.3 {
"heading_2".to_owned()
} else if ratio >= 1.12 {
"heading_3".to_owned()
} else {
"paragraph".to_owned()
}
}
fn line_dominant_size(line: &TextLine) -> f32 {
let mut best_chars = 0usize;
let mut best_size = 0.0f32;
for run in &line.runs {
if run.size <= 0.0 {
continue;
}
let chars = run.text.chars().count();
if chars >= best_chars {
best_chars = chars;
best_size = run.size;
}
}
best_size
}
fn page_body_size(lines: &[TextLine]) -> f32 {
let mut weights: Vec<(u32, usize)> = Vec::new();
for line in lines {
for run in &line.runs {
if run.size <= 0.0 {
continue;
}
let bucket = (run.size * 2.0).round() as u32;
let chars = run.text.chars().count();
if let Some(entry) = weights.iter_mut().find(|(value, _)| *value == bucket) {
entry.1 += chars;
} else {
weights.push((bucket, chars));
}
}
}
weights
.into_iter()
.max_by_key(|(_, chars)| *chars)
.map(|(bucket, _)| bucket as f32 / 2.0)
.unwrap_or(0.0)
}
fn source_ids_for_line(line: &TextLine) -> Vec<String> {
source_ids_for_runs(&line.runs)
}
fn source_ids_for_runs(runs: &[TextRun]) -> Vec<String> {
let mut ids = Vec::new();
for run in runs {
for id in &run.source_object_ids {
if !ids.contains(id) {
ids.push(id.clone());
}
}
}
ids
}
fn anchor(page_number: usize, bbox: Option<BBox>, pdf_object_ids: Vec<String>) -> SourceAnchor {
SourceAnchor {
page_number,
pdf_object_ids,
bbox,
extraction_method: "native_pdf".to_owned(),
}
}
fn warning(code: &str, severity: &str, message: &str, page_number: Option<usize>) -> Warning {
Warning {
code: code.to_owned(),
severity: severity.to_owned(),
message: message.to_owned(),
source_anchor: page_number.map(|page_number| anchor(page_number, None, Vec::new())),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn text_from_line_runs_does_not_treat_slash_prose_page_number_as_script() {
let line = TextLine {
runs: vec![
test_run("Art Cutting / Bates Technical College", 72.0, 720.0, 12.0),
test_run("24", 300.0, 722.0, 8.0),
test_run("Core Competencies", 315.0, 720.0, 12.0),
],
bbox: BBox {
x: 72.0,
y: 720.0,
width: 360.0,
height: 12.0,
},
baseline_y: 720.0,
};
assert_eq!(
text_from_line_runs(&line),
"Art Cutting / Bates Technical College 24 Core Competencies"
);
}
fn test_run(text: &str, x: f32, y: f32, size: f32) -> TextRun {
TextRun {
text: text.to_owned(),
bbox: BBox {
x,
y,
width: text.len() as f32 * size * 0.4,
height: size,
},
baseline_y: y,
font: None,
size,
space_width: size * 0.25,
bold: false,
italic: false,
source_object_ids: Vec::new(),
}
}
}
fn union_boxes(boxes: impl IntoIterator<Item = BBox>) -> Option<BBox> {
let mut iter = boxes.into_iter();
let first = iter.next()?;
let mut min_x = first.x;
let mut min_y = first.y;
let mut max_x = first.x + first.width;
let mut max_y = first.y + first.height;
for bbox in iter {
min_x = min_x.min(bbox.x);
min_y = min_y.min(bbox.y);
max_x = max_x.max(bbox.x + bbox.width);
max_y = max_y.max(bbox.y + bbox.height);
}
Some(BBox {
x: min_x,
y: min_y,
width: max_x - min_x,
height: max_y - min_y,
})
}
fn extract_info_string(objects: &[PdfObject], key: &str) -> Option<String> {
let needle = format!("/{key}");
objects.iter().find_map(|object| {
let body = lossy(&object.body);
if !(body.contains("/Producer") || body.contains("/Creator") || body.contains("/Author")) {
return None;
}
let start = body.find(&needle)?;
let rest = &object.body[start + needle.len()..];
let open = rest.iter().position(|byte| *byte == b'(')?;
let mut parser = ContentParser::new(&rest[open..]);
match parser.next_operand_or_operator()? {
ContentToken::Operand(Operand::Literal(bytes)) => Some(decode_pdf_text(&bytes, None)),
_ => None,
}
})
}
fn pdf_version(bytes: &[u8]) -> Option<String> {
let first_line = bytes.split(|byte| matches!(byte, b'\n' | b'\r')).next()?;
let text = std::str::from_utf8(first_line).ok()?;
text.strip_prefix("%PDF-").map(ToOwned::to_owned)
}
fn decode_hex(bytes: &[u8]) -> Vec<u8> {
let hex = bytes
.iter()
.copied()
.filter(|byte| !is_ws(*byte))
.collect::<Vec<_>>();
let mut output = Vec::new();
let mut index = 0;
while index < hex.len() {
let high = hex_value(hex[index]).unwrap_or(0);
let low = hex
.get(index + 1)
.and_then(|byte| hex_value(*byte))
.unwrap_or(0);
output.push((high << 4) | low);
index += 2;
}
output
}
fn hex_value(byte: u8) -> Option<u8> {
match byte {
b'0'..=b'9' => Some(byte - b'0'),
b'a'..=b'f' => Some(byte - b'a' + 10),
b'A'..=b'F' => Some(byte - b'A' + 10),
_ => None,
}
}
fn parse_unsigned_at(bytes: &[u8], mut pos: usize) -> Option<(usize, usize)> {
let start = pos;
while pos < bytes.len() && bytes[pos].is_ascii_digit() {
pos += 1;
}
if pos == start {
return None;
}
std::str::from_utf8(&bytes[start..pos])
.ok()?
.parse()
.ok()
.map(|value| (value, pos))
}
fn skip_required_ws(bytes: &[u8], mut pos: usize) -> Option<usize> {
if pos >= bytes.len() || !is_ws(bytes[pos]) {
return None;
}
while pos < bytes.len() && is_ws(bytes[pos]) {
pos += 1;
}
Some(pos)
}
fn is_ws_or_line_start(bytes: &[u8], pos: usize) -> bool {
pos == 0 || matches!(bytes[pos - 1], b'\n' | b'\r')
}
fn is_delimiter_or_ws(byte: u8) -> bool {
is_ws(byte) || matches!(byte, b'[' | b']' | b'<' | b'>' | b'/' | b'(' | b')')
}
fn is_ws(byte: u8) -> bool {
matches!(byte, 0x00 | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
}
fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
haystack
.windows(needle.len())
.position(|window| window == needle)
}
fn contains_name(bytes: &[u8], name: &[u8]) -> bool {
find_subslice(bytes, name).is_some()
}
fn lossy(bytes: &[u8]) -> String {
String::from_utf8_lossy(bytes).into_owned()
}
#[allow(dead_code)]
fn sha256_hex(bytes: &[u8]) -> String {
let digest = Sha256::digest(bytes);
digest.iter().map(|byte| format!("{byte:02x}")).collect()
}