use std::collections::{BTreeMap, HashMap};
use lopdf::{Dictionary, Object, ObjectId};
use crate::error::Result;
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq)]
pub struct TextFragment {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub font_size: f32,
pub font_name: String,
pub color: [f32; 3],
pub invisible: bool,
pub is_bold: bool,
pub is_italic: bool,
pub font_family: String,
pub base_font: String,
pub space_advance: f32,
pub tf_font_size: f32,
pub tm_y_scale: f32,
pub source_stream: Option<usize>,
pub source_op_start: Option<usize>,
pub source_op_end: Option<usize>,
pub source_xobject: Option<(u32, u16)>,
pub tm_origin_x: Option<f32>,
pub tm_origin_y: Option<f32>,
pub tm_x_scale: Option<f32>,
pub tm_lm_x: Option<f32>,
pub tm_lm_y: Option<f32>,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub enum WarningKind {
StreamDecompressFailed,
XObjectSkipped,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct ExtractionWarning {
pub kind: WarningKind,
pub stream_id: Option<(u32, u16)>,
pub message: String,
}
pub(crate) struct FontInfo {
pub(crate) to_unicode: BTreeMap<u16, char>,
pub(crate) dw: u32,
pub(crate) w_runs: Vec<WidthRun>,
pub(crate) bytes_per_char: u8,
pub(crate) identity_fallback: bool,
pub(crate) base_font: String,
pub(crate) is_bold: bool,
pub(crate) is_italic: bool,
pub(crate) font_family: String,
}
pub(crate) struct WidthRun {
pub(crate) start_gid: u16,
pub(crate) widths: Vec<u32>,
}
impl FontInfo {
pub(crate) fn advance_width(&self, gid: u16) -> u32 {
for run in &self.w_runs {
if gid >= run.start_gid {
let idx = (gid - run.start_gid) as usize;
if idx < run.widths.len() {
return run.widths[idx];
}
}
}
self.dw
}
}
pub fn text_fragment_bounds(fragments: &[TextFragment]) -> Option<[f32; 4]> {
let mut x_min = f32::INFINITY;
let mut x_max = f32::NEG_INFINITY;
let mut y_min = f32::INFINITY;
let mut y_max = f32::NEG_INFINITY;
for frag in fragments {
if !frag.x.is_finite() || !frag.y.is_finite() || !frag.font_size.is_finite() {
continue;
}
x_min = x_min.min(frag.x);
x_max = x_max.max(frag.x + frag.width.max(0.0));
y_min = y_min.min(frag.y - frag.font_size * 0.25);
y_max = y_max.max(frag.y + frag.font_size * 0.75);
}
if !x_min.is_finite() {
return None;
}
Some([x_min, y_min, (x_max - x_min).max(0.0), (y_max - y_min).max(0.0)])
}
#[non_exhaustive]
#[derive(Debug, Clone, Default)]
pub struct PlacedBox {
pub rect: [f32; 4],
}
impl PlacedBox {
pub fn new(rect: [f32; 4]) -> Self {
Self { rect }
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct Collision {
pub index_a: usize,
pub index_b: usize,
pub overlap_rect: [f32; 4],
}
pub fn detect_collisions(boxes: &[PlacedBox]) -> Vec<Collision> {
let mut out = Vec::new();
for (i, box_a) in boxes.iter().enumerate() {
let [ax, ay, aw, ah] = box_a.rect;
let ax2 = ax + aw;
let ay2 = ay + ah;
for (j, box_b) in boxes.iter().enumerate().skip(i + 1) {
let [bx, by, bw, bh] = box_b.rect;
let bx2 = bx + bw;
let by2 = by + bh;
let ox = ax.max(bx);
let oy = ay.max(by);
let ox2 = ax2.min(bx2);
let oy2 = ay2.min(by2);
if ox2 > ox && oy2 > oy {
out.push(Collision {
index_a: i,
index_b: j,
overlap_rect: [ox, oy, ox2 - ox, oy2 - oy],
});
}
}
}
out
}
pub fn sort_by_reading_order(fragments: &mut [TextFragment]) {
use std::cmp::Ordering;
fragments.sort_by(|a, b| {
let y_cmp = match (a.y.is_finite(), b.y.is_finite()) {
(true, true) => b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal),
(true, false) => Ordering::Less, (false, true) => Ordering::Greater,
(false, false) => Ordering::Equal, };
if y_cmp != Ordering::Equal {
return y_cmp;
}
match (a.x.is_finite(), b.x.is_finite()) {
(true, true) => a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal),
(true, false) => Ordering::Less, (false, true) => Ordering::Greater,
(false, false) => Ordering::Equal,
}
});
}
#[derive(Debug, Clone, PartialEq)]
pub struct ColumnZone {
pub x_start: f32,
pub x_end: f32,
}
pub fn detect_text_columns(fragments: &[TextFragment], page_width: f32) -> Vec<ColumnZone> {
const BUCKET_PT: f32 = 5.0;
const MIN_GAP_PT: f32 = 15.0;
if fragments.is_empty() || page_width <= 0.0 {
return vec![];
}
let n = (page_width / BUCKET_PT).ceil() as usize + 1;
let mut occupied = vec![false; n];
for frag in fragments {
if frag.invisible {
continue;
}
let lo = (frag.x / BUCKET_PT).floor() as usize;
let hi = ((frag.x + frag.width.max(0.0)) / BUCKET_PT).ceil() as usize;
let hi = hi.min(n - 1);
for bucket in occupied.iter_mut().take(hi + 1).skip(lo) {
*bucket = true;
}
}
let min_gap_buckets = (MIN_GAP_PT / BUCKET_PT).ceil() as usize;
let mut gaps: Vec<(usize, usize)> = Vec::new();
let mut gap_start: Option<usize> = None;
for (i, &occ) in occupied.iter().enumerate() {
if !occ {
if gap_start.is_none() {
gap_start = Some(i);
}
} else if let Some(gs) = gap_start.take()
&& i - gs >= min_gap_buckets
{
gaps.push((gs, i));
}
}
if let Some(gs) = gap_start
&& n - gs >= min_gap_buckets
{
gaps.push((gs, n));
}
if gaps.is_empty() {
return vec![ColumnZone { x_start: 0.0, x_end: page_width }];
}
let mut zones = Vec::new();
let mut col_start = 0usize;
for (gap_s, gap_e) in &gaps {
if col_start < *gap_s {
zones.push(ColumnZone {
x_start: col_start as f32 * BUCKET_PT,
x_end: *gap_s as f32 * BUCKET_PT,
});
}
col_start = *gap_e;
}
if col_start < n {
zones.push(ColumnZone {
x_start: col_start as f32 * BUCKET_PT,
x_end: page_width,
});
}
zones
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GroupingStrategy {
Raw,
Line,
Paragraph,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct TextGroup {
pub text: String,
pub fragments: Vec<TextFragment>,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
pub fn group_text_fragments(
fragments: &[TextFragment],
strategy: GroupingStrategy,
) -> Vec<TextGroup> {
if fragments.is_empty() {
return vec![];
}
if matches!(strategy, GroupingStrategy::Raw) {
return fragments
.iter()
.map(|f| TextGroup {
text: f.text.clone(),
fragments: vec![f.clone()],
x: f.x,
y: f.y,
width: f.width.max(0.0),
height: f.height.max(0.0),
})
.collect();
}
let mut sorted = fragments.to_vec();
sort_by_reading_order(&mut sorted);
let mut lines: Vec<TextGroup> = Vec::new();
for frag in &sorted {
let tol = (frag.font_size * 0.5).max(2.0);
if let Some(last) = lines.last_mut()
&& last.y.is_finite()
&& (frag.y - last.y).abs() <= tol
{
if !last.text.is_empty() && !last.text.ends_with(' ') {
last.text.push(' ');
}
last.text.push_str(&frag.text);
last.fragments.push(frag.clone());
let frag_right = frag.x + frag.width.max(0.0);
let self_right = last.x + last.width;
last.x = last.x.min(frag.x);
last.width = frag_right.max(self_right) - last.x;
last.height = last.height.max(frag.height);
continue;
}
lines.push(TextGroup {
text: frag.text.clone(),
fragments: vec![frag.clone()],
x: frag.x,
y: frag.y,
width: frag.width.max(0.0),
height: frag.height.max(0.0),
});
}
if matches!(strategy, GroupingStrategy::Line) {
return lines;
}
let mut paragraphs: Vec<TextGroup> = Vec::new();
for line in lines {
if paragraphs.is_empty() {
paragraphs.push(line);
continue;
}
let prev = paragraphs.last().unwrap();
let gap = (prev.y - line.y).abs();
let line_h = prev.height.max(line.height);
if gap > line_h * 1.5 {
paragraphs.push(line);
} else {
let last = paragraphs.last_mut().unwrap();
last.text.push('\n');
last.text.push_str(&line.text);
last.fragments.extend(line.fragments);
let line_right = line.x + line.width;
let self_right = last.x + last.width;
last.x = last.x.min(line.x);
last.width = line_right.max(self_right) - last.x;
last.height = (last.y - line.y) + line.height.max(last.height);
}
}
paragraphs
}
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq)]
pub struct TableCell {
pub row: usize,
pub col: usize,
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub fragments: Vec<TextFragment>,
}
impl TableCell {
pub fn bbox(&self) -> [f32; 4] {
[self.x, self.y, self.width, self.height]
}
}
pub fn merge_short_cjk_tails(
fragments: &[TextFragment],
max_chars: usize,
line_height_ratio: f32,
) -> Vec<TextFragment> {
if max_chars == 0 || fragments.is_empty() {
return fragments.to_vec();
}
let mut out: Vec<TextFragment> = Vec::with_capacity(fragments.len());
for frag in fragments {
let non_ws = frag.text.chars().filter(|c| !c.is_whitespace()).count();
let is_tail = non_ws > 0 && non_ws <= max_chars;
if is_tail && let Some(prev) = out.last_mut() {
let y_dist = (prev.y - frag.y).abs();
let threshold = (prev.font_size * line_height_ratio).max(2.0);
if y_dist <= threshold {
prev.text.push_str(&frag.text);
let new_right = (frag.x + frag.width).max(prev.x + prev.width);
prev.width = new_right - prev.x;
prev.height = prev.height.max(frag.height);
continue;
}
}
out.push(frag.clone());
}
out
}
pub fn extract_table_cells(
fragments: &[TextFragment],
page_width: f32,
_page_height: f32,
) -> Vec<TableCell> {
if fragments.is_empty() || page_width <= 0.0 {
return vec![];
}
let mut sorted: Vec<TextFragment> = fragments
.iter()
.filter(|f| !f.invisible && !f.text.trim().is_empty())
.cloned()
.collect();
if sorted.is_empty() {
return vec![];
}
sort_by_reading_order(&mut sorted);
let tm_lm_count = sorted.iter().filter(|f| f.tm_lm_x.is_some()).count();
let use_tm_lm_cols = tm_lm_count > sorted.len() / 2;
let tm_lm_anchors: Vec<f32> = if use_tm_lm_cols {
let mut v: Vec<f32> = sorted.iter().filter_map(|f| f.tm_lm_x).collect();
v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
v.dedup_by(|a, b| (*a - *b).abs() < 2.0);
v
} else {
vec![]
};
let col_zones: Vec<ColumnZone> = if !use_tm_lm_cols {
let z = detect_text_columns(fragments, page_width);
if z.is_empty() {
return vec![];
}
z
} else {
vec![]
};
let col_for_frag = |frag: &TextFragment| -> usize {
if use_tm_lm_cols {
let lm = frag.tm_lm_x.unwrap_or(frag.x);
tm_lm_anchors
.iter()
.enumerate()
.min_by(|(_, a), (_, b)| {
(lm - *a).abs().partial_cmp(&(lm - *b).abs())
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|(i, _)| i)
.unwrap_or(0)
} else {
for (i, zone) in col_zones.iter().enumerate() {
if frag.x >= zone.x_start && frag.x < zone.x_end {
return i;
}
}
col_zones
.iter()
.enumerate()
.min_by(|(_, a), (_, b)| {
let da = (frag.x - (a.x_start + a.x_end) * 0.5).abs();
let db = (frag.x - (b.x_start + b.x_end) * 0.5).abs();
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.map(|(i, _)| i)
.unwrap_or(0)
}
};
let row_tol = {
let first_fs = sorted
.iter()
.find(|f| f.font_size.is_finite() && f.font_size > 0.0)
.map(|f| f.font_size)
.unwrap_or(12.0);
(first_fs * 0.5).max(2.0)
};
let mut rows: Vec<Vec<&TextFragment>> = Vec::new();
for frag in &sorted {
let in_current_row = rows
.last()
.map(|r| (r[0].y - frag.y).abs() <= row_tol);
if in_current_row == Some(true) {
rows.last_mut().unwrap().push(frag);
} else {
rows.push(vec![frag]);
}
}
let mut cell_map: std::collections::BTreeMap<(usize, usize), Vec<&TextFragment>> =
std::collections::BTreeMap::new();
for (row_idx, row_frags) in rows.iter().enumerate() {
for frag in row_frags {
let col_idx = col_for_frag(frag);
cell_map.entry((row_idx, col_idx)).or_default().push(frag);
}
}
cell_map
.into_iter()
.map(|((row, col), mut frags)| {
frags.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal));
let text = frags
.iter()
.map(|f| f.text.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<_>>()
.join(" ");
let x = frags.iter().map(|f| f.x).fold(f32::INFINITY, f32::min);
let y = frags.iter().map(|f| f.y).fold(f32::NEG_INFINITY, f32::max);
let right = frags
.iter()
.map(|f| f.x + f.width.max(0.0))
.fold(f32::NEG_INFINITY, f32::max);
let height = frags.iter().map(|f| f.height.max(0.0)).fold(0.0f32, f32::max);
let fragments_owned: Vec<TextFragment> = frags.iter().map(|f| (*f).clone()).collect();
TableCell {
row,
col,
text,
x,
y,
width: (right - x).max(0.0),
height,
fragments: fragments_owned,
}
})
.collect()
}
pub(crate) fn extract_text_runs_from_page(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Result<Vec<TextFragment>> {
let streams = page_content_streams(doc, page_id);
let fonts = collect_fonts(doc, page_id);
let mut fragments = Vec::new();
let mut carry = ParseCarryState::default();
for (stream_idx, stream_bytes) in streams.iter().enumerate() {
parse_content_stream(stream_bytes, &fonts, &mut carry, &mut fragments, Some(stream_idx), None);
}
extract_text_from_xobjects(doc, page_id, &mut carry, &mut fragments, 0);
Ok(fragments)
}
fn extract_text_from_xobjects(
doc: &lopdf::Document,
page_id: ObjectId,
carry: &mut ParseCarryState,
out: &mut Vec<TextFragment>,
_depth: u8,
) {
let saved_ctm = carry.ctm;
let saved_ctm_stack = carry.ctm_stack.clone();
if !carry.do_ctm_map.is_empty() {
let xobj_name_map = collect_inherited_xobject_name_map(doc, page_id);
let do_ctm_map = std::mem::take(&mut carry.do_ctm_map);
for (xobj_name, do_ctm) in &do_ctm_map {
let Some(&xobj_id) = xobj_name_map.get(xobj_name.as_slice()) else { continue };
if let Some(content) = decode_form_xobject(doc, xobj_id) {
let xobj_fonts = xobject_fonts(doc, page_id, xobj_id);
let xobj_matrix = xobject_matrix(doc, xobj_id);
carry.ctm = multiply_ctm(*do_ctm, xobj_matrix);
carry.ctm_stack = vec![carry.ctm];
parse_content_stream(&content, &xobj_fonts, carry, out, None, Some(xobj_id));
}
}
carry.do_ctm_map = do_ctm_map;
} else {
let xobj_ids = collect_inherited_xobject_ids(doc, page_id);
for xobj_id in xobj_ids {
if let Some(content) = decode_form_xobject(doc, xobj_id) {
let xobj_fonts = xobject_fonts(doc, page_id, xobj_id);
let xobj_matrix = xobject_matrix(doc, xobj_id);
carry.ctm = multiply_ctm(saved_ctm, xobj_matrix);
carry.ctm_stack = vec![carry.ctm];
parse_content_stream(&content, &xobj_fonts, carry, out, None, Some(xobj_id));
}
}
}
carry.ctm = saved_ctm;
carry.ctm_stack = saved_ctm_stack;
}
fn decode_form_xobject(doc: &lopdf::Document, xobj_id: ObjectId) -> Option<Vec<u8>> {
let xobj_obj = doc.get_object(xobj_id).ok()?;
let xobj_stream = xobj_obj.as_stream().ok()?;
let is_form = xobj_stream.dict.get(b"Subtype").ok()
.and_then(|o| if let Object::Name(n) = o { Some(n.as_slice()) } else { None })
== Some(b"Form");
if !is_form {
return None;
}
if xobj_stream.dict.get(b"Filter").is_ok() {
let mut owned = xobj_stream.clone();
if owned.decompress().is_ok() {
Some(owned.content)
} else if !xobj_stream.content.is_empty() {
Some(xobj_stream.content.clone())
} else {
None
}
} else {
Some(xobj_stream.content.clone())
}
}
fn xobject_fonts(
doc: &lopdf::Document,
page_id: ObjectId,
xobj_id: ObjectId,
) -> HashMap<Vec<u8>, crate::extract::FontInfo> {
let page_fonts = collect_fonts(doc, page_id);
let xobj_specific = doc.get_object(xobj_id)
.ok()
.and_then(|o| o.as_stream().ok())
.and_then(|s| s.dict.get(b"Resources").ok())
.and_then(|res_ref| resolve_dict(doc, res_ref))
.map(|res_dict| collect_fonts_from_resources(doc, res_dict))
.unwrap_or_default();
if xobj_specific.is_empty() {
page_fonts
} else {
let mut merged = page_fonts;
merged.extend(xobj_specific);
merged
}
}
fn xobject_matrix(doc: &lopdf::Document, xobj_id: ObjectId) -> [f32; 6] {
doc.get_object(xobj_id)
.ok()
.and_then(|o| o.as_stream().ok())
.map(|s| read_matrix(&s.dict))
.unwrap_or(IDENTITY_CTM)
}
pub(crate) fn page_content_streams(doc: &lopdf::Document, page_id: ObjectId) -> Vec<Vec<u8>> {
let Ok(page_obj) = doc.get_object(page_id) else {
return vec![];
};
let Ok(page_dict) = page_obj.as_dict() else {
return vec![];
};
let Ok(contents_obj) = page_dict.get(b"Contents") else {
return vec![];
};
let ids: Vec<ObjectId> = match contents_obj {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| {
if let Object::Reference(id) = o {
Some(*id)
} else {
None
}
})
.collect(),
_ => return vec![],
};
let mut result = Vec::new();
for id in ids {
let Ok(stream_obj) = doc.get_object(id) else {
continue;
};
let Ok(stream) = stream_obj.as_stream() else {
continue;
};
let has_filter = stream.dict.get(b"Filter").is_ok();
if has_filter {
let mut owned = stream.clone();
if owned.decompress().is_ok() {
result.push(owned.content);
} else if !stream.content.is_empty() {
result.push(stream.content.clone());
}
} else {
result.push(stream.content.clone());
}
}
result
}
pub(crate) fn page_content_stream_ids(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Vec<ObjectId> {
let Ok(page_obj) = doc.get_object(page_id) else { return vec![] };
let Ok(page_dict) = page_obj.as_dict() else { return vec![] };
let Ok(contents_obj) = page_dict.get(b"Contents") else { return vec![] };
match contents_obj {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| if let Object::Reference(id) = o { Some(*id) } else { None })
.collect(),
_ => vec![],
}
}
pub(crate) fn page_content_streams_verbose(
doc: &lopdf::Document,
page_id: ObjectId,
) -> (Vec<Vec<u8>>, Vec<ExtractionWarning>) {
let Ok(page_obj) = doc.get_object(page_id) else {
return (vec![], vec![]);
};
let Ok(page_dict) = page_obj.as_dict() else {
return (vec![], vec![]);
};
let Ok(contents_obj) = page_dict.get(b"Contents") else {
return (vec![], vec![]);
};
let ids: Vec<ObjectId> = match contents_obj {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| if let Object::Reference(id) = o { Some(*id) } else { None })
.collect(),
_ => return (vec![], vec![]),
};
let mut result = Vec::new();
let mut warnings = Vec::new();
for id in ids {
let Ok(stream_obj) = doc.get_object(id) else { continue };
let Ok(stream) = stream_obj.as_stream() else { continue };
let has_filter = stream.dict.get(b"Filter").is_ok();
if has_filter {
let mut owned = stream.clone();
if owned.decompress().is_ok() {
result.push(owned.content);
} else if !stream.content.is_empty() {
warnings.push(ExtractionWarning {
kind: WarningKind::StreamDecompressFailed,
stream_id: Some((id.0, id.1)),
message: format!(
"decompress() failed for content stream {id:?}; using raw content as fallback"
),
});
result.push(stream.content.clone());
}
} else {
result.push(stream.content.clone());
}
}
(result, warnings)
}
pub(crate) fn extract_text_runs_from_page_verbose(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Result<(Vec<TextFragment>, Vec<ExtractionWarning>)> {
let (streams, mut warnings) = page_content_streams_verbose(doc, page_id);
let fonts = collect_fonts(doc, page_id);
let mut fragments = Vec::new();
let mut carry = ParseCarryState::default();
for (stream_idx, stream_bytes) in streams.iter().enumerate() {
parse_content_stream(stream_bytes, &fonts, &mut carry, &mut fragments, Some(stream_idx), None);
}
extract_text_from_xobjects_verbose(doc, page_id, &mut carry, &mut fragments, 0, &mut warnings);
Ok((fragments, warnings))
}
fn extract_text_from_xobjects_verbose(
doc: &lopdf::Document,
page_id: ObjectId,
carry: &mut ParseCarryState,
out: &mut Vec<TextFragment>,
_depth: u8,
warnings: &mut Vec<ExtractionWarning>,
) {
let saved_ctm = carry.ctm;
let saved_ctm_stack = carry.ctm_stack.clone();
if !carry.do_ctm_map.is_empty() {
let xobj_name_map = collect_inherited_xobject_name_map(doc, page_id);
let do_ctm_map = std::mem::take(&mut carry.do_ctm_map);
for (xobj_name, do_ctm) in &do_ctm_map {
let Some(&xobj_id) = xobj_name_map.get(xobj_name.as_slice()) else { continue };
match decode_form_xobject_verbose(doc, xobj_id) {
Ok(content) => {
let xobj_fonts = xobject_fonts(doc, page_id, xobj_id);
let xobj_matrix = xobject_matrix(doc, xobj_id);
carry.ctm = multiply_ctm(*do_ctm, xobj_matrix);
carry.ctm_stack = vec![carry.ctm];
parse_content_stream(&content, &xobj_fonts, carry, out, None, Some(xobj_id));
}
Err(warn) => { warnings.push(warn); }
}
}
carry.do_ctm_map = do_ctm_map;
} else {
let xobj_ids = collect_inherited_xobject_ids(doc, page_id);
for xobj_id in xobj_ids {
match decode_form_xobject_verbose(doc, xobj_id) {
Ok(content) => {
let xobj_fonts = xobject_fonts(doc, page_id, xobj_id);
let xobj_matrix = xobject_matrix(doc, xobj_id);
carry.ctm = multiply_ctm(saved_ctm, xobj_matrix);
carry.ctm_stack = vec![carry.ctm];
parse_content_stream(&content, &xobj_fonts, carry, out, None, Some(xobj_id));
}
Err(warn) => { warnings.push(warn); }
}
}
}
carry.ctm = saved_ctm;
carry.ctm_stack = saved_ctm_stack;
}
fn decode_form_xobject_verbose(
doc: &lopdf::Document,
xobj_id: ObjectId,
) -> std::result::Result<Vec<u8>, ExtractionWarning> {
match decode_form_xobject(doc, xobj_id) {
Some(bytes) => Ok(bytes),
None => Err(ExtractionWarning {
kind: WarningKind::XObjectSkipped,
stream_id: Some((xobj_id.0, xobj_id.1)),
message: format!("Form XObject {xobj_id:?} could not be decoded"),
}),
}
}
pub(crate) fn resolve_dict<'a>(
doc: &'a lopdf::Document,
obj: &'a Object,
) -> Option<&'a Dictionary> {
match obj {
Object::Dictionary(d) => Some(d),
Object::Reference(id) => doc.get_object(*id).ok()?.as_dict().ok(),
_ => None,
}
}
fn parse_font_attributes(raw: &str) -> (String, bool, bool, String) {
let name = raw.split('+').next_back().unwrap_or(raw);
let lower = name.to_lowercase();
let is_bold = ["bold", "heavy", "black", "semibold", "demibold", "extrabold"]
.iter()
.any(|kw| lower.contains(kw));
let is_italic = ["italic", "oblique", "slanted"].iter().any(|kw| lower.contains(kw));
let family = name.split(['-', ',']).next().unwrap_or(name).to_string();
(name.to_string(), is_bold, is_italic, family)
}
pub(crate) fn collect_fonts(
doc: &lopdf::Document,
page_id: ObjectId,
) -> HashMap<Vec<u8>, FontInfo> {
collect_fonts_inner(doc, page_id).unwrap_or_default()
}
pub(crate) fn collect_fonts_from_resources(
doc: &lopdf::Document,
resources_dict: &Dictionary,
) -> HashMap<Vec<u8>, FontInfo> {
let mut fonts = HashMap::new();
let Ok(font_obj) = resources_dict.get(b"Font") else {
return fonts;
};
let Some(font_dict) = resolve_dict(doc, font_obj) else {
return fonts;
};
collect_font_dict_entries(doc, font_dict, &mut fonts);
fonts
}
fn collect_fonts_inner(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Option<HashMap<Vec<u8>, FontInfo>> {
let mut current_id = page_id;
loop {
let obj = doc.get_object(current_id).ok()?;
let dict = obj.as_dict().ok()?;
if let Ok(resources_obj) = dict.get(b"Resources") {
let resources_dict = resolve_dict(doc, resources_obj)?;
return Some(collect_fonts_from_resources(doc, resources_dict));
}
let parent_ref = dict.get(b"Parent").ok()?;
let Object::Reference(parent_id) = parent_ref else {
return None;
};
current_id = *parent_id;
}
}
pub(crate) fn collect_inherited_xobject_ids(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Vec<ObjectId> {
let mut current_id = page_id;
while let Ok(obj) = doc.get_object(current_id) {
let Some(dict) = obj.as_dict().ok() else { break };
if let Ok(res_obj) = dict.get(b"Resources") {
let ids = resolve_dict(doc, res_obj)
.and_then(|res_dict| {
res_dict.get(b"XObject").ok().and_then(|xobj_ref| resolve_dict(doc, xobj_ref))
})
.map(|xobj_dict| {
xobj_dict
.iter()
.filter_map(|(_, v)| {
if let Object::Reference(id) = v { Some(*id) } else { None }
})
.collect::<Vec<_>>()
});
if let Some(ids) = ids {
return ids;
}
break; }
let Ok(parent_ref) = dict.get(b"Parent") else { break };
let Object::Reference(parent_id) = parent_ref else { break };
current_id = *parent_id;
}
vec![]
}
fn collect_inherited_xobject_name_map(
doc: &lopdf::Document,
page_id: ObjectId,
) -> HashMap<Vec<u8>, ObjectId> {
let mut current_id = page_id;
while let Ok(obj) = doc.get_object(current_id) {
let Some(dict) = obj.as_dict().ok() else { break };
if let Ok(res_obj) = dict.get(b"Resources") {
let map = resolve_dict(doc, res_obj)
.and_then(|res_dict| {
res_dict.get(b"XObject").ok().and_then(|xobj_ref| resolve_dict(doc, xobj_ref))
})
.map(|xobj_dict| {
xobj_dict
.iter()
.filter_map(|(name, v)| {
if let Object::Reference(id) = v {
Some((name.clone(), *id))
} else {
None
}
})
.collect::<HashMap<Vec<u8>, ObjectId>>()
});
if let Some(m) = map {
return m;
}
break;
}
let Ok(parent_ref) = dict.get(b"Parent") else { break };
let Object::Reference(parent_id) = parent_ref else { break };
current_id = *parent_id;
}
HashMap::new()
}
fn collect_font_dict_entries(
doc: &lopdf::Document,
font_dict: &Dictionary,
fonts: &mut HashMap<Vec<u8>, FontInfo>,
) {
for (name, font_ref) in font_dict.iter() {
let Object::Reference(font_id) = font_ref else {
continue;
};
let Ok(font_obj) = doc.get_object(*font_id) else {
continue;
};
let Ok(fd) = font_obj.as_dict() else { continue };
let subtype = fd.get(b"Subtype").ok().and_then(|o| {
if let Object::Name(n) = o {
Some(n.as_slice())
} else {
None
}
});
let raw_base_font = fd
.get(b"BaseFont")
.ok()
.and_then(|o| match o {
Object::Name(n) => std::str::from_utf8(n).ok().map(|s| s.to_string()),
_ => None,
})
.unwrap_or_default();
let (base_font, is_bold, is_italic, font_family) = parse_font_attributes(&raw_base_font);
let font_info = match subtype {
Some(b"Type0") => match collect_type0_font(fd, doc, base_font, is_bold, is_italic, font_family) {
Some(fi) => fi,
None => continue,
},
Some(b"Type1") | Some(b"MMType1") | Some(b"TrueType") | Some(b"Type3") => {
collect_simple_font(fd, doc, base_font, is_bold, is_italic, font_family)
}
_ => continue,
};
fonts.insert(name.clone(), font_info);
}
}
fn collect_type0_font(
fd: &Dictionary,
doc: &lopdf::Document,
base_font: String,
is_bold: bool,
is_italic: bool,
font_family: String,
) -> Option<FontInfo> {
let to_unicode = try_parse_to_unicode(fd, doc).unwrap_or_default();
let identity_fallback = to_unicode.is_empty() && is_identity_cmap(fd);
let desc_obj = fd.get(b"DescendantFonts").ok()?;
let Object::Array(desc_arr) = desc_obj else {
return None;
};
let Some(Object::Reference(cid_id)) = desc_arr.first() else {
return None;
};
let Ok(cid_obj) = doc.get_object(*cid_id) else {
return None;
};
let Ok(cid_dict) = cid_obj.as_dict() else {
return None;
};
let dw = cid_dict
.get(b"DW")
.ok()
.and_then(|o| o.as_i64().ok())
.map(|n| n as u32)
.unwrap_or(1000);
let w_runs = cid_dict
.get(b"W")
.ok()
.and_then(|o| {
if let Object::Array(a) = o {
Some(a.as_slice())
} else {
None
}
})
.map(parse_w_array)
.unwrap_or_default();
Some(FontInfo {
to_unicode,
dw,
w_runs,
bytes_per_char: 2,
identity_fallback,
base_font,
is_bold,
is_italic,
font_family,
})
}
fn is_identity_cmap(fd: &Dictionary) -> bool {
match fd.get(b"Encoding").ok() {
Some(Object::Name(n)) => matches!(n.as_slice(), b"Identity-H" | b"Identity-V"),
None => true,
_ => false,
}
}
fn collect_simple_font(
fd: &Dictionary,
doc: &lopdf::Document,
base_font: String,
is_bold: bool,
is_italic: bool,
font_family: String,
) -> FontInfo {
let to_unicode = if let Some(map) = try_parse_to_unicode(fd, doc) {
map
} else {
build_encoding_map(fd, doc)
};
let (w_runs, dw) = collect_simple_font_widths(fd, doc);
FontInfo {
to_unicode,
dw,
w_runs,
bytes_per_char: 1,
identity_fallback: false,
base_font,
is_bold,
is_italic,
font_family,
}
}
fn try_parse_to_unicode(fd: &Dictionary, doc: &lopdf::Document) -> Option<BTreeMap<u16, char>> {
let to_uni_ref = fd.get(b"ToUnicode").ok()?;
let Object::Reference(to_uni_id) = to_uni_ref else {
return None;
};
let Ok(to_uni_obj) = doc.get_object(*to_uni_id) else {
return None;
};
let Ok(stream) = to_uni_obj.as_stream() else {
return None;
};
let cmap_bytes = if stream.dict.get(b"Filter").is_ok() {
let mut owned = stream.clone();
owned.decompress().ok()?;
owned.content
} else {
stream.content.clone()
};
let map = parse_to_unicode_cmap(&cmap_bytes);
if map.is_empty() { None } else { Some(map) }
}
fn collect_simple_font_widths(fd: &Dictionary, doc: &lopdf::Document) -> (Vec<WidthRun>, u32) {
let dw = missing_width_from_descriptor(fd, doc);
let first_char = match fd.get(b"FirstChar").ok().and_then(|o| o.as_i64().ok()) {
Some(n) => n as u16,
None => return (vec![], dw),
};
let widths_arr = match fd.get(b"Widths").ok() {
Some(Object::Array(a)) => a,
_ => return (vec![], dw),
};
let widths: Vec<u32> = widths_arr
.iter()
.filter_map(|o| o.as_i64().ok().map(|n| n as u32))
.collect();
if widths.is_empty() {
return (vec![], dw);
}
(
vec![WidthRun {
start_gid: first_char,
widths,
}],
dw,
)
}
fn missing_width_from_descriptor(fd: &Dictionary, doc: &lopdf::Document) -> u32 {
let desc = fd
.get(b"FontDescriptor")
.ok()
.and_then(|o| resolve_dict(doc, o));
desc.and_then(|d| d.get(b"MissingWidth").ok())
.and_then(|o| o.as_i64().ok())
.map(|n| n as u32)
.unwrap_or(1000)
}
fn build_encoding_map(fd: &Dictionary, doc: &lopdf::Document) -> BTreeMap<u16, char> {
let enc_obj = match fd.get(b"Encoding").ok() {
Some(o) => o,
None => return encoding_table_to_btree(&STANDARD_ENCODING),
};
if let Object::Name(name) = enc_obj {
return encoding_name_to_btree(name);
}
let enc_dict = match resolve_dict(doc, enc_obj) {
Some(d) => d,
None => return encoding_table_to_btree(&STANDARD_ENCODING),
};
let base = enc_dict
.get(b"BaseEncoding")
.ok()
.and_then(|o| {
if let Object::Name(n) = o {
Some(n.as_slice())
} else {
None
}
})
.map(encoding_name_to_btree)
.unwrap_or_else(|| encoding_table_to_btree(&STANDARD_ENCODING));
apply_differences(enc_dict, base)
}
fn encoding_name_to_btree(name: &[u8]) -> BTreeMap<u16, char> {
match name {
b"WinAnsiEncoding" => encoding_table_to_btree(&WIN_ANSI_ENCODING),
b"MacRomanEncoding" => encoding_table_to_btree(&MAC_ROMAN_ENCODING),
b"StandardEncoding" => encoding_table_to_btree(&STANDARD_ENCODING),
_ => encoding_table_to_btree(&STANDARD_ENCODING),
}
}
fn encoding_table_to_btree(table: &[Option<char>; 256]) -> BTreeMap<u16, char> {
table
.iter()
.enumerate()
.filter_map(|(i, opt)| opt.map(|ch| (i as u16, ch)))
.collect()
}
fn apply_differences(enc_dict: &Dictionary, mut map: BTreeMap<u16, char>) -> BTreeMap<u16, char> {
let Ok(Object::Array(diffs)) = enc_dict.get(b"Differences") else {
return map;
};
let mut current_code: u16 = 0;
for obj in diffs {
match obj {
Object::Integer(n) => {
current_code = *n as u16;
}
Object::Name(glyph_name) => {
if let Some(ch) = glyph_name_to_char(glyph_name) {
map.insert(current_code, ch);
}
current_code = current_code.saturating_add(1);
}
_ => {}
}
}
map
}
#[rustfmt::skip]
const WIN_ANSI_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\''),
Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('`'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None, Some('€'), None, Some('‚'), Some('ƒ'),
Some('„'), Some('…'), Some('†'), Some('‡'),
Some('ˆ'), Some('‰'), Some('Š'), Some('‹'),
Some('Œ'), None, Some('Ž'), None,
None, Some('\u{2018}'), Some('\u{2019}'), Some('\u{201C}'),
Some('\u{201D}'), Some('•'), Some('–'), Some('—'),
Some('˜'), Some('™'), Some('š'), Some('›'),
Some('œ'), None, Some('ž'), Some('Ÿ'),
Some('\u{00A0}'), Some('¡'), Some('¢'), Some('£'),
Some('¤'), Some('¥'), Some('¦'), Some('§'),
Some('¨'), Some('©'), Some('ª'), Some('«'),
Some('¬'), Some('-'), Some('®'), Some('¯'), Some('°'), Some('±'), Some('²'), Some('³'),
Some('´'), Some('µ'), Some('¶'), Some('·'),
Some('¸'), Some('¹'), Some('º'), Some('»'),
Some('¼'), Some('½'), Some('¾'), Some('¿'),
Some('À'), Some('Á'), Some('Â'), Some('Ã'),
Some('Ä'), Some('Å'), Some('Æ'), Some('Ç'),
Some('È'), Some('É'), Some('Ê'), Some('Ë'),
Some('Ì'), Some('Í'), Some('Î'), Some('Ï'),
Some('Ð'), Some('Ñ'), Some('Ò'), Some('Ó'),
Some('Ô'), Some('Õ'), Some('Ö'), Some('×'),
Some('Ø'), Some('Ù'), Some('Ú'), Some('Û'),
Some('Ü'), Some('Ý'), Some('Þ'), Some('ß'),
Some('à'), Some('á'), Some('â'), Some('ã'),
Some('ä'), Some('å'), Some('æ'), Some('ç'),
Some('è'), Some('é'), Some('ê'), Some('ë'),
Some('ì'), Some('í'), Some('î'), Some('ï'),
Some('ð'), Some('ñ'), Some('ò'), Some('ó'),
Some('ô'), Some('õ'), Some('ö'), Some('÷'),
Some('ø'), Some('ù'), Some('ú'), Some('û'),
Some('ü'), Some('ý'), Some('þ'), Some('ÿ'),
];
#[rustfmt::skip]
const MAC_ROMAN_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\''),
Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('`'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None,
Some('Ä'), Some('Å'), Some('Ç'), Some('É'),
Some('Ñ'), Some('Ö'), Some('Ü'), Some('á'),
Some('à'), Some('â'), Some('ä'), Some('ã'),
Some('å'), Some('ç'), Some('é'), Some('è'),
Some('ê'), Some('ë'), Some('í'), Some('ì'),
Some('î'), Some('ï'), Some('ñ'), Some('ó'),
Some('ò'), Some('ô'), Some('ö'), Some('õ'),
Some('ú'), Some('ù'), Some('û'), Some('ü'),
Some('†'), Some('°'), Some('¢'), Some('£'),
Some('§'), Some('•'), Some('¶'), Some('ß'),
Some('®'), Some('©'), Some('™'), Some('´'),
Some('¨'), Some('≠'), Some('Æ'), Some('Ø'),
Some('∞'), Some('±'), Some('≤'), Some('≥'),
Some('¥'), Some('µ'), Some('∂'), Some('∑'),
Some('∏'), Some('π'), Some('∫'), Some('ª'),
Some('º'), Some('\u{2126}'), Some('æ'), Some('ø'), Some('¿'), Some('¡'), Some('¬'), Some('√'),
Some('ƒ'), Some('≈'), Some('∆'), Some('«'),
Some('»'), Some('…'), Some('\u{00A0}'), Some('À'), Some('Ã'), Some('Õ'), Some('Œ'), Some('œ'),
Some('–'), Some('—'), Some('"'), Some('"'),
Some('\u{2018}'), Some('\u{2019}'), Some('÷'), Some('\u{25CA}'), Some('ÿ'), Some('Ÿ'), Some('⁄'), Some('¤'), Some('‹'), Some('›'), Some('\u{FB01}'), Some('\u{FB02}'), Some('‡'), Some('·'), Some('‚'), Some('„'),
Some('‰'), Some('Â'), Some('Ê'), Some('Á'),
Some('Ë'), Some('È'), Some('Í'), Some('Î'),
Some('Ï'), Some('Ì'), Some('Ó'), Some('Ô'),
Some('\u{F8FF}'), Some('Ò'), Some('Ú'), Some('Û'), Some('Ù'), Some('ı'), Some('ˆ'), Some('˜'),
Some('¯'), Some('˘'), Some('˙'), Some('˚'),
Some('¸'), Some('˝'), Some('˛'), Some('ˇ'),
];
#[rustfmt::skip]
const STANDARD_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\u{2019}'), Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('\u{2018}'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None,
Some('¡'), Some('¢'), Some('£'), Some('⁄'), Some('¥'), Some('ƒ'), Some('§'), Some('¤'), Some('\''), Some('"'), Some('«'), Some('‹'),
Some('›'), Some('\u{FB01}'), Some('\u{FB02}'), None, Some('–'), Some('†'), Some('‡'),
Some('·'), None, Some('¶'), Some('•'),
Some('‚'), Some('„'), Some('"'), Some('»'),
Some('…'), Some('‰'), None, Some('¿'),
None, Some('`'), Some('´'), Some('ˆ'),
Some('˜'), Some('¯'), Some('˘'), Some('˙'),
Some('¨'), None, Some('˚'), Some('¸'),
None, Some('˝'), Some('˛'), Some('ˇ'),
Some('—'), None, None, None,
None, None, None, None,
None, None, None, None,
None, None, None, None,
None, Some('Æ'), None, Some('ª'),
None, None, None, None,
Some('Ł'), Some('Ø'), Some('Œ'), Some('º'),
None, None, None, None,
None, Some('æ'), None, None,
None, Some('ı'), None, None,
Some('ł'), Some('ø'), Some('œ'), Some('ß'),
None, None, None, None,
];
fn glyph_name_to_char(name: &[u8]) -> Option<char> {
let s = std::str::from_utf8(name).ok()?;
if let Ok(i) = AGL_TABLE.binary_search_by_key(&s, |&(n, _)| n) {
return Some(AGL_TABLE[i].1);
}
let hex = s.strip_prefix("uni").or_else(|| s.strip_prefix('u'))?;
if hex.is_empty() || hex.len() > 8 {
return None;
}
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
static AGL_TABLE: &[(&str, char)] = &[
("A", 'A'),
("AE", 'Æ'),
("Aacute", 'Á'),
("Abreve", 'Ă'),
("Acircumflex", 'Â'),
("Adieresis", 'Ä'),
("Agrave", 'À'),
("Amacron", 'Ā'),
("Aogonek", 'Ą'),
("Aring", 'Å'),
("Atilde", 'Ã'),
("B", 'B'),
("C", 'C'),
("Cacute", 'Ć'),
("Ccaron", 'Č'),
("Ccedilla", 'Ç'),
("D", 'D'),
("Dcaron", 'Ď'),
("Dcroat", 'Đ'),
("Delta", '∆'),
("E", 'E'),
("Eacute", 'É'),
("Ecaron", 'Ě'),
("Ecircumflex", 'Ê'),
("Edieresis", 'Ë'),
("Egrave", 'È'),
("Emacron", 'Ē'),
("Eogonek", 'Ę'),
("Eth", 'Ð'),
("Euro", '€'),
("F", 'F'),
("G", 'G'),
("Gbreve", 'Ğ'),
("H", 'H'),
("I", 'I'),
("Iacute", 'Í'),
("Icircumflex", 'Î'),
("Idieresis", 'Ï'),
("Idotaccent", 'İ'),
("Igrave", 'Ì'),
("Imacron", 'Ī'),
("Iogonek", 'Į'),
("J", 'J'),
("K", 'K'),
("L", 'L'),
("Lacute", 'Ĺ'),
("Lcaron", 'Ľ'),
("Lcommaaccent", 'Ļ'),
("Lslash", 'Ł'),
("M", 'M'),
("N", 'N'),
("Nacute", 'Ń'),
("Ncaron", 'Ň'),
("Ncommaaccent", 'Ņ'),
("Ntilde", 'Ñ'),
("O", 'O'),
("OE", 'Œ'),
("Oacute", 'Ó'),
("Ocircumflex", 'Ô'),
("Odblacute", 'Ő'),
("Odieresis", 'Ö'),
("Ograve", 'Ò'),
("Omacron", 'Ō'),
("Omega", '\u{2126}'),
("Oslash", 'Ø'),
("Otilde", 'Õ'),
("P", 'P'),
("Q", 'Q'),
("R", 'R'),
("Racute", 'Ŕ'),
("Rcaron", 'Ř'),
("Rcommaaccent", 'Ŗ'),
("S", 'S'),
("Sacute", 'Ś'),
("Scaron", 'Š'),
("Scedilla", 'Ş'),
("Scommaaccent", 'Ș'),
("T", 'T'),
("Tcaron", 'Ť'),
("Tcedilla", 'Ţ'),
("Tcommaaccent", 'Ț'),
("Thorn", 'Þ'),
("U", 'U'),
("Uacute", 'Ú'),
("Ucircumflex", 'Û'),
("Udblacute", 'Ű'),
("Udieresis", 'Ü'),
("Ugrave", 'Ù'),
("Umacron", 'Ū'),
("Uogonek", 'Ų'),
("Uring", 'Ů'),
("V", 'V'),
("W", 'W'),
("X", 'X'),
("Y", 'Y'),
("Yacute", 'Ý'),
("Ydieresis", 'Ÿ'),
("Z", 'Z'),
("Zacute", 'Ź'),
("Zcaron", 'Ž'),
("Zdotaccent", 'Ż'),
("a", 'a'),
("aacute", 'á'),
("abreve", 'ă'),
("acircumflex", 'â'),
("adieresis", 'ä'),
("ae", 'æ'),
("agrave", 'à'),
("amacron", 'ā'),
("ampersand", '&'),
("aogonek", 'ą'),
("approxequal", '≈'),
("aring", 'å'),
("asciicircum", '^'),
("asciitilde", '~'),
("asterisk", '*'),
("at", '@'),
("atilde", 'ã'),
("b", 'b'),
("backslash", '\\'),
("bar", '|'),
("braceleft", '{'),
("braceright", '}'),
("bracketleft", '['),
("bracketright", ']'),
("breve", '˘'),
("brokenbar", '¦'),
("bullet", '•'),
("c", 'c'),
("cacute", 'ć'),
("caron", 'ˇ'),
("ccaron", 'č'),
("ccedilla", 'ç'),
("cedilla", '¸'),
("cent", '¢'),
("circumflex", 'ˆ'),
("colon", ':'),
("comma", ','),
("copyright", '©'),
("currency", '¤'),
("d", 'd'),
("dagger", '†'),
("daggerdbl", '‡'),
("dcaron", 'ď'),
("dcroat", 'đ'),
("degree", '°'),
("dieresis", '¨'),
("divide", '÷'),
("dollar", '$'),
("dotaccent", '˙'),
("dotlessi", 'ı'),
("e", 'e'),
("eacute", 'é'),
("ecaron", 'ě'),
("ecircumflex", 'ê'),
("edieresis", 'ë'),
("egrave", 'è'),
("eight", '8'),
("ellipsis", '…'),
("emacron", 'ē'),
("emdash", '—'),
("endash", '–'),
("eogonek", 'ę'),
("equal", '='),
("eth", 'ð'),
("euro", '€'),
("exclam", '!'),
("exclamdown", '¡'),
("f", 'f'),
("ff", '\u{FB00}'),
("ffi", '\u{FB03}'),
("ffl", '\u{FB04}'),
("fi", '\u{FB01}'),
("five", '5'),
("fl", '\u{FB02}'),
("florin", 'ƒ'),
("four", '4'),
("fraction", '⁄'),
("g", 'g'),
("gbreve", 'ğ'),
("germandbls", 'ß'),
("grave", '`'),
("greater", '>'),
("greaterequal", '≥'),
("guillemotleft", '«'),
("guillemotright", '»'),
("guilsinglleft", '‹'),
("guilsinglright", '›'),
("h", 'h'),
("hungarumlaut", '˝'),
("hyphen", '-'),
("i", 'i'),
("iacute", 'í'),
("icircumflex", 'î'),
("idieresis", 'ï'),
("idotaccent", 'ı'),
("igrave", 'ì'),
("imacron", 'ī'),
("infinity", '∞'),
("integral", '∫'),
("iogonek", 'į'),
("j", 'j'),
("k", 'k'),
("l", 'l'),
("lacute", 'ĺ'),
("lcaron", 'ľ'),
("lcommaaccent", 'ļ'),
("less", '<'),
("lessequal", '≤'),
("logicalnot", '¬'),
("lozenge", '◊'),
("lslash", 'ł'),
("m", 'm'),
("macron", '¯'),
("mu", 'µ'),
("multiply", '×'),
("n", 'n'),
("nacute", 'ń'),
("ncaron", 'ň'),
("ncommaaccent", 'ņ'),
("nine", '9'),
("notequal", '≠'),
("ntilde", 'ñ'),
("numbersign", '#'),
("o", 'o'),
("oacute", 'ó'),
("ocircumflex", 'ô'),
("odblacute", 'ő'),
("odieresis", 'ö'),
("oe", 'œ'),
("ogonek", '˛'),
("ograve", 'ò'),
("omacron", 'ō'),
("one", '1'),
("onehalf", '½'),
("onequarter", '¼'),
("onesuperior", '¹'),
("ordfeminine", 'ª'),
("ordmasculine", 'º'),
("oslash", 'ø'),
("otilde", 'õ'),
("p", 'p'),
("paragraph", '¶'),
("parenleft", '('),
("parenright", ')'),
("partialdiff", '∂'),
("percent", '%'),
("period", '.'),
("periodcentered", '·'),
("perthousand", '‰'),
("pi", 'π'),
("plus", '+'),
("plusminus", '±'),
("product", '∏'),
("q", 'q'),
("question", '?'),
("questiondown", '¿'),
("quotedbl", '"'),
("quotedblbase", '„'),
("quotedblleft", '"'),
("quotedblright", '"'),
("quoteleft", '\u{2018}'),
("quoteright", '\u{2019}'),
("quotesinglbase", '‚'),
("quotesingle", '\''),
("r", 'r'),
("racute", 'ŕ'),
("radical", '√'),
("rcaron", 'ř'),
("rcommaaccent", 'ŗ'),
("registered", '®'),
("ring", '˚'),
("s", 's'),
("sacute", 'ś'),
("scaron", 'š'),
("scedilla", 'ş'),
("scommaaccent", 'ș'),
("section", '§'),
("semicolon", ';'),
("seven", '7'),
("six", '6'),
("slash", '/'),
("space", ' '),
("sterling", '£'),
("summation", '∑'),
("t", 't'),
("tcaron", 'ť'),
("tcedilla", 'ţ'),
("tcommaaccent", 'ț'),
("thorn", 'þ'),
("three", '3'),
("threequarters", '¾'),
("threesuperior", '³'),
("tilde", '˜'),
("trademark", '™'),
("two", '2'),
("twosuperior", '²'),
("u", 'u'),
("uacute", 'ú'),
("ucircumflex", 'û'),
("udblacute", 'ű'),
("udieresis", 'ü'),
("ugrave", 'ù'),
("umacron", 'ū'),
("underscore", '_'),
("uogonek", 'ų'),
("uring", 'ů'),
("v", 'v'),
("w", 'w'),
("x", 'x'),
("y", 'y'),
("yacute", 'ý'),
("ydieresis", 'ÿ'),
("yen", '¥'),
("z", 'z'),
("zacute", 'ź'),
("zcaron", 'ž'),
("zdotaccent", 'ż'),
("zero", '0'),
];
fn parse_to_unicode_cmap(bytes: &[u8]) -> BTreeMap<u16, char> {
let mut map = BTreeMap::new();
let text = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return map,
};
enum Section {
None,
BfChar,
BfRange,
}
let mut section = Section::None;
for line in text.lines() {
let line = line.trim();
if line.ends_with("beginbfchar") {
section = Section::BfChar;
continue;
}
if line == "endbfchar" {
section = Section::None;
continue;
}
if line.ends_with("beginbfrange") {
section = Section::BfRange;
continue;
}
if line == "endbfrange" {
section = Section::None;
continue;
}
match section {
Section::BfChar => parse_bfchar_line(line, &mut map),
Section::BfRange => parse_bfrange_line(line, &mut map),
Section::None => {}
}
}
map
}
fn parse_bfchar_line(line: &str, map: &mut BTreeMap<u16, char>) {
let mut parts = line.split_ascii_whitespace();
let gid_tok = match parts.next() {
Some(s) => s,
None => return,
};
let uni_tok = match parts.next() {
Some(s) => s,
None => return,
};
let gid_hex = gid_tok.trim_start_matches('<').trim_end_matches('>');
let uni_hex = uni_tok.trim_start_matches('<').trim_end_matches('>');
let Ok(gid) = u16::from_str_radix(gid_hex, 16) else {
return;
};
let ch = hex_to_char(uni_hex);
if let Some(ch) = ch {
map.insert(gid, ch);
}
}
fn parse_bfrange_line(line: &str, map: &mut BTreeMap<u16, char>) {
let mut toks = line.split_ascii_whitespace();
let lo_tok = match toks.next() {
Some(s) => s,
None => return,
};
let hi_tok = match toks.next() {
Some(s) => s,
None => return,
};
let rest = {
let skip2 = line
.trim_start()
.trim_start_matches(|c: char| !c.is_ascii_whitespace()) .trim_start_matches(|c: char| c.is_ascii_whitespace()) .trim_start_matches(|c: char| !c.is_ascii_whitespace()) .trim_start();
if skip2.is_empty() {
return;
}
skip2
};
let lo_hex = lo_tok.trim_start_matches('<').trim_end_matches('>');
let hi_hex = hi_tok.trim_start_matches('<').trim_end_matches('>');
let Ok(lo) = u16::from_str_radix(lo_hex, 16) else {
return;
};
let Ok(hi) = u16::from_str_radix(hi_hex, 16) else {
return;
};
if lo > hi {
return;
}
if rest.starts_with('[') {
let inner = rest.trim_start_matches('[').trim_end_matches(']');
let mut code = lo;
for tok in inner.split_whitespace() {
if code > hi {
break;
}
let hex = tok.trim_start_matches('<').trim_end_matches('>');
if let Some(ch) = hex_to_char(hex) {
map.insert(code, ch);
}
code = code.saturating_add(1);
}
} else {
let dst_hex = rest.trim_start_matches('<').trim_end_matches('>');
let Ok(dst_start) = u32::from_str_radix(dst_hex, 16) else {
return;
};
for i in 0..=(hi as u32).saturating_sub(lo as u32) {
let code = lo + i as u16;
let Some(cp) = dst_start.checked_add(i) else {
break;
};
if let Some(ch) = char::from_u32(cp) {
map.insert(code, ch);
}
}
}
}
fn hex_to_char(hex: &str) -> Option<char> {
match hex.len() {
1 | 2 => {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
3 | 4 => {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
8 => {
let hi = u16::from_str_radix(&hex[0..4], 16).ok()?;
let lo = u16::from_str_radix(&hex[4..8], 16).ok()?;
if (0xD800..=0xDBFF).contains(&hi) && (0xDC00..=0xDFFF).contains(&lo) {
let cp = 0x10000u32 + ((hi as u32 - 0xD800) << 10) + (lo as u32 - 0xDC00);
char::from_u32(cp)
} else {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
}
_ => None,
}
}
fn parse_w_array(arr: &[Object]) -> Vec<WidthRun> {
let mut runs = Vec::new();
let mut i = 0;
while i < arr.len() {
let start_gid = match arr[i].as_i64() {
Ok(n) => n as u16,
Err(_) => {
i += 1;
continue;
}
};
i += 1;
if i >= arr.len() {
break;
}
match &arr[i] {
Object::Array(widths_arr) => {
let widths: Vec<u32> = widths_arr
.iter()
.filter_map(|o| o.as_i64().ok().map(|n| n as u32))
.collect();
runs.push(WidthRun { start_gid, widths });
i += 1;
}
Object::Integer(_) | Object::Real(_) => {
let end_gid = match arr[i].as_i64() {
Ok(n) => n as u16,
Err(_) => {
i += 1;
continue;
}
};
i += 1;
if i >= arr.len() {
break;
}
let w = match arr[i].as_i64() {
Ok(n) => n as u32,
Err(_) => {
i += 1;
continue;
}
};
i += 1;
let count = (end_gid as usize).saturating_sub(start_gid as usize) + 1;
runs.push(WidthRun {
start_gid,
widths: vec![w; count],
});
}
_ => {
i += 1;
}
}
}
runs
}
#[derive(Debug)]
enum Token {
HexStr(Vec<u8>),
LitStr(Vec<u8>),
Name(Vec<u8>),
Number(f32),
Keyword(Vec<u8>),
Array(Vec<Token>),
}
fn tokenize(input: &[u8]) -> Vec<(Token, usize)> {
let mut tokens = Vec::new();
let mut i = 0;
while i < input.len() {
let b = input[i];
if is_pdf_whitespace(b) {
i += 1;
continue;
}
if b == b'%' {
while i < input.len() && input[i] != b'\r' && input[i] != b'\n' {
i += 1;
}
continue;
}
if b == b'<' {
let tok_start = i;
if i + 1 < input.len() && input[i + 1] == b'<' {
i += 2;
while i + 1 < input.len() && !(input[i] == b'>' && input[i + 1] == b'>') {
i += 1;
}
if i + 1 < input.len() {
i += 2;
}
continue;
}
i += 1;
let start = i;
while i < input.len() && input[i] != b'>' {
i += 1;
}
let hex = &input[start..i];
if i < input.len() {
i += 1;
}
tokens.push((Token::HexStr(decode_hex_bytes(hex)), tok_start));
continue;
}
if b == b'/' {
let tok_start = i;
i += 1;
let start = i;
while i < input.len() && !is_pdf_whitespace(input[i]) && !is_pdf_delimiter(input[i]) {
i += 1;
}
tokens.push((Token::Name(input[start..i].to_vec()), tok_start));
continue;
}
if b == b'[' {
let tok_start = i;
i += 1;
let (arr, consumed) = parse_array_tokens(&input[i..]);
i += consumed;
tokens.push((Token::Array(arr), tok_start));
continue;
}
if b == b']' {
i += 1;
continue;
}
if b == b'(' {
let tok_start = i;
let (bytes, end_i) = parse_literal_string(input, i + 1);
i = end_i;
tokens.push((Token::LitStr(bytes), tok_start));
continue;
}
let start = i;
while i < input.len() && !is_pdf_whitespace(input[i]) && !is_pdf_delimiter(input[i]) {
i += 1;
}
let word = &input[start..i];
if word.is_empty() {
i += 1;
continue;
}
if let Ok(s) = std::str::from_utf8(word)
&& let Ok(n) = s.parse::<f32>()
&& n.is_finite()
{
tokens.push((Token::Number(n), start));
continue;
}
tokens.push((Token::Keyword(word.to_vec()), start));
}
tokens
}
fn parse_array_tokens(input: &[u8]) -> (Vec<Token>, usize) {
let mut tokens = Vec::new();
let mut i = 0;
while i < input.len() {
let b = input[i];
if is_pdf_whitespace(b) {
i += 1;
continue;
}
if b == b']' {
i += 1;
return (tokens, i);
}
if b == b'<' && (i + 1 >= input.len() || input[i + 1] != b'<') {
i += 1;
let start = i;
while i < input.len() && input[i] != b'>' {
i += 1;
}
let hex = &input[start..i];
if i < input.len() {
i += 1;
}
tokens.push(Token::HexStr(decode_hex_bytes(hex)));
continue;
}
if b == b'(' {
let (bytes, end_i) = parse_literal_string(input, i + 1);
i = end_i;
tokens.push(Token::LitStr(bytes));
continue;
}
let start = i;
while i < input.len() && !is_pdf_whitespace(input[i]) && !is_pdf_delimiter(input[i]) {
i += 1;
}
let word = &input[start..i];
if word.is_empty() {
i += 1;
continue;
}
if let Ok(s) = std::str::from_utf8(word)
&& let Ok(n) = s.parse::<f32>()
{
tokens.push(Token::Number(n));
}
}
(tokens, i)
}
pub(crate) fn parse_literal_string(input: &[u8], mut i: usize) -> (Vec<u8>, usize) {
let mut depth = 1i32;
let mut out = Vec::new();
while i < input.len() && depth > 0 {
match input[i] {
b'\\' => {
i += 1;
if i >= input.len() {
break;
}
match input[i] {
b'n' => {
out.push(b'\n');
i += 1;
}
b'r' => {
out.push(b'\r');
i += 1;
}
b't' => {
out.push(b'\t');
i += 1;
}
b'\\' => {
out.push(b'\\');
i += 1;
}
b'(' => {
out.push(b'(');
i += 1;
}
b')' => {
out.push(b')');
i += 1;
}
b'\r' => {
i += 1;
if i < input.len() && input[i] == b'\n' {
i += 1;
}
}
b'\n' => {
i += 1;
} d @ b'0'..=b'7' => {
let mut val = (d - b'0') as u16;
i += 1;
let mut count = 1;
while count < 3 && i < input.len() && (b'0'..=b'7').contains(&input[i]) {
val = val * 8 + (input[i] - b'0') as u16;
i += 1;
count += 1;
}
out.push((val & 0xFF) as u8);
}
_ => {
out.push(input[i]);
i += 1;
}
}
}
b'(' => {
depth += 1;
out.push(b'(');
i += 1;
}
b')' => {
depth -= 1;
if depth > 0 {
out.push(b')');
}
i += 1;
}
b => {
out.push(b);
i += 1;
}
}
}
(out, i)
}
pub(crate) fn decode_hex_bytes(hex: &[u8]) -> Vec<u8> {
let cleaned: Vec<u8> = hex
.iter()
.filter(|&&b| !is_pdf_whitespace(b))
.copied()
.collect();
let mut padded = cleaned;
if !padded.len().is_multiple_of(2) {
padded.push(b'0');
}
padded
.chunks(2)
.filter_map(|chunk| {
let s = std::str::from_utf8(chunk).ok()?;
u8::from_str_radix(s, 16).ok()
})
.collect()
}
pub(crate) fn is_pdf_whitespace(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\r' | b'\n' | 0x0C | 0x00)
}
pub(crate) fn is_pdf_delimiter(b: u8) -> bool {
matches!(
b,
b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
)
}
const IDENTITY_CTM: [f32; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
fn multiply_ctm(a: [f32; 6], b: [f32; 6]) -> [f32; 6] {
[
a[0] * b[0] + a[2] * b[1],
a[1] * b[0] + a[3] * b[1],
a[0] * b[2] + a[2] * b[3],
a[1] * b[2] + a[3] * b[3],
a[0] * b[4] + a[2] * b[5] + a[4],
a[1] * b[4] + a[3] * b[5] + a[5],
]
}
fn apply_ctm(m: [f32; 6], x: f32, y: f32) -> (f32, f32) {
(m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5])
}
fn ctm_scale(m: [f32; 6]) -> f32 {
(m[0] * m[0] + m[1] * m[1]).sqrt()
}
fn read_matrix(dict: &lopdf::Dictionary) -> [f32; 6] {
dict.get(b"Matrix")
.ok()
.and_then(|o| o.as_array().ok())
.and_then(|arr| {
if arr.len() < 6 {
return None;
}
let mut m = [0f32; 6];
for (i, v) in arr[..6].iter().enumerate() {
m[i] = v.as_float().ok()?;
}
Some(m)
})
.unwrap_or(IDENTITY_CTM)
}
struct ParseCarryState {
cur_color: [f32; 3],
cur_render_mode: u8,
ctm: [f32; 6],
do_ctm_map: Vec<(Vec<u8>, [f32; 6])>,
ctm_stack: Vec<[f32; 6]>,
in_bt: bool,
font_name: Vec<u8>,
tf_font_size: f32,
font_size: f32,
tm_y_scale: f32,
text_x: f32,
text_y: f32,
tm_origin_x: f32,
tm_origin_y: f32,
tm_origin_set: bool,
tm_x_scale: f32,
tm_lm_x: f32,
tm_lm_y: f32,
text_leading: f32,
char_spacing: f32,
word_spacing: f32,
}
impl Default for ParseCarryState {
fn default() -> Self {
Self {
cur_color: [0.0, 0.0, 0.0],
cur_render_mode: 0,
ctm: IDENTITY_CTM,
do_ctm_map: Vec::new(),
ctm_stack: vec![IDENTITY_CTM],
in_bt: false,
font_name: Vec::new(),
tf_font_size: 12.0,
font_size: 12.0,
tm_y_scale: 1.0,
text_x: 0.0,
text_y: 0.0,
tm_origin_x: 0.0,
tm_origin_y: 0.0,
tm_origin_set: false,
tm_x_scale: 1.0,
tm_lm_x: 0.0,
tm_lm_y: 0.0,
text_leading: 0.0,
char_spacing: 0.0,
word_spacing: 0.0,
}
}
}
fn parse_content_stream(
bytes: &[u8],
fonts: &HashMap<Vec<u8>, FontInfo>,
state: &mut ParseCarryState,
out: &mut Vec<TextFragment>,
stream_idx: Option<usize>,
xobj_id: Option<(u32, u16)>,
) {
let tokens = tokenize(bytes);
let mut stack: Vec<(Token, usize)> = Vec::new();
let mut in_bt = state.in_bt;
let mut font_name = state.font_name.clone();
let mut tf_font_size = state.tf_font_size;
let mut font_size = state.font_size;
let mut tm_y_scale = state.tm_y_scale;
let mut tm_x_scale = state.tm_x_scale;
let mut tm_lm_x = state.tm_lm_x;
let mut tm_lm_y = state.tm_lm_y;
let mut x = state.text_x;
let mut y = state.text_y;
let mut tm_origin_set = state.tm_origin_set;
let mut text_leading = state.text_leading;
let mut char_spacing = state.char_spacing;
let mut word_spacing = state.word_spacing;
for (token, tok_pos) in tokens {
match token {
Token::Keyword(kw) => match kw.as_slice() {
b"BT" => {
in_bt = true;
x = 0.0;
y = 0.0;
tm_origin_set = false;
tm_x_scale = 1.0;
tm_y_scale = 1.0;
tm_lm_x = 0.0;
tm_lm_y = 0.0;
stack.clear();
}
b"ET" => {
in_bt = false;
stack.clear();
}
b"TL" => {
if let Some((Token::Number(tl), _)) = stack.pop() {
text_leading = tl;
}
stack.clear();
}
b"Tc" => {
if let Some((Token::Number(v), _)) = stack.pop() {
char_spacing = v;
}
stack.clear();
}
b"Tw" => {
if let Some((Token::Number(v), _)) = stack.pop() {
word_spacing = v;
}
stack.clear();
}
b"Tf" if in_bt => {
let top = stack.pop();
let second = stack.pop();
if let (Some((Token::Number(size), _)), Some((Token::Name(name), _))) =
(top, second)
{
font_name = name;
tf_font_size = size;
font_size = size * tm_y_scale;
}
stack.clear();
}
b"Td" | b"TD" if in_bt => {
let top = stack.pop();
let second = stack.pop();
if let (Some((Token::Number(ty), _)), Some((Token::Number(tx), _))) =
(top, second)
{
let new_lm_x = tx * tm_x_scale + tm_lm_x;
let new_lm_y = ty * tm_y_scale + tm_lm_y;
tm_lm_x = new_lm_x;
tm_lm_y = new_lm_y;
x = new_lm_x;
y = new_lm_y;
if kw.as_slice() == b"TD" {
text_leading = -ty;
}
}
stack.clear();
}
b"T*" if in_bt => {
let new_lm_x = tm_lm_x;
let new_lm_y = -text_leading * tm_y_scale + tm_lm_y;
tm_lm_x = new_lm_x;
tm_lm_y = new_lm_y;
x = new_lm_x;
y = new_lm_y;
stack.clear();
}
b"Tm" if in_bt => {
let pop_f = stack.pop(); let pop_e = stack.pop(); let pop_d = stack.pop(); let pop_c = stack.pop(); let pop_b = stack.pop(); let pop_a = stack.pop(); if let (Some((Token::Number(fy), _)), Some((Token::Number(ex), _))) =
(pop_f, pop_e)
{
x = ex;
y = fy;
state.tm_origin_x = ex;
state.tm_origin_y = fy;
tm_origin_set = true;
tm_lm_x = ex;
tm_lm_y = fy;
}
if let (Some((Token::Number(dv), _)), Some((Token::Number(cv), _))) =
(pop_d, pop_c)
{
let y_scale = (cv * cv + dv * dv).sqrt();
if y_scale > 0.0 {
font_size = tf_font_size * y_scale;
tm_y_scale = y_scale;
}
}
if let (Some((Token::Number(av), _)), Some((Token::Number(bv), _))) =
(pop_a, pop_b)
{
let x_scale = (av * av + bv * bv).sqrt();
if x_scale > 0.0 {
tm_x_scale = x_scale;
state.tm_x_scale = x_scale;
}
}
stack.clear();
}
b"Tr" => {
if let Some((Token::Number(mode), _)) = stack.pop() {
state.cur_render_mode = mode as u8;
}
stack.clear();
}
b"rg" => {
let b_val = stack.pop();
let g_val = stack.pop();
let r_val = stack.pop();
if let (
Some((Token::Number(bv), _)),
Some((Token::Number(gv), _)),
Some((Token::Number(rv), _)),
) = (b_val, g_val, r_val)
{
state.cur_color = [rv, gv, bv];
}
stack.clear();
}
b"g" => {
if let Some((Token::Number(gray), _)) = stack.pop() {
state.cur_color = [gray, gray, gray];
}
stack.clear();
}
b"q" => {
state.ctm_stack.push(*state.ctm_stack.last().unwrap_or(&IDENTITY_CTM));
stack.clear();
}
b"Q" => {
if state.ctm_stack.len() > 1 {
state.ctm_stack.pop();
}
stack.clear();
}
b"Do" => {
let ctm = *state.ctm_stack.last().unwrap_or(&IDENTITY_CTM);
state.ctm = ctm;
if let Some((Token::Name(name), _)) = stack.last() {
state.do_ctm_map.push((name.clone(), ctm));
}
stack.clear();
}
b"cm" => {
let fv = stack.pop();
let ev = stack.pop();
let dv = stack.pop();
let cv = stack.pop();
let bv = stack.pop();
let av = stack.pop();
if let (
Some((Token::Number(f), _)),
Some((Token::Number(e), _)),
Some((Token::Number(d), _)),
Some((Token::Number(c), _)),
Some((Token::Number(b), _)),
Some((Token::Number(a), _)),
) = (fv, ev, dv, cv, bv, av)
{
let mat = [a, b, c, d, e, f];
let top = state.ctm_stack.last_mut().unwrap();
*top = multiply_ctm(*top, mat);
}
stack.clear();
}
b"Tj" if in_bt => {
let op_start = Some(tok_pos);
let op_end = Some(tok_pos + 2); let bytes_opt = match stack.pop() {
Some((Token::HexStr(b), _)) => Some(b),
Some((Token::LitStr(b), _)) => Some(b),
_ => None,
};
if let Some(char_bytes) = bytes_opt {
let ctm = *state.ctm_stack.last().unwrap_or(&IDENTITY_CTM);
let (px, py) = apply_ctm(ctm, x, y);
let scale = ctm_scale(ctm);
let (tm_ox, tm_oy) = if tm_origin_set {
let (ox, oy) = apply_ctm(ctm, state.tm_origin_x, state.tm_origin_y);
(Some(ox), Some(oy))
} else {
(None, None)
};
let tm_xs = if tm_origin_set { Some(tm_x_scale) } else { None };
let (tm_lm_ox, tm_lm_oy) = if tm_origin_set {
let (lx, ly) = apply_ctm(ctm, tm_lm_x, tm_lm_y);
(Some(lx), Some(ly))
} else {
(None, None)
};
let x_font_size = tf_font_size * tm_x_scale * scale;
if let Some(frag) = decode_chars_to_fragment(
&char_bytes,
&font_name,
font_size * scale,
x_font_size,
px,
py,
fonts,
state.cur_color,
state.cur_render_mode,
tf_font_size,
tm_y_scale,
stream_idx,
op_start,
op_end,
xobj_id,
tm_ox,
tm_oy,
tm_xs,
tm_lm_ox,
tm_lm_oy,
) {
let local_advance =
if scale > 0.0 { frag.width / scale } else { frag.width };
let n_chars = frag.text.chars().count() as f32;
let n_spaces = frag.text.chars().filter(|&c| c == ' ').count() as f32;
x += local_advance
+ char_spacing * tm_x_scale * n_chars
+ word_spacing * tm_x_scale * n_spaces;
out.push(frag);
}
}
stack.clear();
}
b"TJ" if in_bt => {
let op_start = Some(tok_pos);
let op_end = Some(tok_pos + 2); if let Some((Token::Array(items), _)) = stack.pop() {
let ctm = *state.ctm_stack.last().unwrap_or(&IDENTITY_CTM);
let scale = ctm_scale(ctm);
let (tm_ox, tm_oy) = if tm_origin_set {
let (ox, oy) = apply_ctm(ctm, state.tm_origin_x, state.tm_origin_y);
(Some(ox), Some(oy))
} else {
(None, None)
};
let tm_xs = if tm_origin_set { Some(tm_x_scale) } else { None };
let (tm_lm_ox, tm_lm_oy) = if tm_origin_set {
let (lx, ly) = apply_ctm(ctm, tm_lm_x, tm_lm_y);
(Some(lx), Some(ly))
} else {
(None, None)
};
let x_font_size = tf_font_size * tm_x_scale * scale;
let mut cur_x = x; for item in items {
match item {
Token::HexStr(ref b) | Token::LitStr(ref b) => {
let (px, py) = apply_ctm(ctm, cur_x, y);
if let Some(frag) = decode_chars_to_fragment(
b,
&font_name,
font_size * scale,
x_font_size,
px,
py,
fonts,
state.cur_color,
state.cur_render_mode,
tf_font_size,
tm_y_scale,
stream_idx,
op_start,
op_end,
xobj_id,
tm_ox,
tm_oy,
tm_xs,
tm_lm_ox,
tm_lm_oy,
) {
let local_advance = if scale > 0.0 {
frag.width / scale
} else {
frag.width
};
let n_chars = frag.text.chars().count() as f32;
let n_spaces =
frag.text.chars().filter(|&c| c == ' ').count() as f32;
cur_x += local_advance
+ char_spacing * tm_x_scale * n_chars
+ word_spacing * tm_x_scale * n_spaces;
out.push(frag);
}
}
Token::Number(kern) => {
cur_x -= kern / 1000.0 * tf_font_size * tm_x_scale;
}
_ => {}
}
}
x = cur_x;
}
stack.clear();
}
_ => {
stack.clear();
}
},
other => {
stack.push((other, tok_pos));
}
}
}
state.in_bt = in_bt;
state.font_name = font_name;
state.tf_font_size = tf_font_size;
state.font_size = font_size;
state.tm_y_scale = tm_y_scale;
state.tm_x_scale = tm_x_scale;
state.tm_lm_x = tm_lm_x;
state.tm_lm_y = tm_lm_y;
state.text_x = x;
state.text_y = y;
state.tm_origin_set = tm_origin_set;
state.text_leading = text_leading;
state.char_spacing = char_spacing;
state.word_spacing = word_spacing;
}
#[allow(clippy::too_many_arguments)] fn decode_chars_to_fragment(
char_bytes: &[u8],
font_name: &[u8],
font_size: f32,
x_font_size: f32,
x: f32,
y: f32,
fonts: &HashMap<Vec<u8>, FontInfo>,
color: [f32; 3],
render_mode: u8,
tf_font_size: f32,
tm_y_scale: f32,
source_stream: Option<usize>,
source_op_start: Option<usize>,
source_op_end: Option<usize>,
source_xobject: Option<(u32, u16)>,
tm_origin_x: Option<f32>,
tm_origin_y: Option<f32>,
tm_x_scale: Option<f32>,
tm_lm_x: Option<f32>,
tm_lm_y: Option<f32>,
) -> Option<TextFragment> {
if char_bytes.is_empty() {
return None;
}
let font_info = fonts.get(font_name)?;
let mut text = String::new();
let mut total_width = 0.0f32;
match font_info.bytes_per_char {
2 => {
if !char_bytes.len().is_multiple_of(2) {
return None;
}
for chunk in char_bytes.chunks(2) {
let gid = u16::from_be_bytes([chunk[0], chunk[1]]);
let ch = font_info.to_unicode.get(&gid).copied().or_else(|| {
if font_info.identity_fallback {
char::from_u32(gid as u32)
.filter(|c| !c.is_control() || matches!(c, '\t' | '\n' | '\r'))
} else {
None
}
});
let Some(ch) = ch else { continue };
text.push(ch);
let aw = font_info.advance_width(gid);
total_width += aw as f32 / 1000.0 * x_font_size;
}
}
_ => {
for &b in char_bytes {
let code = b as u16;
let Some(&ch) = font_info.to_unicode.get(&code) else {
continue;
};
text.push(ch);
let aw = font_info.advance_width(code);
total_width += aw as f32 / 1000.0 * x_font_size;
}
}
}
if text.is_empty() {
return None;
}
if total_width == 0.0 {
total_width = text.chars().count() as f32 * x_font_size * 0.5;
}
let space_advance = font_info
.to_unicode
.iter()
.find(|&(_gid, &ch)| ch == ' ')
.map(|(&gid, _)| font_info.advance_width(gid) as f32 / 1000.0 * x_font_size)
.unwrap_or(0.0);
Some(TextFragment {
text,
x,
y,
width: total_width,
height: font_size,
font_size,
font_name: String::from_utf8_lossy(font_name).into_owned(),
color,
invisible: render_mode == 3,
is_bold: font_info.is_bold,
is_italic: font_info.is_italic,
font_family: font_info.font_family.clone(),
base_font: font_info.base_font.clone(),
space_advance,
tf_font_size,
tm_y_scale,
source_stream,
source_op_start,
source_op_end,
source_xobject,
tm_origin_x,
tm_origin_y,
tm_x_scale,
tm_lm_x,
tm_lm_y,
})
}
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq)]
pub enum LayoutRegionKind {
Heading(u8),
Paragraph,
TableCell,
Unknown,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct LayoutRegion {
pub kind: LayoutRegionKind,
pub row: Option<usize>,
pub col: Option<usize>,
pub text: String,
pub source_bbox: [f32; 4],
pub usable_rect: [f32; 4],
pub fragments: Vec<TextFragment>,
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct LayoutRegionOptions {
pub infer_row_heights: bool,
pub infer_column_widths: bool,
pub margin: f32,
}
impl Default for LayoutRegionOptions {
fn default() -> Self {
Self { infer_row_heights: true, infer_column_widths: true, margin: 2.0 }
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct RegionFitPlan {
pub region: LayoutRegion,
pub fit: crate::document::FitResult,
pub collisions: Vec<Collision>,
}
pub fn extract_layout_regions(
fragments: &[TextFragment],
page_width: f32,
page_height: f32,
options: LayoutRegionOptions,
) -> Vec<LayoutRegion> {
if fragments.is_empty() || page_width <= 0.0 {
return vec![];
}
let visible: Vec<TextFragment> = fragments
.iter()
.filter(|f| !f.invisible && !f.text.trim().is_empty() && f.font_size.is_finite())
.cloned()
.collect();
if visible.is_empty() {
return vec![];
}
let zones = detect_text_columns(&visible, page_width);
let col_usable_widths: Vec<f32> = zones
.iter()
.enumerate()
.map(|(i, z)| {
let right = if i + 1 < zones.len() {
zones[i + 1].x_start
} else {
page_width
};
(right - z.x_start - options.margin).max(1.0)
})
.collect();
let cells = extract_table_cells(&visible, page_width, page_height);
if cells.is_empty() {
return vec![];
}
let mut font_sizes: Vec<f32> = visible
.iter()
.map(|f| f.font_size)
.filter(|&fs| (4.0_f32..=48.0).contains(&fs))
.collect();
font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median_fs = if font_sizes.is_empty() {
10.0_f32
} else {
font_sizes[font_sizes.len() / 2]
};
let mut row_top_map: std::collections::BTreeMap<usize, f32> =
std::collections::BTreeMap::new();
for cell in &cells {
let top = cell
.fragments
.iter()
.filter(|f| f.font_size.is_finite())
.map(|f| f.y + f.font_size * 0.75)
.fold(f32::NEG_INFINITY, f32::max);
if top.is_finite() {
let entry = row_top_map.entry(cell.row).or_insert(top);
if top > *entry {
*entry = top;
}
}
}
let mut regions: Vec<LayoutRegion> = Vec::with_capacity(cells.len());
for cell in cells {
let source_bbox = text_fragment_bounds(&cell.fragments).unwrap_or(cell.bbox());
let (usable_x, usable_w) = if options.infer_column_widths && cell.col < col_usable_widths.len() {
(zones[cell.col].x_start, col_usable_widths[cell.col])
} else {
(source_bbox[0], source_bbox[2])
};
let (usable_y, usable_h) = if options.infer_row_heights {
let current_top = row_top_map
.get(&cell.row)
.copied()
.filter(|v| v.is_finite())
.unwrap_or(source_bbox[1] + source_bbox[3]);
let next_top = cell.row.checked_add(1).and_then(|r| row_top_map.get(&r)).copied();
if let Some(next_top) = next_top {
let h = (current_top - next_top).max(source_bbox[3]);
(next_top, h)
} else {
let h = (source_bbox[3] * 1.5).max(source_bbox[3]);
let y = current_top - h;
(y.max(options.margin), h)
}
} else {
(source_bbox[1], source_bbox[3])
};
let avg_fs = {
let sizes: Vec<f32> = cell.fragments.iter().map(|f| f.font_size).filter(|fs| fs.is_finite() && *fs > 0.0).collect();
if sizes.is_empty() { median_fs } else { sizes.iter().sum::<f32>() / sizes.len() as f32 }
};
let ratio = if median_fs > 0.0 { avg_fs / median_fs } else { 1.0 };
let is_bold = cell.fragments.iter().any(|f| f.is_bold);
let kind = if ratio >= 1.8 || (ratio >= 1.5 && is_bold) {
LayoutRegionKind::Heading(1)
} else if ratio >= 1.5 {
LayoutRegionKind::Heading(2)
} else if ratio >= 1.3 {
LayoutRegionKind::Heading(3)
} else if ratio >= 1.15 || (ratio >= 1.05 && is_bold) {
LayoutRegionKind::Heading(4)
} else if zones.len() <= 1 && cell.col == 0 {
LayoutRegionKind::Paragraph
} else {
LayoutRegionKind::TableCell
};
regions.push(LayoutRegion {
kind,
row: Some(cell.row),
col: Some(cell.col),
text: cell.text,
source_bbox,
usable_rect: [usable_x, usable_y, usable_w, usable_h],
fragments: cell.fragments,
});
}
regions.sort_by_key(|r| (r.row.unwrap_or(usize::MAX), r.col.unwrap_or(usize::MAX)));
regions
}
#[cfg(test)]
#[path = "extract_tests.rs"]
mod tests;