use crate::graphics::Color;
use crate::parser::content::{ContentOperation, ContentParser, TextElement};
use crate::parser::document::PdfDocument;
use crate::parser::objects::PdfObject;
use crate::parser::page_tree::ParsedPage;
use crate::parser::ParseResult;
use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
use std::collections::HashMap;
use std::io::{Read, Seek};
#[derive(Debug, Clone)]
pub struct ExtractionOptions {
pub preserve_layout: bool,
pub space_threshold: f64,
pub tj_space_threshold: f64,
pub newline_threshold: f64,
pub sort_by_position: bool,
pub detect_columns: bool,
pub column_threshold: f64,
pub merge_hyphenated: bool,
pub track_space_decisions: bool,
pub reconstruct_paragraphs: bool,
pub include_artifacts: bool,
}
impl Default for ExtractionOptions {
fn default() -> Self {
Self {
preserve_layout: false,
space_threshold: 0.3,
tj_space_threshold: 0.2,
newline_threshold: 10.0,
sort_by_position: true,
detect_columns: false,
column_threshold: 50.0,
merge_hyphenated: true,
track_space_decisions: false,
reconstruct_paragraphs: false,
include_artifacts: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ExtractedText {
pub text: String,
pub fragments: Vec<TextFragment>,
}
#[derive(Debug, Clone)]
pub struct SpaceDecision {
pub offset: usize,
pub dx: f64,
pub threshold: f64,
pub confidence: f64,
pub inserted: bool,
}
#[derive(Debug, Clone)]
pub struct TextFragment {
pub text: String,
pub x: f64,
pub y: f64,
pub width: f64,
pub height: f64,
pub font_size: f64,
pub font_name: Option<String>,
pub is_bold: bool,
pub is_italic: bool,
pub color: Option<Color>,
pub space_decisions: Vec<SpaceDecision>,
pub mcid: Option<u32>,
pub struct_tag: Option<String>,
}
#[derive(Debug, Clone)]
struct MarkedContentEntry {
tag: String,
mcid: Option<u32>,
#[allow(dead_code)] actual_text: Option<String>,
is_artifact: bool,
}
#[derive(Debug, Clone)]
struct PendingActualText {
text: String,
first_x: f64,
first_y: f64,
width: f64,
font_size: f64,
font_name: Option<String>,
is_bold: bool,
is_italic: bool,
color: Option<Color>,
stack_depth: usize,
populated: bool,
}
struct TextState {
text_matrix: [f64; 6],
text_line_matrix: [f64; 6],
ctm: [f64; 6],
leading: f64,
char_space: f64,
word_space: f64,
horizontal_scale: f64,
text_rise: f64,
font_size: f64,
font_name: Option<String>,
render_mode: u8,
fill_color: Option<Color>,
saved_states: Vec<SavedGraphicsState>,
mc_stack: Vec<MarkedContentEntry>,
pending_actualtext: Option<PendingActualText>,
}
#[derive(Clone)]
struct SavedGraphicsState {
ctm: [f64; 6],
fill_color: Option<Color>,
}
impl Default for TextState {
fn default() -> Self {
Self {
text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
leading: 0.0,
char_space: 0.0,
word_space: 0.0,
horizontal_scale: 100.0,
text_rise: 0.0,
font_size: 0.0,
font_name: None,
render_mode: 0,
fill_color: None,
saved_states: Vec::new(),
mc_stack: Vec::new(),
pending_actualtext: None,
}
}
}
pub fn parse_font_style(font_name: &str) -> (bool, bool) {
let name_lower = font_name.to_lowercase();
let is_bold = name_lower.contains("bold")
|| name_lower.contains("-b")
|| name_lower.contains(" b ")
|| name_lower.ends_with(" b");
let is_italic = name_lower.contains("italic")
|| name_lower.contains("oblique")
|| name_lower.contains("-i")
|| name_lower.contains(" i ")
|| name_lower.ends_with(" i");
(is_bold, is_italic)
}
pub struct TextExtractor {
options: ExtractionOptions,
font_cache: HashMap<String, FontInfo>,
font_object_cache: HashMap<(u32, u16), FontInfo>,
}
impl TextExtractor {
pub fn new() -> Self {
Self {
options: ExtractionOptions::default(),
font_cache: HashMap::new(),
font_object_cache: HashMap::new(),
}
}
pub fn with_options(options: ExtractionOptions) -> Self {
Self {
options,
font_cache: HashMap::new(),
font_object_cache: HashMap::new(),
}
}
pub fn merge_fragments_for_partition(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
let kerning_fixed = self.merge_close_fragments(fragments);
if !self.options.reconstruct_paragraphs {
return kerning_fixed;
}
let lines = self.merge_into_lines(&kerning_fixed);
self.merge_into_paragraphs(&lines)
}
fn merge_into_lines(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
if fragments.is_empty() {
return Vec::new();
}
let row_ids = assign_row_ids(fragments);
let is_tagged = fragments.iter().any(|f| f.mcid.is_some());
let mut indexed: Vec<(u32, usize, &TextFragment)> = row_ids
.iter()
.copied()
.zip(fragments.iter().enumerate())
.map(|(rid, (idx, f))| (rid, idx, f))
.collect();
indexed.sort_by(|a, b| {
a.0.cmp(&b.0)
.then(b.2.y.total_cmp(&a.2.y))
.then(if is_tagged {
a.1.cmp(&b.1)
} else {
a.2.x.total_cmp(&b.2.x)
})
});
let mut lines: Vec<Vec<&TextFragment>> = Vec::new();
let mut last_seen_row_id: Option<u32> = None;
for (rid, _idx, frag) in indexed {
let same_batch = last_seen_row_id == Some(rid);
let placed = same_batch
&& lines.last_mut().is_some_and(|line| {
let head = line[0];
let tol = (head.height.min(frag.height)) * 0.2;
(head.y - frag.y).abs() < tol && head.mcid == frag.mcid
});
if placed {
lines.last_mut().unwrap().push(frag);
} else {
lines.push(vec![frag]);
last_seen_row_id = Some(rid);
}
}
lines
.into_iter()
.map(|line| build_line_fragment(line, self.options.space_threshold))
.collect()
}
fn merge_into_paragraphs(&self, lines: &[TextFragment]) -> Vec<TextFragment> {
if lines.is_empty() {
return Vec::new();
}
let mut heights: Vec<f64> = lines.iter().map(|l| l.height).collect();
heights.sort_by(f64::total_cmp);
let median_h = heights[heights.len() / 2];
let max_paragraph_gap = median_h * 1.5;
let mut paragraphs: Vec<TextFragment> = Vec::new();
let mut current = lines[0].clone();
for line in &lines[1..] {
let prev_bottom = current.y;
let line_top = line.y + line.height;
let gap = prev_bottom - line_top;
if gap < 0.0 || gap > max_paragraph_gap || current.mcid != line.mcid {
paragraphs.push(current);
current = line.clone();
continue;
}
let joined_text = if self.options.merge_hyphenated && current.text.ends_with('-') {
let mut s = current.text.clone();
s.pop(); s.push_str(&line.text);
s
} else {
format!("{}\n{}", current.text, line.text)
};
let x_min = current.x.min(line.x);
let x_max = (current.x + current.width).max(line.x + line.width);
let y_min = current.y.min(line.y);
let y_max = (current.y + current.height).max(line.y + line.height);
current = TextFragment {
text: joined_text,
x: x_min,
y: y_min,
width: x_max - x_min,
height: y_max - y_min,
font_size: current.font_size,
font_name: current.font_name.clone(),
is_bold: current.is_bold,
is_italic: current.is_italic,
color: current.color,
space_decisions: Vec::new(),
mcid: current.mcid,
struct_tag: current.struct_tag.clone(),
};
}
paragraphs.push(current);
paragraphs
}
pub fn extract_from_document<R: Read + Seek>(
&mut self,
document: &PdfDocument<R>,
) -> ParseResult<Vec<ExtractedText>> {
let page_count = document.page_count()?;
let mut results = Vec::new();
for i in 0..page_count {
let text = self.extract_from_page(document, i)?;
results.push(text);
}
Ok(results)
}
pub fn extract_from_page<R: Read + Seek>(
&mut self,
document: &PdfDocument<R>,
page_index: u32,
) -> ParseResult<ExtractedText> {
let page = document.get_page(page_index)?;
{
let _span = tracing::info_span!("font_resources").entered();
self.extract_font_resources(&page, document)?;
}
let streams = {
let _span = tracing::info_span!("stream_decompress").entered();
page.content_streams_with_document(document)?
};
let mut extracted_text = String::new();
let mut fragments = Vec::new();
let mut state = TextState::default();
let mut in_text_object = false;
let mut last_x = 0.0;
let mut last_y = 0.0;
let page_properties: Option<&crate::parser::objects::PdfDictionary> = page
.get_resources()
.and_then(|res| match res.get("Properties") {
Some(crate::parser::objects::PdfObject::Dictionary(d)) => Some(d),
_ => None,
});
for (stream_idx, stream_data) in streams.iter().enumerate() {
let operations = match {
let _span = tracing::info_span!("content_parse").entered();
ContentParser::parse_content(stream_data)
} {
Ok(ops) => ops,
Err(e) => {
tracing::debug!(
"Warning: Failed to parse content stream on page {}, stream {}/{}",
page_index + 1,
stream_idx + 1,
streams.len()
);
tracing::debug!(" Error: {}", e);
tracing::debug!(" Stream size: {} bytes", stream_data.len());
let preview_len = stream_data.len().min(100);
let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
tracing::debug!(
" Stream preview (first {} bytes): {:?}",
preview_len,
preview.chars().take(80).collect::<String>()
);
continue;
}
};
let _ops_span = tracing::info_span!("text_ops_loop").entered();
for op in operations {
match op {
ContentOperation::BeginText => {
in_text_object = true;
state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
}
ContentOperation::EndText => {
in_text_object = false;
}
ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
state.text_matrix =
[a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
state.text_line_matrix =
[a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
}
ContentOperation::MoveText(tx, ty) => {
let new_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
&state.text_line_matrix,
);
state.text_matrix = new_matrix;
state.text_line_matrix = new_matrix;
}
ContentOperation::NextLine => {
let new_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
&state.text_line_matrix,
);
state.text_matrix = new_matrix;
state.text_line_matrix = new_matrix;
}
ContentOperation::ShowText(text) => {
if in_text_object {
let text_bytes = &text;
let decoded = self.decode_text(text_bytes, &state)?;
let (x, y) = text_origin(&state);
if !extracted_text.is_empty() {
let dx = x - last_x;
let dy = (y - last_y).abs();
if dy > self.options.newline_threshold {
extracted_text.push('\n');
} else if dx > self.options.space_threshold * state.font_size {
extracted_text.push(' ');
}
}
extracted_text.push_str(&decoded);
let text_width = {
let font_info = state
.font_name
.as_ref()
.and_then(|name| self.font_cache.get(name));
calculate_text_width(&decoded, state.font_size, font_info)
};
if self.options.preserve_layout {
emit_text_fragment(
&mut fragments,
&decoded,
text_width,
x,
y,
&mut state,
self.options.include_artifacts,
);
}
last_x = x + text_width;
last_y = y;
let tx = text_width * state.horizontal_scale / 100.0;
state.text_matrix =
multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
}
}
ContentOperation::ShowTextArray(array) => {
if in_text_object {
for item in array {
match item {
TextElement::Text(text_bytes) => {
let decoded = self.decode_text(&text_bytes, &state)?;
extracted_text.push_str(&decoded);
let text_width = {
let font_info = state
.font_name
.as_ref()
.and_then(|name| self.font_cache.get(name));
calculate_text_width(
&decoded,
state.font_size,
font_info,
)
};
if self.options.preserve_layout {
let (x, y) = text_origin(&state);
emit_text_fragment(
&mut fragments,
&decoded,
text_width,
x,
y,
&mut state,
self.options.include_artifacts,
);
}
let tx = text_width * state.horizontal_scale / 100.0;
state.text_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, tx, 0.0],
&state.text_matrix,
);
}
TextElement::Spacing(adjustment) => {
let tx = -(adjustment as f64) / 1000.0 * state.font_size;
if tx > self.options.tj_space_threshold * state.font_size
&& !extracted_text.is_empty()
&& !extracted_text.ends_with(' ')
{
extracted_text.push(' ');
if self.options.preserve_layout
&& state.pending_actualtext.is_none()
{
let (sx, sy) = text_origin(&state);
emit_text_fragment(
&mut fragments,
" ",
tx,
sx,
sy,
&mut state,
self.options.include_artifacts,
);
}
}
state.text_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, tx, 0.0],
&state.text_matrix,
);
}
}
}
}
}
ContentOperation::NextLineShowText(text) => {
if in_text_object {
let new_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
&state.text_line_matrix,
);
state.text_matrix = new_matrix;
state.text_line_matrix = new_matrix;
let decoded = self.decode_text(&text, &state)?;
let (x, y) = text_origin(&state);
if !extracted_text.is_empty() {
extracted_text.push('\n');
}
extracted_text.push_str(&decoded);
let text_width = {
let font_info = state
.font_name
.as_ref()
.and_then(|name| self.font_cache.get(name));
calculate_text_width(&decoded, state.font_size, font_info)
};
if self.options.preserve_layout {
emit_text_fragment(
&mut fragments,
&decoded,
text_width,
x,
y,
&mut state,
self.options.include_artifacts,
);
}
last_x = x + text_width;
last_y = y;
let tx = text_width * state.horizontal_scale / 100.0;
state.text_matrix =
multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
}
}
ContentOperation::SetSpacingNextLineShowText(word_space, char_space, text) => {
if in_text_object {
state.word_space = word_space as f64;
state.char_space = char_space as f64;
let new_matrix = multiply_matrix(
&[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
&state.text_line_matrix,
);
state.text_matrix = new_matrix;
state.text_line_matrix = new_matrix;
let decoded = self.decode_text(&text, &state)?;
let (x, y) = text_origin(&state);
if !extracted_text.is_empty() {
extracted_text.push('\n');
}
extracted_text.push_str(&decoded);
let text_width = {
let font_info = state
.font_name
.as_ref()
.and_then(|name| self.font_cache.get(name));
calculate_text_width(&decoded, state.font_size, font_info)
};
if self.options.preserve_layout {
emit_text_fragment(
&mut fragments,
&decoded,
text_width,
x,
y,
&mut state,
self.options.include_artifacts,
);
}
last_x = x + text_width;
last_y = y;
let tx = text_width * state.horizontal_scale / 100.0;
state.text_matrix =
multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
}
}
ContentOperation::SetFont(name, size) => {
state.font_name = Some(name);
state.font_size = size as f64;
}
ContentOperation::SetLeading(leading) => {
state.leading = leading as f64;
}
ContentOperation::SetCharSpacing(spacing) => {
state.char_space = spacing as f64;
}
ContentOperation::SetWordSpacing(spacing) => {
state.word_space = spacing as f64;
}
ContentOperation::SetHorizontalScaling(scale) => {
state.horizontal_scale = scale as f64;
}
ContentOperation::SetTextRise(rise) => {
state.text_rise = rise as f64;
}
ContentOperation::SetTextRenderMode(mode) => {
state.render_mode = mode as u8;
}
ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
let [a0, b0, c0, d0, e0, f0] = state.ctm;
let a = a as f64;
let b = b as f64;
let c = c as f64;
let d = d as f64;
let e = e as f64;
let f = f as f64;
state.ctm = [
a * a0 + b * c0,
a * b0 + b * d0,
c * a0 + d * c0,
c * b0 + d * d0,
e * a0 + f * c0 + e0,
e * b0 + f * d0 + f0,
];
}
ContentOperation::SaveGraphicsState => {
state.saved_states.push(SavedGraphicsState {
ctm: state.ctm,
fill_color: state.fill_color,
});
}
ContentOperation::RestoreGraphicsState => {
if let Some(saved) = state.saved_states.pop() {
state.ctm = saved.ctm;
state.fill_color = saved.fill_color;
}
}
ContentOperation::SetNonStrokingGray(gray) => {
state.fill_color = Some(Color::gray(gray as f64));
}
ContentOperation::SetNonStrokingRGB(r, g, b) => {
state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
}
ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
state.fill_color =
Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
}
ContentOperation::BeginMarkedContent(tag) => {
let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
state.mc_stack.push(MarkedContentEntry {
is_artifact: tag == "Artifact" || parent_artifact,
tag,
mcid: None,
actual_text: None,
});
}
ContentOperation::BeginMarkedContentWithProps(tag, props) => {
let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
let (mcid, actual_text) = resolve_props(&props, page_properties);
if let Some(ref text) = actual_text {
state.pending_actualtext = Some(PendingActualText {
text: text.clone(),
first_x: 0.0,
first_y: 0.0,
width: 0.0,
font_size: state.font_size,
font_name: state.font_name.clone(),
is_bold: false, is_italic: false,
color: state.fill_color,
stack_depth: state.mc_stack.len(), populated: false,
});
}
state.mc_stack.push(MarkedContentEntry {
is_artifact: tag == "Artifact" || parent_artifact,
tag,
mcid,
actual_text,
});
}
ContentOperation::EndMarkedContent => {
let popped_depth = state.mc_stack.len();
if state.mc_stack.pop().is_none() {
tracing::debug!(
"extraction: EMC with empty marked-content stack on page {}",
page_index + 1
);
} else if let Some(pending) = state.pending_actualtext.as_ref() {
if pending.stack_depth + 1 == popped_depth {
let run = state.pending_actualtext.take().unwrap();
if run.populated && self.options.preserve_layout {
let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
let in_artifact = state.mc_stack.iter().any(|e| e.is_artifact);
if !in_artifact || self.options.include_artifacts {
fragments.push(TextFragment {
text: run.text,
x: run.first_x,
y: run.first_y,
width: run.width,
height: run.font_size,
font_size: run.font_size,
font_name: run.font_name,
is_bold: run.is_bold,
is_italic: run.is_italic,
color: run.color,
space_decisions: Vec::new(),
mcid,
struct_tag,
});
}
}
}
}
}
_ => {
}
}
}
}
{
let _span = tracing::info_span!("layout_finalize").entered();
if self.options.sort_by_position
&& !self.options.reconstruct_paragraphs
&& !fragments.is_empty()
{
self.sort_and_merge_fragments(&mut fragments);
}
if self.options.preserve_layout && !fragments.is_empty() {
fragments = self.merge_close_fragments(&fragments);
}
if self.options.reconstruct_paragraphs && !fragments.is_empty() {
let lines = self.merge_into_lines(&fragments);
fragments = self.merge_into_paragraphs(&lines);
}
if self.options.preserve_layout && !fragments.is_empty() {
extracted_text = self.reconstruct_text_from_fragments(&fragments);
}
}
Ok(ExtractedText {
text: extracted_text,
fragments,
})
}
fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
let threshold = self.options.newline_threshold;
fragments.sort_by(|a, b| {
let band_a = if threshold > 0.0 {
(-a.y / threshold).round()
} else {
-a.y
};
let band_b = if threshold > 0.0 {
(-b.y / threshold).round()
} else {
-b.y
};
band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
});
if self.options.detect_columns {
self.detect_and_sort_columns(fragments);
}
}
fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
let mut current_line: Vec<&mut TextFragment> = Vec::new();
let mut last_y = f64::INFINITY;
for fragment in fragments.iter_mut() {
let fragment_y = fragment.y;
if (last_y - fragment_y).abs() > self.options.newline_threshold
&& !current_line.is_empty()
{
lines.push(current_line);
current_line = Vec::new();
}
current_line.push(fragment);
last_y = fragment_y;
}
if !current_line.is_empty() {
lines.push(current_line);
}
let mut column_boundaries = vec![0.0];
for line in &lines {
if line.len() > 1 {
for i in 0..line.len() - 1 {
let gap = line[i + 1].x - (line[i].x + line[i].width);
if gap > self.options.column_threshold {
let boundary = line[i].x + line[i].width + gap / 2.0;
if !column_boundaries
.iter()
.any(|&b| (b - boundary).abs() < 10.0)
{
column_boundaries.push(boundary);
}
}
}
}
}
column_boundaries.sort_by(|a, b| a.total_cmp(b));
if column_boundaries.len() > 1 {
fragments.sort_by(|a, b| {
let col_a = column_boundaries
.iter()
.position(|&boundary| a.x < boundary)
.unwrap_or(column_boundaries.len())
- 1;
let col_b = column_boundaries
.iter()
.position(|&boundary| b.x < boundary)
.unwrap_or(column_boundaries.len())
- 1;
if col_a != col_b {
col_a.cmp(&col_b)
} else {
b.y.total_cmp(&a.y)
}
});
}
}
fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
let merged_fragments = self.merge_close_fragments(fragments);
let mut result = String::new();
let mut last_y = f64::INFINITY;
let mut last_x = 0.0;
let mut last_line_ended_with_hyphen = false;
for fragment in &merged_fragments {
let y_diff = (last_y - fragment.y).abs();
if !result.is_empty() && y_diff > self.options.newline_threshold {
if self.options.merge_hyphenated && last_line_ended_with_hyphen {
if result.ends_with('-') {
result.pop();
}
} else {
result.push('\n');
}
} else if !result.is_empty() {
let x_gap = fragment.x - last_x;
if x_gap > self.options.space_threshold * fragment.font_size {
result.push(' ');
}
}
result.push_str(&fragment.text);
last_line_ended_with_hyphen = fragment.text.ends_with('-');
last_y = fragment.y;
last_x = fragment.x + fragment.width;
}
result
}
fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
if fragments.is_empty() {
return Vec::new();
}
let mut merged = Vec::new();
let mut current = fragments[0].clone();
for fragment in &fragments[1..] {
let y_diff = (current.y - fragment.y).abs();
let x_gap = fragment.x - (current.x + current.width);
let y_tol = if self.options.reconstruct_paragraphs {
let base = 0.5 * current.font_size.min(fragment.font_size);
if base > 0.0 {
base
} else {
1.0
}
} else {
1.0
};
let should_merge = y_diff < y_tol
&& x_gap >= 0.0 && x_gap < fragment.font_size * 0.5 && current.mcid == fragment.mcid;
if should_merge {
if x_gap > self.options.space_threshold * fragment.font_size {
current.text.push(' ');
}
current.text.push_str(&fragment.text);
current.width = (fragment.x + fragment.width) - current.x;
} else {
merged.push(current);
current = fragment.clone();
}
}
merged.push(current);
merged
}
fn extract_font_resources<R: Read + Seek>(
&mut self,
page: &ParsedPage,
document: &PdfDocument<R>,
) -> ParseResult<()> {
self.font_cache.clear();
if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
{
if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
for (font_name, font_obj) in font_dict.0.iter() {
if let Some(font_ref) = font_obj.as_reference() {
self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
}
}
}
}
} else if let Some(resources) = page.get_resources() {
if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
for (font_name, font_obj) in font_dict.0.iter() {
if let Some(font_ref) = font_obj.as_reference() {
self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
}
}
}
}
Ok(())
}
fn cache_font_by_ref<R: Read + Seek>(
&mut self,
font_name: &str,
font_ref: (u32, u16),
document: &PdfDocument<R>,
) {
if let Some(cached) = self.font_object_cache.get(&font_ref) {
self.font_cache
.insert(font_name.to_string(), cached.clone());
tracing::debug!(
"Reused cached font object ({}, {}): {} (ToUnicode: {})",
font_ref.0,
font_ref.1,
font_name,
cached.to_unicode.is_some()
);
return;
}
if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
let has_to_unicode = font_info.to_unicode.is_some();
self.font_object_cache.insert(font_ref, font_info.clone());
self.font_cache.insert(font_name.to_string(), font_info);
tracing::debug!(
"Parsed and cached font ({}, {}): {} (ToUnicode: {})",
font_ref.0,
font_ref.1,
font_name,
has_to_unicode
);
}
}
}
fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
use crate::text::encoding::TextEncoding;
if let Some(ref font_name) = state.font_name {
if let Some(font_info) = self.font_cache.get(font_name) {
if let Ok(decoded) =
crate::text::extraction_cmap::decode_text_with_font(text, font_info)
{
if !decoded.trim().is_empty()
&& !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
{
let sanitized = sanitize_extracted_text(&decoded);
tracing::debug!(
"Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
font_name,
text,
sanitized
);
return Ok(sanitized);
}
}
tracing::debug!(
"CMap decoding failed or produced garbage for font {}, falling back to encoding",
font_name
);
}
}
let encoding = if let Some(ref font_name) = state.font_name {
match font_name.to_lowercase().as_str() {
name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
name if name.contains("standard") => TextEncoding::StandardEncoding,
name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
_ => {
if font_name.starts_with("Times")
|| font_name.starts_with("Helvetica")
|| font_name.starts_with("Courier")
{
TextEncoding::WinAnsiEncoding } else {
TextEncoding::PdfDocEncoding }
}
}
} else {
TextEncoding::WinAnsiEncoding };
let fallback_result = encoding.decode(text);
let sanitized = sanitize_extracted_text(&fallback_result);
tracing::debug!(
"Fallback encoding decoding: {:?} -> \"{}\"",
text,
sanitized
);
Ok(sanitized)
}
}
impl Default for TextExtractor {
fn default() -> Self {
Self::new()
}
}
fn emit_text_fragment(
fragments: &mut Vec<TextFragment>,
decoded: &str,
text_width: f64,
x: f64,
y: f64,
state: &mut TextState,
include_artifacts: bool,
) {
if decoded.is_empty() {
return;
}
if !include_artifacts && state.mc_stack.iter().any(|e| e.is_artifact) {
return;
}
let (is_bold, is_italic) = state
.font_name
.as_ref()
.map(|name| parse_font_style(name))
.unwrap_or((false, false));
let combined = multiply_matrix(&state.text_matrix, &state.ctm);
let x_scale = (combined[0] * combined[0] + combined[1] * combined[1]).sqrt();
let y_scale = (combined[2] * combined[2] + combined[3] * combined[3]).sqrt();
let effective_width = text_width * x_scale;
let effective_size = state.font_size * y_scale;
let local_font_name = state.font_name.clone();
let local_fill_color = state.fill_color;
if let Some(pending) = state.pending_actualtext.as_mut() {
if !pending.populated {
pending.first_x = x;
pending.first_y = y;
pending.font_size = effective_size;
pending.font_name = local_font_name;
pending.is_bold = is_bold;
pending.is_italic = is_italic;
pending.color = local_fill_color;
pending.populated = true;
}
pending.width += effective_width;
return;
}
let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
fragments.push(TextFragment {
text: decoded.to_owned(),
x,
y,
width: effective_width,
height: effective_size,
font_size: effective_size,
font_name: state.font_name.clone(),
is_bold,
is_italic,
color: state.fill_color,
space_decisions: Vec::new(),
mcid,
struct_tag,
});
}
fn text_origin(state: &TextState) -> (f64, f64) {
let combined = multiply_matrix(&state.text_matrix, &state.ctm);
transform_point(0.0, 0.0, &combined)
}
fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
[
a[0] * b[0] + a[1] * b[2],
a[0] * b[1] + a[1] * b[3],
a[2] * b[0] + a[3] * b[2],
a[2] * b[1] + a[3] * b[3],
a[4] * b[0] + a[5] * b[2] + b[4],
a[4] * b[1] + a[5] * b[3] + b[5],
]
}
fn decode_pdf_string(bytes: &[u8]) -> String {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let mut code_units: Vec<u16> = Vec::with_capacity((bytes.len() - 2) / 2);
let mut i = 2;
while i + 1 < bytes.len() {
code_units.push(u16::from_be_bytes([bytes[i], bytes[i + 1]]));
i += 2;
}
String::from_utf16_lossy(&code_units)
} else {
bytes.iter().map(|&b| b as char).collect()
}
}
fn resolve_props(
props: &crate::parser::content::MarkedContentProps,
properties: Option<&crate::parser::objects::PdfDictionary>,
) -> (Option<u32>, Option<String>) {
use crate::parser::content::{MarkedContentProps, MarkedContentValue};
let map_mcid_actual =
|map: &std::collections::HashMap<String, MarkedContentValue>| -> (Option<u32>, Option<String>) {
let mcid = match map.get("MCID") {
Some(MarkedContentValue::Integer(n)) if *n >= 0 && *n <= u32::MAX as i64 => {
Some(*n as u32)
}
_ => None,
};
let actual = match map.get("ActualText") {
Some(MarkedContentValue::String(bytes)) => Some(decode_pdf_string(bytes)),
_ => None,
};
(mcid, actual)
};
match props {
MarkedContentProps::Inline(map) => map_mcid_actual(map),
MarkedContentProps::ResourceRef(name) => {
let Some(properties) = properties else {
return (None, None);
};
let Some(entry) = properties.get(name) else {
return (None, None);
};
let crate::parser::objects::PdfObject::Dictionary(dict) = entry else {
return (None, None);
};
let mcid = dict.get("MCID").and_then(|o| match o {
crate::parser::objects::PdfObject::Integer(n)
if *n >= 0 && *n <= u32::MAX as i64 =>
{
Some(*n as u32)
}
_ => None,
});
let actual_text = dict.get("ActualText").and_then(|o| match o {
crate::parser::objects::PdfObject::String(s) => {
Some(decode_pdf_string(s.as_bytes()))
}
_ => None,
});
(mcid, actual_text)
}
}
}
fn innermost_mc_tag(stack: &[MarkedContentEntry]) -> (Option<u32>, Option<String>) {
stack
.iter()
.rev()
.find(|e| e.mcid.is_some())
.map_or((None, None), |e| (e.mcid, Some(e.tag.clone())))
}
fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
let tx = matrix[0] * x + matrix[2] * y + matrix[4];
let ty = matrix[1] * x + matrix[3] * y + matrix[5];
(tx, ty)
}
fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
if let Some(font) = font_info {
if let Some(ref widths) = font.metrics.widths {
let first_char = font.metrics.first_char.unwrap_or(0);
let last_char = font.metrics.last_char.unwrap_or(255);
let missing_width = font.metrics.missing_width.unwrap_or(500.0);
let mut total_width = 0.0;
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
let char_code = ch as u32;
let width = if char_code >= first_char && char_code <= last_char {
let index = (char_code - first_char) as usize;
widths.get(index).copied().unwrap_or(missing_width)
} else {
missing_width
};
total_width += width / 1000.0 * font_size;
if let Some(ref kerning) = font.metrics.kerning {
if let Some(&next_ch) = chars.peek() {
let next_char = next_ch as u32;
if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
total_width += kern_value / 1000.0 * font_size;
}
}
}
}
return total_width;
}
}
text.len() as f64 * font_size * 0.5
}
pub fn sanitize_extracted_text(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let mut result = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
let mut last_was_space = false;
while let Some(ch) = chars.next() {
match ch {
'\0' => {
if chars.peek() == Some(&'\u{3}') {
chars.next(); }
if !last_was_space {
result.push(' ');
last_was_space = true;
}
}
'\u{3}' => {
}
'\t' | '\n' | '\r' => {
result.push(ch);
last_was_space = ch == '\t';
}
' ' => {
if !last_was_space {
result.push(' ');
last_was_space = true;
}
}
c if c.is_ascii_control() => {
}
_ => {
result.push(ch);
last_was_space = false;
}
}
}
result
}
fn assign_row_ids(fragments: &[TextFragment]) -> Vec<u32> {
let mut result = Vec::with_capacity(fragments.len());
let mut row_id: u32 = 0;
let mut prev_y: Option<f64> = None;
for frag in fragments {
if let Some(py) = prev_y {
let delta = frag.y - py;
let threshold = (frag.font_size * 0.5).max(2.0);
if delta > threshold {
row_id += 1;
}
}
result.push(row_id);
prev_y = Some(frag.y);
}
debug_assert_eq!(
result.len(),
fragments.len(),
"assign_row_ids: output length must equal input length"
);
result
}
fn build_line_fragment(line: Vec<&TextFragment>, space_threshold: f64) -> TextFragment {
let head = line[0];
let mut text = String::new();
let mut x_min = head.x;
let mut x_max = head.x + head.width;
let mut y_min = head.y;
let mut y_max = head.y + head.height;
for (i, frag) in line.iter().enumerate() {
if i > 0 {
let prev = line[i - 1];
let gap = frag.x - (prev.x + prev.width);
if gap > space_threshold * frag.font_size {
text.push(' ');
}
}
text.push_str(&frag.text);
x_min = x_min.min(frag.x);
x_max = x_max.max(frag.x + frag.width);
y_min = y_min.min(frag.y);
y_max = y_max.max(frag.y + frag.height);
}
TextFragment {
text,
x: x_min,
y: y_min,
width: x_max - x_min,
height: y_max - y_min,
font_size: head.font_size,
font_name: head.font_name.clone(),
is_bold: head.is_bold,
is_italic: head.is_italic,
color: head.color,
space_decisions: Vec::new(),
mcid: head.mcid,
struct_tag: head.struct_tag.clone(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_matrix_multiplication() {
let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
let result = multiply_matrix(&identity, &translation);
assert_eq!(result, translation);
let result2 = multiply_matrix(&translation, &identity);
assert_eq!(result2, translation);
}
#[test]
fn test_transform_point() {
let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
let (x, y) = transform_point(5.0, 5.0, &translation);
assert_eq!(x, 15.0);
assert_eq!(y, 25.0);
}
#[test]
fn test_extraction_options_default() {
let options = ExtractionOptions::default();
assert!(!options.preserve_layout);
assert_eq!(options.space_threshold, 0.3);
assert_eq!(options.newline_threshold, 10.0);
assert!(options.sort_by_position);
assert!(!options.detect_columns);
assert_eq!(options.column_threshold, 50.0);
assert!(options.merge_hyphenated);
}
#[test]
fn test_extraction_options_custom() {
let options = ExtractionOptions {
preserve_layout: true,
space_threshold: 0.5,
tj_space_threshold: 0.15,
newline_threshold: 15.0,
sort_by_position: false,
detect_columns: true,
column_threshold: 75.0,
merge_hyphenated: false,
track_space_decisions: false,
reconstruct_paragraphs: false,
include_artifacts: false,
};
assert!(options.preserve_layout);
assert_eq!(options.space_threshold, 0.5);
assert_eq!(options.tj_space_threshold, 0.15);
assert_eq!(options.newline_threshold, 15.0);
assert!(!options.sort_by_position);
assert!(options.detect_columns);
assert_eq!(options.column_threshold, 75.0);
assert!(!options.merge_hyphenated);
}
#[test]
fn test_parse_font_style_bold() {
assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
assert_eq!(parse_font_style("Arial Bold"), (true, false));
assert_eq!(parse_font_style("Calibri Bold"), (true, false));
assert_eq!(parse_font_style("Helvetica-B"), (true, false));
}
#[test]
fn test_parse_font_style_italic() {
assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
assert_eq!(parse_font_style("Times-Oblique"), (false, true));
assert_eq!(parse_font_style("Arial Italic"), (false, true));
assert_eq!(parse_font_style("Courier Oblique"), (false, true));
assert_eq!(parse_font_style("Helvetica-I"), (false, true));
}
#[test]
fn test_parse_font_style_bold_italic() {
assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
}
#[test]
fn test_parse_font_style_regular() {
assert_eq!(parse_font_style("Helvetica"), (false, false));
assert_eq!(parse_font_style("Times-Roman"), (false, false));
assert_eq!(parse_font_style("Courier"), (false, false));
assert_eq!(parse_font_style("Arial"), (false, false));
}
#[test]
fn test_parse_font_style_edge_cases() {
assert_eq!(parse_font_style(""), (false, false));
assert_eq!(parse_font_style("UnknownFont"), (false, false));
assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
assert_eq!(parse_font_style("times-ITALIC"), (false, true));
}
#[test]
fn test_text_fragment() {
let fragment = TextFragment {
text: "Hello".to_string(),
x: 100.0,
y: 200.0,
width: 50.0,
height: 12.0,
font_size: 10.0,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
};
assert_eq!(fragment.text, "Hello");
assert_eq!(fragment.x, 100.0);
assert_eq!(fragment.y, 200.0);
assert_eq!(fragment.width, 50.0);
assert_eq!(fragment.height, 12.0);
assert_eq!(fragment.font_size, 10.0);
}
#[test]
fn test_extracted_text() {
let fragments = vec![
TextFragment {
text: "Hello".to_string(),
x: 100.0,
y: 200.0,
width: 50.0,
height: 12.0,
font_size: 10.0,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
},
TextFragment {
text: "World".to_string(),
x: 160.0,
y: 200.0,
width: 50.0,
height: 12.0,
font_size: 10.0,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
},
];
let extracted = ExtractedText {
text: "Hello World".to_string(),
fragments: fragments,
};
assert_eq!(extracted.text, "Hello World");
assert_eq!(extracted.fragments.len(), 2);
assert_eq!(extracted.fragments[0].text, "Hello");
assert_eq!(extracted.fragments[1].text, "World");
}
#[test]
fn test_text_state_default() {
let state = TextState::default();
assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
assert_eq!(state.leading, 0.0);
assert_eq!(state.char_space, 0.0);
assert_eq!(state.word_space, 0.0);
assert_eq!(state.horizontal_scale, 100.0);
assert_eq!(state.text_rise, 0.0);
assert_eq!(state.font_size, 0.0);
assert!(state.font_name.is_none());
assert_eq!(state.render_mode, 0);
}
#[test]
fn test_matrix_operations() {
let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
assert_eq!(x, 0.0);
assert_eq!(y, 1.0);
let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
let (x, y) = transform_point(5.0, 5.0, &scale);
assert_eq!(x, 10.0);
assert_eq!(y, 15.0);
let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
let (x, y) = transform_point(1.0, 1.0, &complex);
assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
#[test]
fn test_text_extractor_new() {
let extractor = TextExtractor::new();
let options = extractor.options;
assert!(!options.preserve_layout);
assert_eq!(options.space_threshold, 0.3);
assert_eq!(options.newline_threshold, 10.0);
assert!(options.sort_by_position);
assert!(!options.detect_columns);
assert_eq!(options.column_threshold, 50.0);
assert!(options.merge_hyphenated);
}
#[test]
fn test_text_extractor_with_options() {
let options = ExtractionOptions {
preserve_layout: true,
space_threshold: 0.3,
tj_space_threshold: 0.2,
newline_threshold: 12.0,
sort_by_position: false,
detect_columns: true,
column_threshold: 60.0,
merge_hyphenated: false,
track_space_decisions: false,
reconstruct_paragraphs: false,
include_artifacts: false,
};
let extractor = TextExtractor::with_options(options.clone());
assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
assert_eq!(extractor.options.space_threshold, options.space_threshold);
assert_eq!(
extractor.options.newline_threshold,
options.newline_threshold
);
assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
assert_eq!(extractor.options.detect_columns, options.detect_columns);
assert_eq!(extractor.options.column_threshold, options.column_threshold);
assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
}
#[test]
fn test_calculate_text_width_with_no_font_info() {
let width = calculate_text_width("Hello", 12.0, None);
assert_eq!(
width, 30.0,
"Without font info, should use simplified calculation: len * font_size * 0.5"
);
}
#[test]
fn test_calculate_text_width_with_empty_metrics() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: None,
last_char: None,
widths: None,
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let width = calculate_text_width("Hello", 12.0, Some(&font_info));
assert_eq!(
width, 30.0,
"Without widths array, should fall back to simplified calculation"
);
}
#[test]
fn test_calculate_text_width_with_complete_metrics() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let mut widths = vec![0.0; 95];
widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0;
let font_info = FontInfo {
name: "Helvetica".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(32),
last_char: Some(126),
widths: Some(widths),
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let width = calculate_text_width("Hello", 12.0, Some(&font_info));
let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
let tolerance = 0.0001; assert!(
(width - expected).abs() < tolerance,
"Should calculate width using actual character metrics: expected {}, got {}, diff {}",
expected,
width,
(width - expected).abs()
);
let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
width, simplified,
"Metrics-based calculation should differ from simplified (30.0)"
);
}
#[test]
fn test_calculate_text_width_character_outside_range() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let widths = vec![722.0; 26];
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(65), last_char: Some(90), widths: Some(widths),
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let width = calculate_text_width("A1", 10.0, Some(&font_info));
let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
assert_eq!(
width, expected,
"Should use missing_width for characters outside range"
);
}
#[test]
fn test_calculate_text_width_missing_width_in_array() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let mut widths = vec![500.0; 95]; widths[10] = 0.0;
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(32),
last_char: Some(126),
widths: Some(widths),
missing_width: Some(600.0),
kerning: None,
},
cid_encoding: None,
};
let char_code = 42u8 as char; let text = char_code.to_string();
let width = calculate_text_width(&text, 10.0, Some(&font_info));
assert_eq!(
width, 0.0,
"Should use 0.0 width from array, not missing_width"
);
}
#[test]
fn test_calculate_text_width_empty_string() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(32),
last_char: Some(126),
widths: Some(vec![500.0; 95]),
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let width = calculate_text_width("", 12.0, Some(&font_info));
assert_eq!(width, 0.0, "Empty string should have zero width");
let width_no_font = calculate_text_width("", 12.0, None);
assert_eq!(
width_no_font, 0.0,
"Empty string should have zero width (no font)"
);
}
#[test]
fn test_calculate_text_width_unicode_characters() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(32),
last_char: Some(126),
widths: Some(vec![500.0; 95]),
missing_width: Some(600.0),
kerning: None,
},
cid_encoding: None,
};
let width = calculate_text_width("ร", 10.0, Some(&font_info));
assert_eq!(
width, 6.0,
"Unicode character outside range should use missing_width"
);
}
#[test]
fn test_calculate_text_width_different_font_sizes() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let font_info = FontInfo {
name: "TestFont".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
assert_eq!(
width_20,
width_10 * 2.0,
"Width should scale linearly with font size"
);
}
#[test]
fn test_calculate_text_width_proportional_vs_monospace() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
name: "Helvetica".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
missing_width: Some(500.0),
kerning: None,
},
cid_encoding: None,
};
let monospace_widths = vec![600.0, 600.0, 600.0];
let monospace_font = FontInfo {
name: "Courier".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(105),
last_char: Some(107),
widths: Some(monospace_widths),
missing_width: Some(600.0),
kerning: None,
},
cid_encoding: None,
};
let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
assert!(
prop_width < mono_width,
"Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
prop_width,
mono_width
);
}
#[test]
fn test_calculate_text_width_with_kerning() {
use crate::text::extraction_cmap::{FontInfo, FontMetrics};
use std::collections::HashMap;
let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0;
let mut kerning = HashMap::new();
kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0);
let font_info = FontInfo {
name: "Helvetica".to_string(),
font_type: "Type1".to_string(),
encoding: None,
to_unicode: None,
differences: None,
descendant_font: None,
cid_to_gid_map: None,
cid_ordering: None,
metrics: FontMetrics {
first_char: Some(32),
last_char: Some(126),
widths: Some(widths),
missing_width: Some(500.0),
kerning: Some(kerning),
},
cid_encoding: None,
};
let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
let tolerance = 0.0001;
assert!(
(width_av - expected_av).abs() < tolerance,
"AV with kerning: expected {}, got {}, diff {}",
expected_av,
width_av,
(width_av - expected_av).abs()
);
let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
assert!(
(width_aw - expected_aw).abs() < tolerance,
"AW with kerning: expected {}, got {}, diff {}",
expected_aw,
width_aw,
(width_aw - expected_aw).abs()
);
let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
assert!(
(width_va - expected_va).abs() < tolerance,
"VA without kerning: expected {}, got {}, diff {}",
expected_va,
width_va,
(width_va - expected_va).abs()
);
assert!(
width_av < width_va,
"AV with kerning ({}) should be narrower than VA without kerning ({})",
width_av,
width_va
);
}
#[test]
fn test_parse_truetype_kern_table_minimal() {
use crate::text::extraction_cmap::parse_truetype_kern_table;
let mut ttf_data = vec![
0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]);
ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]);
ttf_data.extend_from_slice(&[0u8; 54]);
ttf_data.extend_from_slice(&[
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
let result = parse_truetype_kern_table(&ttf_data);
assert!(
result.is_ok(),
"Should parse minimal kern table successfully: {:?}",
result.err()
);
let kerning_map = result.unwrap();
assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
assert_eq!(
kerning_map.get(&(65, 86)),
Some(&-50.0),
"Should have A+V kerning pair with value -50"
);
assert_eq!(
kerning_map.get(&(65, 87)),
Some(&-40.0),
"Should have A+W kerning pair with value -40"
);
}
#[test]
fn test_parse_kern_table_no_kern_table() {
use crate::text::extraction_cmap::extract_truetype_kerning;
let ttf_data = vec![
0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
let result = extract_truetype_kerning(&ttf_data);
assert!(
result.is_ok(),
"Should gracefully handle missing kern table"
);
let kerning_map = result.unwrap();
assert!(
kerning_map.is_empty(),
"Should return empty HashMap when no kern table exists"
);
}
fn tf(text: &str, x: f64, y: f64, width: f64, font_size: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x,
y,
width,
height: font_size,
font_size,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
}
}
#[test]
fn merge_into_lines_groups_same_baseline_fragments() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let input = vec![
tf("Hello", 50.0, 400.0, 30.0, 12.0),
tf("world", 90.0, 400.0, 30.0, 12.0),
tf("now.", 130.0, 400.0, 25.0, 12.0),
tf("Next", 50.0, 386.0, 30.0, 12.0),
tf("line.", 90.0, 386.0, 25.0, 12.0),
];
let lines = extractor.merge_into_lines(&input);
assert_eq!(
lines.len(),
2,
"two distinct baselines must produce two line fragments"
);
assert_eq!(
lines[0].text, "Hello world now.",
"first line concatenated with spaces"
);
assert_eq!(lines[1].text, "Next line.", "second line concatenated");
}
#[test]
fn merge_into_lines_inserts_space_only_when_gap_exceeds_threshold() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
space_threshold: 0.3,
..Default::default()
});
let with_gap = vec![
tf("AB", 50.0, 400.0, 10.0, 12.0),
tf("CD", 64.0, 400.0, 10.0, 12.0),
];
let lines = extractor.merge_into_lines(&with_gap);
assert_eq!(
lines[0].text, "AB CD",
"gap above threshold must insert space"
);
let tight = vec![
tf("AB", 50.0, 400.0, 10.0, 12.0),
tf("CD", 61.0, 400.0, 10.0, 12.0),
];
let lines = extractor.merge_into_lines(&tight);
assert_eq!(lines[0].text, "ABCD", "tight gap must NOT insert space");
}
#[test]
fn merge_into_lines_unioned_bounding_box() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let input = vec![
tf("A", 50.0, 400.0, 10.0, 12.0),
tf("B", 100.0, 400.0, 10.0, 12.0),
];
let lines = extractor.merge_into_lines(&input);
assert_eq!(lines.len(), 1);
assert!((lines[0].x - 50.0).abs() < 0.01);
assert!(
(lines[0].width - 60.0).abs() < 0.01,
"width must span 50->110"
);
}
#[test]
fn assign_row_ids_monotone_y_descending_keeps_zero() {
let frags = vec![
tf("A", 50.0, 400.0, 10.0, 9.0),
tf("B", 50.0, 395.0, 10.0, 9.0),
tf("C", 50.0, 390.0, 10.0, 9.0),
];
let row_ids = super::assign_row_ids(&frags);
assert_eq!(row_ids, vec![0u32, 0, 0]);
}
#[test]
fn assign_row_ids_increments_on_y_up_jump_above_threshold() {
let frags = vec![
tf("A", 50.0, 400.0, 10.0, 9.0),
tf("B", 50.0, 395.0, 10.0, 9.0),
tf("C", 50.0, 420.0, 10.0, 9.0),
];
let row_ids = super::assign_row_ids(&frags);
assert_eq!(row_ids, vec![0u32, 0, 1]);
}
#[test]
fn assign_row_ids_ignores_superscript_within_threshold() {
let frags = vec![
tf("A", 50.0, 400.0, 10.0, 9.0),
tf("^2", 60.0, 402.5, 5.0, 9.0),
tf("B", 65.0, 395.0, 10.0, 9.0),
];
let row_ids = super::assign_row_ids(&frags);
assert_eq!(row_ids, vec![0u32, 0, 0]);
}
#[test]
fn assign_row_ids_floor_2pt_for_small_fonts() {
let frags = vec![
tf("A", 50.0, 100.0, 10.0, 3.0),
tf("B", 50.0, 102.5, 10.0, 3.0),
];
let row_ids = super::assign_row_ids(&frags);
assert_eq!(row_ids, vec![0u32, 1]);
}
#[test]
fn assign_row_ids_empty_slice_returns_empty() {
let frags: Vec<TextFragment> = vec![];
let row_ids = super::assign_row_ids(&frags);
assert!(row_ids.is_empty(), "empty input must yield empty output");
}
#[test]
fn merge_into_lines_splits_two_columns_emitted_sequentially() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let input = vec![
tf("col1-top", 50.0, 400.0, 80.0, 10.0),
tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
tf("col2-top", 200.0, 405.0, 80.0, 10.0),
tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
];
let lines = extractor.merge_into_lines(&input);
assert_eq!(
lines.len(),
4,
"two columns at near-identical Y must split into 4 lines"
);
assert_eq!(lines[0].text, "col1-top");
assert_eq!(lines[0].y, 400.0);
assert_eq!(lines[1].text, "col1-bot");
assert_eq!(lines[1].y, 395.0);
assert_eq!(lines[2].text, "col2-top");
assert_eq!(lines[2].y, 405.0);
assert_eq!(lines[3].text, "col2-bot");
assert_eq!(lines[3].y, 400.0);
}
#[test]
fn merge_into_lines_preserves_single_column_continuation() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let input = vec![
tf("Hello", 50.0, 400.0, 30.0, 10.0),
tf("world", 90.0, 400.0, 30.0, 10.0),
tf("next-line", 50.0, 395.0, 70.0, 10.0),
];
let lines = extractor.merge_into_lines(&input);
assert_eq!(
lines.len(),
2,
"single column continuation must collapse to 2 lines"
);
assert!(lines[0].text.contains("Hello"));
assert!(lines[0].text.contains("world"));
assert_eq!(lines[1].text, "next-line");
}
#[test]
fn merge_into_lines_splits_columns_with_uniform_mcid() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let mut frags = vec![
tf("col1-top", 50.0, 400.0, 80.0, 10.0),
tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
tf("col2-top", 200.0, 405.0, 80.0, 10.0),
tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
];
for f in &mut frags {
f.mcid = Some(0);
}
let lines = extractor.merge_into_lines(&frags);
assert_eq!(
lines.len(),
4,
"uniform mcid must not prevent row_id-based column split (NCSC root cause)"
);
assert_eq!(lines[0].text, "col1-top");
assert_eq!(lines[1].text, "col1-bot");
assert_eq!(lines[2].text, "col2-top");
assert_eq!(lines[3].text, "col2-bot");
}
#[test]
fn merge_close_fragments_superscript_merges_when_reconstruct_paragraphs() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let frags = vec![
tf("body-text", 50.0, 400.0, 25.0, 10.0),
tf("1", 79.0, 403.5, 4.0, 10.0),
];
let merged = extractor.merge_close_fragments(&frags);
assert_eq!(
merged.len(),
1,
"superscript within 5pt of baseline must merge in reconstruct path"
);
assert!(merged[0].text.contains("body-text"));
assert!(merged[0].text.contains("1"));
}
#[test]
fn merge_close_fragments_superscript_does_not_merge_in_legacy_path() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: false,
..Default::default()
});
let frags = vec![
tf("body-text", 50.0, 400.0, 25.0, 10.0),
tf("1", 79.0, 403.5, 4.0, 10.0),
];
let merged = extractor.merge_close_fragments(&frags);
assert_eq!(
merged.len(),
2,
"3.5pt Y delta exceeds legacy 1.0pt threshold; superscript stays separate"
);
}
#[test]
fn merge_into_paragraphs_groups_consecutive_lines() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let lines = vec![
tf("Line one.", 50.0, 400.0, 60.0, 12.0),
tf("Line two.", 50.0, 386.0, 60.0, 12.0),
tf("Line three.", 50.0, 372.0, 70.0, 12.0),
];
let paragraphs = extractor.merge_into_paragraphs(&lines);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].text, "Line one.\nLine two.\nLine three.");
}
#[test]
fn merge_into_paragraphs_splits_on_large_vertical_gap() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
..Default::default()
});
let lines = vec![
tf("P1L1.", 50.0, 400.0, 40.0, 12.0),
tf("P1L2.", 50.0, 386.0, 40.0, 12.0),
tf("P2L1.", 50.0, 300.0, 40.0, 12.0),
];
let paragraphs = extractor.merge_into_paragraphs(&lines);
assert_eq!(paragraphs.len(), 2);
assert_eq!(paragraphs[0].text, "P1L1.\nP1L2.");
assert_eq!(paragraphs[1].text, "P2L1.");
}
#[test]
fn merge_into_paragraphs_drops_hyphen_when_merge_hyphenated() {
let extractor = TextExtractor::with_options(ExtractionOptions {
reconstruct_paragraphs: true,
merge_hyphenated: true,
..Default::default()
});
let lines = vec![
tf("Kryp-", 50.0, 400.0, 30.0, 12.0),
tf("tographie", 50.0, 386.0, 60.0, 12.0),
];
let paragraphs = extractor.merge_into_paragraphs(&lines);
assert_eq!(paragraphs.len(), 1);
assert_eq!(
paragraphs[0].text, "Kryptographie",
"hyphen elided, no newline inserted"
);
}
#[test]
fn decode_pdf_string_utf16be_bom_decodes_fi_ligature() {
let bytes = [0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69];
assert_eq!(super::decode_pdf_string(&bytes), "fi");
}
#[test]
fn decode_pdf_string_ascii_pdfdocencoding_passthrough() {
let bytes = b"page 12";
assert_eq!(super::decode_pdf_string(bytes), "page 12");
}
#[test]
fn decode_pdf_string_empty_input_returns_empty() {
assert_eq!(super::decode_pdf_string(&[]), "");
}
#[test]
fn decode_pdf_string_lone_bom_returns_empty() {
assert_eq!(super::decode_pdf_string(&[0xFE, 0xFF]), "");
}
#[test]
fn resolve_props_extracts_integer_mcid() {
use crate::parser::content::{MarkedContentProps, MarkedContentValue};
use std::collections::HashMap;
let mut map = HashMap::new();
map.insert("MCID".to_string(), MarkedContentValue::Integer(7));
let props = MarkedContentProps::Inline(map);
let (mcid, actual) = super::resolve_props(&props, None);
assert_eq!(mcid, Some(7));
assert_eq!(actual, None);
}
#[test]
fn resolve_props_decodes_utf16be_actualtext() {
use crate::parser::content::{MarkedContentProps, MarkedContentValue};
use std::collections::HashMap;
let mut map = HashMap::new();
map.insert(
"ActualText".to_string(),
MarkedContentValue::String(vec![0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69]),
);
let props = MarkedContentProps::Inline(map);
let (mcid, actual) = super::resolve_props(&props, None);
assert_eq!(mcid, None);
assert_eq!(actual.as_deref(), Some("fi"));
}
#[test]
fn resolve_props_returns_none_for_unresolvable_resource_ref() {
use crate::parser::content::MarkedContentProps;
let props = MarkedContentProps::ResourceRef("PropsName".to_string());
let (mcid, actual) = super::resolve_props(&props, None);
assert_eq!((mcid, actual), (None, None));
}
#[test]
fn resolve_props_negative_mcid_rejected() {
use crate::parser::content::{MarkedContentProps, MarkedContentValue};
use std::collections::HashMap;
let mut map = HashMap::new();
map.insert("MCID".to_string(), MarkedContentValue::Integer(-1));
let props = MarkedContentProps::Inline(map);
let (mcid, _) = super::resolve_props(&props, None);
assert_eq!(mcid, None);
}
#[test]
fn resolve_props_resource_ref_overflow_mcid_rejected() {
use crate::parser::content::MarkedContentProps;
use crate::parser::objects::{PdfDictionary, PdfObject};
let mut inner = PdfDictionary::new();
inner.insert("MCID".to_string(), PdfObject::Integer(i64::MAX));
let mut properties = PdfDictionary::new();
properties.insert("PropsName".to_string(), PdfObject::Dictionary(inner));
let props = MarkedContentProps::ResourceRef("PropsName".to_string());
let (mcid, _) = super::resolve_props(&props, Some(&properties));
assert_eq!(mcid, None);
}
}