use crate::bbox::BoundingBox;
use crate::error::{Error, Result};
use lopdf::{
content::{Content, Operation},
Dictionary, Document, Object, ObjectId, Stream,
};
use std::collections::HashMap;
#[derive(Debug, Clone)]
struct GraphicsState {
ctm: [f64; 6],
text_matrix: [f64; 6],
text_line_matrix: [f64; 6],
text_pos: (f64, f64),
font_size: f64,
font_name: Option<Vec<u8>>,
char_spacing: f64,
word_spacing: f64,
horiz_scaling: f64,
leading: f64,
text_rise: f64,
}
impl Default for GraphicsState {
fn default() -> Self {
GraphicsState {
ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0], text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
text_pos: (0.0, 0.0),
font_size: 12.0,
font_name: None,
char_spacing: 0.0,
word_spacing: 0.0,
horiz_scaling: 1.0,
leading: 0.0,
text_rise: 0.0,
}
}
}
impl GraphicsState {
fn apply_transform(&mut self, matrix: &[f64; 6]) {
let [a1, b1, c1, d1, e1, f1] = self.ctm;
let [a2, b2, c2, d2, e2, f2] = matrix;
self.ctm = [
a1 * a2 + b1 * c2,
a1 * b2 + b1 * d2,
c1 * a2 + d1 * c2,
c1 * b2 + d1 * d2,
e1 * a2 + f1 * c2 + e2,
e1 * b2 + f1 * d2 + f2,
];
}
fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
let [a, b, c, d, e, f] = self.ctm;
(a * x + c * y + e, b * x + d * y + f)
}
fn reset_text_state(&mut self) {
self.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
self.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
self.text_pos = (0.0, 0.0);
self.char_spacing = 0.0;
self.word_spacing = 0.0;
self.horiz_scaling = 1.0;
self.leading = 0.0;
self.text_rise = 0.0;
}
fn set_text_matrix(&mut self, matrix: [f64; 6]) {
self.text_matrix = matrix;
self.text_line_matrix = matrix;
self.update_text_position();
}
fn translate_text_matrix(&mut self, tx: f64, ty: f64) {
let translation = [1.0, 0.0, 0.0, 1.0, tx, ty];
self.text_matrix = multiply_matrices(&self.text_matrix, &translation);
self.update_text_position();
}
fn translate_text_line_matrix(&mut self, tx: f64, ty: f64) {
let translation = [1.0, 0.0, 0.0, 1.0, tx, ty];
self.text_line_matrix = multiply_matrices(&self.text_line_matrix, &translation);
self.text_matrix = self.text_line_matrix;
self.update_text_position();
}
fn move_to_next_line(&mut self) {
let ty = -self.leading;
self.translate_text_line_matrix(0.0, ty);
}
fn combined_text_matrix(&self) -> [f64; 6] {
multiply_matrices(&self.ctm, &self.text_matrix)
}
fn update_text_position(&mut self) {
self.text_pos = (self.text_matrix[4], self.text_matrix[5]);
}
}
#[derive(Clone, Debug)]
struct FontMetrics {
widths: HashMap<u32, f64>,
default_width: f64,
ascent: f64,
descent: f64,
is_cid: bool,
bytes_per_char: usize,
writing_mode: WritingMode,
}
impl FontMetrics {
fn glyph_width(&self, code: u32) -> f64 {
self.widths
.get(&code)
.copied()
.unwrap_or(self.default_width)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
enum WritingMode {
Horizontal,
Vertical,
}
#[derive(Default)]
struct FontCache {
cache: HashMap<Vec<u8>, FontMetrics>,
}
impl FontCache {
fn new() -> Self {
Self {
cache: HashMap::new(),
}
}
fn get(
&mut self,
doc: &Document,
resources: Option<&Dictionary>,
font_name: &[u8],
) -> Option<FontMetrics> {
if let Some(metrics) = self.cache.get(font_name) {
return Some(metrics.clone());
}
let metrics = load_font_metrics(doc, resources, font_name)?;
self.cache.insert(font_name.to_vec(), metrics.clone());
Some(metrics)
}
}
fn load_font_metrics(
doc: &Document,
resources: Option<&Dictionary>,
font_name: &[u8],
) -> Option<FontMetrics> {
let font_dict = get_font_dictionary(doc, resources, font_name)?;
let subtype = font_dict
.get(b"Subtype")
.ok()
.and_then(|obj| obj.as_name().ok())?;
match subtype {
b"Type0" => parse_type0_font(doc, &font_dict),
b"Type1" | b"TrueType" => parse_type1_font(doc, &font_dict),
b"Type3" => parse_type3_font(doc, &font_dict),
_ => None,
}
}
fn get_font_dictionary(
doc: &Document,
resources: Option<&Dictionary>,
font_name: &[u8],
) -> Option<Dictionary> {
let resources = resources?;
let font_entry = resources.get(b"Font").ok()?;
let font_dict_obj = resolve_to_owned(doc, font_entry)?;
let font_dict = font_dict_obj.as_dict().ok()?;
let font_obj = font_dict.get(font_name).ok()?.clone();
match resolve_to_owned(doc, &font_obj)? {
Object::Dictionary(dict) => Some(dict),
Object::Stream(stream) => Some(stream.dict),
_ => None,
}
}
fn resolve_to_owned(doc: &Document, obj: &Object) -> Option<Object> {
match obj {
Object::Reference(id) => doc.get_object(*id).ok().cloned(),
other => Some(other.clone()),
}
}
fn parse_type1_font(doc: &Document, font_dict: &Dictionary) -> Option<FontMetrics> {
let first_char = font_dict
.get(b"FirstChar")
.ok()
.and_then(|obj| obj.as_i64().ok())
.unwrap_or(0) as u32;
let widths_obj = font_dict.get(b"Widths").ok()?;
let widths_array_obj = resolve_to_owned(doc, widths_obj)?;
let widths_array = widths_array_obj.as_array().ok()?;
let mut widths = HashMap::new();
for (idx, value) in widths_array.iter().enumerate() {
if let Some(width) = object_to_f64(value) {
widths.insert(first_char + idx as u32, width);
}
}
let descriptor_dict = font_dict
.get(b"FontDescriptor")
.ok()
.and_then(|obj| resolve_to_owned(doc, obj))
.and_then(|obj| match obj {
Object::Dictionary(dict) => Some(dict),
Object::Stream(stream) => Some(stream.dict),
_ => None,
});
let (ascent, descent, missing_width) = descriptor_metrics(descriptor_dict.as_ref());
Some(FontMetrics {
widths,
default_width: missing_width,
ascent,
descent,
is_cid: false,
bytes_per_char: 1,
writing_mode: WritingMode::Horizontal,
})
}
fn parse_type0_font(doc: &Document, font_dict: &Dictionary) -> Option<FontMetrics> {
let writing_mode = if let Ok(enc_name) = font_dict.get(b"Encoding").and_then(|obj| obj.as_name()) {
match enc_name {
b"Identity-H" => WritingMode::Horizontal,
b"Identity-V" => WritingMode::Vertical,
_ => return None,
}
} else {
WritingMode::Horizontal
};
let descendant_fonts_obj = font_dict.get(b"DescendantFonts").ok()?;
let descendant_fonts_resolved = resolve_to_owned(doc, descendant_fonts_obj)?;
let descendant_array = descendant_fonts_resolved.as_array().ok()?;
let first_descendant = descendant_array.first()?;
let descendant_dict_obj = resolve_to_owned(doc, first_descendant)?;
let descendant_dict = match descendant_dict_obj {
Object::Dictionary(dict) => dict,
Object::Stream(stream) => stream.dict,
_ => return None,
};
let default_width = descendant_dict
.get(b"DW")
.ok()
.and_then(object_to_f64)
.unwrap_or(1000.0);
let mut widths = HashMap::new();
if let Ok(w_array_obj) = descendant_dict.get(b"W") {
if let Some(resolved_w_array) = resolve_to_owned(doc, w_array_obj) {
if let Ok(entries) = resolved_w_array.as_array() {
parse_cid_widths(entries, &mut widths);
}
}
}
let descriptor_dict = descendant_dict
.get(b"FontDescriptor")
.ok()
.and_then(|obj| resolve_to_owned(doc, obj))
.and_then(|obj| match obj {
Object::Dictionary(dict) => Some(dict),
Object::Stream(stream) => Some(stream.dict),
_ => None,
});
let (ascent, descent, missing_width) = descriptor_metrics(descriptor_dict.as_ref());
Some(FontMetrics {
widths,
default_width: if default_width > 0.0 {
default_width
} else {
missing_width
},
ascent,
descent,
is_cid: true,
bytes_per_char: 2,
writing_mode,
})
}
fn parse_type3_font(doc: &Document, font_dict: &Dictionary) -> Option<FontMetrics> {
let bbox_width = font_dict
.get(b"FontBBox")
.ok()
.and_then(|obj| resolve_to_owned(doc, obj))
.and_then(|obj| obj.as_array().ok().map(|arr| arr.to_vec()))
.and_then(|vals| {
if vals.len() == 4 {
let left = object_to_f64(&vals[0])?;
let right = object_to_f64(&vals[2])?;
Some((right - left).abs())
} else {
None
}
})
.unwrap_or(500.0);
let mut widths = HashMap::new();
for code in 0..=255u32 {
widths.insert(code, bbox_width);
}
let (ascent, descent, missing_width) =
descriptor_metrics(font_dict.get(b"FontDescriptor").ok().and_then(|obj| {
resolve_to_owned(doc, obj).and_then(|o| match o {
Object::Dictionary(d) => Some(d),
Object::Stream(s) => Some(s.dict),
_ => None,
})
}).as_ref());
Some(FontMetrics {
widths,
default_width: missing_width.max(bbox_width),
ascent,
descent,
is_cid: false,
bytes_per_char: 1,
writing_mode: WritingMode::Horizontal,
})
}
fn descriptor_metrics(descriptor: Option<&Dictionary>) -> (f64, f64, f64) {
let ascent = descriptor
.and_then(|dict| dict.get(b"Ascent").ok())
.and_then(object_to_f64)
.unwrap_or(800.0);
let descent = descriptor
.and_then(|dict| dict.get(b"Descent").ok())
.and_then(object_to_f64)
.unwrap_or(-200.0);
let missing_width = descriptor
.and_then(|dict| dict.get(b"MissingWidth").ok())
.and_then(object_to_f64)
.unwrap_or(500.0);
(ascent, descent, missing_width)
}
fn parse_cid_widths(entries: &[Object], widths: &mut HashMap<u32, f64>) {
let mut idx = 0;
while idx < entries.len() {
let start_code = match object_to_u32(&entries[idx]) {
Some(val) => val,
None => {
idx += 1;
continue;
}
};
if idx + 1 >= entries.len() {
break;
}
match &entries[idx + 1] {
Object::Array(values) => {
for (offset, value) in values.iter().enumerate() {
if let Some(width) = object_to_f64(value) {
widths.insert(start_code + offset as u32, width);
}
}
idx += 2;
}
Object::Integer(_) | Object::Real(_) => {
if idx + 2 >= entries.len() {
break;
}
let end_code = match object_to_u32(&entries[idx + 1]) {
Some(val) => val,
None => {
idx += 1;
continue;
}
};
if let Some(width) = object_to_f64(&entries[idx + 2]) {
for code in start_code..=end_code {
widths.insert(code, width);
}
}
idx += 3;
}
_ => {
idx += 1;
}
}
}
}
fn object_to_f64(obj: &Object) -> Option<f64> {
match obj {
Object::Real(val) => Some(*val as f64),
Object::Integer(val) => Some(*val as f64),
_ => None,
}
}
fn object_to_u32(obj: &Object) -> Option<u32> {
match obj {
Object::Integer(val) => {
if *val >= 0 {
Some(*val as u32)
} else {
None
}
}
Object::Real(val) => {
if *val >= 0.0 {
Some(*val as u32)
} else {
None
}
}
_ => None,
}
}
#[derive(Debug)]
enum ContentComponent {
Path {
operators: Vec<Operation>,
bbox: Option<BoundingBox>,
},
ImageXObject {
operator: Operation,
bbox: Option<BoundingBox>,
},
FormXObject {
operator: Operation,
bbox: Option<BoundingBox>,
},
TextBlock { operators: Vec<Operation> },
OrphanText {
operator: Operation,
bbox: Option<BoundingBox>,
},
GraphicsState { operators: Vec<Operation> },
}
fn flush_graphics_ops(components: &mut Vec<ContentComponent>, graphics_ops: &mut Vec<Operation>) {
if !graphics_ops.is_empty() {
let mut ops = Vec::new();
std::mem::swap(&mut ops, graphics_ops);
components.push(ContentComponent::GraphicsState { operators: ops });
}
}
fn parse_into_components(
doc: &Document,
operations: &[Operation],
resources: Option<&Dictionary>,
) -> Result<Vec<ContentComponent>> {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] parse_into_components: Processing {} operations",
operations.len()
)));
let ops_to_log = if operations.len() <= 20 {
operations.len()
} else {
10
};
for (i, op) in operations.iter().take(ops_to_log).enumerate() {
let operands_str = op
.operands
.iter()
.map(|o| match o {
Object::Name(n) => format!("Name({})", String::from_utf8_lossy(n)),
Object::Real(r) => format!("Real({})", r),
Object::Integer(i) => format!("Int({})", i),
Object::String(s, _) => format!("String({})", String::from_utf8_lossy(s)),
_ => format!("{:?}", o),
})
.collect::<Vec<_>>()
.join(", ");
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Op {}: {} [{}]",
i, op.operator, operands_str
)));
}
}
let mut components = Vec::new();
let mut state = GraphicsState::default();
let mut state_stack: Vec<GraphicsState> = Vec::new();
let mut path_buffer: Vec<Operation> = Vec::new();
let mut path_points: Vec<(f64, f64)> = Vec::new();
let mut path_start = (0.0, 0.0);
let mut in_text_block = false;
let mut text_block_ops: Vec<Operation> = Vec::new();
let mut graphics_state_ops: Vec<Operation> = Vec::new();
let mut font_cache = FontCache::new();
for (_op_idx, op) in operations.iter().enumerate() {
let operator = op.operator.as_str();
#[cfg(debug_assertions)]
if operations.len() <= 5 {
eprintln!(
"[DEBUG] Operation {}: '{}' ({} bytes) with {} operands",
_op_idx,
operator,
op.operator.len(),
op.operands.len()
);
if operator == "Do" || operator.contains("o") {
eprintln!("[DEBUG] Operator bytes: {:?}", op.operator.as_bytes());
if let Some(first_operand) = op.operands.first() {
eprintln!("[DEBUG] First operand: {:?}", first_operand);
}
}
}
match operator {
"BT" => {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str("[DEBUG] Found BT (Begin Text)"));
}
flush_graphics_ops(&mut components, &mut graphics_state_ops);
in_text_block = true;
text_block_ops.clear();
text_block_ops.push(op.clone());
state.reset_text_state();
}
"ET" => {
text_block_ops.push(op.clone());
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Found ET (End Text) - creating TextBlock with {} operators",
text_block_ops.len()
)));
}
components.push(ContentComponent::TextBlock {
operators: text_block_ops.clone(),
});
text_block_ops.clear();
in_text_block = false;
}
_ if in_text_block => {
text_block_ops.push(op.clone());
match operator {
"Tf" => {
if let Some(Object::Name(font_name)) = op.operands.first() {
state.font_name = Some(font_name.clone());
}
if let Some(size) = extract_number(&op.operands, 1) {
state.font_size = size;
}
}
"Tm" => {
if let Some(matrix) = extract_matrix(&op.operands) {
state.set_text_matrix(matrix);
}
}
"Td" | "TD" => {
if let (Some(tx), Some(ty)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
state.translate_text_line_matrix(tx, ty);
if operator == "TD" {
state.leading = -ty;
}
}
}
"T*" => {
state.move_to_next_line();
}
"Ts" => {
if let Some(rise) = extract_number(&op.operands, 0) {
state.text_rise = rise;
}
}
"Tw" => {
if let Some(space) = extract_number(&op.operands, 0) {
state.word_spacing = space;
}
}
"Tc" => {
if let Some(space) = extract_number(&op.operands, 0) {
state.char_spacing = space;
}
}
"Tz" => {
if let Some(scale) = extract_number(&op.operands, 0) {
state.horiz_scaling = scale / 100.0;
}
}
"TL" => {
if let Some(leading) = extract_number(&op.operands, 0) {
state.leading = leading;
}
}
_ => {}
}
}
"m" | "l" | "c" | "v" | "y" | "re" | "h" => {
if !graphics_state_ops.is_empty() {
components.push(ContentComponent::GraphicsState {
operators: graphics_state_ops.clone(),
});
graphics_state_ops.clear();
}
path_buffer.push(op.clone());
match operator {
"m" => {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
let pos = state.transform_point(x, y);
path_points.clear();
path_points.push(pos);
path_start = pos;
}
}
"l" => {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
path_points.push(state.transform_point(x, y));
}
}
"c" | "v" | "y" => {
if op.operands.len() >= 2 {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, op.operands.len() - 2),
extract_number(&op.operands, op.operands.len() - 1),
) {
path_points.push(state.transform_point(x, y));
}
}
}
"re" => {
if let (Some(x), Some(y), Some(w), Some(h)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
extract_number(&op.operands, 2),
extract_number(&op.operands, 3),
) {
path_points.clear();
path_points.push(state.transform_point(x, y));
path_points.push(state.transform_point(x + w, y));
path_points.push(state.transform_point(x + w, y + h));
path_points.push(state.transform_point(x, y + h));
}
}
"h" => {
if !path_points.is_empty() {
path_points.push(path_start);
}
}
_ => {}
}
}
"S" | "s" | "f" | "F" | "f*" | "B" | "B*" | "b" | "b*" => {
path_buffer.push(op.clone());
let bbox = if path_points.is_empty() {
None
} else {
calculate_path_bbox(&path_points)
};
components.push(ContentComponent::Path {
operators: path_buffer.clone(),
bbox,
});
path_buffer.clear();
path_points.clear();
}
"W" | "W*" => {
path_buffer.push(op.clone());
}
"n" => {
path_buffer.clear();
path_points.clear();
}
"Do" => {
if !graphics_state_ops.is_empty() {
components.push(ContentComponent::GraphicsState {
operators: graphics_state_ops.clone(),
});
graphics_state_ops.clear();
}
if let Some(Object::Name(xobj_name)) = op.operands.first() {
if let Some(resources_dict) = resources {
match get_xobject_type(doc, resources_dict, xobj_name) {
XObjectType::Image => {
let bbox = calculate_image_bbox(&state.ctm);
components.push(ContentComponent::ImageXObject {
operator: op.clone(),
bbox,
});
}
XObjectType::Form => {
#[cfg(debug_assertions)]
eprintln!(
"[DEBUG] Processing Form XObject: {}",
String::from_utf8_lossy(xobj_name)
);
let bbox = if let Ok(xobj_ref) =
get_xobject_object_id(doc, resources_dict, xobj_name)
{
#[cfg(debug_assertions)]
eprintln!("[DEBUG] Got XObject reference: {:?}", xobj_ref);
let result =
calculate_form_xobject_bbox(doc, xobj_ref, &state.ctm);
#[cfg(debug_assertions)]
eprintln!(
"[DEBUG] Form XObject bbox calculation result: {:?}",
result
);
result
} else {
#[cfg(debug_assertions)]
eprintln!("[DEBUG] Failed to get XObject reference");
None
};
#[cfg(target_arch = "wasm32")]
if let Some(ref b) = bbox {
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Form XObject bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
b.left, b.bottom, b.right, b.top
)));
}
components.push(ContentComponent::FormXObject {
operator: op.clone(),
bbox,
});
}
XObjectType::Unknown => {
components.push(ContentComponent::FormXObject {
operator: op.clone(),
bbox: None,
});
}
}
} else {
components.push(ContentComponent::FormXObject {
operator: op.clone(),
bbox: None,
});
}
} else {
graphics_state_ops.push(op.clone());
}
}
"q" => {
state_stack.push(state.clone());
graphics_state_ops.push(op.clone());
}
"Q" => {
if let Some(saved_state) = state_stack.pop() {
state = saved_state;
}
graphics_state_ops.push(op.clone());
}
"cm" => {
if let Some(matrix) = extract_matrix(&op.operands) {
state.apply_transform(&matrix);
}
graphics_state_ops.push(op.clone());
}
"CS" | "cs" | "SC" | "SCN" | "sc" | "scn" | "G" | "g" | "RG" | "rg" | "K" | "k"
| "w" | "J" | "j" | "M" | "d" | "ri" | "i" | "gs" => {
graphics_state_ops.push(op.clone());
}
"BMC" | "BDC" | "EMC" | "MP" | "DP" => {
graphics_state_ops.push(op.clone());
}
"Tj" | "TJ" | "'" | "\"" => {
flush_graphics_ops(&mut components, &mut graphics_state_ops);
if let Some(component) =
handle_orphan_text_operation(doc, resources, op, &mut state, &mut font_cache)
{
components.push(component);
} else {
components.push(ContentComponent::GraphicsState {
operators: vec![op.clone()],
});
}
}
"Tf" => {
if let Some(Object::Name(font_name)) = op.operands.first() {
state.font_name = Some(font_name.clone());
}
if let Some(size) = extract_number(&op.operands, 1) {
state.font_size = size;
#[cfg(not(target_arch = "wasm32"))]
eprintln!("[DEBUG] Tf outside BT/ET: font size = {:.1}", size);
}
graphics_state_ops.push(op.clone());
}
"Ts" | "Tz" | "TL" | "Tw" | "Tc" | "Tr" => {
match operator {
"Ts" => {
if let Some(rise) = extract_number(&op.operands, 0) {
state.text_rise = rise;
}
}
"Tz" => {
if let Some(scale) = extract_number(&op.operands, 0) {
state.horiz_scaling = scale / 100.0;
}
}
"TL" => {
if let Some(leading) = extract_number(&op.operands, 0) {
state.leading = leading;
}
}
"Tw" => {
if let Some(space) = extract_number(&op.operands, 0) {
state.word_spacing = space;
}
}
"Tc" => {
if let Some(space) = extract_number(&op.operands, 0) {
state.char_spacing = space;
}
}
_ => {}
}
graphics_state_ops.push(op.clone());
}
"Tm" => {
if let Some(matrix) = extract_matrix(&op.operands) {
state.set_text_matrix(matrix);
#[cfg(not(target_arch = "wasm32"))]
eprintln!(
"[DEBUG] Tm outside BT/ET: pos = ({:.1}, {:.1})",
state.text_pos.0, state.text_pos.1
);
}
graphics_state_ops.push(op.clone());
}
"Td" | "TD" => {
if let (Some(tx), Some(ty)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
state.translate_text_line_matrix(tx, ty);
if operator == "TD" {
state.leading = -ty;
}
#[cfg(not(target_arch = "wasm32"))]
eprintln!(
"[DEBUG] {} outside BT/ET: pos = ({:.1}, {:.1})",
operator, state.text_pos.0, state.text_pos.1
);
}
graphics_state_ops.push(op.clone());
}
"T*" => {
state.move_to_next_line();
graphics_state_ops.push(op.clone());
}
_ => {
graphics_state_ops.push(op.clone());
}
}
}
if !graphics_state_ops.is_empty() {
components.push(ContentComponent::GraphicsState {
operators: graphics_state_ops,
});
}
if in_text_block && !text_block_ops.is_empty() {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[WARNING] Unmatched BT - creating TextBlock with {} operators",
text_block_ops.len()
)));
}
components.push(ContentComponent::TextBlock {
operators: text_block_ops,
});
}
Ok(components)
}
fn calculate_path_bbox(points: &[(f64, f64)]) -> Option<BoundingBox> {
if points.is_empty() {
return None;
}
let min_x = points
.iter()
.map(|(x, _)| x)
.fold(f64::INFINITY, |a, &b| a.min(b));
let max_x = points
.iter()
.map(|(x, _)| x)
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let min_y = points
.iter()
.map(|(_, y)| y)
.fold(f64::INFINITY, |a, &b| a.min(b));
let max_y = points
.iter()
.map(|(_, y)| y)
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
BoundingBox::new(min_x, min_y, max_x, max_y).ok()
}
fn handle_orphan_text_operation(
doc: &Document,
resources: Option<&Dictionary>,
op: &Operation,
state: &mut GraphicsState,
font_cache: &mut FontCache,
) -> Option<ContentComponent> {
let operator = op.operator.as_str();
let font_name = match state.font_name.clone() {
Some(name) => name,
None => {
#[cfg(not(target_arch = "wasm32"))]
eprintln!(
"[WARNING] Orphaned '{}' encountered without active font - keeping in stream",
operator
);
return None;
}
};
let metrics = match font_cache.get(doc, resources, &font_name) {
Some(metrics) => metrics,
None => {
#[cfg(not(target_arch = "wasm32"))]
eprintln!(
"[WARNING] Could not load metrics for font '{}' - keeping orphaned text",
String::from_utf8_lossy(&font_name)
);
return None;
}
};
let advance = match operator {
"Tj" => measure_text_from_string(op.operands.first()?, &metrics, state)?,
"TJ" => {
let array = op.operands.first()?.as_array().ok()?;
measure_text_from_array(array, &metrics, state)?
}
"'" => {
state.move_to_next_line();
measure_text_from_string(op.operands.first()?, &metrics, state)?
}
"\"" => {
if let Some(space) = extract_number(&op.operands, 0) {
state.word_spacing = space;
}
if let Some(space) = extract_number(&op.operands, 1) {
state.char_spacing = space;
}
state.move_to_next_line();
measure_text_from_string(op.operands.get(2)?, &metrics, state)?
}
_ => return None,
};
let bbox = calculate_text_bbox_from_state(state, advance, &metrics);
if metrics.writing_mode == WritingMode::Vertical {
state.translate_text_matrix(0.0, -advance);
} else {
state.translate_text_matrix(advance, 0.0);
}
Some(ContentComponent::OrphanText {
operator: op.clone(),
bbox,
})
}
fn measure_text_from_string(
operand: &Object,
metrics: &FontMetrics,
state: &GraphicsState,
) -> Option<f64> {
let bytes = extract_string_bytes(operand)?;
Some(measure_text_displacement(&bytes, metrics, state))
}
fn measure_text_from_array(
array: &[Object],
metrics: &FontMetrics,
state: &GraphicsState,
) -> Option<f64> {
let mut width = 0.0;
for item in array {
match item {
Object::String(_, _) => {
let bytes = extract_string_bytes(item)?;
width += measure_text_displacement(&bytes, metrics, state);
}
Object::Integer(val) => {
width -= (*val as f64 / 1000.0) * state.font_size * state.horiz_scaling;
}
Object::Real(val) => {
width -= (*val as f64 / 1000.0) * state.font_size * state.horiz_scaling;
}
_ => {}
}
}
Some(width)
}
fn measure_text_displacement(bytes: &[u8], metrics: &FontMetrics, state: &GraphicsState) -> f64 {
let mut advance_total = 0.0;
let scale = state.horiz_scaling;
for code in decode_text_codes(bytes, metrics) {
let mut advance = (metrics.glyph_width(code) / 1000.0) * state.font_size;
advance += state.char_spacing;
if !metrics.is_cid && code == 32 {
advance += state.word_spacing;
}
advance_total += advance * scale;
}
advance_total
}
fn decode_text_codes(bytes: &[u8], metrics: &FontMetrics) -> Vec<u32> {
if metrics.is_cid {
let mut codes = Vec::new();
for chunk in bytes.chunks(metrics.bytes_per_char) {
if chunk.len() == metrics.bytes_per_char {
let mut value = 0u32;
for &b in chunk {
value = (value << 8) | b as u32;
}
codes.push(value);
}
}
codes
} else {
bytes.iter().map(|b| *b as u32).collect()
}
}
fn extract_string_bytes(obj: &Object) -> Option<Vec<u8>> {
match obj {
Object::String(bytes, _) => Some(bytes.clone()),
_ => None,
}
}
fn calculate_text_bbox_from_state(
state: &GraphicsState,
advance: f64,
metrics: &FontMetrics,
) -> Option<BoundingBox> {
if advance.abs() < f64::EPSILON {
return None;
}
let ascent = (metrics.ascent / 1000.0) * state.font_size + state.text_rise;
let descent = (metrics.descent / 1000.0) * state.font_size + state.text_rise;
let combined = state.combined_text_matrix();
let points = if metrics.writing_mode == WritingMode::Vertical {
let glyph_width = (ascent - descent).abs().max(state.font_size * 0.5);
let half_w = glyph_width / 2.0;
[
transform_point_with_matrix(&combined, -half_w, 0.0),
transform_point_with_matrix(&combined, half_w, 0.0),
transform_point_with_matrix(&combined, half_w, -advance),
transform_point_with_matrix(&combined, -half_w, -advance),
]
} else {
[
transform_point_with_matrix(&combined, 0.0, descent),
transform_point_with_matrix(&combined, advance, descent),
transform_point_with_matrix(&combined, advance, ascent.max(descent + 0.1)),
transform_point_with_matrix(&combined, 0.0, ascent.max(descent + 0.1)),
]
};
calculate_path_bbox(&points)
}
fn transform_point_with_matrix(matrix: &[f64; 6], x: f64, y: f64) -> (f64, f64) {
let [a, b, c, d, e, f] = matrix;
(a * x + c * y + e, b * x + d * y + f)
}
fn calculate_image_bbox(ctm: &[f64; 6]) -> Option<BoundingBox> {
let corners = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)];
let [a, b, c, d, e, f] = ctm;
let transformed: Vec<(f64, f64)> = corners
.iter()
.map(|(x, y)| (a * x + c * y + e, b * x + d * y + f))
.collect();
calculate_path_bbox(&transformed)
}
fn calculate_form_xobject_bbox(
doc: &Document,
xobj_ref: ObjectId,
page_ctm: &[f64; 6],
) -> Option<BoundingBox> {
let xobj = doc.get_object(xobj_ref).ok()?;
let stream = xobj.as_stream().ok()?;
let dict = &stream.dict;
let bbox_array = dict.get(b"BBox").ok()?.as_array().ok()?;
if bbox_array.len() != 4 {
return None;
}
let x1 = bbox_array[0].as_f32().unwrap_or(0.0) as f64;
let y1 = bbox_array[1].as_f32().unwrap_or(0.0) as f64;
let x2 = bbox_array[2].as_f32().unwrap_or(0.0) as f64;
let y2 = bbox_array[3].as_f32().unwrap_or(0.0) as f64;
let matrix = if let Ok(matrix_obj) = dict.get(b"Matrix") {
if let Ok(matrix_array) = matrix_obj.as_array() {
if matrix_array.len() == 6 {
let mut m = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
for (i, val) in matrix_array.iter().enumerate() {
m[i] = val
.as_f32()
.unwrap_or(if i == 0 || i == 3 { 1.0 } else { 0.0 })
as f64;
}
m
} else {
[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
}
} else {
[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
}
} else {
[1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
};
let combined_ctm = multiply_matrices(page_ctm, &matrix);
let corners = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)];
let [a, b, c, d, e, f] = combined_ctm;
let transformed: Vec<(f64, f64)> = corners
.iter()
.map(|(x, y)| (a * x + c * y + e, b * x + d * y + f))
.collect();
calculate_path_bbox(&transformed)
}
fn multiply_matrices(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
let [a1, b1, c1, d1, e1, f1] = m1;
let [a2, b2, c2, d2, e2, f2] = m2;
[
a1 * a2 + b1 * c2,
a1 * b2 + b1 * d2,
c1 * a2 + d1 * c2,
c1 * b2 + d1 * d2,
e1 * a2 + f1 * c2 + e2,
e1 * b2 + f1 * d2 + f2,
]
}
#[derive(Debug, Clone, Copy)]
enum XObjectType {
Image,
Form,
Unknown,
}
fn get_xobject_type(doc: &Document, resources: &Dictionary, xobj_name: &[u8]) -> XObjectType {
let xobject_ref = resources.get(b"XObject");
if xobject_ref.is_err() {
return XObjectType::Unknown;
}
let xobject_dict = xobject_ref.unwrap().as_dict();
if xobject_dict.is_err() {
return XObjectType::Unknown;
}
let xobj_ref = xobject_dict
.unwrap()
.get(xobj_name)
.ok()
.and_then(|obj| obj.as_reference().ok());
if xobj_ref.is_none() {
return XObjectType::Unknown;
}
let xobj_stream = doc.get_object(xobj_ref.unwrap());
if xobj_stream.is_err() {
return XObjectType::Unknown;
}
let xobj_stream = xobj_stream.unwrap().as_stream();
if xobj_stream.is_err() {
return XObjectType::Unknown;
}
let subtype = xobj_stream
.unwrap()
.dict
.get(b"Subtype")
.ok()
.and_then(|obj| obj.as_name().ok());
match subtype {
Some(b"Image") => XObjectType::Image,
Some(b"Form") => XObjectType::Form,
_ => XObjectType::Unknown,
}
}
fn filter_components(components: Vec<ContentComponent>, crop_box: &BoundingBox) -> Vec<Operation> {
const SAFETY_MARGIN: f64 = 15.0;
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Filtering {} components with crop box: ({:.2}, {:.2}, {:.2}, {:.2})",
components.len(),
crop_box.left,
crop_box.bottom,
crop_box.right,
crop_box.top
)));
}
#[cfg(debug_assertions)]
eprintln!(
"[DEBUG] filter_components: {} components, crop: ({:.1}, {:.1}, {:.1}, {:.1})",
components.len(),
crop_box.left,
crop_box.bottom,
crop_box.right,
crop_box.top
);
let mut output = Vec::new();
let mut stats = ComponentStats::default();
for component in components {
match component {
ContentComponent::GraphicsState { operators } => {
stats.graphics_state += 1;
output.extend(operators);
}
ContentComponent::TextBlock { operators } => {
stats.text_blocks += 1;
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Keeping TextBlock with {} operators",
operators.len()
)));
}
output.extend(operators);
}
ContentComponent::FormXObject { operator, bbox } => {
stats.form_xobjects += 1;
if let Some(form_bbox) = bbox {
if has_overlap(&form_bbox, crop_box, SAFETY_MARGIN) {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Keeping Form XObject with bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
form_bbox.left, form_bbox.bottom, form_bbox.right, form_bbox.top
)));
}
output.push(operator);
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Removing Form XObject outside bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
form_bbox.left, form_bbox.bottom, form_bbox.right, form_bbox.top
)));
}
}
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] Keeping Form XObject (no bbox calculated)",
));
}
output.push(operator);
}
}
ContentComponent::Path { operators, bbox } => {
stats.paths_total += 1;
if let Some(path_bbox) = bbox {
if has_overlap(&path_bbox, crop_box, SAFETY_MARGIN) {
stats.paths_kept += 1;
output.extend(operators);
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Removing path outside bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
path_bbox.left, path_bbox.bottom, path_bbox.right, path_bbox.top
)));
}
}
} else {
stats.paths_kept += 1;
output.extend(operators);
}
}
ContentComponent::ImageXObject { operator, bbox } => {
stats.images_total += 1;
if let Some(image_bbox) = bbox {
if has_overlap(&image_bbox, crop_box, SAFETY_MARGIN) {
stats.images_kept += 1;
output.push(operator);
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Removing image outside bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
image_bbox.left,
image_bbox.bottom,
image_bbox.right,
image_bbox.top
)));
}
}
} else {
stats.images_kept += 1;
output.push(operator);
}
}
ContentComponent::OrphanText { operator, bbox } => {
stats.orphan_text_total += 1;
if let Some(text_bbox) = bbox {
if has_overlap(&text_bbox, crop_box, SAFETY_MARGIN) {
stats.orphan_text_kept += 1;
output.push(operator);
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Removing orphan text outside bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
text_bbox.left,
text_bbox.bottom,
text_bbox.right,
text_bbox.top
)));
}
}
} else {
stats.orphan_text_kept += 1;
output.push(operator);
}
}
}
}
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Component stats: {} text blocks, {} graphics state, {} form XObjects, {}/{} paths kept, {}/{} images kept, {}/{} orphan text kept",
stats.text_blocks, stats.graphics_state, stats.form_xobjects,
stats.paths_kept, stats.paths_total,
stats.images_kept, stats.images_total,
stats.orphan_text_kept, stats.orphan_text_total
)));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Final output: {} operators",
output.len()
)));
}
output
}
#[derive(Default)]
struct ComponentStats {
text_blocks: usize,
graphics_state: usize,
form_xobjects: usize,
paths_total: usize,
paths_kept: usize,
images_total: usize,
images_kept: usize,
orphan_text_total: usize,
orphan_text_kept: usize,
}
fn has_overlap(component_bbox: &BoundingBox, crop_box: &BoundingBox, margin: f64) -> bool {
let actual_margin = margin.max(200.0); let left = crop_box.left - actual_margin;
let bottom = crop_box.bottom - actual_margin;
let right = crop_box.right + actual_margin;
let top = crop_box.top + actual_margin;
!(component_bbox.right < left
|| component_bbox.left > right
|| component_bbox.top < bottom
|| component_bbox.bottom > top)
}
pub fn filter_content_stream(
doc: &Document,
stream: &Stream,
resources: Option<&Dictionary>,
crop_box: &BoundingBox,
) -> Result<(Vec<u8>, Vec<(ObjectId, Option<Dictionary>)>)> {
let decoded_bytes = stream
.decompressed_content()
.map_err(|e| Error::PdfParse(format!("Failed to decompress content stream: {}", e)))?;
#[cfg(debug_assertions)]
{
let preview = if decoded_bytes.len() > 50 {
&decoded_bytes[..50]
} else {
&decoded_bytes
};
eprintln!("[DEBUG] Raw content bytes (first 50): {:?}", preview);
let ascii_preview = String::from_utf8_lossy(preview);
eprintln!("[DEBUG] As ASCII: {}", ascii_preview);
}
let content = match Content::decode(&decoded_bytes) {
Ok(c) => {
if c.operations.len() == 1 {
let op = &c.operations[0];
if op.operator == "x" || op.operator == "H" {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[WARNING] Invalid operator '{}' detected - keeping original content",
op.operator
)));
}
return Ok((decoded_bytes, vec![]));
}
}
c
}
Err(e) => {
#[cfg(debug_assertions)]
eprintln!("[DEBUG] Content::decode failed: {:?}", e);
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[WARNING] Content::decode failed - keeping original content: {:?}",
e
)));
}
return Ok((decoded_bytes, vec![]));
}
};
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Content stream has {} operations",
content.operations.len()
)));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Decoded {} bytes from content stream",
decoded_bytes.len()
)));
if content.operations.len() == 1 {
let op = &content.operations[0];
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Single operation: '{}' with {} operands",
op.operator,
op.operands.len()
)));
if decoded_bytes.len() < 100 {
let preview = String::from_utf8_lossy(&decoded_bytes);
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Raw content: {}",
preview
)));
}
}
}
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!(
"[DEBUG] Content stream has {} operations",
content.operations.len()
);
if content.operations.len() == 1 {
let op = &content.operations[0];
eprintln!(
"[DEBUG] Single op: '{}', operands: {:?}",
op.operator, op.operands
);
if let Some(Object::Name(name)) = op.operands.first() {
eprintln!(
"[DEBUG] First operand is Name: {}",
String::from_utf8_lossy(name)
);
}
} else if !content.operations.is_empty() {
let first_op = &content.operations[0];
if matches!(first_op.operator.as_str(), "Tj" | "TJ" | "'" | "\"") {
eprintln!(
"[WARNING] Stream starts with text operator '{}' without BT!",
first_op.operator
);
eprintln!(
"[WARNING] This is invalid PDF - text operators should be inside BT/ET blocks"
);
}
let bt_count = content
.operations
.iter()
.filter(|op| op.operator == "BT")
.count();
let et_count = content
.operations
.iter()
.filter(|op| op.operator == "ET")
.count();
let text_ops_count = content
.operations
.iter()
.filter(|op| matches!(op.operator.as_str(), "Tj" | "TJ" | "'" | "\""))
.count();
eprintln!(
"[DEBUG] BT: {}, ET: {}, Text ops: {}",
bt_count, et_count, text_ops_count
);
if content.operations.len() <= 10 {
for (i, op) in content.operations.iter().take(5).enumerate() {
eprintln!("[DEBUG] Op[{}]: '{}'", i, op.operator);
}
}
}
}
let components = parse_into_components(doc, &content.operations, resources)?;
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!("[DEBUG] Parsed into {} components", components.len());
for (i, comp) in components.iter().enumerate() {
match comp {
ContentComponent::Path { bbox, .. } => {
if let Some(b) = bbox {
eprintln!(
"[DEBUG] Component {} (Path): bbox=({:.1},{:.1},{:.1},{:.1})",
i, b.left, b.bottom, b.right, b.top
);
}
}
ContentComponent::FormXObject { bbox, .. } => {
if let Some(b) = bbox {
eprintln!(
"[DEBUG] Component {} (FormXObject): bbox=({:.1},{:.1},{:.1},{:.1})",
i, b.left, b.bottom, b.right, b.top
);
} else {
eprintln!("[DEBUG] Component {} (FormXObject): no bbox", i);
}
}
ContentComponent::TextBlock { operators } => {
eprintln!(
"[DEBUG] Component {} (TextBlock): {} ops",
i,
operators.len()
);
}
ContentComponent::GraphicsState { operators } => {
eprintln!(
"[DEBUG] Component {} (GraphicsState): {} ops",
i,
operators.len()
);
}
_ => {}
}
}
}
let filtered_ops = filter_components(components, crop_box);
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
let original_count = content.operations.len();
let filtered_count = filtered_ops.len();
let removed_count = original_count.saturating_sub(filtered_count);
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Filtered to {} operations ({} removed)",
filtered_count, removed_count
)));
if removed_count == 0 {
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] No operations removed - keeping original content stream",
));
return Ok((decoded_bytes, vec![]));
}
}
#[cfg(not(target_arch = "wasm32"))]
{
let original_count = content.operations.len();
let filtered_count = filtered_ops.len();
let removed_count = original_count.saturating_sub(filtered_count);
eprintln!(
"[DEBUG] Filtered to {} operations ({} removed)",
filtered_count, removed_count
);
if removed_count == 0 {
eprintln!("[DEBUG] No operations removed - keeping original content stream");
return Ok((decoded_bytes, vec![]));
}
}
let filtered_content = Content {
operations: filtered_ops,
};
let encoded = filtered_content
.encode()
.map_err(|e| Error::PdfParse(format!("Failed to encode content stream: {}", e)))?;
Ok((encoded, vec![]))
}
fn get_xobject_object_id(
_doc: &Document,
resources: &Dictionary,
xobj_name: &[u8],
) -> Result<ObjectId> {
let xobject_ref = resources.get(b"XObject")?;
let xobject_dict = xobject_ref
.as_dict()
.map_err(|_| Error::PdfParse("XObject is not a dictionary".to_string()))?;
let xobj_ref = xobject_dict
.get(xobj_name)
.ok()
.and_then(|obj| obj.as_reference().ok())
.ok_or_else(|| {
Error::PdfParse(format!(
"XObject {} not found in Resources",
String::from_utf8_lossy(xobj_name)
))
})?;
Ok(xobj_ref)
}
#[allow(dead_code)]
fn get_form_xobject_ref(
doc: &Document,
resources: &Dictionary,
xobj_name: &[u8],
) -> Result<(ObjectId, Option<Dictionary>)> {
let xobject_ref = resources.get(b"XObject")?;
let xobject_dict = xobject_ref
.as_dict()
.map_err(|_| Error::PdfParse("XObject is not a dictionary".to_string()))?;
let xobj_ref = xobject_dict
.get(xobj_name)
.ok()
.and_then(|obj| obj.as_reference().ok())
.ok_or_else(|| {
Error::PdfParse(format!(
"XObject {} not found in Resources",
String::from_utf8_lossy(xobj_name)
))
})?;
let xobj_stream = doc
.get_object(xobj_ref)
.map_err(|e| Error::PdfParse(format!("Failed to get XObject: {}", e)))?
.as_stream()
.map_err(|e| Error::PdfParse(format!("XObject is not a stream: {}", e)))?;
let is_form = xobj_stream
.dict
.get(b"Subtype")
.ok()
.and_then(|obj| obj.as_name().ok())
.map(|name| name == b"Form")
.unwrap_or(false);
if !is_form {
return Err(Error::PdfParse("Not a Form XObject".to_string()));
}
let form_resources = xobj_stream
.dict
.get(b"Resources")
.ok()
.and_then(|obj| obj.as_dict().ok()).cloned();
Ok((xobj_ref, form_resources))
}
pub fn filter_form_xobject(
doc: &mut Document,
xobj_id: ObjectId,
xobj_resources: Option<Dictionary>,
crop_box: &BoundingBox,
) -> Result<Vec<(ObjectId, Option<Dictionary>)>> {
let xobj_stream = doc
.get_object(xobj_id)
.map_err(|e| Error::PdfParse(format!("Failed to get XObject: {}", e)))?
.as_stream()
.map_err(|e| Error::PdfParse(format!("XObject is not a stream: {}", e)))?;
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Filtering Form XObject: {:?}",
xobj_id
)));
}
let (filtered_content, nested_form_xobjects) =
filter_content_stream(doc, xobj_stream, xobj_resources.as_ref(), crop_box)?;
let xobj_stream_mut = doc
.get_object_mut(xobj_id)
.map_err(|e| Error::PdfParse(format!("Failed to get XObject mut: {}", e)))?
.as_stream_mut()
.map_err(|e| Error::PdfParse(format!("XObject is not a stream (mut): {}", e)))?;
xobj_stream_mut.set_plain_content(filtered_content);
Ok(nested_form_xobjects)
}
#[allow(dead_code)]
fn filter_operations(
doc: &Document,
operations: &[Operation],
resources: Option<&Dictionary>,
crop_box: &BoundingBox,
) -> Result<(Vec<Operation>, Vec<(ObjectId, Option<Dictionary>)>)> {
let mut filtered = vec![];
let mut form_xobjects: Vec<(ObjectId, Option<Dictionary>)> = vec![];
let mut state = GraphicsState::default();
let mut state_stack: Vec<GraphicsState> = vec![];
let mut current_path: Vec<(f64, f64)> = vec![];
let mut path_start = (0.0, 0.0);
let mut path_ops_buffer: Vec<Operation> = vec![];
for op in operations {
let operator = op.operator.as_str();
let should_keep = match operator {
"q" => {
state_stack.push(state.clone());
true
}
"Q" => {
if let Some(saved_state) = state_stack.pop() {
state = saved_state;
}
true
}
"cm" => {
if let Some(matrix) = extract_matrix(&op.operands) {
state.apply_transform(&matrix);
}
true
}
"Tf" => {
if let Some(size) = extract_number(&op.operands, 1) {
state.font_size = size;
}
true
}
"Tc" | "Tw" | "Tz" | "TL" | "Tr" | "Ts" => true,
"Td" | "TD" => {
if let (Some(tx), Some(ty)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
state.text_pos = (state.text_pos.0 + tx, state.text_pos.1 + ty);
}
true
}
"Tm" => {
if let Some(matrix) = extract_matrix(&op.operands) {
state.text_matrix = matrix;
state.text_pos = (matrix[4], matrix[5]);
}
true
}
"T*" => true,
"BT" => {
state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
state.text_pos = (0.0, 0.0);
true
}
"ET" => true,
"Tj" | "TJ" | "'" | "\"" => true,
"m" => {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
let pos = state.transform_point(x, y);
current_path.clear();
current_path.push(pos);
path_start = pos;
}
path_ops_buffer.push(op.clone());
false }
"l" => {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
) {
current_path.push(state.transform_point(x, y));
}
path_ops_buffer.push(op.clone());
false }
"c" | "v" | "y" => {
if op.operands.len() >= 2 {
if let (Some(x), Some(y)) = (
extract_number(&op.operands, op.operands.len() - 2),
extract_number(&op.operands, op.operands.len() - 1),
) {
current_path.push(state.transform_point(x, y));
}
}
path_ops_buffer.push(op.clone());
false }
"re" => {
if let (Some(x), Some(y), Some(w), Some(h)) = (
extract_number(&op.operands, 0),
extract_number(&op.operands, 1),
extract_number(&op.operands, 2),
extract_number(&op.operands, 3),
) {
current_path.clear();
current_path.push(state.transform_point(x, y));
current_path.push(state.transform_point(x + w, y));
current_path.push(state.transform_point(x + w, y + h));
current_path.push(state.transform_point(x, y + h));
}
path_ops_buffer.push(op.clone());
false }
"h" => {
if !current_path.is_empty() {
current_path.push(path_start);
}
path_ops_buffer.push(op.clone());
false }
"S" | "s" | "f" | "F" | "f*" | "B" | "B*" | "b" | "b*" => {
let keep = path_intersects_box(¤t_path, crop_box);
if keep {
filtered.append(&mut path_ops_buffer);
filtered.push(op.clone());
} else {
path_ops_buffer.clear();
}
current_path.clear();
false }
"W" | "W*" => {
path_ops_buffer.push(op.clone());
false }
"n" => {
current_path.clear();
path_ops_buffer.clear();
false }
"CS" | "cs" | "SC" | "SCN" | "sc" | "scn" | "G" | "g" | "RG" | "rg" | "K" | "k" => true,
"Do" => {
if let Some(Object::Name(xobj_name)) = op.operands.first() {
if let Some(resources_dict) = resources {
if let Ok((xobj_id, xobj_resources)) =
get_form_xobject_ref(doc, resources_dict, xobj_name)
{
form_xobjects.push((xobj_id, xobj_resources));
}
}
}
true
}
"w" | "J" | "j" | "M" | "d" | "ri" | "i" | "gs" => true,
"BMC" | "BDC" | "EMC" | "MP" | "DP" => true,
_ => true,
};
if should_keep {
filtered.push(op.clone());
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Filtered out: {}",
operator
)));
}
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!(
"[DEBUG] Filtered out: {} (operands: {})",
operator,
op.operands.len()
);
}
}
}
Ok((filtered, form_xobjects))
}
#[allow(dead_code)]
fn create_clipping_path_operations(bbox: &BoundingBox) -> Vec<lopdf::content::Operation> {
use lopdf::content::Operation;
vec![
Operation::new("q", vec![]),
Operation::new(
"re",
vec![
Object::Real(bbox.left as f32),
Object::Real(bbox.bottom as f32),
Object::Real(bbox.width() as f32),
Object::Real(bbox.height() as f32),
],
),
Operation::new("W", vec![]),
Operation::new("n", vec![]),
]
}
fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
if operands.len() >= 6 {
Some([
extract_number(operands, 0)?,
extract_number(operands, 1)?,
extract_number(operands, 2)?,
extract_number(operands, 3)?,
extract_number(operands, 4)?,
extract_number(operands, 5)?,
])
} else {
None
}
}
fn extract_number(operands: &[Object], index: usize) -> Option<f64> {
operands.get(index).and_then(|obj| match obj {
Object::Integer(i) => Some(*i as f64),
Object::Real(f) => Some(*f as f64),
_ => None,
})
}
#[allow(dead_code)]
fn is_point_near_box(point: (f64, f64), bbox: &BoundingBox, margin: f64) -> bool {
let (x, y) = point;
x >= bbox.left - margin
&& x <= bbox.right + margin
&& y >= bbox.bottom - margin
&& y <= bbox.top + margin
}
#[allow(dead_code)]
fn path_intersects_box(path: &[(f64, f64)], bbox: &BoundingBox) -> bool {
if path.is_empty() {
return true; }
for &(x, y) in path {
if is_point_near_box((x, y), bbox, 10.0) {
return true;
}
}
let min_x = path
.iter()
.map(|(x, _)| x)
.fold(f64::INFINITY, |a, &b| a.min(b));
let max_x = path
.iter()
.map(|(x, _)| x)
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let min_y = path
.iter()
.map(|(_, y)| y)
.fold(f64::INFINITY, |a, &b| a.min(b));
let max_y = path
.iter()
.map(|(_, y)| y)
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
!(max_x < bbox.left || min_x > bbox.right || max_y < bbox.bottom || min_y > bbox.top)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_clipping_path() {
let bbox = BoundingBox::new(100.0, 100.0, 500.0, 700.0).unwrap();
let ops = create_clipping_path_operations(&bbox);
assert_eq!(ops.len(), 4);
assert_eq!(ops[0].operator, "q");
assert_eq!(ops[1].operator, "re");
assert_eq!(ops[2].operator, "W");
assert_eq!(ops[3].operator, "n");
}
#[test]
fn test_is_point_near_box() {
let bbox = BoundingBox::new(100.0, 100.0, 500.0, 700.0).unwrap();
assert!(is_point_near_box((300.0, 400.0), &bbox, 0.0));
assert!(!is_point_near_box((50.0, 50.0), &bbox, 0.0));
assert!(is_point_near_box((95.0, 100.0), &bbox, 10.0));
}
#[test]
fn test_extract_number() {
let operands = vec![Object::Integer(42), Object::Real(3.14)];
assert_eq!(extract_number(&operands, 0), Some(42.0));
let real_value = extract_number(&operands, 1).unwrap();
assert!((real_value - 3.14).abs() < 1e-6);
assert_eq!(extract_number(&operands, 2), None);
}
}