use crate::bbox::{detect_bbox, BoundingBox};
use crate::error::{Error, Result};
use crate::pdf_ops::{apply_cropbox, get_page_count, get_page_dimensions};
use crate::CropOptions;
use lopdf::Document;
pub fn crop_pdf(pdf_data: &[u8], options: CropOptions) -> Result<Vec<u8>> {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str("[DEBUG] crop_pdf called"));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] page_range: {:?}",
options.page_range
)));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] shrink_to_content: {}",
options.shrink_to_content
)));
}
let mut doc = Document::load_mem(pdf_data)?;
let page_count = get_page_count(&doc);
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Total pages in document: {}",
page_count
)));
}
let pages_to_process: Vec<usize> = if let Some(ref range) = options.page_range {
range.to_page_list(page_count)
} else {
(0..page_count).collect()
};
if options.verbose {
if pages_to_process.len() == page_count {
eprintln!("Processing all {} pages", page_count);
} else {
eprintln!(
"Processing {} of {} pages",
pages_to_process.len(),
page_count
);
}
}
#[cfg(feature = "parallel")]
let bbox_results: Vec<_> = {
use rayon::prelude::*;
pages_to_process
.par_iter() .map(|&page_num| {
(
page_num,
bbox_detection_task(pdf_data, &doc, page_num, &options),
)
})
.collect::<Vec<_>>()
};
#[cfg(not(feature = "parallel"))]
let bbox_results: Vec<_> = pages_to_process
.iter()
.map(|&page_num| {
(
page_num,
bbox_detection_task(pdf_data, &doc, page_num, &options),
)
})
.collect::<Vec<_>>();
for (page_num, bbox_result) in bbox_results.iter() {
let (final_bbox, is_manual) = bbox_result.as_ref().map_err(|e| {
Error::PdfParse(format!(
"Failed to detect bbox for page {}: {}",
page_num + 1,
e
))
})?;
let should_clip = options.clip_content && *is_manual;
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
if options.clip_content {
if *is_manual {
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Page {}: Applying content clipping (manual bbox)",
page_num + 1
)));
} else {
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Page {}: Skipping clipping (auto-detected bbox - already tight)",
page_num + 1
)));
}
}
}
if options.verbose && options.clip_content && !is_manual {
eprintln!(" Skipping clipping (auto-detected bbox - fast track)");
}
apply_cropbox(&mut doc, *page_num, final_bbox, should_clip)?;
}
if let Some(ref _range) = options.page_range {
let all_pages: Vec<usize> = (0..page_count).collect();
let pages_to_remove: Vec<u32> = all_pages
.iter()
.filter(|&&p| !pages_to_process.contains(&p))
.map(|&p| (p + 1) as u32) .collect();
if !pages_to_remove.is_empty() {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Removing {} pages not in range: {:?}",
pages_to_remove.len(),
pages_to_remove
)));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Pages to keep (0-indexed): {:?}",
pages_to_process
)));
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Page count before deletion: {}",
page_count
)));
}
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!(
"[DEBUG] Removing {} pages not in range: {:?}",
pages_to_remove.len(),
pages_to_remove
);
eprintln!("[DEBUG] Pages to keep (0-indexed): {:?}", pages_to_process);
eprintln!("[DEBUG] Page count before deletion: {}", page_count);
}
doc.delete_pages(&pages_to_remove);
let new_count = get_page_count(&doc);
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Page count after deletion: {}",
new_count
)));
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] Cleaning up unused objects...",
));
}
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!("[DEBUG] Page count after deletion: {}", new_count);
eprintln!("[DEBUG] Cleaning up unused objects...");
}
doc.delete_zero_length_streams(); doc.prune_objects(); doc.renumber_objects(); doc.compress();
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str("[DEBUG] Cleanup complete"));
}
#[cfg(not(target_arch = "wasm32"))]
{
eprintln!("[DEBUG] Cleanup complete");
}
}
}
let mut output = Vec::new();
doc.save_to(&mut output)?;
Ok(output)
}
fn bbox_detection_task(
pdf_data: &[u8],
doc: &Document,
page_num: usize,
options: &CropOptions,
) -> Result<(BoundingBox, bool)> {
let page_count = get_page_count(doc);
if options.verbose {
eprintln!("Processing page {}/{}", page_num + 1, page_count);
}
let (bbox, is_manual) = determine_bbox_with_source_parallel(
pdf_data, doc, page_num, options,
)?;
if options.verbose {
eprintln!(
" Detected bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
bbox.left, bbox.bottom, bbox.right, bbox.top
);
eprintln!(" Size: {:.2} x {:.2} pts", bbox.width(), bbox.height());
}
let bbox_with_margins = bbox.with_margins(&options.margins);
if options.verbose {
eprintln!(
" With margins: ({:.2}, {:.2}, {:.2}, {:.2})",
bbox_with_margins.left,
bbox_with_margins.bottom,
bbox_with_margins.right,
bbox_with_margins.top
);
}
let (page_width, page_height) = get_page_dimensions(doc, page_num)?;
let final_bbox = bbox_with_margins.clamp_to_page(page_width, page_height);
if options.verbose {
eprintln!(
" Final bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
final_bbox.left, final_bbox.bottom, final_bbox.right, final_bbox.top
);
}
Ok((final_bbox, is_manual))
}
fn determine_bbox_with_source_parallel(
pdf_data: &[u8],
doc: &Document, page_num: usize,
options: &CropOptions,
) -> Result<(BoundingBox, bool)> {
if let Some(ref page_bboxes) = options.page_bboxes {
if let Some(&bbox) = page_bboxes.get(&page_num) {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Per-page bbox for page {}: ({:.2}, {:.2}, {:.2}, {:.2}), shrink_to_content: {}",
page_num + 1, bbox.left, bbox.bottom, bbox.right, bbox.top, options.shrink_to_content
)));
}
if options.verbose {
eprintln!(" Using per-page bbox for page {}", page_num + 1);
}
if options.shrink_to_content {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] Shrinking per-page bbox to actual content...",
));
}
match detect_bbox_within_region(pdf_data, page_num, &bbox, options.verbose) {
Ok(detected_bbox) => {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Shrunk to: ({:.2}, {:.2}, {:.2}, {:.2})",
detected_bbox.left,
detected_bbox.bottom,
detected_bbox.right,
detected_bbox.top
)));
}
return Ok((detected_bbox, true));
}
Err(_e) => {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Could not shrink: {}, using manual bbox",
_e
)));
}
return Ok((bbox, true));
}
}
}
return Ok((bbox, true));
}
}
let page_number = page_num + 1;
let manual_bbox = if page_number % 2 == 1 {
options.bbox_odd
} else {
options.bbox_even
}
.or(options.bbox_override);
if let Some(bbox) = manual_bbox {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Manual bbox for page {}: ({:.2}, {:.2}, {:.2}, {:.2}), shrink_to_content: {}",
page_num + 1, bbox.left, bbox.bottom, bbox.right, bbox.top, options.shrink_to_content
)));
}
if options.shrink_to_content {
if options.verbose {
eprintln!(
" Manual bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
bbox.left, bbox.bottom, bbox.right, bbox.top
);
eprintln!(" Detecting actual content within manual bbox...");
}
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] Shrinking manual bbox to actual content...",
));
}
match detect_bbox_within_region(pdf_data, page_num, &bbox, options.verbose) {
Ok(detected_bbox) => {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Shrunk to: ({:.2}, {:.2}, {:.2}, {:.2})",
detected_bbox.left,
detected_bbox.bottom,
detected_bbox.right,
detected_bbox.top
)));
}
if options.verbose {
eprintln!(
" Shrunk to actual content: ({:.2}, {:.2}, {:.2}, {:.2})",
detected_bbox.left,
detected_bbox.bottom,
detected_bbox.right,
detected_bbox.top
);
}
return Ok((detected_bbox, true));
}
Err(e) => {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] Could not shrink bbox: {}, using manual bbox",
e
)));
}
if options.verbose {
eprintln!(" Warning: Could not detect content within bbox ({}), using manual bbox", e);
}
return Ok((bbox, true));
}
}
} else {
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(
"[DEBUG] Using manual bbox without shrinking",
));
}
return Ok((bbox, true));
}
}
#[cfg(target_arch = "wasm32")]
{
use wasm_bindgen::JsValue;
web_sys::console::log_1(&JsValue::from_str(&format!(
"[DEBUG] No manual bbox for page {}, auto-detecting content",
page_num + 1
)));
}
let bbox = detect_bbox_with_method_parallel(
pdf_data,
doc,
page_num,
options.bbox_method,
options.verbose,
)?;
Ok((bbox, false))
}
#[allow(dead_code)]
fn determine_bbox_with_source(
pdf_data: &[u8],
doc: &mut Document,
page_num: usize,
options: &CropOptions,
) -> Result<(BoundingBox, bool)> {
if let Some(ref page_bboxes) = options.page_bboxes {
if let Some(&bbox) = page_bboxes.get(&page_num) {
if options.verbose {
eprintln!(" Using per-page bbox for page {}", page_num + 1);
}
return Ok((bbox, true));
}
}
let page_number = page_num + 1;
let manual_bbox = if page_number % 2 == 1 {
options.bbox_odd
} else {
options.bbox_even
}
.or(options.bbox_override);
if let Some(bbox) = manual_bbox {
if options.shrink_to_content {
if options.verbose {
eprintln!(
" Manual bbox: ({:.2}, {:.2}, {:.2}, {:.2})",
bbox.left, bbox.bottom, bbox.right, bbox.top
);
eprintln!(" Detecting actual content within manual bbox...");
}
match detect_bbox_within_region(pdf_data, page_num, &bbox, options.verbose) {
Ok(detected_bbox) => {
if options.verbose {
eprintln!(
" Shrunk to actual content: ({:.2}, {:.2}, {:.2}, {:.2})",
detected_bbox.left,
detected_bbox.bottom,
detected_bbox.right,
detected_bbox.top
);
}
return Ok((detected_bbox, false));
}
Err(e) => {
if options.verbose {
eprintln!(" Warning: Could not detect content within bbox ({}), using manual bbox", e);
}
return Ok((bbox, true));
}
}
} else {
return Ok((bbox, true));
}
}
let bbox = detect_bbox_with_method(
pdf_data,
doc,
page_num,
options.bbox_method,
options.verbose,
)?;
Ok((bbox, false))
}
fn detect_bbox_within_region(
pdf_data: &[u8],
page_num: usize,
region: &BoundingBox,
_verbose: bool,
) -> Result<BoundingBox> {
use crate::error::Error;
use hayro::{render, InterpreterSettings, Pdf as HayroPdf, RenderSettings};
use std::sync::Arc;
let scale = 1.0f64;
let data = Arc::new(pdf_data.to_vec());
let pdf = HayroPdf::new(data)
.map_err(|e| Error::PdfParse(format!("hayro failed to load PDF: {:?}", e)))?;
let page = pdf
.pages()
.get(page_num)
.ok_or_else(|| Error::InvalidPage(format!("page {} not found", page_num)))?;
let interpreter_settings = InterpreterSettings::default();
let render_settings = RenderSettings {
x_scale: scale as f32,
y_scale: scale as f32,
..Default::default()
};
let pixmap = render(page, &interpreter_settings, &render_settings);
let width = pixmap.width() as usize;
let height = pixmap.height() as usize;
let pixels = pixmap.data_as_u8_slice();
let region_left_px = (region.left * scale).max(0.0) as usize;
let region_right_px = ((region.right * scale).ceil() as usize).min(width);
let pdf_height = (height as f64) / scale;
let region_bottom_px = ((pdf_height - region.top) * scale).max(0.0) as usize;
let region_top_px = (((pdf_height - region.bottom) * scale).ceil() as usize).min(height);
let mut min_x = region_right_px;
let mut max_x = region_left_px;
let mut min_y = region_top_px;
let mut max_y = region_bottom_px;
const WHITE_THRESHOLD: u8 = 250;
for y in region_bottom_px..region_top_px {
for x in region_left_px..region_right_px {
let idx = (y * width + x) * 4;
if idx + 2 < pixels.len() {
let r = pixels[idx];
let g = pixels[idx + 1];
let b = pixels[idx + 2];
if r < WHITE_THRESHOLD || g < WHITE_THRESHOLD || b < WHITE_THRESHOLD {
min_x = min_x.min(x);
max_x = max_x.max(x);
min_y = min_y.min(y);
max_y = max_y.max(y);
}
}
}
}
if min_x > max_x || min_y > max_y {
return Ok(*region);
}
let left = (min_x as f64) / scale;
let right = (max_x as f64 + 1.0) / scale;
let bottom = pdf_height - ((max_y as f64 + 1.0) / scale);
let top = pdf_height - ((min_y as f64) / scale);
BoundingBox::new(left, bottom, right, top)
}
fn detect_bbox_with_method_parallel(
pdf_data: &[u8],
_doc: &Document, page_num: usize,
method: crate::BBoxMethod,
verbose: bool,
) -> Result<BoundingBox> {
use crate::bbox::detect_bbox_by_rendering;
use crate::BBoxMethod;
match method {
#[cfg(not(target_arch = "wasm32"))]
BBoxMethod::Ghostscript => crate::ghostscript::detect_bbox_gs(pdf_data, page_num),
#[cfg(target_arch = "wasm32")]
BBoxMethod::Ghostscript => {
if verbose {
eprintln!(" Ghostscript not available in WASM, using rendering-based detection");
}
detect_bbox_by_rendering(pdf_data, page_num, Some(72.0))
}
BBoxMethod::ContentStream => {
detect_bbox_by_rendering(pdf_data, page_num, Some(72.0))
}
#[cfg(not(target_arch = "wasm32"))]
BBoxMethod::Auto => {
match crate::ghostscript::detect_bbox_gs(pdf_data, page_num) {
Ok(bbox) => {
if verbose {
eprintln!(" BBox method: Ghostscript");
}
Ok(bbox)
}
Err(e) => {
if verbose {
eprintln!(
" Ghostscript unavailable ({}), using rendering-based detection",
e
);
}
detect_bbox_by_rendering(pdf_data, page_num, Some(72.0))
}
}
}
#[cfg(target_arch = "wasm32")]
BBoxMethod::Auto => {
detect_bbox_by_rendering(pdf_data, page_num, Some(72.0))
}
}
}
#[allow(dead_code)]
#[allow(unused_variables)] fn detect_bbox_with_method(
pdf_data: &[u8],
doc: &mut Document,
page_num: usize,
method: crate::BBoxMethod,
verbose: bool,
) -> Result<BoundingBox> {
use crate::BBoxMethod;
match method {
#[cfg(not(target_arch = "wasm32"))]
BBoxMethod::Ghostscript => crate::ghostscript::detect_bbox_gs(pdf_data, page_num),
#[cfg(target_arch = "wasm32")]
BBoxMethod::Ghostscript => {
if verbose {
eprintln!(" Ghostscript not available in WASM, using content stream parsing");
}
detect_bbox(doc, page_num)
}
BBoxMethod::ContentStream => detect_bbox(doc, page_num),
#[cfg(not(target_arch = "wasm32"))]
BBoxMethod::Auto => {
match crate::ghostscript::detect_bbox_gs(pdf_data, page_num) {
Ok(bbox) => {
if verbose {
eprintln!(" BBox method: Ghostscript");
}
Ok(bbox)
}
Err(e) => {
if verbose {
eprintln!(
" Ghostscript unavailable ({}), using content stream parsing",
e
);
}
detect_bbox(doc, page_num)
}
}
}
#[cfg(target_arch = "wasm32")]
BBoxMethod::Auto => {
detect_bbox(doc, page_num)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::margins::Margins;
#[test]
fn test_crop_options_default() {
let options = CropOptions::default();
assert_eq!(options.margins, Margins::none());
assert!(options.bbox_override.is_none());
assert!(!options.verbose);
}
#[test]
fn test_crop_options_with_margins() {
let options = CropOptions {
margins: Margins::uniform(10.0),
..Default::default()
};
assert_eq!(options.margins.left, 10.0);
}
}