1use crate::backend::PdfBackend;
7use crate::error::BackendError;
8use crate::handler::ContentHandler;
9use pdfplumber_core::{BBox, ExtractOptions};
10
11pub struct LopdfDocument {
13 inner: lopdf::Document,
15 page_ids: Vec<lopdf::ObjectId>,
17}
18
19impl LopdfDocument {
20 pub fn inner(&self) -> &lopdf::Document {
22 &self.inner
23 }
24}
25
26impl std::fmt::Debug for LopdfDocument {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 f.debug_struct("LopdfDocument")
29 .field("page_count", &self.page_ids.len())
30 .finish_non_exhaustive()
31 }
32}
33
34#[derive(Debug, Clone, Copy)]
36pub struct LopdfPage {
37 pub object_id: lopdf::ObjectId,
39 pub index: usize,
41}
42
43pub struct LopdfBackend;
59
60fn extract_bbox_from_array(array: &[lopdf::Object]) -> Result<BBox, BackendError> {
62 if array.len() != 4 {
63 return Err(BackendError::Parse(format!(
64 "expected 4-element array for box, got {}",
65 array.len()
66 )));
67 }
68 let x0 = object_to_f64(&array[0])?;
69 let y0 = object_to_f64(&array[1])?;
70 let x1 = object_to_f64(&array[2])?;
71 let y1 = object_to_f64(&array[3])?;
72 Ok(BBox::new(x0, y0, x1, y1))
73}
74
75pub(crate) fn object_to_f64(obj: &lopdf::Object) -> Result<f64, BackendError> {
77 match obj {
78 lopdf::Object::Integer(i) => Ok(*i as f64),
79 lopdf::Object::Real(f) => Ok(*f as f64),
80 _ => Err(BackendError::Parse(format!("expected number, got {obj:?}"))),
81 }
82}
83
84fn resolve_inherited<'a>(
89 doc: &'a lopdf::Document,
90 page_id: lopdf::ObjectId,
91 key: &[u8],
92) -> Result<Option<&'a lopdf::Object>, BackendError> {
93 let mut current_id = page_id;
94 loop {
95 let dict = doc
96 .get_object(current_id)
97 .and_then(|o| o.as_dict())
98 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
99
100 if let Ok(value) = dict.get(key) {
101 return Ok(Some(value));
102 }
103
104 match dict.get(b"Parent") {
106 Ok(parent_obj) => {
107 current_id = parent_obj
108 .as_reference()
109 .map_err(|e| BackendError::Parse(format!("invalid /Parent reference: {e}")))?;
110 }
111 Err(_) => return Ok(None),
112 }
113 }
114}
115
116impl PdfBackend for LopdfBackend {
117 type Document = LopdfDocument;
118 type Page = LopdfPage;
119 type Error = BackendError;
120
121 fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
122 let inner = lopdf::Document::load_mem(bytes)
123 .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
124
125 let pages_map = inner.get_pages();
127 let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
128
129 Ok(LopdfDocument { inner, page_ids })
130 }
131
132 fn page_count(doc: &Self::Document) -> usize {
133 doc.page_ids.len()
134 }
135
136 fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
137 if index >= doc.page_ids.len() {
138 return Err(BackendError::Parse(format!(
139 "page index {index} out of range (0..{})",
140 doc.page_ids.len()
141 )));
142 }
143 Ok(LopdfPage {
144 object_id: doc.page_ids[index],
145 index,
146 })
147 }
148
149 fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
150 let obj = resolve_inherited(&doc.inner, page.object_id, b"MediaBox")?
151 .ok_or_else(|| BackendError::Parse("MediaBox not found on page or ancestors".into()))?;
152 let array = obj
153 .as_array()
154 .map_err(|e| BackendError::Parse(format!("MediaBox is not an array: {e}")))?;
155 extract_bbox_from_array(array)
156 }
157
158 fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
159 let dict = doc
161 .inner
162 .get_object(page.object_id)
163 .and_then(|o| o.as_dict())
164 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
165
166 match dict.get(b"CropBox") {
167 Ok(obj) => {
168 let array = obj
169 .as_array()
170 .map_err(|e| BackendError::Parse(format!("CropBox is not an array: {e}")))?;
171 Ok(Some(extract_bbox_from_array(array)?))
172 }
173 Err(_) => Ok(None),
174 }
175 }
176
177 fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
178 match resolve_inherited(&doc.inner, page.object_id, b"Rotate")? {
179 Some(obj) => {
180 let rotation = obj
181 .as_i64()
182 .map_err(|e| BackendError::Parse(format!("Rotate is not an integer: {e}")))?;
183 Ok(rotation as i32)
184 }
185 None => Ok(0), }
187 }
188
189 fn interpret_page(
190 doc: &Self::Document,
191 page: &Self::Page,
192 handler: &mut dyn ContentHandler,
193 options: &ExtractOptions,
194 ) -> Result<(), Self::Error> {
195 let inner = &doc.inner;
196
197 let page_dict = inner
199 .get_object(page.object_id)
200 .and_then(|o| o.as_dict())
201 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
202
203 let content_bytes = get_page_content_bytes(inner, page_dict)?;
205
206 let resources = get_page_resources(inner, page.object_id)?;
208
209 let mut gstate = crate::interpreter_state::InterpreterState::new();
211 let mut tstate = crate::text_state::TextState::new();
212
213 crate::interpreter::interpret_content_stream(
215 inner,
216 &content_bytes,
217 resources,
218 handler,
219 options,
220 0, &mut gstate,
222 &mut tstate,
223 )
224 }
225}
226
227fn get_page_content_bytes(
231 doc: &lopdf::Document,
232 page_dict: &lopdf::Dictionary,
233) -> Result<Vec<u8>, BackendError> {
234 let contents_obj = match page_dict.get(b"Contents") {
235 Ok(obj) => obj,
236 Err(_) => return Ok(Vec::new()), };
238
239 match contents_obj {
240 lopdf::Object::Reference(id) => {
241 let obj = doc
242 .get_object(*id)
243 .map_err(|e| BackendError::Parse(format!("failed to resolve /Contents: {e}")))?;
244 let stream = obj
245 .as_stream()
246 .map_err(|e| BackendError::Parse(format!("/Contents is not a stream: {e}")))?;
247 decode_content_stream(stream)
248 }
249 lopdf::Object::Array(arr) => {
250 let mut content = Vec::new();
251 for item in arr {
252 let id = item.as_reference().map_err(|e| {
253 BackendError::Parse(format!("/Contents array item is not a reference: {e}"))
254 })?;
255 let obj = doc.get_object(id).map_err(|e| {
256 BackendError::Parse(format!("failed to resolve /Contents stream: {e}"))
257 })?;
258 let stream = obj.as_stream().map_err(|e| {
259 BackendError::Parse(format!("/Contents array item is not a stream: {e}"))
260 })?;
261 let bytes = decode_content_stream(stream)?;
262 if !content.is_empty() {
263 content.push(b' ');
264 }
265 content.extend_from_slice(&bytes);
266 }
267 Ok(content)
268 }
269 _ => Err(BackendError::Parse(
270 "/Contents is not a reference or array".to_string(),
271 )),
272 }
273}
274
275fn decode_content_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
277 if stream.dict.get(b"Filter").is_ok() {
278 stream
279 .decompressed_content()
280 .map_err(|e| BackendError::Parse(format!("failed to decompress content stream: {e}")))
281 } else {
282 Ok(stream.content.clone())
283 }
284}
285
286fn get_page_resources(
288 doc: &lopdf::Document,
289 page_id: lopdf::ObjectId,
290) -> Result<&lopdf::Dictionary, BackendError> {
291 match resolve_inherited(doc, page_id, b"Resources")? {
292 Some(obj) => {
293 let obj = match obj {
295 lopdf::Object::Reference(id) => doc.get_object(*id).map_err(|e| {
296 BackendError::Parse(format!("failed to resolve /Resources reference: {e}"))
297 })?,
298 other => other,
299 };
300 obj.as_dict()
301 .map_err(|_| BackendError::Parse("/Resources is not a dictionary".to_string()))
302 }
303 None => {
304 static EMPTY_DICT: std::sync::LazyLock<lopdf::Dictionary> =
307 std::sync::LazyLock::new(lopdf::Dictionary::new);
308 Ok(&EMPTY_DICT)
309 }
310 }
311}
312
313#[cfg(test)]
318fn create_test_pdf(page_count: usize) -> Vec<u8> {
319 use lopdf::{Document, Object, ObjectId, dictionary};
320
321 let mut doc = Document::with_version("1.5");
322 let pages_id: ObjectId = doc.new_object_id();
323
324 let mut page_ids: Vec<Object> = Vec::new();
325 for _ in 0..page_count {
326 let page_id = doc.add_object(dictionary! {
327 "Type" => "Page",
328 "Parent" => pages_id,
329 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
330 });
331 page_ids.push(page_id.into());
332 }
333
334 doc.objects.insert(
335 pages_id,
336 Object::Dictionary(dictionary! {
337 "Type" => "Pages",
338 "Kids" => page_ids,
339 "Count" => page_count as i64,
340 }),
341 );
342
343 let catalog_id = doc.add_object(dictionary! {
344 "Type" => "Catalog",
345 "Pages" => pages_id,
346 });
347 doc.trailer.set("Root", catalog_id);
348
349 let mut buf = Vec::new();
350 doc.save_to(&mut buf).expect("failed to save test PDF");
351 buf
352}
353
354#[cfg(test)]
356fn create_test_pdf_inherited_media_box() -> Vec<u8> {
357 use lopdf::{Document, Object, ObjectId, dictionary};
358
359 let mut doc = Document::with_version("1.5");
360 let pages_id: ObjectId = doc.new_object_id();
361
362 let page_id = doc.add_object(dictionary! {
364 "Type" => "Page",
365 "Parent" => pages_id,
366 });
367
368 doc.objects.insert(
369 pages_id,
370 Object::Dictionary(dictionary! {
371 "Type" => "Pages",
372 "Kids" => vec![Object::from(page_id)],
373 "Count" => 1i64,
374 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
375 }),
376 );
377
378 let catalog_id = doc.add_object(dictionary! {
379 "Type" => "Catalog",
380 "Pages" => pages_id,
381 });
382 doc.trailer.set("Root", catalog_id);
383
384 let mut buf = Vec::new();
385 doc.save_to(&mut buf).expect("failed to save test PDF");
386 buf
387}
388
389#[cfg(test)]
391fn create_test_pdf_with_crop_box() -> Vec<u8> {
392 use lopdf::{Document, Object, ObjectId, dictionary};
393
394 let mut doc = Document::with_version("1.5");
395 let pages_id: ObjectId = doc.new_object_id();
396
397 let page_id = doc.add_object(dictionary! {
398 "Type" => "Page",
399 "Parent" => pages_id,
400 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
401 "CropBox" => vec![
402 Object::Real(36.0),
403 Object::Real(36.0),
404 Object::Real(576.0),
405 Object::Real(756.0),
406 ],
407 });
408
409 doc.objects.insert(
410 pages_id,
411 Object::Dictionary(dictionary! {
412 "Type" => "Pages",
413 "Kids" => vec![Object::from(page_id)],
414 "Count" => 1i64,
415 }),
416 );
417
418 let catalog_id = doc.add_object(dictionary! {
419 "Type" => "Catalog",
420 "Pages" => pages_id,
421 });
422 doc.trailer.set("Root", catalog_id);
423
424 let mut buf = Vec::new();
425 doc.save_to(&mut buf).expect("failed to save test PDF");
426 buf
427}
428
429#[cfg(test)]
431fn create_test_pdf_with_rotate(rotation: i64) -> Vec<u8> {
432 use lopdf::{Document, Object, ObjectId, dictionary};
433
434 let mut doc = Document::with_version("1.5");
435 let pages_id: ObjectId = doc.new_object_id();
436
437 let page_id = doc.add_object(dictionary! {
438 "Type" => "Page",
439 "Parent" => pages_id,
440 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
441 "Rotate" => rotation,
442 });
443
444 doc.objects.insert(
445 pages_id,
446 Object::Dictionary(dictionary! {
447 "Type" => "Pages",
448 "Kids" => vec![Object::from(page_id)],
449 "Count" => 1i64,
450 }),
451 );
452
453 let catalog_id = doc.add_object(dictionary! {
454 "Type" => "Catalog",
455 "Pages" => pages_id,
456 });
457 doc.trailer.set("Root", catalog_id);
458
459 let mut buf = Vec::new();
460 doc.save_to(&mut buf).expect("failed to save test PDF");
461 buf
462}
463
464#[cfg(test)]
466fn create_test_pdf_inherited_rotate(rotation: i64) -> Vec<u8> {
467 use lopdf::{Document, Object, ObjectId, dictionary};
468
469 let mut doc = Document::with_version("1.5");
470 let pages_id: ObjectId = doc.new_object_id();
471
472 let page_id = doc.add_object(dictionary! {
474 "Type" => "Page",
475 "Parent" => pages_id,
476 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
477 });
478
479 doc.objects.insert(
480 pages_id,
481 Object::Dictionary(dictionary! {
482 "Type" => "Pages",
483 "Kids" => vec![Object::from(page_id)],
484 "Count" => 1i64,
485 "Rotate" => rotation,
486 }),
487 );
488
489 let catalog_id = doc.add_object(dictionary! {
490 "Type" => "Catalog",
491 "Pages" => pages_id,
492 });
493 doc.trailer.set("Root", catalog_id);
494
495 let mut buf = Vec::new();
496 doc.save_to(&mut buf).expect("failed to save test PDF");
497 buf
498}
499
500#[cfg(test)]
505fn create_test_pdf_with_form_xobject() -> Vec<u8> {
506 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
507
508 let mut doc = Document::with_version("1.5");
509 let pages_id: ObjectId = doc.new_object_id();
510
511 let font_id = doc.add_object(dictionary! {
513 "Type" => "Font",
514 "Subtype" => "Type1",
515 "BaseFont" => "Helvetica",
516 });
517
518 let form_content = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
520 let form_stream = Stream::new(
521 dictionary! {
522 "Type" => "XObject",
523 "Subtype" => "Form",
524 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
525 "Resources" => Object::Dictionary(dictionary! {
526 "Font" => Object::Dictionary(dictionary! {
527 "F1" => font_id,
528 }),
529 }),
530 },
531 form_content.to_vec(),
532 );
533 let form_id = doc.add_object(Object::Stream(form_stream));
534
535 let page_content = b"q /FM1 Do Q";
537 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
538 let content_id = doc.add_object(Object::Stream(page_stream));
539
540 let page_id = doc.add_object(dictionary! {
541 "Type" => "Page",
542 "Parent" => pages_id,
543 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
544 "Contents" => content_id,
545 "Resources" => Object::Dictionary(dictionary! {
546 "Font" => Object::Dictionary(dictionary! {
547 "F1" => font_id,
548 }),
549 "XObject" => Object::Dictionary(dictionary! {
550 "FM1" => form_id,
551 }),
552 }),
553 });
554
555 doc.objects.insert(
556 pages_id,
557 Object::Dictionary(dictionary! {
558 "Type" => "Pages",
559 "Kids" => vec![Object::from(page_id)],
560 "Count" => 1i64,
561 }),
562 );
563
564 let catalog_id = doc.add_object(dictionary! {
565 "Type" => "Catalog",
566 "Pages" => pages_id,
567 });
568 doc.trailer.set("Root", catalog_id);
569
570 let mut buf = Vec::new();
571 doc.save_to(&mut buf).expect("failed to save test PDF");
572 buf
573}
574
575#[cfg(test)]
581fn create_test_pdf_with_nested_form_xobjects() -> Vec<u8> {
582 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
583
584 let mut doc = Document::with_version("1.5");
585 let pages_id: ObjectId = doc.new_object_id();
586
587 let font_id = doc.add_object(dictionary! {
588 "Type" => "Font",
589 "Subtype" => "Type1",
590 "BaseFont" => "Helvetica",
591 });
592
593 let fm2_content = b"BT /F1 10 Tf (Deep) Tj ET";
595 let fm2_stream = Stream::new(
596 dictionary! {
597 "Type" => "XObject",
598 "Subtype" => "Form",
599 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
600 "Resources" => Object::Dictionary(dictionary! {
601 "Font" => Object::Dictionary(dictionary! {
602 "F1" => font_id,
603 }),
604 }),
605 },
606 fm2_content.to_vec(),
607 );
608 let fm2_id = doc.add_object(Object::Stream(fm2_stream));
609
610 let fm1_content = b"q /FM2 Do Q";
612 let fm1_stream = Stream::new(
613 dictionary! {
614 "Type" => "XObject",
615 "Subtype" => "Form",
616 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
617 "Resources" => Object::Dictionary(dictionary! {
618 "XObject" => Object::Dictionary(dictionary! {
619 "FM2" => fm2_id,
620 }),
621 "Font" => Object::Dictionary(dictionary! {
622 "F1" => font_id,
623 }),
624 }),
625 },
626 fm1_content.to_vec(),
627 );
628 let fm1_id = doc.add_object(Object::Stream(fm1_stream));
629
630 let page_content = b"q /FM1 Do Q";
632 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
633 let content_id = doc.add_object(Object::Stream(page_stream));
634
635 let page_id = doc.add_object(dictionary! {
636 "Type" => "Page",
637 "Parent" => pages_id,
638 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
639 "Contents" => content_id,
640 "Resources" => Object::Dictionary(dictionary! {
641 "XObject" => Object::Dictionary(dictionary! {
642 "FM1" => fm1_id,
643 }),
644 "Font" => Object::Dictionary(dictionary! {
645 "F1" => font_id,
646 }),
647 }),
648 });
649
650 doc.objects.insert(
651 pages_id,
652 Object::Dictionary(dictionary! {
653 "Type" => "Pages",
654 "Kids" => vec![Object::from(page_id)],
655 "Count" => 1i64,
656 }),
657 );
658
659 let catalog_id = doc.add_object(dictionary! {
660 "Type" => "Catalog",
661 "Pages" => pages_id,
662 });
663 doc.trailer.set("Root", catalog_id);
664
665 let mut buf = Vec::new();
666 doc.save_to(&mut buf).expect("failed to save test PDF");
667 buf
668}
669
670#[cfg(test)]
674fn create_test_pdf_form_xobject_with_matrix() -> Vec<u8> {
675 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
676
677 let mut doc = Document::with_version("1.5");
678 let pages_id: ObjectId = doc.new_object_id();
679
680 let font_id = doc.add_object(dictionary! {
681 "Type" => "Font",
682 "Subtype" => "Type1",
683 "BaseFont" => "Helvetica",
684 });
685
686 let form_content = b"BT /F1 12 Tf (A) Tj ET";
687 let form_stream = Stream::new(
688 dictionary! {
689 "Type" => "XObject",
690 "Subtype" => "Form",
691 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
692 "Matrix" => vec![
693 Object::Real(2.0), Object::Real(0.0),
694 Object::Real(0.0), Object::Real(2.0),
695 Object::Real(10.0), Object::Real(20.0),
696 ],
697 "Resources" => Object::Dictionary(dictionary! {
698 "Font" => Object::Dictionary(dictionary! {
699 "F1" => font_id,
700 }),
701 }),
702 },
703 form_content.to_vec(),
704 );
705 let form_id = doc.add_object(Object::Stream(form_stream));
706
707 let page_content = b"q /FM1 Do Q";
708 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
709 let content_id = doc.add_object(Object::Stream(page_stream));
710
711 let page_id = doc.add_object(dictionary! {
712 "Type" => "Page",
713 "Parent" => pages_id,
714 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
715 "Contents" => content_id,
716 "Resources" => Object::Dictionary(dictionary! {
717 "XObject" => Object::Dictionary(dictionary! {
718 "FM1" => form_id,
719 }),
720 "Font" => Object::Dictionary(dictionary! {
721 "F1" => font_id,
722 }),
723 }),
724 });
725
726 doc.objects.insert(
727 pages_id,
728 Object::Dictionary(dictionary! {
729 "Type" => "Pages",
730 "Kids" => vec![Object::from(page_id)],
731 "Count" => 1i64,
732 }),
733 );
734
735 let catalog_id = doc.add_object(dictionary! {
736 "Type" => "Catalog",
737 "Pages" => pages_id,
738 });
739 doc.trailer.set("Root", catalog_id);
740
741 let mut buf = Vec::new();
742 doc.save_to(&mut buf).expect("failed to save test PDF");
743 buf
744}
745
746#[cfg(test)]
748fn create_test_pdf_with_image_xobject() -> Vec<u8> {
749 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
750
751 let mut doc = Document::with_version("1.5");
752 let pages_id: ObjectId = doc.new_object_id();
753
754 let image_data = vec![255u8, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0];
756 let image_stream = Stream::new(
757 dictionary! {
758 "Type" => "XObject",
759 "Subtype" => "Image",
760 "Width" => 2i64,
761 "Height" => 2i64,
762 "ColorSpace" => "DeviceRGB",
763 "BitsPerComponent" => 8i64,
764 },
765 image_data,
766 );
767 let image_id = doc.add_object(Object::Stream(image_stream));
768
769 let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
771 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
772 let content_id = doc.add_object(Object::Stream(page_stream));
773
774 let page_id = doc.add_object(dictionary! {
775 "Type" => "Page",
776 "Parent" => pages_id,
777 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
778 "Contents" => content_id,
779 "Resources" => Object::Dictionary(dictionary! {
780 "XObject" => Object::Dictionary(dictionary! {
781 "Im0" => image_id,
782 }),
783 }),
784 });
785
786 doc.objects.insert(
787 pages_id,
788 Object::Dictionary(dictionary! {
789 "Type" => "Pages",
790 "Kids" => vec![Object::from(page_id)],
791 "Count" => 1i64,
792 }),
793 );
794
795 let catalog_id = doc.add_object(dictionary! {
796 "Type" => "Catalog",
797 "Pages" => pages_id,
798 });
799 doc.trailer.set("Root", catalog_id);
800
801 let mut buf = Vec::new();
802 doc.save_to(&mut buf).expect("failed to save test PDF");
803 buf
804}
805
806#[cfg(test)]
808fn create_test_pdf_with_text_content() -> Vec<u8> {
809 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
810
811 let mut doc = Document::with_version("1.5");
812 let pages_id: ObjectId = doc.new_object_id();
813
814 let font_id = doc.add_object(dictionary! {
815 "Type" => "Font",
816 "Subtype" => "Type1",
817 "BaseFont" => "Helvetica",
818 });
819
820 let page_content = b"BT /F1 12 Tf 72 700 Td (Hi) Tj ET";
821 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
822 let content_id = doc.add_object(Object::Stream(page_stream));
823
824 let page_id = doc.add_object(dictionary! {
825 "Type" => "Page",
826 "Parent" => pages_id,
827 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
828 "Contents" => content_id,
829 "Resources" => Object::Dictionary(dictionary! {
830 "Font" => Object::Dictionary(dictionary! {
831 "F1" => font_id,
832 }),
833 }),
834 });
835
836 doc.objects.insert(
837 pages_id,
838 Object::Dictionary(dictionary! {
839 "Type" => "Pages",
840 "Kids" => vec![Object::from(page_id)],
841 "Count" => 1i64,
842 }),
843 );
844
845 let catalog_id = doc.add_object(dictionary! {
846 "Type" => "Catalog",
847 "Pages" => pages_id,
848 });
849 doc.trailer.set("Root", catalog_id);
850
851 let mut buf = Vec::new();
852 doc.save_to(&mut buf).expect("failed to save test PDF");
853 buf
854}
855
856#[cfg(test)]
857mod tests {
858 use super::*;
859 use crate::handler::{CharEvent, ContentHandler, ImageEvent};
860 use pdfplumber_core::PdfError;
861
862 struct CollectingHandler {
865 chars: Vec<CharEvent>,
866 images: Vec<ImageEvent>,
867 }
868
869 impl CollectingHandler {
870 fn new() -> Self {
871 Self {
872 chars: Vec::new(),
873 images: Vec::new(),
874 }
875 }
876 }
877
878 impl ContentHandler for CollectingHandler {
879 fn on_char(&mut self, event: CharEvent) {
880 self.chars.push(event);
881 }
882 fn on_image(&mut self, event: ImageEvent) {
883 self.images.push(event);
884 }
885 }
886
887 #[test]
890 fn open_valid_single_page_pdf() {
891 let pdf_bytes = create_test_pdf(1);
892 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
893 assert_eq!(LopdfBackend::page_count(&doc), 1);
894 }
895
896 #[test]
897 fn open_valid_multi_page_pdf() {
898 let pdf_bytes = create_test_pdf(5);
899 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
900 assert_eq!(LopdfBackend::page_count(&doc), 5);
901 }
902
903 #[test]
904 fn open_invalid_bytes_returns_error() {
905 let result = LopdfBackend::open(b"not a pdf");
906 assert!(result.is_err());
907 }
908
909 #[test]
910 fn open_empty_bytes_returns_error() {
911 let result = LopdfBackend::open(&[]);
912 assert!(result.is_err());
913 }
914
915 #[test]
916 fn open_error_converts_to_pdf_error() {
917 let err = LopdfBackend::open(b"garbage").unwrap_err();
918 let pdf_err: PdfError = err.into();
919 assert!(matches!(pdf_err, PdfError::ParseError(_)));
920 }
921
922 #[test]
925 fn page_count_zero_pages() {
926 let pdf_bytes = create_test_pdf(0);
927 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
928 assert_eq!(LopdfBackend::page_count(&doc), 0);
929 }
930
931 #[test]
932 fn page_count_three_pages() {
933 let pdf_bytes = create_test_pdf(3);
934 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
935 assert_eq!(LopdfBackend::page_count(&doc), 3);
936 }
937
938 #[test]
941 fn get_page_first_page() {
942 let pdf_bytes = create_test_pdf(3);
943 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
944 let page = LopdfBackend::get_page(&doc, 0).unwrap();
945 assert_eq!(page.index, 0);
946 }
947
948 #[test]
949 fn get_page_last_page() {
950 let pdf_bytes = create_test_pdf(3);
951 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
952 let page = LopdfBackend::get_page(&doc, 2).unwrap();
953 assert_eq!(page.index, 2);
954 }
955
956 #[test]
957 fn get_page_out_of_bounds() {
958 let pdf_bytes = create_test_pdf(2);
959 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
960 let result = LopdfBackend::get_page(&doc, 2);
961 assert!(result.is_err());
962 }
963
964 #[test]
965 fn get_page_out_of_bounds_error_converts_to_pdf_error() {
966 let pdf_bytes = create_test_pdf(1);
967 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
968 let err = LopdfBackend::get_page(&doc, 5).unwrap_err();
969 let pdf_err: PdfError = err.into();
970 assert!(matches!(pdf_err, PdfError::ParseError(_)));
971 assert!(pdf_err.to_string().contains("out of range"));
972 }
973
974 #[test]
975 fn get_page_on_empty_document() {
976 let pdf_bytes = create_test_pdf(0);
977 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
978 let result = LopdfBackend::get_page(&doc, 0);
979 assert!(result.is_err());
980 }
981
982 #[test]
985 fn pages_have_distinct_object_ids() {
986 let pdf_bytes = create_test_pdf(3);
987 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
988 let page0 = LopdfBackend::get_page(&doc, 0).unwrap();
989 let page1 = LopdfBackend::get_page(&doc, 1).unwrap();
990 let page2 = LopdfBackend::get_page(&doc, 2).unwrap();
991 assert_ne!(page0.object_id, page1.object_id);
992 assert_ne!(page1.object_id, page2.object_id);
993 assert_ne!(page0.object_id, page2.object_id);
994 }
995
996 #[test]
999 fn round_trip_open_count_access() {
1000 let pdf_bytes = create_test_pdf(4);
1001 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1002 let count = LopdfBackend::page_count(&doc);
1003 assert_eq!(count, 4);
1004
1005 for i in 0..count {
1006 let page = LopdfBackend::get_page(&doc, i).unwrap();
1007 assert_eq!(page.index, i);
1008 }
1009
1010 assert!(LopdfBackend::get_page(&doc, count).is_err());
1012 }
1013
1014 #[test]
1017 fn media_box_explicit_us_letter() {
1018 let pdf_bytes = create_test_pdf(1);
1019 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1020 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1021 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1022 assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
1023 }
1024
1025 #[test]
1026 fn media_box_inherited_from_parent() {
1027 let pdf_bytes = create_test_pdf_inherited_media_box();
1028 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1029 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1030 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1031 assert_eq!(media_box, BBox::new(0.0, 0.0, 595.0, 842.0));
1033 }
1034
1035 #[test]
1036 fn media_box_width_height() {
1037 let pdf_bytes = create_test_pdf(1);
1038 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1039 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1040 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1041 assert_eq!(media_box.width(), 612.0);
1042 assert_eq!(media_box.height(), 792.0);
1043 }
1044
1045 #[test]
1048 fn crop_box_present() {
1049 let pdf_bytes = create_test_pdf_with_crop_box();
1050 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1051 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1052 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1053 assert_eq!(crop_box, Some(BBox::new(36.0, 36.0, 576.0, 756.0)));
1054 }
1055
1056 #[test]
1057 fn crop_box_absent() {
1058 let pdf_bytes = create_test_pdf(1);
1059 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1060 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1061 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1062 assert_eq!(crop_box, None);
1063 }
1064
1065 #[test]
1068 fn rotate_default_zero() {
1069 let pdf_bytes = create_test_pdf(1);
1070 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1071 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1072 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1073 assert_eq!(rotation, 0);
1074 }
1075
1076 #[test]
1077 fn rotate_90() {
1078 let pdf_bytes = create_test_pdf_with_rotate(90);
1079 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1080 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1081 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1082 assert_eq!(rotation, 90);
1083 }
1084
1085 #[test]
1086 fn rotate_180() {
1087 let pdf_bytes = create_test_pdf_with_rotate(180);
1088 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1089 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1090 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1091 assert_eq!(rotation, 180);
1092 }
1093
1094 #[test]
1095 fn rotate_270() {
1096 let pdf_bytes = create_test_pdf_with_rotate(270);
1097 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1098 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1099 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1100 assert_eq!(rotation, 270);
1101 }
1102
1103 #[test]
1104 fn rotate_inherited_from_parent() {
1105 let pdf_bytes = create_test_pdf_inherited_rotate(90);
1106 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1107 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1108 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1109 assert_eq!(rotation, 90);
1110 }
1111
1112 #[test]
1115 fn page_properties_round_trip() {
1116 let pdf_bytes = create_test_pdf_with_crop_box();
1117 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1118 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1119
1120 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1121 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1122 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1123
1124 assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
1125 assert!(crop_box.is_some());
1126 assert_eq!(rotation, 0);
1127 }
1128
1129 #[test]
1132 fn interpret_page_simple_text() {
1133 let pdf_bytes = create_test_pdf_with_text_content();
1134 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1135 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1136 let options = ExtractOptions::default();
1137 let mut handler = CollectingHandler::new();
1138
1139 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1140
1141 assert_eq!(handler.chars.len(), 2);
1143 assert_eq!(handler.chars[0].char_code, b'H' as u32);
1144 assert_eq!(handler.chars[1].char_code, b'i' as u32);
1145 assert_eq!(handler.chars[0].font_size, 12.0);
1146 assert_eq!(handler.chars[0].font_name, "Helvetica");
1147 }
1148
1149 #[test]
1150 fn interpret_page_no_content() {
1151 let pdf_bytes = create_test_pdf(1);
1152 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1153 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1154 let options = ExtractOptions::default();
1155 let mut handler = CollectingHandler::new();
1156
1157 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1159 assert_eq!(handler.chars.len(), 0);
1160 }
1161
1162 #[test]
1165 fn interpret_page_form_xobject_text() {
1166 let pdf_bytes = create_test_pdf_with_form_xobject();
1167 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1168 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1169 let options = ExtractOptions::default();
1170 let mut handler = CollectingHandler::new();
1171
1172 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1173
1174 assert_eq!(handler.chars.len(), 5);
1176 assert_eq!(handler.chars[0].char_code, b'H' as u32);
1177 assert_eq!(handler.chars[1].char_code, b'e' as u32);
1178 assert_eq!(handler.chars[2].char_code, b'l' as u32);
1179 assert_eq!(handler.chars[3].char_code, b'l' as u32);
1180 assert_eq!(handler.chars[4].char_code, b'o' as u32);
1181 assert_eq!(handler.chars[0].font_name, "Helvetica");
1182 assert_eq!(handler.chars[0].font_size, 12.0);
1183 }
1184
1185 #[test]
1186 fn interpret_page_nested_form_xobjects() {
1187 let pdf_bytes = create_test_pdf_with_nested_form_xobjects();
1188 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1189 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1190 let options = ExtractOptions::default();
1191 let mut handler = CollectingHandler::new();
1192
1193 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1194
1195 assert_eq!(handler.chars.len(), 4);
1197 assert_eq!(handler.chars[0].char_code, b'D' as u32);
1198 assert_eq!(handler.chars[1].char_code, b'e' as u32);
1199 assert_eq!(handler.chars[2].char_code, b'e' as u32);
1200 assert_eq!(handler.chars[3].char_code, b'p' as u32);
1201 }
1202
1203 #[test]
1204 fn interpret_page_form_xobject_matrix_applied() {
1205 let pdf_bytes = create_test_pdf_form_xobject_with_matrix();
1206 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1207 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1208 let options = ExtractOptions::default();
1209 let mut handler = CollectingHandler::new();
1210
1211 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1212
1213 assert_eq!(handler.chars.len(), 1);
1215 assert_eq!(handler.chars[0].char_code, b'A' as u32);
1216 let ctm = handler.chars[0].ctm;
1218 assert!((ctm[0] - 2.0).abs() < 0.01);
1220 assert!((ctm[3] - 2.0).abs() < 0.01);
1221 assert!((ctm[4] - 10.0).abs() < 0.01);
1222 assert!((ctm[5] - 20.0).abs() < 0.01);
1223 }
1224
1225 #[test]
1226 fn interpret_page_form_xobject_state_restored() {
1227 let pdf_bytes = create_test_pdf_with_form_xobject();
1231 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1232 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1233 let options = ExtractOptions::default();
1234 let mut handler = CollectingHandler::new();
1235
1236 let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
1238 assert!(result.is_ok());
1239 }
1240
1241 #[test]
1242 fn interpret_page_image_xobject() {
1243 let pdf_bytes = create_test_pdf_with_image_xobject();
1244 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1245 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1246 let options = ExtractOptions::default();
1247 let mut handler = CollectingHandler::new();
1248
1249 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1250
1251 assert_eq!(handler.chars.len(), 0);
1253 assert_eq!(handler.images.len(), 1);
1254 assert_eq!(handler.images[0].name, "Im0");
1255 assert_eq!(handler.images[0].width, 2);
1256 assert_eq!(handler.images[0].height, 2);
1257 assert_eq!(handler.images[0].colorspace.as_deref(), Some("DeviceRGB"));
1258 assert_eq!(handler.images[0].bits_per_component, Some(8));
1259 let ctm = handler.images[0].ctm;
1261 assert!((ctm[0] - 200.0).abs() < 0.01);
1262 assert!((ctm[3] - 150.0).abs() < 0.01);
1263 assert!((ctm[4] - 100.0).abs() < 0.01);
1264 assert!((ctm[5] - 300.0).abs() < 0.01);
1265 }
1266
1267 #[test]
1268 fn interpret_page_recursion_limit() {
1269 let pdf_bytes = create_test_pdf_with_form_xobject();
1271 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1272 let page = LopdfBackend::get_page(&doc, 0).unwrap();
1273 let mut options = ExtractOptions::default();
1274 options.max_recursion_depth = 0; let mut handler = CollectingHandler::new();
1276
1277 let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
1278 assert!(result.is_err());
1279 let err_msg = result.unwrap_err().to_string();
1280 assert!(err_msg.contains("recursion depth"));
1281 }
1282}