1use pdfplumber_core::{
7 Annotation, BBox, Bookmark, DocumentMetadata, ExtractOptions, FormField, Hyperlink,
8 ImageContent, PdfError, RepairOptions, RepairResult, SignatureInfo, StructElement,
9 ValidationIssue,
10};
11
12use crate::handler::ContentHandler;
13
14pub trait PdfBackend {
36 type Document;
38
39 type Page;
41
42 type Error: std::error::Error + Into<PdfError>;
44
45 fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error>;
52
53 fn open_with_password(bytes: &[u8], password: &[u8]) -> Result<Self::Document, Self::Error>;
63
64 fn page_count(doc: &Self::Document) -> usize;
66
67 fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error>;
73
74 fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error>;
85
86 fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error>;
95
96 fn page_trim_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error>;
106
107 fn page_bleed_box(doc: &Self::Document, page: &Self::Page)
117 -> Result<Option<BBox>, Self::Error>;
118
119 fn page_art_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error>;
129
130 fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error>;
138
139 fn document_metadata(doc: &Self::Document) -> Result<DocumentMetadata, Self::Error>;
149
150 fn document_bookmarks(doc: &Self::Document) -> Result<Vec<Bookmark>, Self::Error>;
160
161 fn page_annotations(
170 doc: &Self::Document,
171 page: &Self::Page,
172 ) -> Result<Vec<Annotation>, Self::Error>;
173
174 fn page_hyperlinks(
184 doc: &Self::Document,
185 page: &Self::Page,
186 ) -> Result<Vec<Hyperlink>, Self::Error>;
187
188 fn interpret_page(
200 doc: &Self::Document,
201 page: &Self::Page,
202 handler: &mut dyn ContentHandler,
203 options: &ExtractOptions,
204 ) -> Result<(), Self::Error>;
205
206 fn document_form_fields(doc: &Self::Document) -> Result<Vec<FormField>, Self::Error>;
217
218 fn document_structure_tree(doc: &Self::Document) -> Result<Vec<StructElement>, Self::Error>;
229
230 fn extract_image_content(
242 doc: &Self::Document,
243 page: &Self::Page,
244 image_name: &str,
245 ) -> Result<ImageContent, Self::Error>;
246
247 fn validate(doc: &Self::Document) -> Result<Vec<ValidationIssue>, Self::Error> {
261 let _ = doc;
262 Ok(Vec::new())
263 }
264
265 fn document_signatures(doc: &Self::Document) -> Result<Vec<SignatureInfo>, Self::Error> {
277 let _ = doc;
278 Ok(Vec::new())
279 }
280
281 fn repair(
291 bytes: &[u8],
292 options: &RepairOptions,
293 ) -> Result<(Vec<u8>, RepairResult), Self::Error> {
294 let _ = (bytes, options);
295 Ok((bytes.to_vec(), RepairResult::new()))
296 }
297}
298
299#[cfg(test)]
300mod tests {
301 use super::*;
302 use crate::handler::{CharEvent, ImageEvent, PaintOp, PathEvent};
303 use pdfplumber_core::{Color, ImageFormat, PathSegment, Point};
304
305 #[derive(Debug)]
308 struct MockDocument {
309 pages: Vec<MockPageData>,
310 }
311
312 #[derive(Debug)]
313 struct MockPageData {
314 media_box: BBox,
315 crop_box: Option<BBox>,
316 trim_box: Option<BBox>,
317 bleed_box: Option<BBox>,
318 art_box: Option<BBox>,
319 rotate: i32,
320 }
321
322 #[derive(Debug)]
323 struct MockPage {
324 index: usize,
325 }
326
327 struct CollectingHandler {
330 chars: Vec<CharEvent>,
331 paths: Vec<PathEvent>,
332 images: Vec<ImageEvent>,
333 }
334
335 impl CollectingHandler {
336 fn new() -> Self {
337 Self {
338 chars: Vec::new(),
339 paths: Vec::new(),
340 images: Vec::new(),
341 }
342 }
343 }
344
345 impl ContentHandler for CollectingHandler {
346 fn on_char(&mut self, event: CharEvent) {
347 self.chars.push(event);
348 }
349
350 fn on_path_painted(&mut self, event: PathEvent) {
351 self.paths.push(event);
352 }
353
354 fn on_image(&mut self, event: ImageEvent) {
355 self.images.push(event);
356 }
357 }
358
359 struct MockBackend;
362
363 impl PdfBackend for MockBackend {
364 type Document = MockDocument;
365 type Page = MockPage;
366 type Error = PdfError;
367
368 fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
369 if bytes.is_empty() {
370 return Err(PdfError::ParseError("empty input".to_string()));
371 }
372 let page_count = bytes[0] as usize;
374 let mut pages = Vec::new();
375 for _ in 0..page_count {
376 pages.push(MockPageData {
377 media_box: BBox::new(0.0, 0.0, 612.0, 792.0), crop_box: None,
379 trim_box: None,
380 bleed_box: None,
381 art_box: None,
382 rotate: 0,
383 });
384 }
385 Ok(MockDocument { pages })
386 }
387
388 fn open_with_password(
389 bytes: &[u8],
390 _password: &[u8],
391 ) -> Result<Self::Document, Self::Error> {
392 Self::open(bytes)
394 }
395
396 fn page_count(doc: &Self::Document) -> usize {
397 doc.pages.len()
398 }
399
400 fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
401 if index >= doc.pages.len() {
402 return Err(PdfError::ParseError(format!(
403 "page index {index} out of range (0..{})",
404 doc.pages.len()
405 )));
406 }
407 Ok(MockPage { index })
408 }
409
410 fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
411 Ok(doc.pages[page.index].media_box)
412 }
413
414 fn page_crop_box(
415 doc: &Self::Document,
416 page: &Self::Page,
417 ) -> Result<Option<BBox>, Self::Error> {
418 Ok(doc.pages[page.index].crop_box)
419 }
420
421 fn page_trim_box(
422 doc: &Self::Document,
423 page: &Self::Page,
424 ) -> Result<Option<BBox>, Self::Error> {
425 Ok(doc.pages[page.index].trim_box)
426 }
427
428 fn page_bleed_box(
429 doc: &Self::Document,
430 page: &Self::Page,
431 ) -> Result<Option<BBox>, Self::Error> {
432 Ok(doc.pages[page.index].bleed_box)
433 }
434
435 fn page_art_box(
436 doc: &Self::Document,
437 page: &Self::Page,
438 ) -> Result<Option<BBox>, Self::Error> {
439 Ok(doc.pages[page.index].art_box)
440 }
441
442 fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
443 Ok(doc.pages[page.index].rotate)
444 }
445
446 fn document_metadata(_doc: &Self::Document) -> Result<DocumentMetadata, Self::Error> {
447 Ok(DocumentMetadata::default())
448 }
449
450 fn document_bookmarks(_doc: &Self::Document) -> Result<Vec<Bookmark>, Self::Error> {
451 Ok(Vec::new())
452 }
453
454 fn document_form_fields(_doc: &Self::Document) -> Result<Vec<FormField>, Self::Error> {
455 Ok(Vec::new())
456 }
457
458 fn document_signatures(_doc: &Self::Document) -> Result<Vec<SignatureInfo>, Self::Error> {
459 Ok(Vec::new())
460 }
461
462 fn document_structure_tree(
463 _doc: &Self::Document,
464 ) -> Result<Vec<StructElement>, Self::Error> {
465 Ok(Vec::new())
466 }
467
468 fn page_annotations(
469 _doc: &Self::Document,
470 _page: &Self::Page,
471 ) -> Result<Vec<Annotation>, Self::Error> {
472 Ok(Vec::new())
473 }
474
475 fn page_hyperlinks(
476 _doc: &Self::Document,
477 _page: &Self::Page,
478 ) -> Result<Vec<Hyperlink>, Self::Error> {
479 Ok(Vec::new())
480 }
481
482 fn interpret_page(
483 _doc: &Self::Document,
484 _page: &Self::Page,
485 handler: &mut dyn ContentHandler,
486 _options: &ExtractOptions,
487 ) -> Result<(), Self::Error> {
488 handler.on_char(CharEvent {
490 char_code: 72, unicode: Some("H".to_string()),
492 font_name: "Times-Roman".to_string(),
493 font_size: 14.0,
494 text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 720.0],
495 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
496 displacement: 722.0,
497 char_spacing: 0.0,
498 word_spacing: 0.0,
499 h_scaling: 1.0,
500 rise: 0.0,
501 });
502
503 handler.on_path_painted(PathEvent {
505 segments: vec![
506 PathSegment::MoveTo(Point::new(72.0, 700.0)),
507 PathSegment::LineTo(Point::new(540.0, 700.0)),
508 ],
509 paint_op: PaintOp::Stroke,
510 line_width: 0.5,
511 stroking_color: Some(Color::black()),
512 non_stroking_color: None,
513 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
514 dash_pattern: None,
515 fill_rule: None,
516 });
517
518 handler.on_image(ImageEvent {
520 name: "Im1".to_string(),
521 ctm: [100.0, 0.0, 0.0, 75.0, 72.0, 600.0],
522 width: 400,
523 height: 300,
524 colorspace: Some("DeviceRGB".to_string()),
525 bits_per_component: Some(8),
526 });
527
528 Ok(())
529 }
530
531 fn extract_image_content(
532 _doc: &Self::Document,
533 _page: &Self::Page,
534 image_name: &str,
535 ) -> Result<ImageContent, Self::Error> {
536 if image_name == "Im1" {
537 Ok(ImageContent {
538 data: vec![255, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0],
539 format: ImageFormat::Raw,
540 width: 2,
541 height: 2,
542 })
543 } else {
544 Err(PdfError::ParseError(format!(
545 "image XObject /{image_name} not found"
546 )))
547 }
548 }
549 }
550
551 #[test]
554 fn mock_backend_open_valid_document() {
555 let doc = MockBackend::open(&[3]).unwrap();
556 assert_eq!(MockBackend::page_count(&doc), 3);
557 }
558
559 #[test]
560 fn mock_backend_open_single_page() {
561 let doc = MockBackend::open(&[1]).unwrap();
562 assert_eq!(MockBackend::page_count(&doc), 1);
563 }
564
565 #[test]
566 fn mock_backend_open_empty_bytes_fails() {
567 let result = MockBackend::open(&[]);
568 assert!(result.is_err());
569 }
570
571 #[test]
574 fn mock_backend_get_page_valid_index() {
575 let doc = MockBackend::open(&[3]).unwrap();
576 let page = MockBackend::get_page(&doc, 0).unwrap();
577 assert_eq!(page.index, 0);
578
579 let page2 = MockBackend::get_page(&doc, 2).unwrap();
580 assert_eq!(page2.index, 2);
581 }
582
583 #[test]
584 fn mock_backend_get_page_out_of_bounds() {
585 let doc = MockBackend::open(&[2]).unwrap();
586 let result = MockBackend::get_page(&doc, 5);
587 assert!(result.is_err());
588 }
589
590 #[test]
593 fn mock_backend_page_media_box() {
594 let doc = MockBackend::open(&[1]).unwrap();
595 let page = MockBackend::get_page(&doc, 0).unwrap();
596 let media_box = MockBackend::page_media_box(&doc, &page).unwrap();
597 assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
598 }
599
600 #[test]
603 fn mock_backend_page_crop_box_none() {
604 let doc = MockBackend::open(&[1]).unwrap();
605 let page = MockBackend::get_page(&doc, 0).unwrap();
606 let crop_box = MockBackend::page_crop_box(&doc, &page).unwrap();
607 assert_eq!(crop_box, None);
608 }
609
610 #[test]
613 fn mock_backend_page_rotate_default() {
614 let doc = MockBackend::open(&[1]).unwrap();
615 let page = MockBackend::get_page(&doc, 0).unwrap();
616 let rotate = MockBackend::page_rotate(&doc, &page).unwrap();
617 assert_eq!(rotate, 0);
618 }
619
620 #[test]
623 fn mock_backend_interpret_page_emits_char() {
624 let doc = MockBackend::open(&[1]).unwrap();
625 let page = MockBackend::get_page(&doc, 0).unwrap();
626 let options = ExtractOptions::default();
627 let mut handler = CollectingHandler::new();
628
629 MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
630
631 assert_eq!(handler.chars.len(), 1);
632 assert_eq!(handler.chars[0].char_code, 72);
633 assert_eq!(handler.chars[0].unicode.as_deref(), Some("H"));
634 assert_eq!(handler.chars[0].font_name, "Times-Roman");
635 assert_eq!(handler.chars[0].font_size, 14.0);
636 }
637
638 #[test]
639 fn mock_backend_interpret_page_emits_path() {
640 let doc = MockBackend::open(&[1]).unwrap();
641 let page = MockBackend::get_page(&doc, 0).unwrap();
642 let options = ExtractOptions::default();
643 let mut handler = CollectingHandler::new();
644
645 MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
646
647 assert_eq!(handler.paths.len(), 1);
648 assert_eq!(handler.paths[0].paint_op, PaintOp::Stroke);
649 assert_eq!(handler.paths[0].segments.len(), 2);
650 assert_eq!(handler.paths[0].line_width, 0.5);
651 }
652
653 #[test]
654 fn mock_backend_interpret_page_emits_image() {
655 let doc = MockBackend::open(&[1]).unwrap();
656 let page = MockBackend::get_page(&doc, 0).unwrap();
657 let options = ExtractOptions::default();
658 let mut handler = CollectingHandler::new();
659
660 MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
661
662 assert_eq!(handler.images.len(), 1);
663 assert_eq!(handler.images[0].name, "Im1");
664 assert_eq!(handler.images[0].width, 400);
665 assert_eq!(handler.images[0].height, 300);
666 }
667
668 #[test]
669 fn mock_backend_interpret_page_uses_trait_object() {
670 let doc = MockBackend::open(&[1]).unwrap();
671 let page = MockBackend::get_page(&doc, 0).unwrap();
672 let options = ExtractOptions::default();
673 let mut handler = CollectingHandler::new();
674
675 let handler_ref: &mut dyn ContentHandler = &mut handler;
677 MockBackend::interpret_page(&doc, &page, handler_ref, &options).unwrap();
678
679 assert_eq!(handler.chars.len(), 1);
680 assert_eq!(handler.paths.len(), 1);
681 assert_eq!(handler.images.len(), 1);
682 }
683
684 #[test]
687 fn mock_backend_error_converts_to_pdf_error() {
688 let result = MockBackend::open(&[]);
689 let err = result.unwrap_err();
690 let pdf_err: PdfError = err.into();
692 assert!(matches!(pdf_err, PdfError::ParseError(_)));
693 }
694
695 #[test]
696 fn mock_backend_error_is_std_error() {
697 let result = MockBackend::open(&[]);
698 let err = result.unwrap_err();
699 let std_err: Box<dyn std::error::Error> = Box::new(err);
700 assert!(std_err.to_string().contains("empty input"));
701 }
702
703 #[test]
706 fn mock_backend_custom_page_properties() {
707 let doc = MockDocument {
708 pages: vec![
709 MockPageData {
710 media_box: BBox::new(0.0, 0.0, 595.0, 842.0), crop_box: Some(BBox::new(10.0, 10.0, 585.0, 832.0)),
712 trim_box: None,
713 bleed_box: None,
714 art_box: None,
715 rotate: 90,
716 },
717 MockPageData {
718 media_box: BBox::new(0.0, 0.0, 842.0, 595.0), crop_box: None,
720 trim_box: None,
721 bleed_box: None,
722 art_box: None,
723 rotate: 0,
724 },
725 ],
726 };
727
728 let page0 = MockBackend::get_page(&doc, 0).unwrap();
730 let media_box0 = MockBackend::page_media_box(&doc, &page0).unwrap();
731 assert_eq!(media_box0, BBox::new(0.0, 0.0, 595.0, 842.0));
732
733 let crop_box0 = MockBackend::page_crop_box(&doc, &page0).unwrap();
734 assert_eq!(crop_box0, Some(BBox::new(10.0, 10.0, 585.0, 832.0)));
735
736 let rotate0 = MockBackend::page_rotate(&doc, &page0).unwrap();
737 assert_eq!(rotate0, 90);
738
739 let page1 = MockBackend::get_page(&doc, 1).unwrap();
741 let crop_box1 = MockBackend::page_crop_box(&doc, &page1).unwrap();
742 assert_eq!(crop_box1, None);
743
744 let rotate1 = MockBackend::page_rotate(&doc, &page1).unwrap();
745 assert_eq!(rotate1, 0);
746 }
747}