1use crate::backend::PdfBackend;
7use crate::error::BackendError;
8use crate::handler::ContentHandler;
9use pdfplumber_core::{
10 Annotation, AnnotationType, BBox, Bookmark, DocumentMetadata, ExtractOptions, FieldType,
11 FormField, Hyperlink, ImageContent, RepairOptions, RepairResult, SignatureInfo, StructElement,
12 ValidationIssue,
13};
14
15pub struct LopdfDocument {
17 inner: lopdf::Document,
19 page_ids: Vec<lopdf::ObjectId>,
21}
22
23impl LopdfDocument {
24 pub fn inner(&self) -> &lopdf::Document {
26 &self.inner
27 }
28}
29
30impl std::fmt::Debug for LopdfDocument {
31 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
32 f.debug_struct("LopdfDocument")
33 .field("page_count", &self.page_ids.len())
34 .finish_non_exhaustive()
35 }
36}
37
38#[derive(Debug, Clone, Copy)]
40pub struct LopdfPage {
41 pub object_id: lopdf::ObjectId,
43 pub index: usize,
45}
46
47pub struct LopdfBackend;
63
64fn extract_bbox_from_array(array: &[lopdf::Object]) -> Result<BBox, BackendError> {
66 if array.len() != 4 {
67 return Err(BackendError::Parse(format!(
68 "expected 4-element array for box, got {}",
69 array.len()
70 )));
71 }
72 let x0 = object_to_f64(&array[0])?;
73 let y0 = object_to_f64(&array[1])?;
74 let x1 = object_to_f64(&array[2])?;
75 let y1 = object_to_f64(&array[3])?;
76 Ok(BBox::new(x0, y0, x1, y1))
77}
78
79pub(crate) fn object_to_f64(obj: &lopdf::Object) -> Result<f64, BackendError> {
81 match obj {
82 lopdf::Object::Integer(i) => Ok(*i as f64),
83 lopdf::Object::Real(f) => Ok(*f as f64),
84 _ => Err(BackendError::Parse(format!("expected number, got {obj:?}"))),
85 }
86}
87
88fn resolve_inherited<'a>(
93 doc: &'a lopdf::Document,
94 page_id: lopdf::ObjectId,
95 key: &[u8],
96) -> Result<Option<&'a lopdf::Object>, BackendError> {
97 let mut current_id = page_id;
98 loop {
99 let dict = doc
100 .get_object(current_id)
101 .and_then(|o| o.as_dict())
102 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
103
104 if let Ok(value) = dict.get(key) {
105 return Ok(Some(value));
106 }
107
108 match dict.get(b"Parent") {
110 Ok(parent_obj) => {
111 current_id = parent_obj
112 .as_reference()
113 .map_err(|e| BackendError::Parse(format!("invalid /Parent reference: {e}")))?;
114 }
115 Err(_) => return Ok(None),
116 }
117 }
118}
119
120impl PdfBackend for LopdfBackend {
121 type Document = LopdfDocument;
122 type Page = LopdfPage;
123 type Error = BackendError;
124
125 fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
126 let inner = lopdf::Document::load_mem(bytes)
127 .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
128
129 if inner.is_encrypted() {
131 return Err(BackendError::Core(
132 pdfplumber_core::PdfError::PasswordRequired,
133 ));
134 }
135
136 let pages_map = inner.get_pages();
138 let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
139
140 Ok(LopdfDocument { inner, page_ids })
141 }
142
143 fn open_with_password(bytes: &[u8], password: &[u8]) -> Result<Self::Document, Self::Error> {
144 let mut inner = lopdf::Document::load_mem(bytes)
145 .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
146
147 if inner.is_encrypted() {
149 inner.decrypt(password).map_err(|e| {
150 let msg = e.to_string();
151 if msg.contains("incorrect") || msg.contains("password") {
152 BackendError::Core(pdfplumber_core::PdfError::InvalidPassword)
153 } else {
154 BackendError::Parse(format!("decryption failed: {e}"))
155 }
156 })?;
157 }
158
159 let pages_map = inner.get_pages();
161 let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
162
163 Ok(LopdfDocument { inner, page_ids })
164 }
165
166 fn page_count(doc: &Self::Document) -> usize {
167 doc.page_ids.len()
168 }
169
170 fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
171 if index >= doc.page_ids.len() {
172 return Err(BackendError::Parse(format!(
173 "page index {index} out of range (0..{})",
174 doc.page_ids.len()
175 )));
176 }
177 Ok(LopdfPage {
178 object_id: doc.page_ids[index],
179 index,
180 })
181 }
182
183 fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
184 let obj = resolve_inherited(&doc.inner, page.object_id, b"MediaBox")?
185 .ok_or_else(|| BackendError::Parse("MediaBox not found on page or ancestors".into()))?;
186 let array = obj
187 .as_array()
188 .map_err(|e| BackendError::Parse(format!("MediaBox is not an array: {e}")))?;
189 extract_bbox_from_array(array)
190 }
191
192 fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
193 let dict = doc
195 .inner
196 .get_object(page.object_id)
197 .and_then(|o| o.as_dict())
198 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
199
200 match dict.get(b"CropBox") {
201 Ok(obj) => {
202 let array = obj
203 .as_array()
204 .map_err(|e| BackendError::Parse(format!("CropBox is not an array: {e}")))?;
205 Ok(Some(extract_bbox_from_array(array)?))
206 }
207 Err(_) => Ok(None),
208 }
209 }
210
211 fn page_trim_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
212 match resolve_inherited(&doc.inner, page.object_id, b"TrimBox")? {
213 Some(obj) => {
214 let array = obj
215 .as_array()
216 .map_err(|e| BackendError::Parse(format!("TrimBox is not an array: {e}")))?;
217 Ok(Some(extract_bbox_from_array(array)?))
218 }
219 None => Ok(None),
220 }
221 }
222
223 fn page_bleed_box(
224 doc: &Self::Document,
225 page: &Self::Page,
226 ) -> Result<Option<BBox>, Self::Error> {
227 match resolve_inherited(&doc.inner, page.object_id, b"BleedBox")? {
228 Some(obj) => {
229 let array = obj
230 .as_array()
231 .map_err(|e| BackendError::Parse(format!("BleedBox is not an array: {e}")))?;
232 Ok(Some(extract_bbox_from_array(array)?))
233 }
234 None => Ok(None),
235 }
236 }
237
238 fn page_art_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
239 match resolve_inherited(&doc.inner, page.object_id, b"ArtBox")? {
240 Some(obj) => {
241 let array = obj
242 .as_array()
243 .map_err(|e| BackendError::Parse(format!("ArtBox is not an array: {e}")))?;
244 Ok(Some(extract_bbox_from_array(array)?))
245 }
246 None => Ok(None),
247 }
248 }
249
250 fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
251 match resolve_inherited(&doc.inner, page.object_id, b"Rotate")? {
252 Some(obj) => {
253 let rotation = obj
254 .as_i64()
255 .map_err(|e| BackendError::Parse(format!("Rotate is not an integer: {e}")))?;
256 Ok(rotation as i32)
257 }
258 None => Ok(0), }
260 }
261
262 fn document_metadata(doc: &Self::Document) -> Result<DocumentMetadata, Self::Error> {
263 extract_document_metadata(&doc.inner)
264 }
265
266 fn document_bookmarks(doc: &Self::Document) -> Result<Vec<Bookmark>, Self::Error> {
267 extract_document_bookmarks(&doc.inner)
268 }
269
270 fn document_form_fields(doc: &Self::Document) -> Result<Vec<FormField>, Self::Error> {
271 extract_document_form_fields(&doc.inner)
272 }
273
274 fn document_signatures(doc: &Self::Document) -> Result<Vec<SignatureInfo>, Self::Error> {
275 extract_document_signatures(&doc.inner)
276 }
277
278 fn document_structure_tree(doc: &Self::Document) -> Result<Vec<StructElement>, Self::Error> {
279 extract_document_structure_tree(&doc.inner)
280 }
281
282 fn page_annotations(
283 doc: &Self::Document,
284 page: &Self::Page,
285 ) -> Result<Vec<Annotation>, Self::Error> {
286 extract_page_annotations(&doc.inner, page.object_id)
287 }
288
289 fn page_hyperlinks(
290 doc: &Self::Document,
291 page: &Self::Page,
292 ) -> Result<Vec<Hyperlink>, Self::Error> {
293 extract_page_hyperlinks(&doc.inner, page.object_id)
294 }
295
296 fn interpret_page(
297 doc: &Self::Document,
298 page: &Self::Page,
299 handler: &mut dyn ContentHandler,
300 options: &ExtractOptions,
301 ) -> Result<(), Self::Error> {
302 let inner = &doc.inner;
303
304 let page_dict = inner
306 .get_object(page.object_id)
307 .and_then(|o| o.as_dict())
308 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
309
310 let content_bytes = get_page_content_bytes(inner, page_dict)?;
312
313 let resources = get_page_resources(inner, page.object_id)?;
315
316 let mut gstate = crate::interpreter_state::InterpreterState::new();
318 let mut tstate = crate::text_state::TextState::new();
319
320 crate::interpreter::interpret_content_stream(
322 inner,
323 &content_bytes,
324 resources,
325 handler,
326 options,
327 0, &mut gstate,
329 &mut tstate,
330 )
331 }
332
333 fn extract_image_content(
334 doc: &Self::Document,
335 page: &Self::Page,
336 image_name: &str,
337 ) -> Result<ImageContent, Self::Error> {
338 use pdfplumber_core::ImageFormat;
339
340 let inner = &doc.inner;
341
342 let resources = get_page_resources(inner, page.object_id)?;
344
345 let xobj_dict = resources.get(b"XObject").map_err(|_| {
347 BackendError::Parse(format!(
348 "no /XObject dictionary in page resources for image /{image_name}"
349 ))
350 })?;
351 let xobj_dict = resolve_ref(inner, xobj_dict);
352 let xobj_dict = xobj_dict.as_dict().map_err(|_| {
353 BackendError::Parse("/XObject resource is not a dictionary".to_string())
354 })?;
355
356 let xobj_entry = xobj_dict.get(image_name.as_bytes()).map_err(|_| {
357 BackendError::Parse(format!(
358 "image XObject /{image_name} not found in resources"
359 ))
360 })?;
361
362 let xobj_id = xobj_entry.as_reference().map_err(|_| {
363 BackendError::Parse(format!(
364 "image XObject /{image_name} is not an indirect reference"
365 ))
366 })?;
367
368 let xobj = inner.get_object(xobj_id).map_err(|e| {
369 BackendError::Parse(format!(
370 "failed to resolve image XObject /{image_name}: {e}"
371 ))
372 })?;
373
374 let stream = xobj.as_stream().map_err(|e| {
375 BackendError::Parse(format!("image XObject /{image_name} is not a stream: {e}"))
376 })?;
377
378 let subtype = stream
380 .dict
381 .get(b"Subtype")
382 .ok()
383 .and_then(|o| o.as_name_str().ok())
384 .unwrap_or("");
385 if subtype != "Image" {
386 return Err(BackendError::Parse(format!(
387 "XObject /{image_name} is not an Image (subtype: {subtype})"
388 )));
389 }
390
391 let width = stream
392 .dict
393 .get(b"Width")
394 .ok()
395 .and_then(|o| o.as_i64().ok())
396 .unwrap_or(0) as u32;
397
398 let height = stream
399 .dict
400 .get(b"Height")
401 .ok()
402 .and_then(|o| o.as_i64().ok())
403 .unwrap_or(0) as u32;
404
405 let filter = stream
407 .dict
408 .get(b"Filter")
409 .ok()
410 .and_then(|o| {
411 if let Ok(name) = o.as_name_str() {
413 Some(vec![name.to_string()])
414 } else if let Ok(arr) = o.as_array() {
415 Some(
416 arr.iter()
417 .filter_map(|item| {
418 let resolved = resolve_ref(inner, item);
419 resolved.as_name_str().ok().map(|s| s.to_string())
420 })
421 .collect(),
422 )
423 } else {
424 None
425 }
426 })
427 .unwrap_or_default();
428
429 let format = if filter.is_empty() {
431 ImageFormat::Raw
432 } else {
433 match filter.last().map(|s| s.as_str()) {
434 Some("DCTDecode") => ImageFormat::Jpeg,
435 Some("JBIG2Decode") => ImageFormat::Jbig2,
436 Some("CCITTFaxDecode") => ImageFormat::CcittFax,
437 _ => ImageFormat::Raw,
438 }
439 };
440
441 let data = match format {
443 ImageFormat::Jpeg => {
444 if filter.len() == 1 {
447 stream.content.clone()
449 } else {
450 stream.decompressed_content().map_err(|e| {
452 BackendError::Parse(format!(
453 "failed to decompress image /{image_name}: {e}"
454 ))
455 })?
456 }
457 }
458 ImageFormat::Jbig2 | ImageFormat::CcittFax => {
459 stream.content.clone()
461 }
462 ImageFormat::Raw | ImageFormat::Png => {
463 if filter.is_empty() {
465 stream.content.clone()
466 } else {
467 stream.decompressed_content().map_err(|e| {
468 BackendError::Parse(format!(
469 "failed to decompress image /{image_name}: {e}"
470 ))
471 })?
472 }
473 }
474 };
475
476 Ok(ImageContent {
477 data,
478 format,
479 width,
480 height,
481 })
482 }
483
484 fn validate(doc: &Self::Document) -> Result<Vec<ValidationIssue>, Self::Error> {
485 validate_document(doc)
486 }
487
488 fn repair(
489 bytes: &[u8],
490 options: &RepairOptions,
491 ) -> Result<(Vec<u8>, RepairResult), Self::Error> {
492 repair_document(bytes, options)
493 }
494}
495
496fn validate_document(doc: &LopdfDocument) -> Result<Vec<ValidationIssue>, BackendError> {
498 use pdfplumber_core::{Severity, ValidationIssue};
499
500 let inner = &doc.inner;
501 let mut issues = Vec::new();
502
503 let catalog_location = get_catalog_location(inner);
505 let catalog_dict = get_catalog_dict(inner);
506
507 if let Some(dict) = catalog_dict {
508 match dict.get(b"Type") {
509 Ok(type_obj) => {
510 if let Ok(name) = type_obj.as_name_str() {
511 if name != "Catalog" {
512 issues.push(ValidationIssue::with_location(
513 Severity::Warning,
514 "WRONG_CATALOG_TYPE",
515 format!("catalog /Type is '{name}' instead of 'Catalog'"),
516 &catalog_location,
517 ));
518 }
519 }
520 }
521 Err(_) => {
522 issues.push(ValidationIssue::with_location(
523 Severity::Warning,
524 "MISSING_TYPE",
525 "catalog dictionary missing /Type key",
526 &catalog_location,
527 ));
528 }
529 }
530
531 if dict.get(b"Pages").is_err() {
533 issues.push(ValidationIssue::with_location(
534 Severity::Error,
535 "MISSING_PAGES",
536 "catalog dictionary missing /Pages key",
537 &catalog_location,
538 ));
539 }
540 }
541
542 for (page_idx, &page_id) in doc.page_ids.iter().enumerate() {
544 let page_num = page_idx + 1;
545 let location = format!("page {page_num} (object {} {})", page_id.0, page_id.1);
546
547 match inner.get_object(page_id) {
548 Ok(obj) => {
549 if let Ok(dict) = obj.as_dict() {
550 match dict.get(b"Type") {
552 Ok(type_obj) => {
553 if let Ok(name) = type_obj.as_name_str() {
554 if name != "Page" {
555 issues.push(ValidationIssue::with_location(
556 Severity::Warning,
557 "WRONG_PAGE_TYPE",
558 format!("page /Type is '{name}' instead of 'Page'"),
559 &location,
560 ));
561 }
562 }
563 }
564 Err(_) => {
565 issues.push(ValidationIssue::with_location(
566 Severity::Warning,
567 "MISSING_TYPE",
568 "page dictionary missing /Type key",
569 &location,
570 ));
571 }
572 }
573
574 if resolve_inherited(inner, page_id, b"MediaBox")
576 .ok()
577 .flatten()
578 .is_none()
579 {
580 issues.push(ValidationIssue::with_location(
581 Severity::Error,
582 "MISSING_MEDIABOX",
583 "page has no /MediaBox (not on page or ancestors)",
584 &location,
585 ));
586 }
587
588 check_page_fonts(inner, page_id, dict, &location, &mut issues);
590 } else {
591 issues.push(ValidationIssue::with_location(
592 Severity::Error,
593 "INVALID_PAGE",
594 "page object is not a dictionary",
595 &location,
596 ));
597 }
598 }
599 Err(_) => {
600 issues.push(ValidationIssue::with_location(
601 Severity::Error,
602 "BROKEN_REF",
603 format!("page object {} {} not found", page_id.0, page_id.1),
604 &location,
605 ));
606 }
607 }
608 }
609
610 check_broken_references(inner, &mut issues);
612
613 Ok(issues)
614}
615
616fn get_catalog_dict(doc: &lopdf::Document) -> Option<&lopdf::Dictionary> {
618 let root_obj = doc.trailer.get(b"Root").ok()?;
619 match root_obj {
620 lopdf::Object::Reference(id) => {
621 let obj = doc.get_object(*id).ok()?;
622 obj.as_dict().ok()
623 }
624 lopdf::Object::Dictionary(dict) => Some(dict),
625 _ => None,
626 }
627}
628
629fn get_catalog_location(doc: &lopdf::Document) -> String {
631 if let Ok(lopdf::Object::Reference(id)) = doc.trailer.get(b"Root") {
632 return format!("object {} {}", id.0, id.1);
633 }
634 "catalog".to_string()
635}
636
637fn check_page_fonts(
639 doc: &lopdf::Document,
640 page_id: lopdf::ObjectId,
641 page_dict: &lopdf::Dictionary,
642 location: &str,
643 issues: &mut Vec<pdfplumber_core::ValidationIssue>,
644) {
645 use pdfplumber_core::{Severity, ValidationIssue};
646
647 let font_names = get_resource_font_names(doc, page_id, page_dict);
649
650 let content_fonts = get_content_stream_font_refs(doc, page_dict);
652
653 for font_ref in &content_fonts {
655 if !font_names.contains(font_ref) {
656 issues.push(ValidationIssue::with_location(
657 Severity::Warning,
658 "MISSING_FONT",
659 format!("font /{font_ref} referenced in content stream but not in resources"),
660 location,
661 ));
662 }
663 }
664}
665
666fn get_resource_font_names(
668 doc: &lopdf::Document,
669 page_id: lopdf::ObjectId,
670 page_dict: &lopdf::Dictionary,
671) -> Vec<String> {
672 let mut names = Vec::new();
673
674 let resources = if let Ok(res_obj) = page_dict.get(b"Resources") {
676 let resolved = resolve_ref(doc, res_obj);
677 resolved.as_dict().ok()
678 } else {
679 resolve_inherited(doc, page_id, b"Resources")
681 .ok()
682 .flatten()
683 .and_then(|obj| obj.as_dict().ok())
684 };
685
686 if let Some(resources_dict) = resources {
687 if let Ok(font_obj) = resources_dict.get(b"Font") {
688 let font_obj = resolve_ref(doc, font_obj);
689 if let Ok(font_dict) = font_obj.as_dict() {
690 for (key, _) in font_dict.iter() {
691 if let Ok(name) = std::str::from_utf8(key) {
692 names.push(name.to_string());
693 }
694 }
695 }
696 }
697 }
698
699 names
700}
701
702fn get_content_stream_font_refs(
704 doc: &lopdf::Document,
705 page_dict: &lopdf::Dictionary,
706) -> Vec<String> {
707 let mut font_refs = Vec::new();
708
709 let content_bytes = match get_content_stream_bytes(doc, page_dict) {
710 Some(bytes) => bytes,
711 None => return font_refs,
712 };
713
714 let content = String::from_utf8_lossy(&content_bytes);
716 let tokens: Vec<&str> = content.split_whitespace().collect();
717
718 for (i, token) in tokens.iter().enumerate() {
719 if *token == "Tf" && i >= 2 {
720 let font_name_token = tokens[i - 2];
721 if let Some(name) = font_name_token.strip_prefix('/') {
722 if !font_refs.contains(&name.to_string()) {
723 font_refs.push(name.to_string());
724 }
725 }
726 }
727 }
728
729 font_refs
730}
731
732fn stream_bytes(stream: &lopdf::Stream) -> Option<Vec<u8>> {
734 stream
735 .decompressed_content()
736 .ok()
737 .or_else(|| Some(stream.content.clone()))
738 .filter(|b| !b.is_empty())
739}
740
741fn get_content_stream_bytes(
743 doc: &lopdf::Document,
744 page_dict: &lopdf::Dictionary,
745) -> Option<Vec<u8>> {
746 let contents_obj = page_dict.get(b"Contents").ok()?;
747
748 match contents_obj {
749 lopdf::Object::Reference(id) => {
750 let obj = doc.get_object(*id).ok()?;
751 if let Ok(stream) = obj.as_stream() {
752 stream_bytes(stream)
753 } else {
754 None
755 }
756 }
757 lopdf::Object::Array(arr) => {
758 let mut all_bytes = Vec::new();
759 for item in arr {
760 let resolved = resolve_ref(doc, item);
761 if let Ok(stream) = resolved.as_stream() {
762 if let Some(bytes) = stream_bytes(stream) {
763 all_bytes.extend_from_slice(&bytes);
764 all_bytes.push(b' ');
765 }
766 }
767 }
768 if all_bytes.is_empty() {
769 None
770 } else {
771 Some(all_bytes)
772 }
773 }
774 _ => None,
775 }
776}
777
778fn check_broken_references(
780 doc: &lopdf::Document,
781 issues: &mut Vec<pdfplumber_core::ValidationIssue>,
782) {
783 use pdfplumber_core::{Severity, ValidationIssue};
784
785 for (&obj_id, obj) in &doc.objects {
787 check_references_in_object(doc, obj, obj_id, issues);
788 }
789
790 fn check_references_in_object(
791 doc: &lopdf::Document,
792 obj: &lopdf::Object,
793 source_id: lopdf::ObjectId,
794 issues: &mut Vec<ValidationIssue>,
795 ) {
796 match obj {
797 lopdf::Object::Reference(ref_id) => {
798 if doc.get_object(*ref_id).is_err() {
799 issues.push(ValidationIssue::with_location(
800 Severity::Warning,
801 "BROKEN_REF",
802 format!(
803 "reference to object {} {} which does not exist",
804 ref_id.0, ref_id.1
805 ),
806 format!("object {} {}", source_id.0, source_id.1),
807 ));
808 }
809 }
810 lopdf::Object::Array(arr) => {
811 for item in arr {
812 check_references_in_object(doc, item, source_id, issues);
813 }
814 }
815 lopdf::Object::Dictionary(dict) => {
816 for (_, value) in dict.iter() {
817 check_references_in_object(doc, value, source_id, issues);
818 }
819 }
820 lopdf::Object::Stream(stream) => {
821 for (_, value) in stream.dict.iter() {
822 check_references_in_object(doc, value, source_id, issues);
823 }
824 }
825 _ => {}
826 }
827 }
828}
829
830fn resolve_ref<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
835 match obj {
836 lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
837 _ => obj,
838 }
839}
840
841fn repair_document(
843 bytes: &[u8],
844 options: &RepairOptions,
845) -> Result<(Vec<u8>, RepairResult), BackendError> {
846 let mut doc = lopdf::Document::load_mem(bytes)
847 .map_err(|e| BackendError::Parse(format!("failed to parse PDF for repair: {e}")))?;
848
849 let mut result = RepairResult::new();
850
851 if options.fix_stream_lengths {
852 repair_stream_lengths(&mut doc, &mut result);
853 }
854
855 if options.remove_broken_objects {
856 repair_broken_references(&mut doc, &mut result);
857 }
858
859 if options.rebuild_xref {
862 }
865
866 let mut buf = Vec::new();
867 doc.save_to(&mut buf)
868 .map_err(|e| BackendError::Parse(format!("failed to save repaired PDF: {e}")))?;
869
870 Ok((buf, result))
871}
872
873fn repair_stream_lengths(doc: &mut lopdf::Document, result: &mut RepairResult) {
875 let obj_ids: Vec<lopdf::ObjectId> = doc.objects.keys().copied().collect();
876
877 for obj_id in obj_ids {
878 let needs_fix = if let Some(lopdf::Object::Stream(stream)) = doc.objects.get(&obj_id) {
879 let actual_len = stream.content.len() as i64;
880 match stream.dict.get(b"Length") {
881 Ok(lopdf::Object::Integer(stored_len)) => *stored_len != actual_len,
882 Ok(lopdf::Object::Reference(_)) => {
883 false
885 }
886 _ => true, }
888 } else {
889 false
890 };
891
892 if needs_fix {
893 if let Some(lopdf::Object::Stream(stream)) = doc.objects.get_mut(&obj_id) {
894 let actual_len = stream.content.len() as i64;
895 let old_len = stream.dict.get(b"Length").ok().and_then(|o| {
896 if let lopdf::Object::Integer(v) = o {
897 Some(*v)
898 } else {
899 None
900 }
901 });
902 stream
903 .dict
904 .set("Length", lopdf::Object::Integer(actual_len));
905 match old_len {
906 Some(old) => {
907 result.log.push(format!(
908 "fixed stream length for object {} {}: {} -> {}",
909 obj_id.0, obj_id.1, old, actual_len
910 ));
911 }
912 None => {
913 result.log.push(format!(
914 "added missing stream length for object {} {}: {}",
915 obj_id.0, obj_id.1, actual_len
916 ));
917 }
918 }
919 }
920 }
921 }
922}
923
924fn repair_broken_references(doc: &mut lopdf::Document, result: &mut RepairResult) {
926 let obj_ids: Vec<lopdf::ObjectId> = doc.objects.keys().copied().collect();
927 let existing_ids: std::collections::BTreeSet<lopdf::ObjectId> =
928 doc.objects.keys().copied().collect();
929
930 for obj_id in obj_ids {
931 if let Some(obj) = doc.objects.remove(&obj_id) {
932 let fixed = fix_references_in_object(obj, &existing_ids, obj_id, result);
933 doc.objects.insert(obj_id, fixed);
934 }
935 }
936}
937
938fn fix_references_in_object(
940 obj: lopdf::Object,
941 existing_ids: &std::collections::BTreeSet<lopdf::ObjectId>,
942 source_id: lopdf::ObjectId,
943 result: &mut RepairResult,
944) -> lopdf::Object {
945 match obj {
946 lopdf::Object::Reference(ref_id) => {
947 if existing_ids.contains(&ref_id) {
948 obj
949 } else {
950 result.log.push(format!(
951 "removed broken reference to object {} {} (in object {} {})",
952 ref_id.0, ref_id.1, source_id.0, source_id.1
953 ));
954 lopdf::Object::Null
955 }
956 }
957 lopdf::Object::Array(arr) => {
958 let fixed: Vec<lopdf::Object> = arr
959 .into_iter()
960 .map(|item| fix_references_in_object(item, existing_ids, source_id, result))
961 .collect();
962 lopdf::Object::Array(fixed)
963 }
964 lopdf::Object::Dictionary(dict) => {
965 let mut new_dict = lopdf::Dictionary::new();
966 for (key, value) in dict.into_iter() {
967 let fixed = fix_references_in_object(value, existing_ids, source_id, result);
968 new_dict.set(key, fixed);
969 }
970 lopdf::Object::Dictionary(new_dict)
971 }
972 lopdf::Object::Stream(mut stream) => {
973 let mut new_dict = lopdf::Dictionary::new();
974 for (key, value) in stream.dict.into_iter() {
975 let fixed = fix_references_in_object(value, existing_ids, source_id, result);
976 new_dict.set(key, fixed);
977 }
978 stream.dict = new_dict;
979 lopdf::Object::Stream(stream)
980 }
981 other => other,
982 }
983}
984
985fn get_page_content_bytes(
989 doc: &lopdf::Document,
990 page_dict: &lopdf::Dictionary,
991) -> Result<Vec<u8>, BackendError> {
992 let contents_obj = match page_dict.get(b"Contents") {
993 Ok(obj) => obj,
994 Err(_) => return Ok(Vec::new()), };
996
997 match contents_obj {
998 lopdf::Object::Reference(id) => {
999 let obj = doc
1000 .get_object(*id)
1001 .map_err(|e| BackendError::Parse(format!("failed to resolve /Contents: {e}")))?;
1002 let stream = obj
1003 .as_stream()
1004 .map_err(|e| BackendError::Parse(format!("/Contents is not a stream: {e}")))?;
1005 decode_content_stream(stream)
1006 }
1007 lopdf::Object::Array(arr) => {
1008 let mut content = Vec::new();
1009 for item in arr {
1010 let id = item.as_reference().map_err(|e| {
1011 BackendError::Parse(format!("/Contents array item is not a reference: {e}"))
1012 })?;
1013 let obj = doc.get_object(id).map_err(|e| {
1014 BackendError::Parse(format!("failed to resolve /Contents stream: {e}"))
1015 })?;
1016 let stream = obj.as_stream().map_err(|e| {
1017 BackendError::Parse(format!("/Contents array item is not a stream: {e}"))
1018 })?;
1019 let bytes = decode_content_stream(stream)?;
1020 if !content.is_empty() {
1021 content.push(b' ');
1022 }
1023 content.extend_from_slice(&bytes);
1024 }
1025 Ok(content)
1026 }
1027 _ => Err(BackendError::Parse(
1028 "/Contents is not a reference or array".to_string(),
1029 )),
1030 }
1031}
1032
1033fn decode_content_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
1035 if stream.dict.get(b"Filter").is_ok() {
1036 stream
1037 .decompressed_content()
1038 .map_err(|e| BackendError::Parse(format!("failed to decompress content stream: {e}")))
1039 } else {
1040 Ok(stream.content.clone())
1041 }
1042}
1043
1044fn get_page_resources(
1046 doc: &lopdf::Document,
1047 page_id: lopdf::ObjectId,
1048) -> Result<&lopdf::Dictionary, BackendError> {
1049 match resolve_inherited(doc, page_id, b"Resources")? {
1050 Some(obj) => {
1051 let obj = match obj {
1053 lopdf::Object::Reference(id) => doc.get_object(*id).map_err(|e| {
1054 BackendError::Parse(format!("failed to resolve /Resources reference: {e}"))
1055 })?,
1056 other => other,
1057 };
1058 obj.as_dict()
1059 .map_err(|_| BackendError::Parse("/Resources is not a dictionary".to_string()))
1060 }
1061 None => {
1062 static EMPTY_DICT: std::sync::LazyLock<lopdf::Dictionary> =
1065 std::sync::LazyLock::new(lopdf::Dictionary::new);
1066 Ok(&EMPTY_DICT)
1067 }
1068 }
1069}
1070
1071fn extract_string_from_dict(
1073 doc: &lopdf::Document,
1074 dict: &lopdf::Dictionary,
1075 key: &[u8],
1076) -> Option<String> {
1077 let obj = dict.get(key).ok()?;
1078 let obj = match obj {
1080 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1081 other => other,
1082 };
1083 match obj {
1084 lopdf::Object::String(bytes, _) => {
1085 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1087 let chars: Vec<u16> = bytes[2..]
1088 .chunks(2)
1089 .filter_map(|c| {
1090 if c.len() == 2 {
1091 Some(u16::from_be_bytes([c[0], c[1]]))
1092 } else {
1093 None
1094 }
1095 })
1096 .collect();
1097 String::from_utf16(&chars).ok()
1098 } else {
1099 match std::str::from_utf8(bytes) {
1101 Ok(s) => Some(s.to_string()),
1102 Err(_) => Some(bytes.iter().map(|&b| b as char).collect()),
1103 }
1104 }
1105 }
1106 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1107 _ => None,
1108 }
1109}
1110
1111fn extract_document_metadata(doc: &lopdf::Document) -> Result<DocumentMetadata, BackendError> {
1113 let info_ref = match doc.trailer.get(b"Info") {
1115 Ok(obj) => obj,
1116 Err(_) => return Ok(DocumentMetadata::default()),
1117 };
1118
1119 let info_dict = match info_ref {
1120 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1121 Ok(obj) => match obj.as_dict() {
1122 Ok(dict) => dict,
1123 Err(_) => return Ok(DocumentMetadata::default()),
1124 },
1125 Err(_) => return Ok(DocumentMetadata::default()),
1126 },
1127 lopdf::Object::Dictionary(dict) => dict,
1128 _ => return Ok(DocumentMetadata::default()),
1129 };
1130
1131 Ok(DocumentMetadata {
1132 title: extract_string_from_dict(doc, info_dict, b"Title"),
1133 author: extract_string_from_dict(doc, info_dict, b"Author"),
1134 subject: extract_string_from_dict(doc, info_dict, b"Subject"),
1135 keywords: extract_string_from_dict(doc, info_dict, b"Keywords"),
1136 creator: extract_string_from_dict(doc, info_dict, b"Creator"),
1137 producer: extract_string_from_dict(doc, info_dict, b"Producer"),
1138 creation_date: extract_string_from_dict(doc, info_dict, b"CreationDate"),
1139 mod_date: extract_string_from_dict(doc, info_dict, b"ModDate"),
1140 })
1141}
1142
1143fn extract_document_bookmarks(doc: &lopdf::Document) -> Result<Vec<Bookmark>, BackendError> {
1148 let catalog_ref = match doc.trailer.get(b"Root") {
1150 Ok(obj) => obj,
1151 Err(_) => return Ok(Vec::new()),
1152 };
1153
1154 let catalog = match catalog_ref {
1155 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1156 Ok(obj) => match obj.as_dict() {
1157 Ok(dict) => dict,
1158 Err(_) => return Ok(Vec::new()),
1159 },
1160 Err(_) => return Ok(Vec::new()),
1161 },
1162 lopdf::Object::Dictionary(dict) => dict,
1163 _ => return Ok(Vec::new()),
1164 };
1165
1166 let outlines_obj = match catalog.get(b"Outlines") {
1168 Ok(obj) => obj,
1169 Err(_) => return Ok(Vec::new()),
1170 };
1171
1172 let outlines_obj = match outlines_obj {
1173 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1174 Ok(obj) => obj,
1175 Err(_) => return Ok(Vec::new()),
1176 },
1177 other => other,
1178 };
1179
1180 let outlines_dict = match outlines_obj.as_dict() {
1181 Ok(dict) => dict,
1182 Err(_) => return Ok(Vec::new()),
1183 };
1184
1185 let first_ref = match outlines_dict.get(b"First") {
1187 Ok(lopdf::Object::Reference(id)) => *id,
1188 _ => return Ok(Vec::new()),
1189 };
1190
1191 let pages_map = doc.get_pages();
1193
1194 let mut bookmarks = Vec::new();
1195 let max_depth = 64; walk_outline_tree(doc, first_ref, 0, max_depth, &pages_map, &mut bookmarks);
1197
1198 Ok(bookmarks)
1199}
1200
1201fn walk_outline_tree(
1203 doc: &lopdf::Document,
1204 item_id: lopdf::ObjectId,
1205 level: usize,
1206 max_depth: usize,
1207 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1208 bookmarks: &mut Vec<Bookmark>,
1209) {
1210 if level >= max_depth {
1211 return;
1212 }
1213
1214 let mut current_id = Some(item_id);
1215 let mut visited = std::collections::HashSet::new();
1216 let max_siblings = 10_000; let mut sibling_count = 0;
1218
1219 while let Some(node_id) = current_id {
1220 if !visited.insert(node_id) || sibling_count >= max_siblings {
1222 break;
1223 }
1224 sibling_count += 1;
1225
1226 let node_obj = match doc.get_object(node_id) {
1227 Ok(obj) => obj,
1228 Err(_) => break,
1229 };
1230
1231 let node_dict = match node_obj.as_dict() {
1232 Ok(dict) => dict,
1233 Err(_) => break,
1234 };
1235
1236 let title = extract_string_from_dict(doc, node_dict, b"Title").unwrap_or_default();
1238
1239 let (page_number, dest_top) = resolve_bookmark_dest(doc, node_dict, pages_map);
1241
1242 bookmarks.push(Bookmark {
1243 title,
1244 level,
1245 page_number,
1246 dest_top,
1247 });
1248
1249 if let Ok(lopdf::Object::Reference(child_id)) = node_dict.get(b"First") {
1251 walk_outline_tree(doc, *child_id, level + 1, max_depth, pages_map, bookmarks);
1252 }
1253
1254 current_id = match node_dict.get(b"Next") {
1256 Ok(lopdf::Object::Reference(next_id)) => Some(*next_id),
1257 _ => None,
1258 };
1259 }
1260}
1261
1262fn resolve_bookmark_dest(
1266 doc: &lopdf::Document,
1267 node_dict: &lopdf::Dictionary,
1268 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1269) -> (Option<usize>, Option<f64>) {
1270 if let Ok(dest_obj) = node_dict.get(b"Dest") {
1272 if let Some(result) = resolve_dest_to_page(doc, dest_obj, pages_map) {
1273 return result;
1274 }
1275 }
1276
1277 if let Ok(action_obj) = node_dict.get(b"A") {
1279 let action_obj = match action_obj {
1280 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1281 Ok(obj) => obj,
1282 Err(_) => return (None, None),
1283 },
1284 other => other,
1285 };
1286 if let Ok(action_dict) = action_obj.as_dict() {
1287 if let Ok(lopdf::Object::Name(action_type)) = action_dict.get(b"S") {
1288 if String::from_utf8_lossy(action_type) == "GoTo" {
1289 if let Ok(dest_obj) = action_dict.get(b"D") {
1290 if let Some(result) = resolve_dest_to_page(doc, dest_obj, pages_map) {
1291 return result;
1292 }
1293 }
1294 }
1295 }
1296 }
1297 }
1298
1299 (None, None)
1300}
1301
1302fn resolve_dest_to_page(
1306 doc: &lopdf::Document,
1307 dest_obj: &lopdf::Object,
1308 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1309) -> Option<(Option<usize>, Option<f64>)> {
1310 let dest_obj = match dest_obj {
1311 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1312 other => other,
1313 };
1314
1315 match dest_obj {
1316 lopdf::Object::Array(arr) => {
1318 if arr.is_empty() {
1319 return None;
1320 }
1321 if let lopdf::Object::Reference(page_ref) = &arr[0] {
1323 let page_number = pages_map.iter().find_map(|(&page_num, &page_id)| {
1325 if page_id == *page_ref {
1326 Some((page_num - 1) as usize) } else {
1328 None
1329 }
1330 });
1331
1332 let dest_top = extract_dest_top(arr);
1334
1335 return Some((page_number, dest_top));
1336 }
1337 None
1338 }
1339 lopdf::Object::String(bytes, _) => {
1341 let name = if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1342 let chars: Vec<u16> = bytes[2..]
1343 .chunks(2)
1344 .filter_map(|c| {
1345 if c.len() == 2 {
1346 Some(u16::from_be_bytes([c[0], c[1]]))
1347 } else {
1348 None
1349 }
1350 })
1351 .collect();
1352 String::from_utf16(&chars).ok()?
1353 } else {
1354 match std::str::from_utf8(bytes) {
1355 Ok(s) => s.to_string(),
1356 Err(_) => bytes.iter().map(|&b| b as char).collect(),
1357 }
1358 };
1359 resolve_named_dest(doc, &name, pages_map)
1360 }
1361 lopdf::Object::Name(name) => {
1363 let name_str = String::from_utf8_lossy(name);
1364 resolve_named_dest(doc, &name_str, pages_map)
1365 }
1366 _ => None,
1367 }
1368}
1369
1370fn extract_dest_top(arr: &[lopdf::Object]) -> Option<f64> {
1374 if arr.len() < 2 {
1375 return None;
1376 }
1377 if let lopdf::Object::Name(dest_type) = &arr[1] {
1379 let type_str = String::from_utf8_lossy(dest_type);
1380 match type_str.as_ref() {
1381 "XYZ" => {
1382 if arr.len() >= 4 {
1384 return obj_to_f64(&arr[3]);
1385 }
1386 }
1387 "FitH" | "FitBH" => {
1388 if arr.len() >= 3 {
1390 return obj_to_f64(&arr[2]);
1391 }
1392 }
1393 _ => {} }
1395 }
1396 None
1397}
1398
1399fn obj_to_f64(obj: &lopdf::Object) -> Option<f64> {
1401 match obj {
1402 lopdf::Object::Integer(i) => Some(*i as f64),
1403 lopdf::Object::Real(f) => Some((*f).into()),
1404 lopdf::Object::Null => None, _ => None,
1406 }
1407}
1408
1409fn resolve_named_dest(
1414 doc: &lopdf::Document,
1415 name: &str,
1416 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1417) -> Option<(Option<usize>, Option<f64>)> {
1418 let catalog_ref = doc.trailer.get(b"Root").ok()?;
1420 let catalog = match catalog_ref {
1421 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?.as_dict().ok()?,
1422 lopdf::Object::Dictionary(dict) => dict,
1423 _ => return None,
1424 };
1425
1426 if let Ok(names_obj) = catalog.get(b"Names") {
1428 let names_obj = match names_obj {
1429 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1430 other => other,
1431 };
1432 if let Ok(names_dict) = names_obj.as_dict() {
1433 if let Ok(dests_obj) = names_dict.get(b"Dests") {
1434 let dests_obj = match dests_obj {
1435 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1436 other => other,
1437 };
1438 if let Ok(dests_dict) = dests_obj.as_dict() {
1439 if let Some(result) = lookup_name_tree(doc, dests_dict, name, pages_map) {
1440 return Some(result);
1441 }
1442 }
1443 }
1444 }
1445 }
1446
1447 if let Ok(dests_obj) = catalog.get(b"Dests") {
1449 let dests_obj = match dests_obj {
1450 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1451 other => other,
1452 };
1453 if let Ok(dests_dict) = dests_obj.as_dict() {
1454 if let Ok(dest_obj) = dests_dict.get(name.as_bytes()) {
1455 let dest_obj = match dest_obj {
1456 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1457 other => other,
1458 };
1459 match dest_obj {
1461 lopdf::Object::Array(arr) => {
1462 if let Some(result) =
1463 resolve_dest_to_page(doc, &lopdf::Object::Array(arr.clone()), pages_map)
1464 {
1465 return Some(result);
1466 }
1467 }
1468 lopdf::Object::Dictionary(d) => {
1469 if let Ok(d_dest) = d.get(b"D") {
1470 if let Some(result) = resolve_dest_to_page(doc, d_dest, pages_map) {
1471 return Some(result);
1472 }
1473 }
1474 }
1475 _ => {}
1476 }
1477 }
1478 }
1479 }
1480
1481 None
1482}
1483
1484fn lookup_name_tree(
1486 doc: &lopdf::Document,
1487 tree_dict: &lopdf::Dictionary,
1488 name: &str,
1489 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1490) -> Option<(Option<usize>, Option<f64>)> {
1491 if let Ok(names_arr_obj) = tree_dict.get(b"Names") {
1493 let names_arr_obj = match names_arr_obj {
1494 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1495 other => other,
1496 };
1497 if let Ok(names_arr) = names_arr_obj.as_array() {
1498 let mut i = 0;
1500 while i + 1 < names_arr.len() {
1501 let key_obj = match &names_arr[i] {
1502 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1503 Ok(obj) => obj.clone(),
1504 Err(_) => {
1505 i += 2;
1506 continue;
1507 }
1508 },
1509 other => other.clone(),
1510 };
1511 if let lopdf::Object::String(key_bytes, _) = &key_obj {
1512 let key_str = String::from_utf8_lossy(key_bytes);
1513 if key_str == name {
1514 let value = &names_arr[i + 1];
1515 let value = match value {
1516 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1517 other => other,
1518 };
1519 match value {
1521 lopdf::Object::Array(arr) => {
1522 return resolve_dest_to_page(
1523 doc,
1524 &lopdf::Object::Array(arr.clone()),
1525 pages_map,
1526 );
1527 }
1528 lopdf::Object::Dictionary(d) => {
1529 if let Ok(d_dest) = d.get(b"D") {
1530 return resolve_dest_to_page(doc, d_dest, pages_map);
1531 }
1532 }
1533 _ => {}
1534 }
1535 }
1536 }
1537 i += 2;
1538 }
1539 }
1540 }
1541
1542 if let Ok(kids_obj) = tree_dict.get(b"Kids") {
1544 let kids_obj = match kids_obj {
1545 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1546 other => other,
1547 };
1548 if let Ok(kids_arr) = kids_obj.as_array() {
1549 for kid in kids_arr {
1550 let kid_obj = match kid {
1551 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1552 Ok(obj) => obj,
1553 Err(_) => continue,
1554 },
1555 other => other,
1556 };
1557 if let Ok(kid_dict) = kid_obj.as_dict() {
1558 if let Some(result) = lookup_name_tree(doc, kid_dict, name, pages_map) {
1559 return Some(result);
1560 }
1561 }
1562 }
1563 }
1564 }
1565
1566 None
1567}
1568
1569fn extract_document_form_fields(doc: &lopdf::Document) -> Result<Vec<FormField>, BackendError> {
1575 let catalog_ref = match doc.trailer.get(b"Root") {
1577 Ok(obj) => obj,
1578 Err(_) => return Ok(Vec::new()),
1579 };
1580
1581 let catalog = match catalog_ref {
1582 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1583 Ok(obj) => match obj.as_dict() {
1584 Ok(dict) => dict,
1585 Err(_) => return Ok(Vec::new()),
1586 },
1587 Err(_) => return Ok(Vec::new()),
1588 },
1589 lopdf::Object::Dictionary(dict) => dict,
1590 _ => return Ok(Vec::new()),
1591 };
1592
1593 let acroform_obj = match catalog.get(b"AcroForm") {
1595 Ok(obj) => obj,
1596 Err(_) => return Ok(Vec::new()), };
1598
1599 let acroform_obj = match acroform_obj {
1600 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1601 Ok(obj) => obj,
1602 Err(_) => return Ok(Vec::new()),
1603 },
1604 other => other,
1605 };
1606
1607 let acroform_dict = match acroform_obj.as_dict() {
1608 Ok(dict) => dict,
1609 Err(_) => return Ok(Vec::new()),
1610 };
1611
1612 let fields_obj = match acroform_dict.get(b"Fields") {
1614 Ok(obj) => obj,
1615 Err(_) => return Ok(Vec::new()),
1616 };
1617
1618 let fields_obj = match fields_obj {
1619 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1620 Ok(obj) => obj,
1621 Err(_) => return Ok(Vec::new()),
1622 },
1623 other => other,
1624 };
1625
1626 let fields_array = match fields_obj.as_array() {
1627 Ok(arr) => arr,
1628 Err(_) => return Ok(Vec::new()),
1629 };
1630
1631 let pages_map = doc.get_pages();
1633
1634 let mut form_fields = Vec::new();
1635 let max_depth = 64; for field_entry in fields_array {
1638 let field_ref = match field_entry {
1639 lopdf::Object::Reference(id) => *id,
1640 _ => continue,
1641 };
1642 walk_field_tree(
1643 doc,
1644 field_ref,
1645 None, None, 0,
1648 max_depth,
1649 &pages_map,
1650 &mut form_fields,
1651 );
1652 }
1653
1654 Ok(form_fields)
1655}
1656
1657#[allow(clippy::too_many_arguments)]
1662fn walk_field_tree(
1663 doc: &lopdf::Document,
1664 field_id: lopdf::ObjectId,
1665 parent_name: Option<&str>,
1666 inherited_ft: Option<&FieldType>,
1667 depth: usize,
1668 max_depth: usize,
1669 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1670 fields: &mut Vec<FormField>,
1671) {
1672 if depth >= max_depth {
1673 return;
1674 }
1675
1676 let field_obj = match doc.get_object(field_id) {
1677 Ok(obj) => obj,
1678 Err(_) => return,
1679 };
1680
1681 let field_dict = match field_obj.as_dict() {
1682 Ok(dict) => dict,
1683 Err(_) => return,
1684 };
1685
1686 let partial_name = extract_string_from_dict(doc, field_dict, b"T");
1688
1689 let full_name = match (&parent_name, &partial_name) {
1691 (Some(parent), Some(name)) => format!("{parent}.{name}"),
1692 (Some(parent), None) => parent.to_string(),
1693 (None, Some(name)) => name.clone(),
1694 (None, None) => String::new(),
1695 };
1696
1697 let field_type = match field_dict.get(b"FT") {
1699 Ok(lopdf::Object::Name(name)) => FieldType::from_pdf_name(&String::from_utf8_lossy(name)),
1700 _ => inherited_ft.cloned(),
1701 };
1702
1703 if let Ok(kids_obj) = field_dict.get(b"Kids") {
1705 let kids_obj = match kids_obj {
1706 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1707 Ok(obj) => obj,
1708 Err(_) => return,
1709 },
1710 other => other,
1711 };
1712
1713 if let Ok(kids_array) = kids_obj.as_array() {
1714 let has_child_fields = kids_array.iter().any(|kid| {
1717 let kid_obj = match kid {
1718 lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
1719 _ => Some(kid),
1720 };
1721 kid_obj
1722 .and_then(|o| o.as_dict().ok())
1723 .is_some_and(|d| d.get(b"T").is_ok())
1724 });
1725
1726 if has_child_fields {
1727 for kid in kids_array {
1729 if let lopdf::Object::Reference(kid_id) = kid {
1730 walk_field_tree(
1731 doc,
1732 *kid_id,
1733 Some(&full_name),
1734 field_type.as_ref(),
1735 depth + 1,
1736 max_depth,
1737 pages_map,
1738 fields,
1739 );
1740 }
1741 }
1742 return;
1743 }
1744 }
1746 }
1747
1748 let Some(field_type) = field_type else {
1750 return; };
1752
1753 let value = extract_field_value(doc, field_dict, b"V");
1755
1756 let default_value = extract_field_value(doc, field_dict, b"DV");
1758
1759 let bbox = extract_field_bbox(doc, field_dict).unwrap_or(BBox::new(0.0, 0.0, 0.0, 0.0));
1761
1762 let options = extract_field_options(doc, field_dict);
1764
1765 let flags = match field_dict.get(b"Ff") {
1767 Ok(lopdf::Object::Integer(n)) => *n as u32,
1768 _ => 0,
1769 };
1770
1771 let page_index = resolve_field_page(doc, field_dict, pages_map);
1773
1774 fields.push(FormField {
1775 name: full_name,
1776 field_type,
1777 value,
1778 default_value,
1779 bbox,
1780 options,
1781 flags,
1782 page_index,
1783 });
1784}
1785
1786fn extract_field_value(
1790 doc: &lopdf::Document,
1791 dict: &lopdf::Dictionary,
1792 key: &[u8],
1793) -> Option<String> {
1794 let obj = dict.get(key).ok()?;
1795 let obj = match obj {
1796 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1797 other => other,
1798 };
1799 match obj {
1800 lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1801 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1802 lopdf::Object::Array(arr) => {
1803 let vals: Vec<String> = arr
1805 .iter()
1806 .filter_map(|item| match item {
1807 lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1808 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1809 _ => None,
1810 })
1811 .collect();
1812 if vals.is_empty() {
1813 None
1814 } else {
1815 Some(vals.join(", "))
1816 }
1817 }
1818 _ => None,
1819 }
1820}
1821
1822fn decode_pdf_string(bytes: &[u8]) -> String {
1824 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1825 let chars: Vec<u16> = bytes[2..]
1827 .chunks(2)
1828 .filter_map(|c| {
1829 if c.len() == 2 {
1830 Some(u16::from_be_bytes([c[0], c[1]]))
1831 } else {
1832 None
1833 }
1834 })
1835 .collect();
1836 String::from_utf16_lossy(&chars)
1837 } else {
1838 String::from_utf8_lossy(bytes).into_owned()
1839 }
1840}
1841
1842fn extract_field_bbox(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> Option<BBox> {
1844 let rect_obj = dict.get(b"Rect").ok()?;
1845 let rect_obj = match rect_obj {
1846 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1847 other => other,
1848 };
1849 let arr = rect_obj.as_array().ok()?;
1850 extract_bbox_from_array(arr).ok()
1851}
1852
1853fn extract_field_options(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> Vec<String> {
1855 let opt_obj = match dict.get(b"Opt") {
1856 Ok(obj) => obj,
1857 Err(_) => return Vec::new(),
1858 };
1859 let opt_obj = match opt_obj {
1860 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1861 Ok(obj) => obj,
1862 Err(_) => return Vec::new(),
1863 },
1864 other => other,
1865 };
1866 let opt_array = match opt_obj.as_array() {
1867 Ok(arr) => arr,
1868 Err(_) => return Vec::new(),
1869 };
1870
1871 opt_array
1872 .iter()
1873 .filter_map(|item| {
1874 let item = match item {
1875 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1876 other => other,
1877 };
1878 match item {
1879 lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1880 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1881 lopdf::Object::Array(pair) => {
1883 if pair.len() >= 2 {
1884 match &pair[1] {
1886 lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1887 lopdf::Object::Name(name) => {
1888 Some(String::from_utf8_lossy(name).into_owned())
1889 }
1890 _ => None,
1891 }
1892 } else {
1893 None
1894 }
1895 }
1896 _ => None,
1897 }
1898 })
1899 .collect()
1900}
1901
1902fn resolve_field_page(
1904 _doc: &lopdf::Document,
1905 dict: &lopdf::Dictionary,
1906 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1907) -> Option<usize> {
1908 let page_ref = match dict.get(b"P") {
1910 Ok(lopdf::Object::Reference(id)) => *id,
1911 _ => return None,
1912 };
1913
1914 pages_map.iter().find_map(|(&page_num, &page_id)| {
1916 if page_id == page_ref {
1917 Some((page_num - 1) as usize) } else {
1919 None
1920 }
1921 })
1922}
1923
1924fn extract_document_signatures(doc: &lopdf::Document) -> Result<Vec<SignatureInfo>, BackendError> {
1930 let catalog_ref = match doc.trailer.get(b"Root") {
1932 Ok(obj) => obj,
1933 Err(_) => return Ok(Vec::new()),
1934 };
1935
1936 let catalog = match catalog_ref {
1937 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1938 Ok(obj) => match obj.as_dict() {
1939 Ok(dict) => dict,
1940 Err(_) => return Ok(Vec::new()),
1941 },
1942 Err(_) => return Ok(Vec::new()),
1943 },
1944 lopdf::Object::Dictionary(dict) => dict,
1945 _ => return Ok(Vec::new()),
1946 };
1947
1948 let acroform_obj = match catalog.get(b"AcroForm") {
1950 Ok(obj) => obj,
1951 Err(_) => return Ok(Vec::new()),
1952 };
1953
1954 let acroform_obj = match acroform_obj {
1955 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1956 Ok(obj) => obj,
1957 Err(_) => return Ok(Vec::new()),
1958 },
1959 other => other,
1960 };
1961
1962 let acroform_dict = match acroform_obj.as_dict() {
1963 Ok(dict) => dict,
1964 Err(_) => return Ok(Vec::new()),
1965 };
1966
1967 let fields_obj = match acroform_dict.get(b"Fields") {
1969 Ok(obj) => obj,
1970 Err(_) => return Ok(Vec::new()),
1971 };
1972
1973 let fields_obj = match fields_obj {
1974 lopdf::Object::Reference(id) => match doc.get_object(*id) {
1975 Ok(obj) => obj,
1976 Err(_) => return Ok(Vec::new()),
1977 },
1978 other => other,
1979 };
1980
1981 let fields_array = match fields_obj.as_array() {
1982 Ok(arr) => arr,
1983 Err(_) => return Ok(Vec::new()),
1984 };
1985
1986 let mut signatures = Vec::new();
1987 let max_depth = 64;
1988
1989 for field_entry in fields_array {
1990 let field_ref = match field_entry {
1991 lopdf::Object::Reference(id) => *id,
1992 _ => continue,
1993 };
1994 walk_signature_tree(doc, field_ref, None, 0, max_depth, &mut signatures);
1995 }
1996
1997 Ok(signatures)
1998}
1999
2000fn walk_signature_tree(
2005 doc: &lopdf::Document,
2006 field_id: lopdf::ObjectId,
2007 inherited_ft: Option<&[u8]>,
2008 depth: usize,
2009 max_depth: usize,
2010 signatures: &mut Vec<SignatureInfo>,
2011) {
2012 if depth >= max_depth {
2013 return;
2014 }
2015
2016 let field_obj = match doc.get_object(field_id) {
2017 Ok(obj) => obj,
2018 Err(_) => return,
2019 };
2020
2021 let field_dict = match field_obj.as_dict() {
2022 Ok(dict) => dict,
2023 Err(_) => return,
2024 };
2025
2026 let field_type = match field_dict.get(b"FT") {
2028 Ok(lopdf::Object::Name(name)) => Some(name.as_slice()),
2029 _ => inherited_ft,
2030 };
2031
2032 if let Ok(kids_obj) = field_dict.get(b"Kids") {
2034 let kids_obj = match kids_obj {
2035 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2036 Ok(obj) => obj,
2037 Err(_) => return,
2038 },
2039 other => other,
2040 };
2041
2042 if let Ok(kids_array) = kids_obj.as_array() {
2043 let has_child_fields = kids_array.iter().any(|kid| {
2045 let kid_obj = match kid {
2046 lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
2047 _ => Some(kid),
2048 };
2049 kid_obj
2050 .and_then(|o| o.as_dict().ok())
2051 .is_some_and(|d| d.get(b"T").is_ok())
2052 });
2053
2054 if has_child_fields {
2055 for kid in kids_array {
2056 if let lopdf::Object::Reference(kid_id) = kid {
2057 walk_signature_tree(
2058 doc,
2059 *kid_id,
2060 field_type,
2061 depth + 1,
2062 max_depth,
2063 signatures,
2064 );
2065 }
2066 }
2067 return;
2068 }
2069 }
2070 }
2071
2072 let is_sig = field_type.is_some_and(|ft| ft == b"Sig");
2074 if !is_sig {
2075 return;
2076 }
2077
2078 let sig_dict = field_dict
2080 .get(b"V")
2081 .ok()
2082 .and_then(|obj| match obj {
2083 lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
2084 other => Some(other),
2085 })
2086 .and_then(|obj| obj.as_dict().ok());
2087
2088 let info = match sig_dict {
2089 Some(v_dict) => SignatureInfo {
2090 signer_name: extract_string_from_dict(doc, v_dict, b"Name"),
2091 sign_date: extract_string_from_dict(doc, v_dict, b"M"),
2092 reason: extract_string_from_dict(doc, v_dict, b"Reason"),
2093 location: extract_string_from_dict(doc, v_dict, b"Location"),
2094 contact_info: extract_string_from_dict(doc, v_dict, b"ContactInfo"),
2095 is_signed: true,
2096 },
2097 None => SignatureInfo {
2098 signer_name: None,
2099 sign_date: None,
2100 reason: None,
2101 location: None,
2102 contact_info: None,
2103 is_signed: false,
2104 },
2105 };
2106
2107 signatures.push(info);
2108}
2109
2110fn extract_document_structure_tree(
2116 doc: &lopdf::Document,
2117) -> Result<Vec<StructElement>, BackendError> {
2118 let catalog_ref = match doc.trailer.get(b"Root") {
2120 Ok(obj) => obj,
2121 Err(_) => return Ok(Vec::new()),
2122 };
2123
2124 let catalog = match catalog_ref {
2125 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2126 Ok(obj) => match obj.as_dict() {
2127 Ok(dict) => dict,
2128 Err(_) => return Ok(Vec::new()),
2129 },
2130 Err(_) => return Ok(Vec::new()),
2131 },
2132 lopdf::Object::Dictionary(dict) => dict,
2133 _ => return Ok(Vec::new()),
2134 };
2135
2136 let struct_tree_obj = match catalog.get(b"StructTreeRoot") {
2138 Ok(obj) => obj,
2139 Err(_) => return Ok(Vec::new()), };
2141
2142 let struct_tree_obj = resolve_object(doc, struct_tree_obj);
2143 let struct_tree_dict = match struct_tree_obj.as_dict() {
2144 Ok(dict) => dict,
2145 Err(_) => return Ok(Vec::new()),
2146 };
2147
2148 let pages_map = doc.get_pages();
2150
2151 let kids_obj = match struct_tree_dict.get(b"K") {
2153 Ok(obj) => obj,
2154 Err(_) => return Ok(Vec::new()), };
2156
2157 let max_depth = 64; let elements = parse_struct_kids(doc, kids_obj, 0, max_depth, &pages_map);
2159 Ok(elements)
2160}
2161
2162fn parse_struct_kids(
2168 doc: &lopdf::Document,
2169 kids_obj: &lopdf::Object,
2170 depth: usize,
2171 max_depth: usize,
2172 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2173) -> Vec<StructElement> {
2174 if depth >= max_depth {
2175 return Vec::new();
2176 }
2177
2178 let kids_obj = resolve_object(doc, kids_obj);
2179
2180 match kids_obj {
2181 lopdf::Object::Array(arr) => {
2182 let mut elements = Vec::new();
2183 for item in arr {
2184 let item = resolve_object(doc, item);
2185 match item {
2186 lopdf::Object::Dictionary(dict) => {
2187 if let Some(elem) =
2188 parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2189 {
2190 elements.push(elem);
2191 }
2192 }
2193 lopdf::Object::Reference(id) => {
2194 if let Ok(obj) = doc.get_object(*id) {
2195 if let Ok(dict) = obj.as_dict() {
2196 if let Some(elem) =
2197 parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2198 {
2199 elements.push(elem);
2200 }
2201 }
2202 }
2203 }
2204 lopdf::Object::Integer(_) => {
2206 }
2209 _ => {}
2210 }
2211 }
2212 elements
2213 }
2214 lopdf::Object::Dictionary(dict) => {
2215 if let Some(elem) = parse_struct_element(doc, dict, depth + 1, max_depth, pages_map) {
2216 vec![elem]
2217 } else {
2218 Vec::new()
2219 }
2220 }
2221 lopdf::Object::Reference(id) => {
2222 if let Ok(obj) = doc.get_object(*id) {
2223 if let Ok(dict) = obj.as_dict() {
2224 if let Some(elem) =
2225 parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2226 {
2227 return vec![elem];
2228 }
2229 }
2230 }
2231 Vec::new()
2232 }
2233 _ => Vec::new(),
2234 }
2235}
2236
2237fn parse_struct_element(
2242 doc: &lopdf::Document,
2243 dict: &lopdf::Dictionary,
2244 depth: usize,
2245 max_depth: usize,
2246 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2247) -> Option<StructElement> {
2248 if dict.get(b"MCID").is_ok() && dict.get(b"S").is_err() {
2251 return None; }
2253
2254 let element_type = match dict.get(b"S") {
2256 Ok(obj) => {
2257 let obj = resolve_object(doc, obj);
2258 match obj {
2259 lopdf::Object::Name(name) => String::from_utf8_lossy(name).into_owned(),
2260 _ => return None,
2261 }
2262 }
2263 Err(_) => return None, };
2265
2266 let mut mcids = Vec::new();
2268 let mut children = Vec::new();
2269
2270 if let Ok(k_obj) = dict.get(b"K") {
2271 collect_mcids_and_children(
2272 doc,
2273 k_obj,
2274 &mut mcids,
2275 &mut children,
2276 depth,
2277 max_depth,
2278 pages_map,
2279 );
2280 }
2281
2282 let alt_text = extract_string_entry(doc, dict, b"Alt");
2284
2285 let actual_text = extract_string_entry(doc, dict, b"ActualText");
2287
2288 let lang = extract_string_entry(doc, dict, b"Lang");
2290
2291 let page_index = resolve_struct_page(doc, dict, pages_map);
2293
2294 Some(StructElement {
2295 element_type,
2296 mcids,
2297 alt_text,
2298 actual_text,
2299 lang,
2300 bbox: None, children,
2302 page_index,
2303 })
2304}
2305
2306fn collect_mcids_and_children(
2314 doc: &lopdf::Document,
2315 k_obj: &lopdf::Object,
2316 mcids: &mut Vec<u32>,
2317 children: &mut Vec<StructElement>,
2318 depth: usize,
2319 max_depth: usize,
2320 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2321) {
2322 if depth >= max_depth {
2323 return;
2324 }
2325
2326 let k_obj = resolve_object(doc, k_obj);
2327
2328 match k_obj {
2329 lopdf::Object::Integer(n) => {
2330 if *n >= 0 {
2332 mcids.push(*n as u32);
2333 }
2334 }
2335 lopdf::Object::Dictionary(dict) => {
2336 process_k_dict(doc, dict, mcids, children, depth, max_depth, pages_map);
2337 }
2338 lopdf::Object::Reference(id) => {
2339 if let Ok(obj) = doc.get_object(*id) {
2340 match obj {
2341 lopdf::Object::Dictionary(dict) => {
2342 process_k_dict(doc, dict, mcids, children, depth, max_depth, pages_map);
2343 }
2344 lopdf::Object::Integer(n) => {
2345 if *n >= 0 {
2346 mcids.push(*n as u32);
2347 }
2348 }
2349 _ => {}
2350 }
2351 }
2352 }
2353 lopdf::Object::Array(arr) => {
2354 for item in arr {
2355 collect_mcids_and_children(doc, item, mcids, children, depth, max_depth, pages_map);
2356 }
2357 }
2358 _ => {}
2359 }
2360}
2361
2362fn process_k_dict(
2364 doc: &lopdf::Document,
2365 dict: &lopdf::Dictionary,
2366 mcids: &mut Vec<u32>,
2367 children: &mut Vec<StructElement>,
2368 depth: usize,
2369 max_depth: usize,
2370 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2371) {
2372 if let Ok(mcid_obj) = dict.get(b"MCID") {
2374 let mcid_obj = resolve_object(doc, mcid_obj);
2375 if let lopdf::Object::Integer(n) = mcid_obj {
2376 if *n >= 0 {
2377 mcids.push(*n as u32);
2378 }
2379 }
2380 return;
2381 }
2382
2383 if let Some(elem) = parse_struct_element(doc, dict, depth + 1, max_depth, pages_map) {
2385 children.push(elem);
2386 }
2387}
2388
2389fn resolve_struct_page(
2391 _doc: &lopdf::Document,
2392 dict: &lopdf::Dictionary,
2393 pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2394) -> Option<usize> {
2395 let page_ref = match dict.get(b"Pg") {
2396 Ok(lopdf::Object::Reference(id)) => *id,
2397 _ => return None,
2398 };
2399
2400 for (page_num, page_id) in pages_map {
2402 if *page_id == page_ref {
2403 return Some((*page_num - 1) as usize); }
2405 }
2406
2407 None
2408}
2409
2410fn extract_string_entry(
2412 doc: &lopdf::Document,
2413 dict: &lopdf::Dictionary,
2414 key: &[u8],
2415) -> Option<String> {
2416 let obj = dict.get(key).ok()?;
2417 let obj = resolve_object(doc, obj);
2418 match obj {
2419 lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
2420 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
2421 _ => None,
2422 }
2423}
2424
2425fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
2427 match obj {
2428 lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
2429 _ => obj,
2430 }
2431}
2432
2433fn extract_page_annotations(
2435 doc: &lopdf::Document,
2436 page_id: lopdf::ObjectId,
2437) -> Result<Vec<Annotation>, BackendError> {
2438 let page_dict = doc
2439 .get_object(page_id)
2440 .and_then(|o| o.as_dict())
2441 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
2442
2443 let annots_obj = match page_dict.get(b"Annots") {
2445 Ok(obj) => obj,
2446 Err(_) => return Ok(Vec::new()), };
2448
2449 let annots_obj = match annots_obj {
2451 lopdf::Object::Reference(id) => doc
2452 .get_object(*id)
2453 .map_err(|e| BackendError::Parse(format!("failed to resolve /Annots ref: {e}")))?,
2454 other => other,
2455 };
2456
2457 let annots_array = annots_obj
2458 .as_array()
2459 .map_err(|e| BackendError::Parse(format!("/Annots is not an array: {e}")))?;
2460
2461 let mut annotations = Vec::new();
2462
2463 for annot_entry in annots_array {
2464 let annot_obj = match annot_entry {
2466 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2467 Ok(obj) => obj,
2468 Err(_) => continue, },
2470 other => other,
2471 };
2472
2473 let annot_dict = match annot_obj.as_dict() {
2474 Ok(dict) => dict,
2475 Err(_) => continue, };
2477
2478 let raw_subtype = match annot_dict.get(b"Subtype") {
2480 Ok(obj) => match obj {
2481 lopdf::Object::Name(name) => String::from_utf8_lossy(name).into_owned(),
2482 _ => continue, },
2484 Err(_) => continue, };
2486
2487 let annot_type = AnnotationType::from_subtype(&raw_subtype);
2488
2489 let bbox = match annot_dict.get(b"Rect") {
2491 Ok(obj) => {
2492 let obj = match obj {
2493 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2494 Ok(resolved) => resolved,
2495 Err(_) => continue,
2496 },
2497 other => other,
2498 };
2499 match obj.as_array() {
2500 Ok(arr) => match extract_bbox_from_array(arr) {
2501 Ok(b) => b,
2502 Err(_) => continue,
2503 },
2504 Err(_) => continue,
2505 }
2506 }
2507 Err(_) => continue, };
2509
2510 let contents = extract_string_from_dict(doc, annot_dict, b"Contents");
2512 let author = extract_string_from_dict(doc, annot_dict, b"T");
2513 let date = extract_string_from_dict(doc, annot_dict, b"M");
2514
2515 annotations.push(Annotation {
2516 annot_type,
2517 bbox,
2518 contents,
2519 author,
2520 date,
2521 raw_subtype,
2522 });
2523 }
2524
2525 Ok(annotations)
2526}
2527
2528fn extract_page_hyperlinks(
2533 doc: &lopdf::Document,
2534 page_id: lopdf::ObjectId,
2535) -> Result<Vec<Hyperlink>, BackendError> {
2536 let page_dict = doc
2537 .get_object(page_id)
2538 .and_then(|o| o.as_dict())
2539 .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
2540
2541 let annots_obj = match page_dict.get(b"Annots") {
2543 Ok(obj) => obj,
2544 Err(_) => return Ok(Vec::new()),
2545 };
2546
2547 let annots_obj = match annots_obj {
2549 lopdf::Object::Reference(id) => doc
2550 .get_object(*id)
2551 .map_err(|e| BackendError::Parse(format!("failed to resolve /Annots ref: {e}")))?,
2552 other => other,
2553 };
2554
2555 let annots_array = annots_obj
2556 .as_array()
2557 .map_err(|e| BackendError::Parse(format!("/Annots is not an array: {e}")))?;
2558
2559 let mut hyperlinks = Vec::new();
2560
2561 for annot_entry in annots_array {
2562 let annot_obj = match annot_entry {
2564 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2565 Ok(obj) => obj,
2566 Err(_) => continue,
2567 },
2568 other => other,
2569 };
2570
2571 let annot_dict = match annot_obj.as_dict() {
2572 Ok(dict) => dict,
2573 Err(_) => continue,
2574 };
2575
2576 let subtype = match annot_dict.get(b"Subtype") {
2578 Ok(lopdf::Object::Name(name)) => String::from_utf8_lossy(name).into_owned(),
2579 _ => continue,
2580 };
2581 if subtype != "Link" {
2582 continue;
2583 }
2584
2585 let bbox = match annot_dict.get(b"Rect") {
2587 Ok(obj) => {
2588 let obj = match obj {
2589 lopdf::Object::Reference(id) => match doc.get_object(*id) {
2590 Ok(resolved) => resolved,
2591 Err(_) => continue,
2592 },
2593 other => other,
2594 };
2595 match obj.as_array() {
2596 Ok(arr) => match extract_bbox_from_array(arr) {
2597 Ok(b) => b,
2598 Err(_) => continue,
2599 },
2600 Err(_) => continue,
2601 }
2602 }
2603 Err(_) => continue,
2604 };
2605
2606 let uri = resolve_link_uri(doc, annot_dict);
2608
2609 if let Some(uri) = uri {
2611 if !uri.is_empty() {
2612 hyperlinks.push(Hyperlink { bbox, uri });
2613 }
2614 }
2615 }
2616
2617 Ok(hyperlinks)
2618}
2619
2620fn resolve_link_uri(doc: &lopdf::Document, annot_dict: &lopdf::Dictionary) -> Option<String> {
2624 if let Ok(action_obj) = annot_dict.get(b"A") {
2626 let action_obj = match action_obj {
2627 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
2628 other => other,
2629 };
2630 if let Ok(action_dict) = action_obj.as_dict() {
2631 if let Ok(lopdf::Object::Name(action_type)) = action_dict.get(b"S") {
2633 let action_type_str = String::from_utf8_lossy(action_type);
2634 match action_type_str.as_ref() {
2635 "URI" => {
2636 return extract_string_from_dict(doc, action_dict, b"URI");
2638 }
2639 "GoTo" => {
2640 return resolve_goto_dest(doc, action_dict);
2642 }
2643 "GoToR" => {
2644 let file = extract_string_from_dict(doc, action_dict, b"F");
2646 if let Some(f) = file {
2647 return Some(f);
2648 }
2649 }
2650 _ => {}
2651 }
2652 }
2653 }
2654 }
2655
2656 if let Ok(dest_obj) = annot_dict.get(b"Dest") {
2658 return resolve_dest_object(doc, dest_obj);
2659 }
2660
2661 None
2662}
2663
2664fn resolve_goto_dest(doc: &lopdf::Document, action_dict: &lopdf::Dictionary) -> Option<String> {
2666 let dest_obj = action_dict.get(b"D").ok()?;
2667 resolve_dest_object(doc, dest_obj)
2668}
2669
2670fn resolve_dest_object(doc: &lopdf::Document, dest_obj: &lopdf::Object) -> Option<String> {
2676 let dest_obj = match dest_obj {
2677 lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
2678 other => other,
2679 };
2680
2681 match dest_obj {
2682 lopdf::Object::String(bytes, _) => {
2684 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
2685 let chars: Vec<u16> = bytes[2..]
2686 .chunks(2)
2687 .filter_map(|c| {
2688 if c.len() == 2 {
2689 Some(u16::from_be_bytes([c[0], c[1]]))
2690 } else {
2691 None
2692 }
2693 })
2694 .collect();
2695 String::from_utf16(&chars).ok()
2696 } else {
2697 match std::str::from_utf8(bytes) {
2698 Ok(s) => Some(s.to_string()),
2699 Err(_) => Some(bytes.iter().map(|&b| b as char).collect()),
2700 }
2701 }
2702 }
2703 lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
2705 lopdf::Object::Array(arr) => {
2707 if arr.is_empty() {
2708 return None;
2709 }
2710 if let lopdf::Object::Reference(page_ref) = &arr[0] {
2712 let pages_map = doc.get_pages();
2714 for (&page_num, &page_id) in &pages_map {
2715 if page_id == *page_ref {
2716 return Some(format!("#page={page_num}"));
2717 }
2718 }
2719 return Some(format!("#ref={},{}", page_ref.0, page_ref.1));
2721 }
2722 None
2723 }
2724 _ => None,
2725 }
2726}
2727
2728#[cfg(test)]
2733fn create_test_pdf(page_count: usize) -> Vec<u8> {
2734 use lopdf::{Document, Object, ObjectId, dictionary};
2735
2736 let mut doc = Document::with_version("1.5");
2737 let pages_id: ObjectId = doc.new_object_id();
2738
2739 let mut page_ids: Vec<Object> = Vec::new();
2740 for _ in 0..page_count {
2741 let page_id = doc.add_object(dictionary! {
2742 "Type" => "Page",
2743 "Parent" => pages_id,
2744 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2745 });
2746 page_ids.push(page_id.into());
2747 }
2748
2749 doc.objects.insert(
2750 pages_id,
2751 Object::Dictionary(dictionary! {
2752 "Type" => "Pages",
2753 "Kids" => page_ids,
2754 "Count" => page_count as i64,
2755 }),
2756 );
2757
2758 let catalog_id = doc.add_object(dictionary! {
2759 "Type" => "Catalog",
2760 "Pages" => pages_id,
2761 });
2762 doc.trailer.set("Root", catalog_id);
2763
2764 let mut buf = Vec::new();
2765 doc.save_to(&mut buf).expect("failed to save test PDF");
2766 buf
2767}
2768
2769#[cfg(test)]
2771fn create_test_pdf_inherited_media_box() -> Vec<u8> {
2772 use lopdf::{Document, Object, ObjectId, dictionary};
2773
2774 let mut doc = Document::with_version("1.5");
2775 let pages_id: ObjectId = doc.new_object_id();
2776
2777 let page_id = doc.add_object(dictionary! {
2779 "Type" => "Page",
2780 "Parent" => pages_id,
2781 });
2782
2783 doc.objects.insert(
2784 pages_id,
2785 Object::Dictionary(dictionary! {
2786 "Type" => "Pages",
2787 "Kids" => vec![Object::from(page_id)],
2788 "Count" => 1i64,
2789 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
2790 }),
2791 );
2792
2793 let catalog_id = doc.add_object(dictionary! {
2794 "Type" => "Catalog",
2795 "Pages" => pages_id,
2796 });
2797 doc.trailer.set("Root", catalog_id);
2798
2799 let mut buf = Vec::new();
2800 doc.save_to(&mut buf).expect("failed to save test PDF");
2801 buf
2802}
2803
2804#[cfg(test)]
2806fn create_test_pdf_with_crop_box() -> Vec<u8> {
2807 use lopdf::{Document, Object, ObjectId, dictionary};
2808
2809 let mut doc = Document::with_version("1.5");
2810 let pages_id: ObjectId = doc.new_object_id();
2811
2812 let page_id = doc.add_object(dictionary! {
2813 "Type" => "Page",
2814 "Parent" => pages_id,
2815 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2816 "CropBox" => vec![
2817 Object::Real(36.0),
2818 Object::Real(36.0),
2819 Object::Real(576.0),
2820 Object::Real(756.0),
2821 ],
2822 });
2823
2824 doc.objects.insert(
2825 pages_id,
2826 Object::Dictionary(dictionary! {
2827 "Type" => "Pages",
2828 "Kids" => vec![Object::from(page_id)],
2829 "Count" => 1i64,
2830 }),
2831 );
2832
2833 let catalog_id = doc.add_object(dictionary! {
2834 "Type" => "Catalog",
2835 "Pages" => pages_id,
2836 });
2837 doc.trailer.set("Root", catalog_id);
2838
2839 let mut buf = Vec::new();
2840 doc.save_to(&mut buf).expect("failed to save test PDF");
2841 buf
2842}
2843
2844#[cfg(test)]
2846fn create_test_pdf_with_rotate(rotation: i64) -> Vec<u8> {
2847 use lopdf::{Document, Object, ObjectId, dictionary};
2848
2849 let mut doc = Document::with_version("1.5");
2850 let pages_id: ObjectId = doc.new_object_id();
2851
2852 let page_id = doc.add_object(dictionary! {
2853 "Type" => "Page",
2854 "Parent" => pages_id,
2855 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2856 "Rotate" => rotation,
2857 });
2858
2859 doc.objects.insert(
2860 pages_id,
2861 Object::Dictionary(dictionary! {
2862 "Type" => "Pages",
2863 "Kids" => vec![Object::from(page_id)],
2864 "Count" => 1i64,
2865 }),
2866 );
2867
2868 let catalog_id = doc.add_object(dictionary! {
2869 "Type" => "Catalog",
2870 "Pages" => pages_id,
2871 });
2872 doc.trailer.set("Root", catalog_id);
2873
2874 let mut buf = Vec::new();
2875 doc.save_to(&mut buf).expect("failed to save test PDF");
2876 buf
2877}
2878
2879#[cfg(test)]
2881fn create_test_pdf_inherited_rotate(rotation: i64) -> Vec<u8> {
2882 use lopdf::{Document, Object, ObjectId, dictionary};
2883
2884 let mut doc = Document::with_version("1.5");
2885 let pages_id: ObjectId = doc.new_object_id();
2886
2887 let page_id = doc.add_object(dictionary! {
2889 "Type" => "Page",
2890 "Parent" => pages_id,
2891 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2892 });
2893
2894 doc.objects.insert(
2895 pages_id,
2896 Object::Dictionary(dictionary! {
2897 "Type" => "Pages",
2898 "Kids" => vec![Object::from(page_id)],
2899 "Count" => 1i64,
2900 "Rotate" => rotation,
2901 }),
2902 );
2903
2904 let catalog_id = doc.add_object(dictionary! {
2905 "Type" => "Catalog",
2906 "Pages" => pages_id,
2907 });
2908 doc.trailer.set("Root", catalog_id);
2909
2910 let mut buf = Vec::new();
2911 doc.save_to(&mut buf).expect("failed to save test PDF");
2912 buf
2913}
2914
2915#[cfg(test)]
2920fn create_test_pdf_with_form_xobject() -> Vec<u8> {
2921 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
2922
2923 let mut doc = Document::with_version("1.5");
2924 let pages_id: ObjectId = doc.new_object_id();
2925
2926 let font_id = doc.add_object(dictionary! {
2928 "Type" => "Font",
2929 "Subtype" => "Type1",
2930 "BaseFont" => "Helvetica",
2931 });
2932
2933 let form_content = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
2935 let form_stream = Stream::new(
2936 dictionary! {
2937 "Type" => "XObject",
2938 "Subtype" => "Form",
2939 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2940 "Resources" => Object::Dictionary(dictionary! {
2941 "Font" => Object::Dictionary(dictionary! {
2942 "F1" => font_id,
2943 }),
2944 }),
2945 },
2946 form_content.to_vec(),
2947 );
2948 let form_id = doc.add_object(Object::Stream(form_stream));
2949
2950 let page_content = b"q /FM1 Do Q";
2952 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
2953 let content_id = doc.add_object(Object::Stream(page_stream));
2954
2955 let page_id = doc.add_object(dictionary! {
2956 "Type" => "Page",
2957 "Parent" => pages_id,
2958 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2959 "Contents" => content_id,
2960 "Resources" => Object::Dictionary(dictionary! {
2961 "Font" => Object::Dictionary(dictionary! {
2962 "F1" => font_id,
2963 }),
2964 "XObject" => Object::Dictionary(dictionary! {
2965 "FM1" => form_id,
2966 }),
2967 }),
2968 });
2969
2970 doc.objects.insert(
2971 pages_id,
2972 Object::Dictionary(dictionary! {
2973 "Type" => "Pages",
2974 "Kids" => vec![Object::from(page_id)],
2975 "Count" => 1i64,
2976 }),
2977 );
2978
2979 let catalog_id = doc.add_object(dictionary! {
2980 "Type" => "Catalog",
2981 "Pages" => pages_id,
2982 });
2983 doc.trailer.set("Root", catalog_id);
2984
2985 let mut buf = Vec::new();
2986 doc.save_to(&mut buf).expect("failed to save test PDF");
2987 buf
2988}
2989
2990#[cfg(test)]
2996fn create_test_pdf_with_nested_form_xobjects() -> Vec<u8> {
2997 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
2998
2999 let mut doc = Document::with_version("1.5");
3000 let pages_id: ObjectId = doc.new_object_id();
3001
3002 let font_id = doc.add_object(dictionary! {
3003 "Type" => "Font",
3004 "Subtype" => "Type1",
3005 "BaseFont" => "Helvetica",
3006 });
3007
3008 let fm2_content = b"BT /F1 10 Tf (Deep) Tj ET";
3010 let fm2_stream = Stream::new(
3011 dictionary! {
3012 "Type" => "XObject",
3013 "Subtype" => "Form",
3014 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3015 "Resources" => Object::Dictionary(dictionary! {
3016 "Font" => Object::Dictionary(dictionary! {
3017 "F1" => font_id,
3018 }),
3019 }),
3020 },
3021 fm2_content.to_vec(),
3022 );
3023 let fm2_id = doc.add_object(Object::Stream(fm2_stream));
3024
3025 let fm1_content = b"q /FM2 Do Q";
3027 let fm1_stream = Stream::new(
3028 dictionary! {
3029 "Type" => "XObject",
3030 "Subtype" => "Form",
3031 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3032 "Resources" => Object::Dictionary(dictionary! {
3033 "XObject" => Object::Dictionary(dictionary! {
3034 "FM2" => fm2_id,
3035 }),
3036 "Font" => Object::Dictionary(dictionary! {
3037 "F1" => font_id,
3038 }),
3039 }),
3040 },
3041 fm1_content.to_vec(),
3042 );
3043 let fm1_id = doc.add_object(Object::Stream(fm1_stream));
3044
3045 let page_content = b"q /FM1 Do Q";
3047 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3048 let content_id = doc.add_object(Object::Stream(page_stream));
3049
3050 let page_id = doc.add_object(dictionary! {
3051 "Type" => "Page",
3052 "Parent" => pages_id,
3053 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3054 "Contents" => content_id,
3055 "Resources" => Object::Dictionary(dictionary! {
3056 "XObject" => Object::Dictionary(dictionary! {
3057 "FM1" => fm1_id,
3058 }),
3059 "Font" => Object::Dictionary(dictionary! {
3060 "F1" => font_id,
3061 }),
3062 }),
3063 });
3064
3065 doc.objects.insert(
3066 pages_id,
3067 Object::Dictionary(dictionary! {
3068 "Type" => "Pages",
3069 "Kids" => vec![Object::from(page_id)],
3070 "Count" => 1i64,
3071 }),
3072 );
3073
3074 let catalog_id = doc.add_object(dictionary! {
3075 "Type" => "Catalog",
3076 "Pages" => pages_id,
3077 });
3078 doc.trailer.set("Root", catalog_id);
3079
3080 let mut buf = Vec::new();
3081 doc.save_to(&mut buf).expect("failed to save test PDF");
3082 buf
3083}
3084
3085#[cfg(test)]
3089fn create_test_pdf_form_xobject_with_matrix() -> Vec<u8> {
3090 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3091
3092 let mut doc = Document::with_version("1.5");
3093 let pages_id: ObjectId = doc.new_object_id();
3094
3095 let font_id = doc.add_object(dictionary! {
3096 "Type" => "Font",
3097 "Subtype" => "Type1",
3098 "BaseFont" => "Helvetica",
3099 });
3100
3101 let form_content = b"BT /F1 12 Tf (A) Tj ET";
3102 let form_stream = Stream::new(
3103 dictionary! {
3104 "Type" => "XObject",
3105 "Subtype" => "Form",
3106 "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3107 "Matrix" => vec![
3108 Object::Real(2.0), Object::Real(0.0),
3109 Object::Real(0.0), Object::Real(2.0),
3110 Object::Real(10.0), Object::Real(20.0),
3111 ],
3112 "Resources" => Object::Dictionary(dictionary! {
3113 "Font" => Object::Dictionary(dictionary! {
3114 "F1" => font_id,
3115 }),
3116 }),
3117 },
3118 form_content.to_vec(),
3119 );
3120 let form_id = doc.add_object(Object::Stream(form_stream));
3121
3122 let page_content = b"q /FM1 Do Q";
3123 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3124 let content_id = doc.add_object(Object::Stream(page_stream));
3125
3126 let page_id = doc.add_object(dictionary! {
3127 "Type" => "Page",
3128 "Parent" => pages_id,
3129 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3130 "Contents" => content_id,
3131 "Resources" => Object::Dictionary(dictionary! {
3132 "XObject" => Object::Dictionary(dictionary! {
3133 "FM1" => form_id,
3134 }),
3135 "Font" => Object::Dictionary(dictionary! {
3136 "F1" => font_id,
3137 }),
3138 }),
3139 });
3140
3141 doc.objects.insert(
3142 pages_id,
3143 Object::Dictionary(dictionary! {
3144 "Type" => "Pages",
3145 "Kids" => vec![Object::from(page_id)],
3146 "Count" => 1i64,
3147 }),
3148 );
3149
3150 let catalog_id = doc.add_object(dictionary! {
3151 "Type" => "Catalog",
3152 "Pages" => pages_id,
3153 });
3154 doc.trailer.set("Root", catalog_id);
3155
3156 let mut buf = Vec::new();
3157 doc.save_to(&mut buf).expect("failed to save test PDF");
3158 buf
3159}
3160
3161#[cfg(test)]
3163fn create_test_pdf_with_image_xobject() -> Vec<u8> {
3164 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3165
3166 let mut doc = Document::with_version("1.5");
3167 let pages_id: ObjectId = doc.new_object_id();
3168
3169 let image_data = vec![255u8, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0];
3171 let image_stream = Stream::new(
3172 dictionary! {
3173 "Type" => "XObject",
3174 "Subtype" => "Image",
3175 "Width" => 2i64,
3176 "Height" => 2i64,
3177 "ColorSpace" => "DeviceRGB",
3178 "BitsPerComponent" => 8i64,
3179 },
3180 image_data,
3181 );
3182 let image_id = doc.add_object(Object::Stream(image_stream));
3183
3184 let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
3186 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3187 let content_id = doc.add_object(Object::Stream(page_stream));
3188
3189 let page_id = doc.add_object(dictionary! {
3190 "Type" => "Page",
3191 "Parent" => pages_id,
3192 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3193 "Contents" => content_id,
3194 "Resources" => Object::Dictionary(dictionary! {
3195 "XObject" => Object::Dictionary(dictionary! {
3196 "Im0" => image_id,
3197 }),
3198 }),
3199 });
3200
3201 doc.objects.insert(
3202 pages_id,
3203 Object::Dictionary(dictionary! {
3204 "Type" => "Pages",
3205 "Kids" => vec![Object::from(page_id)],
3206 "Count" => 1i64,
3207 }),
3208 );
3209
3210 let catalog_id = doc.add_object(dictionary! {
3211 "Type" => "Catalog",
3212 "Pages" => pages_id,
3213 });
3214 doc.trailer.set("Root", catalog_id);
3215
3216 let mut buf = Vec::new();
3217 doc.save_to(&mut buf).expect("failed to save test PDF");
3218 buf
3219}
3220
3221#[cfg(test)]
3223fn create_test_pdf_with_jpeg_image() -> Vec<u8> {
3224 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3225
3226 let mut doc = Document::with_version("1.5");
3227 let pages_id: ObjectId = doc.new_object_id();
3228
3229 let jpeg_data = vec![
3232 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xFF, 0xD9, ];
3241
3242 let image_stream = Stream::new(
3243 dictionary! {
3244 "Type" => "XObject",
3245 "Subtype" => "Image",
3246 "Width" => 2i64,
3247 "Height" => 2i64,
3248 "ColorSpace" => "DeviceRGB",
3249 "BitsPerComponent" => 8i64,
3250 "Filter" => "DCTDecode",
3251 },
3252 jpeg_data,
3253 );
3254 let image_id = doc.add_object(Object::Stream(image_stream));
3255
3256 let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
3257 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3258 let content_id = doc.add_object(Object::Stream(page_stream));
3259
3260 let page_id = doc.add_object(dictionary! {
3261 "Type" => "Page",
3262 "Parent" => pages_id,
3263 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3264 "Contents" => content_id,
3265 "Resources" => Object::Dictionary(dictionary! {
3266 "XObject" => Object::Dictionary(dictionary! {
3267 "Im0" => image_id,
3268 }),
3269 }),
3270 });
3271
3272 doc.objects.insert(
3273 pages_id,
3274 Object::Dictionary(dictionary! {
3275 "Type" => "Pages",
3276 "Kids" => vec![Object::from(page_id)],
3277 "Count" => 1i64,
3278 }),
3279 );
3280
3281 let catalog_id = doc.add_object(dictionary! {
3282 "Type" => "Catalog",
3283 "Pages" => pages_id,
3284 });
3285 doc.trailer.set("Root", catalog_id);
3286
3287 let mut buf = Vec::new();
3288 doc.save_to(&mut buf).expect("failed to save test PDF");
3289 buf
3290}
3291
3292#[cfg(test)]
3294fn create_test_pdf_with_text_content() -> Vec<u8> {
3295 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3296
3297 let mut doc = Document::with_version("1.5");
3298 let pages_id: ObjectId = doc.new_object_id();
3299
3300 let font_id = doc.add_object(dictionary! {
3301 "Type" => "Font",
3302 "Subtype" => "Type1",
3303 "BaseFont" => "Helvetica",
3304 });
3305
3306 let page_content = b"BT /F1 12 Tf 72 700 Td (Hi) Tj ET";
3307 let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3308 let content_id = doc.add_object(Object::Stream(page_stream));
3309
3310 let page_id = doc.add_object(dictionary! {
3311 "Type" => "Page",
3312 "Parent" => pages_id,
3313 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3314 "Contents" => content_id,
3315 "Resources" => Object::Dictionary(dictionary! {
3316 "Font" => Object::Dictionary(dictionary! {
3317 "F1" => font_id,
3318 }),
3319 }),
3320 });
3321
3322 doc.objects.insert(
3323 pages_id,
3324 Object::Dictionary(dictionary! {
3325 "Type" => "Pages",
3326 "Kids" => vec![Object::from(page_id)],
3327 "Count" => 1i64,
3328 }),
3329 );
3330
3331 let catalog_id = doc.add_object(dictionary! {
3332 "Type" => "Catalog",
3333 "Pages" => pages_id,
3334 });
3335 doc.trailer.set("Root", catalog_id);
3336
3337 let mut buf = Vec::new();
3338 doc.save_to(&mut buf).expect("failed to save test PDF");
3339 buf
3340}
3341
3342#[cfg(test)]
3344#[allow(clippy::too_many_arguments)]
3345fn create_test_pdf_with_metadata(
3346 title: Option<&str>,
3347 author: Option<&str>,
3348 subject: Option<&str>,
3349 keywords: Option<&str>,
3350 creator: Option<&str>,
3351 producer: Option<&str>,
3352 creation_date: Option<&str>,
3353 mod_date: Option<&str>,
3354) -> Vec<u8> {
3355 use lopdf::{Document, Object, ObjectId, dictionary};
3356
3357 let mut doc = Document::with_version("1.5");
3358 let pages_id: ObjectId = doc.new_object_id();
3359
3360 let page_id = doc.add_object(dictionary! {
3361 "Type" => "Page",
3362 "Parent" => pages_id,
3363 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3364 });
3365
3366 doc.objects.insert(
3367 pages_id,
3368 Object::Dictionary(dictionary! {
3369 "Type" => "Pages",
3370 "Kids" => vec![Object::from(page_id)],
3371 "Count" => 1i64,
3372 }),
3373 );
3374
3375 let catalog_id = doc.add_object(dictionary! {
3376 "Type" => "Catalog",
3377 "Pages" => pages_id,
3378 });
3379 doc.trailer.set("Root", catalog_id);
3380
3381 let mut info_dict = lopdf::Dictionary::new();
3383 if let Some(v) = title {
3384 info_dict.set("Title", Object::string_literal(v));
3385 }
3386 if let Some(v) = author {
3387 info_dict.set("Author", Object::string_literal(v));
3388 }
3389 if let Some(v) = subject {
3390 info_dict.set("Subject", Object::string_literal(v));
3391 }
3392 if let Some(v) = keywords {
3393 info_dict.set("Keywords", Object::string_literal(v));
3394 }
3395 if let Some(v) = creator {
3396 info_dict.set("Creator", Object::string_literal(v));
3397 }
3398 if let Some(v) = producer {
3399 info_dict.set("Producer", Object::string_literal(v));
3400 }
3401 if let Some(v) = creation_date {
3402 info_dict.set("CreationDate", Object::string_literal(v));
3403 }
3404 if let Some(v) = mod_date {
3405 info_dict.set("ModDate", Object::string_literal(v));
3406 }
3407
3408 let info_id = doc.add_object(Object::Dictionary(info_dict));
3409 doc.trailer.set("Info", Object::Reference(info_id));
3410
3411 let mut buf = Vec::new();
3412 doc.save_to(&mut buf).expect("failed to save test PDF");
3413 buf
3414}
3415
3416#[cfg(test)]
3417mod tests {
3418 use super::*;
3419 use crate::handler::{CharEvent, ContentHandler, ImageEvent};
3420 use pdfplumber_core::PdfError;
3421
3422 struct CollectingHandler {
3425 chars: Vec<CharEvent>,
3426 images: Vec<ImageEvent>,
3427 }
3428
3429 impl CollectingHandler {
3430 fn new() -> Self {
3431 Self {
3432 chars: Vec::new(),
3433 images: Vec::new(),
3434 }
3435 }
3436 }
3437
3438 impl ContentHandler for CollectingHandler {
3439 fn on_char(&mut self, event: CharEvent) {
3440 self.chars.push(event);
3441 }
3442 fn on_image(&mut self, event: ImageEvent) {
3443 self.images.push(event);
3444 }
3445 }
3446
3447 #[test]
3450 fn open_valid_single_page_pdf() {
3451 let pdf_bytes = create_test_pdf(1);
3452 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3453 assert_eq!(LopdfBackend::page_count(&doc), 1);
3454 }
3455
3456 #[test]
3457 fn open_valid_multi_page_pdf() {
3458 let pdf_bytes = create_test_pdf(5);
3459 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3460 assert_eq!(LopdfBackend::page_count(&doc), 5);
3461 }
3462
3463 #[test]
3464 fn open_invalid_bytes_returns_error() {
3465 let result = LopdfBackend::open(b"not a pdf");
3466 assert!(result.is_err());
3467 }
3468
3469 #[test]
3470 fn open_empty_bytes_returns_error() {
3471 let result = LopdfBackend::open(&[]);
3472 assert!(result.is_err());
3473 }
3474
3475 #[test]
3476 fn open_error_converts_to_pdf_error() {
3477 let err = LopdfBackend::open(b"garbage").unwrap_err();
3478 let pdf_err: PdfError = err.into();
3479 assert!(matches!(pdf_err, PdfError::ParseError(_)));
3480 }
3481
3482 #[test]
3485 fn page_count_zero_pages() {
3486 let pdf_bytes = create_test_pdf(0);
3487 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3488 assert_eq!(LopdfBackend::page_count(&doc), 0);
3489 }
3490
3491 #[test]
3492 fn page_count_three_pages() {
3493 let pdf_bytes = create_test_pdf(3);
3494 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3495 assert_eq!(LopdfBackend::page_count(&doc), 3);
3496 }
3497
3498 #[test]
3501 fn get_page_first_page() {
3502 let pdf_bytes = create_test_pdf(3);
3503 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3504 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3505 assert_eq!(page.index, 0);
3506 }
3507
3508 #[test]
3509 fn get_page_last_page() {
3510 let pdf_bytes = create_test_pdf(3);
3511 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3512 let page = LopdfBackend::get_page(&doc, 2).unwrap();
3513 assert_eq!(page.index, 2);
3514 }
3515
3516 #[test]
3517 fn get_page_out_of_bounds() {
3518 let pdf_bytes = create_test_pdf(2);
3519 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3520 let result = LopdfBackend::get_page(&doc, 2);
3521 assert!(result.is_err());
3522 }
3523
3524 #[test]
3525 fn get_page_out_of_bounds_error_converts_to_pdf_error() {
3526 let pdf_bytes = create_test_pdf(1);
3527 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3528 let err = LopdfBackend::get_page(&doc, 5).unwrap_err();
3529 let pdf_err: PdfError = err.into();
3530 assert!(matches!(pdf_err, PdfError::ParseError(_)));
3531 assert!(pdf_err.to_string().contains("out of range"));
3532 }
3533
3534 #[test]
3535 fn get_page_on_empty_document() {
3536 let pdf_bytes = create_test_pdf(0);
3537 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3538 let result = LopdfBackend::get_page(&doc, 0);
3539 assert!(result.is_err());
3540 }
3541
3542 #[test]
3545 fn pages_have_distinct_object_ids() {
3546 let pdf_bytes = create_test_pdf(3);
3547 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3548 let page0 = LopdfBackend::get_page(&doc, 0).unwrap();
3549 let page1 = LopdfBackend::get_page(&doc, 1).unwrap();
3550 let page2 = LopdfBackend::get_page(&doc, 2).unwrap();
3551 assert_ne!(page0.object_id, page1.object_id);
3552 assert_ne!(page1.object_id, page2.object_id);
3553 assert_ne!(page0.object_id, page2.object_id);
3554 }
3555
3556 #[test]
3559 fn round_trip_open_count_access() {
3560 let pdf_bytes = create_test_pdf(4);
3561 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3562 let count = LopdfBackend::page_count(&doc);
3563 assert_eq!(count, 4);
3564
3565 for i in 0..count {
3566 let page = LopdfBackend::get_page(&doc, i).unwrap();
3567 assert_eq!(page.index, i);
3568 }
3569
3570 assert!(LopdfBackend::get_page(&doc, count).is_err());
3572 }
3573
3574 #[test]
3577 fn media_box_explicit_us_letter() {
3578 let pdf_bytes = create_test_pdf(1);
3579 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3580 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3581 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3582 assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
3583 }
3584
3585 #[test]
3586 fn media_box_inherited_from_parent() {
3587 let pdf_bytes = create_test_pdf_inherited_media_box();
3588 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3589 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3590 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3591 assert_eq!(media_box, BBox::new(0.0, 0.0, 595.0, 842.0));
3593 }
3594
3595 #[test]
3596 fn media_box_width_height() {
3597 let pdf_bytes = create_test_pdf(1);
3598 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3599 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3600 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3601 assert_eq!(media_box.width(), 612.0);
3602 assert_eq!(media_box.height(), 792.0);
3603 }
3604
3605 #[test]
3608 fn crop_box_present() {
3609 let pdf_bytes = create_test_pdf_with_crop_box();
3610 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3611 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3612 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3613 assert_eq!(crop_box, Some(BBox::new(36.0, 36.0, 576.0, 756.0)));
3614 }
3615
3616 #[test]
3617 fn crop_box_absent() {
3618 let pdf_bytes = create_test_pdf(1);
3619 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3620 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3621 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3622 assert_eq!(crop_box, None);
3623 }
3624
3625 #[test]
3628 fn rotate_default_zero() {
3629 let pdf_bytes = create_test_pdf(1);
3630 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3631 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3632 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3633 assert_eq!(rotation, 0);
3634 }
3635
3636 #[test]
3637 fn rotate_90() {
3638 let pdf_bytes = create_test_pdf_with_rotate(90);
3639 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3640 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3641 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3642 assert_eq!(rotation, 90);
3643 }
3644
3645 #[test]
3646 fn rotate_180() {
3647 let pdf_bytes = create_test_pdf_with_rotate(180);
3648 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3649 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3650 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3651 assert_eq!(rotation, 180);
3652 }
3653
3654 #[test]
3655 fn rotate_270() {
3656 let pdf_bytes = create_test_pdf_with_rotate(270);
3657 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3658 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3659 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3660 assert_eq!(rotation, 270);
3661 }
3662
3663 #[test]
3664 fn rotate_inherited_from_parent() {
3665 let pdf_bytes = create_test_pdf_inherited_rotate(90);
3666 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3667 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3668 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3669 assert_eq!(rotation, 90);
3670 }
3671
3672 #[test]
3675 fn page_properties_round_trip() {
3676 let pdf_bytes = create_test_pdf_with_crop_box();
3677 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3678 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3679
3680 let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3681 let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3682 let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3683
3684 assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
3685 assert!(crop_box.is_some());
3686 assert_eq!(rotation, 0);
3687 }
3688
3689 #[test]
3692 fn interpret_page_simple_text() {
3693 let pdf_bytes = create_test_pdf_with_text_content();
3694 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3695 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3696 let options = ExtractOptions::default();
3697 let mut handler = CollectingHandler::new();
3698
3699 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3700
3701 assert_eq!(handler.chars.len(), 2);
3703 assert_eq!(handler.chars[0].char_code, b'H' as u32);
3704 assert_eq!(handler.chars[1].char_code, b'i' as u32);
3705 assert_eq!(handler.chars[0].font_size, 12.0);
3706 assert_eq!(handler.chars[0].font_name, "Helvetica");
3707 }
3708
3709 #[test]
3710 fn interpret_page_no_content() {
3711 let pdf_bytes = create_test_pdf(1);
3712 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3713 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3714 let options = ExtractOptions::default();
3715 let mut handler = CollectingHandler::new();
3716
3717 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3719 assert_eq!(handler.chars.len(), 0);
3720 }
3721
3722 #[test]
3725 fn interpret_page_form_xobject_text() {
3726 let pdf_bytes = create_test_pdf_with_form_xobject();
3727 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3728 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3729 let options = ExtractOptions::default();
3730 let mut handler = CollectingHandler::new();
3731
3732 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3733
3734 assert_eq!(handler.chars.len(), 5);
3736 assert_eq!(handler.chars[0].char_code, b'H' as u32);
3737 assert_eq!(handler.chars[1].char_code, b'e' as u32);
3738 assert_eq!(handler.chars[2].char_code, b'l' as u32);
3739 assert_eq!(handler.chars[3].char_code, b'l' as u32);
3740 assert_eq!(handler.chars[4].char_code, b'o' as u32);
3741 assert_eq!(handler.chars[0].font_name, "Helvetica");
3742 assert_eq!(handler.chars[0].font_size, 12.0);
3743 }
3744
3745 #[test]
3746 fn interpret_page_nested_form_xobjects() {
3747 let pdf_bytes = create_test_pdf_with_nested_form_xobjects();
3748 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3749 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3750 let options = ExtractOptions::default();
3751 let mut handler = CollectingHandler::new();
3752
3753 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3754
3755 assert_eq!(handler.chars.len(), 4);
3757 assert_eq!(handler.chars[0].char_code, b'D' as u32);
3758 assert_eq!(handler.chars[1].char_code, b'e' as u32);
3759 assert_eq!(handler.chars[2].char_code, b'e' as u32);
3760 assert_eq!(handler.chars[3].char_code, b'p' as u32);
3761 }
3762
3763 #[test]
3764 fn interpret_page_form_xobject_matrix_applied() {
3765 let pdf_bytes = create_test_pdf_form_xobject_with_matrix();
3766 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3767 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3768 let options = ExtractOptions::default();
3769 let mut handler = CollectingHandler::new();
3770
3771 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3772
3773 assert_eq!(handler.chars.len(), 1);
3775 assert_eq!(handler.chars[0].char_code, b'A' as u32);
3776 let ctm = handler.chars[0].ctm;
3778 assert!((ctm[0] - 2.0).abs() < 0.01);
3780 assert!((ctm[3] - 2.0).abs() < 0.01);
3781 assert!((ctm[4] - 10.0).abs() < 0.01);
3782 assert!((ctm[5] - 20.0).abs() < 0.01);
3783 }
3784
3785 #[test]
3786 fn interpret_page_form_xobject_state_restored() {
3787 let pdf_bytes = create_test_pdf_with_form_xobject();
3791 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3792 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3793 let options = ExtractOptions::default();
3794 let mut handler = CollectingHandler::new();
3795
3796 let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
3798 assert!(result.is_ok());
3799 }
3800
3801 #[test]
3802 fn interpret_page_image_xobject() {
3803 let pdf_bytes = create_test_pdf_with_image_xobject();
3804 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3805 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3806 let options = ExtractOptions::default();
3807 let mut handler = CollectingHandler::new();
3808
3809 LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3810
3811 assert_eq!(handler.chars.len(), 0);
3813 assert_eq!(handler.images.len(), 1);
3814 assert_eq!(handler.images[0].name, "Im0");
3815 assert_eq!(handler.images[0].width, 2);
3816 assert_eq!(handler.images[0].height, 2);
3817 assert_eq!(handler.images[0].colorspace.as_deref(), Some("DeviceRGB"));
3818 assert_eq!(handler.images[0].bits_per_component, Some(8));
3819 let ctm = handler.images[0].ctm;
3821 assert!((ctm[0] - 200.0).abs() < 0.01);
3822 assert!((ctm[3] - 150.0).abs() < 0.01);
3823 assert!((ctm[4] - 100.0).abs() < 0.01);
3824 assert!((ctm[5] - 300.0).abs() < 0.01);
3825 }
3826
3827 #[test]
3828 fn interpret_page_recursion_limit() {
3829 let pdf_bytes = create_test_pdf_with_form_xobject();
3831 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3832 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3833 let mut options = ExtractOptions::default();
3834 options.max_recursion_depth = 0; let mut handler = CollectingHandler::new();
3836
3837 let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
3838 assert!(result.is_err());
3839 let err_msg = result.unwrap_err().to_string();
3840 assert!(err_msg.contains("recursion depth"));
3841 }
3842
3843 #[test]
3846 fn metadata_full_info_dictionary() {
3847 let pdf_bytes = create_test_pdf_with_metadata(
3848 Some("Test Document"),
3849 Some("John Doe"),
3850 Some("Testing metadata"),
3851 Some("test, pdf, rust"),
3852 Some("LibreOffice"),
3853 Some("pdfplumber-rs"),
3854 Some("D:20240101120000+00'00'"),
3855 Some("D:20240615153000+00'00'"),
3856 );
3857 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3858 let meta = LopdfBackend::document_metadata(&doc).unwrap();
3859
3860 assert_eq!(meta.title.as_deref(), Some("Test Document"));
3861 assert_eq!(meta.author.as_deref(), Some("John Doe"));
3862 assert_eq!(meta.subject.as_deref(), Some("Testing metadata"));
3863 assert_eq!(meta.keywords.as_deref(), Some("test, pdf, rust"));
3864 assert_eq!(meta.creator.as_deref(), Some("LibreOffice"));
3865 assert_eq!(meta.producer.as_deref(), Some("pdfplumber-rs"));
3866 assert_eq!(
3867 meta.creation_date.as_deref(),
3868 Some("D:20240101120000+00'00'")
3869 );
3870 assert_eq!(meta.mod_date.as_deref(), Some("D:20240615153000+00'00'"));
3871 assert!(!meta.is_empty());
3872 }
3873
3874 #[test]
3875 fn metadata_partial_info_dictionary() {
3876 let pdf_bytes = create_test_pdf_with_metadata(
3877 Some("Only Title"),
3878 None,
3879 None,
3880 None,
3881 None,
3882 Some("A Producer"),
3883 None,
3884 None,
3885 );
3886 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3887 let meta = LopdfBackend::document_metadata(&doc).unwrap();
3888
3889 assert_eq!(meta.title.as_deref(), Some("Only Title"));
3890 assert_eq!(meta.author, None);
3891 assert_eq!(meta.subject, None);
3892 assert_eq!(meta.keywords, None);
3893 assert_eq!(meta.creator, None);
3894 assert_eq!(meta.producer.as_deref(), Some("A Producer"));
3895 assert_eq!(meta.creation_date, None);
3896 assert_eq!(meta.mod_date, None);
3897 assert!(!meta.is_empty());
3898 }
3899
3900 #[test]
3901 fn metadata_no_info_dictionary() {
3902 let pdf_bytes = create_test_pdf(1);
3904 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3905 let meta = LopdfBackend::document_metadata(&doc).unwrap();
3906
3907 assert!(meta.is_empty());
3908 assert_eq!(meta.title, None);
3909 assert_eq!(meta.author, None);
3910 }
3911
3912 #[test]
3915 fn extract_image_content_raw_data() {
3916 let pdf_bytes = create_test_pdf_with_image_xobject();
3917 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3918 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3919
3920 let content = LopdfBackend::extract_image_content(&doc, &page, "Im0").unwrap();
3921
3922 assert_eq!(content.format, pdfplumber_core::ImageFormat::Raw);
3923 assert_eq!(content.width, 2);
3924 assert_eq!(content.height, 2);
3925 assert_eq!(content.data.len(), 12);
3927 assert_eq!(
3928 content.data,
3929 vec![255, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0]
3930 );
3931 }
3932
3933 #[test]
3934 fn extract_image_content_not_found() {
3935 let pdf_bytes = create_test_pdf_with_image_xobject();
3936 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3937 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3938
3939 let result = LopdfBackend::extract_image_content(&doc, &page, "NonExistent");
3940 assert!(result.is_err());
3941 let err_msg = result.unwrap_err().to_string();
3942 assert!(err_msg.contains("not found"));
3943 }
3944
3945 #[test]
3946 fn extract_image_content_jpeg() {
3947 let pdf_bytes = create_test_pdf_with_jpeg_image();
3949 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3950 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3951
3952 let content = LopdfBackend::extract_image_content(&doc, &page, "Im0").unwrap();
3953
3954 assert_eq!(content.format, pdfplumber_core::ImageFormat::Jpeg);
3955 assert_eq!(content.width, 2);
3956 assert_eq!(content.height, 2);
3957 assert!(content.data.starts_with(&[0xFF, 0xD8]));
3959 }
3960
3961 #[test]
3962 fn extract_image_content_no_xobject_resources() {
3963 let pdf_bytes = create_test_pdf_with_text_content();
3965 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3966 let page = LopdfBackend::get_page(&doc, 0).unwrap();
3967
3968 let result = LopdfBackend::extract_image_content(&doc, &page, "Im0");
3969 assert!(result.is_err());
3970 }
3971
3972 const PAD_BYTES: [u8; 32] = [
3976 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01,
3977 0x08, 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53,
3978 0x69, 0x7A,
3979 ];
3980
3981 fn rc4_transform(key: &[u8], data: &[u8]) -> Vec<u8> {
3983 let mut s: Vec<u8> = (0..=255).collect();
3985 let mut j: usize = 0;
3986 for i in 0..256 {
3987 j = (j + s[i] as usize + key[i % key.len()] as usize) & 0xFF;
3988 s.swap(i, j);
3989 }
3990 let mut out = Vec::with_capacity(data.len());
3992 let mut i: usize = 0;
3993 j = 0;
3994 for &byte in data {
3995 i = (i + 1) & 0xFF;
3996 j = (j + s[i] as usize) & 0xFF;
3997 s.swap(i, j);
3998 let k = s[(s[i] as usize + s[j] as usize) & 0xFF];
3999 out.push(byte ^ k);
4000 }
4001 out
4002 }
4003
4004 fn create_encrypted_test_pdf(user_password: &[u8]) -> Vec<u8> {
4006 use lopdf::{Document, Object, ObjectId, Stream, StringFormat, dictionary};
4007
4008 let file_id = b"testfileid123456"; let permissions: i32 = -4; let mut padded_pw = Vec::with_capacity(32);
4013 let pw_len = user_password.len().min(32);
4014 padded_pw.extend_from_slice(&user_password[..pw_len]);
4015 padded_pw.extend_from_slice(&PAD_BYTES[..32 - pw_len]);
4016
4017 let o_key_digest = md5::compute(&padded_pw);
4020 let o_key = &o_key_digest[..5]; let o_value = rc4_transform(o_key, &padded_pw);
4022
4023 let mut key_input = Vec::with_capacity(128);
4025 key_input.extend_from_slice(&padded_pw);
4026 key_input.extend_from_slice(&o_value);
4027 key_input.extend_from_slice(&(permissions as u32).to_le_bytes());
4028 key_input.extend_from_slice(file_id);
4029 let key_digest = md5::compute(&key_input);
4030 let enc_key = key_digest[..5].to_vec(); let u_value = rc4_transform(&enc_key, &PAD_BYTES);
4034
4035 let mut doc = Document::with_version("1.5");
4037 let pages_id: ObjectId = doc.new_object_id();
4038
4039 let content_bytes = b"BT /F1 12 Tf 72 720 Td (Hello World) Tj ET";
4041 let stream = Stream::new(dictionary! {}, content_bytes.to_vec());
4042 let content_id = doc.add_object(Object::Stream(stream));
4043
4044 let font_id = doc.add_object(dictionary! {
4045 "Type" => "Font",
4046 "Subtype" => "Type1",
4047 "BaseFont" => "Helvetica",
4048 });
4049
4050 let page_id = doc.add_object(dictionary! {
4051 "Type" => "Page",
4052 "Parent" => pages_id,
4053 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4054 "Contents" => Object::Reference(content_id),
4055 "Resources" => dictionary! {
4056 "Font" => dictionary! {
4057 "F1" => Object::Reference(font_id),
4058 },
4059 },
4060 });
4061
4062 doc.objects.insert(
4063 pages_id,
4064 Object::Dictionary(dictionary! {
4065 "Type" => "Pages",
4066 "Kids" => vec![Object::Reference(page_id)],
4067 "Count" => 1_i64,
4068 }),
4069 );
4070
4071 let catalog_id = doc.add_object(dictionary! {
4072 "Type" => "Catalog",
4073 "Pages" => pages_id,
4074 });
4075 doc.trailer.set("Root", catalog_id);
4076
4077 for (&obj_id, obj) in doc.objects.iter_mut() {
4079 let mut obj_key_input = Vec::with_capacity(10);
4081 obj_key_input.extend_from_slice(&enc_key);
4082 obj_key_input.extend_from_slice(&obj_id.0.to_le_bytes()[..3]);
4083 obj_key_input.extend_from_slice(&obj_id.1.to_le_bytes()[..2]);
4084 let obj_key_digest = md5::compute(&obj_key_input);
4085 let obj_key_len = (enc_key.len() + 5).min(16);
4086 let obj_key = &obj_key_digest[..obj_key_len];
4087
4088 match obj {
4089 Object::Stream(stream) => {
4090 let encrypted = rc4_transform(obj_key, &stream.content);
4091 stream.set_content(encrypted);
4092 }
4093 Object::String(content, _) => {
4094 let encrypted = rc4_transform(obj_key, content);
4095 *content = encrypted;
4096 }
4097 _ => {}
4098 }
4099 }
4100
4101 let encrypt_id = doc.add_object(dictionary! {
4103 "Filter" => "Standard",
4104 "V" => 1_i64,
4105 "R" => 2_i64,
4106 "Length" => 40_i64,
4107 "O" => Object::String(o_value, StringFormat::Literal),
4108 "U" => Object::String(u_value, StringFormat::Literal),
4109 "P" => permissions as i64,
4110 });
4111 doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
4112
4113 doc.trailer.set(
4115 "ID",
4116 Object::Array(vec![
4117 Object::String(file_id.to_vec(), StringFormat::Literal),
4118 Object::String(file_id.to_vec(), StringFormat::Literal),
4119 ]),
4120 );
4121
4122 let mut buf = Vec::new();
4123 doc.save_to(&mut buf)
4124 .expect("failed to save encrypted test PDF");
4125 buf
4126 }
4127
4128 #[test]
4131 fn open_encrypted_pdf_without_password_returns_password_required() {
4132 let pdf_bytes = create_encrypted_test_pdf(b"secret123");
4133 let result = LopdfBackend::open(&pdf_bytes);
4134 assert!(result.is_err());
4135 let err: pdfplumber_core::PdfError = result.unwrap_err().into();
4136 assert_eq!(err, pdfplumber_core::PdfError::PasswordRequired);
4137 }
4138
4139 #[test]
4140 fn open_encrypted_pdf_with_correct_password() {
4141 let password = b"secret123";
4142 let pdf_bytes = create_encrypted_test_pdf(password);
4143 let result = LopdfBackend::open_with_password(&pdf_bytes, password);
4144 assert!(result.is_ok());
4145 let doc = result.unwrap();
4146 assert_eq!(LopdfBackend::page_count(&doc), 1);
4147 }
4148
4149 #[test]
4150 fn open_encrypted_pdf_with_wrong_password_returns_invalid_password() {
4151 let pdf_bytes = create_encrypted_test_pdf(b"secret123");
4152 let result = LopdfBackend::open_with_password(&pdf_bytes, b"wrongpassword");
4153 assert!(result.is_err());
4154 let err: pdfplumber_core::PdfError = result.unwrap_err().into();
4155 assert_eq!(err, pdfplumber_core::PdfError::InvalidPassword);
4156 }
4157
4158 #[test]
4159 fn open_unencrypted_pdf_with_password_succeeds() {
4160 let pdf_bytes = create_test_pdf(1);
4162 let result = LopdfBackend::open_with_password(&pdf_bytes, b"anypassword");
4163 assert!(result.is_ok());
4164 let doc = result.unwrap();
4165 assert_eq!(LopdfBackend::page_count(&doc), 1);
4166 }
4167
4168 #[test]
4169 fn open_encrypted_pdf_with_empty_password() {
4170 let pdf_bytes = create_encrypted_test_pdf(b"");
4172 let result = LopdfBackend::open_with_password(&pdf_bytes, b"");
4173 assert!(result.is_ok());
4174 let doc = result.unwrap();
4175 assert_eq!(LopdfBackend::page_count(&doc), 1);
4176 }
4177
4178 fn create_test_pdf_with_form_fields() -> Vec<u8> {
4182 use lopdf::{Document, Object, ObjectId, dictionary};
4183
4184 let mut doc = Document::with_version("1.7");
4185 let pages_id: ObjectId = doc.new_object_id();
4186
4187 let page_id = doc.add_object(dictionary! {
4189 "Type" => "Page",
4190 "Parent" => pages_id,
4191 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4192 });
4193
4194 doc.objects.insert(
4195 pages_id,
4196 Object::Dictionary(dictionary! {
4197 "Type" => "Pages",
4198 "Kids" => vec![Object::Reference(page_id)],
4199 "Count" => Object::Integer(1),
4200 }),
4201 );
4202
4203 let text_field_id = doc.add_object(dictionary! {
4205 "Type" => "Annot",
4206 "Subtype" => "Widget",
4207 "T" => Object::string_literal("name"),
4208 "FT" => "Tx",
4209 "V" => Object::string_literal("John Doe"),
4210 "DV" => Object::string_literal(""),
4211 "Rect" => vec![50.into(), 700.into(), 200.into(), 720.into()],
4212 "Ff" => Object::Integer(0),
4213 "P" => Object::Reference(page_id),
4214 });
4215
4216 let checkbox_field_id = doc.add_object(dictionary! {
4218 "Type" => "Annot",
4219 "Subtype" => "Widget",
4220 "T" => Object::string_literal("agree"),
4221 "FT" => "Btn",
4222 "V" => "Yes",
4223 "DV" => "Off",
4224 "Rect" => vec![50.into(), 650.into(), 70.into(), 670.into()],
4225 "Ff" => Object::Integer(0),
4226 "P" => Object::Reference(page_id),
4227 });
4228
4229 let radio_field_id = doc.add_object(dictionary! {
4231 "Type" => "Annot",
4232 "Subtype" => "Widget",
4233 "T" => Object::string_literal("gender"),
4234 "FT" => "Btn",
4235 "V" => "Male",
4236 "Rect" => vec![50.into(), 600.into(), 70.into(), 620.into()],
4237 "Ff" => Object::Integer(49152), "P" => Object::Reference(page_id),
4239 });
4240
4241 let dropdown_field_id = doc.add_object(dictionary! {
4243 "Type" => "Annot",
4244 "Subtype" => "Widget",
4245 "T" => Object::string_literal("country"),
4246 "FT" => "Ch",
4247 "V" => Object::string_literal("US"),
4248 "Rect" => vec![50.into(), 550.into(), 200.into(), 570.into()],
4249 "Opt" => vec![
4250 Object::string_literal("US"),
4251 Object::string_literal("UK"),
4252 Object::string_literal("FR"),
4253 ],
4254 "Ff" => Object::Integer(0),
4255 "P" => Object::Reference(page_id),
4256 });
4257
4258 let empty_field_id = doc.add_object(dictionary! {
4260 "Type" => "Annot",
4261 "Subtype" => "Widget",
4262 "T" => Object::string_literal("email"),
4263 "FT" => "Tx",
4264 "Rect" => vec![50.into(), 500.into(), 200.into(), 520.into()],
4265 "Ff" => Object::Integer(0),
4266 "P" => Object::Reference(page_id),
4267 });
4268
4269 let acroform_id = doc.add_object(dictionary! {
4271 "Fields" => vec![
4272 Object::Reference(text_field_id),
4273 Object::Reference(checkbox_field_id),
4274 Object::Reference(radio_field_id),
4275 Object::Reference(dropdown_field_id),
4276 Object::Reference(empty_field_id),
4277 ],
4278 });
4279
4280 let catalog_id = doc.add_object(dictionary! {
4282 "Type" => "Catalog",
4283 "Pages" => pages_id,
4284 "AcroForm" => Object::Reference(acroform_id),
4285 });
4286 doc.trailer.set("Root", catalog_id);
4287
4288 let mut buf = Vec::new();
4289 doc.save_to(&mut buf).expect("failed to save test PDF");
4290 buf
4291 }
4292
4293 #[test]
4294 fn form_fields_text_field() {
4295 let pdf_bytes = create_test_pdf_with_form_fields();
4296 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4297 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4298
4299 let text_field = fields.iter().find(|f| f.name == "name").unwrap();
4300 assert_eq!(text_field.field_type, FieldType::Text);
4301 assert_eq!(text_field.value.as_deref(), Some("John Doe"));
4302 assert_eq!(text_field.default_value.as_deref(), Some(""));
4303 }
4304
4305 #[test]
4306 fn form_fields_checkbox() {
4307 let pdf_bytes = create_test_pdf_with_form_fields();
4308 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4309 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4310
4311 let checkbox = fields.iter().find(|f| f.name == "agree").unwrap();
4312 assert_eq!(checkbox.field_type, FieldType::Button);
4313 assert_eq!(checkbox.value.as_deref(), Some("Yes"));
4314 assert_eq!(checkbox.default_value.as_deref(), Some("Off"));
4315 }
4316
4317 #[test]
4318 fn form_fields_radio_button() {
4319 let pdf_bytes = create_test_pdf_with_form_fields();
4320 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4321 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4322
4323 let radio = fields.iter().find(|f| f.name == "gender").unwrap();
4324 assert_eq!(radio.field_type, FieldType::Button);
4325 assert_eq!(radio.value.as_deref(), Some("Male"));
4326 assert_eq!(radio.flags, 49152); }
4328
4329 #[test]
4330 fn form_fields_dropdown_with_options() {
4331 let pdf_bytes = create_test_pdf_with_form_fields();
4332 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4333 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4334
4335 let dropdown = fields.iter().find(|f| f.name == "country").unwrap();
4336 assert_eq!(dropdown.field_type, FieldType::Choice);
4337 assert_eq!(dropdown.value.as_deref(), Some("US"));
4338 assert_eq!(dropdown.options, vec!["US", "UK", "FR"]);
4339 }
4340
4341 #[test]
4342 fn form_fields_no_value() {
4343 let pdf_bytes = create_test_pdf_with_form_fields();
4344 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4345 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4346
4347 let empty = fields.iter().find(|f| f.name == "email").unwrap();
4348 assert_eq!(empty.field_type, FieldType::Text);
4349 assert!(empty.value.is_none());
4350 assert!(empty.default_value.is_none());
4351 }
4352
4353 #[test]
4354 fn form_fields_count() {
4355 let pdf_bytes = create_test_pdf_with_form_fields();
4356 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4357 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4358 assert_eq!(fields.len(), 5);
4359 }
4360
4361 #[test]
4362 fn form_fields_no_acroform_returns_empty() {
4363 let pdf_bytes = create_test_pdf(1);
4364 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4365 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4366 assert!(fields.is_empty());
4367 }
4368
4369 #[test]
4370 fn form_fields_have_bbox() {
4371 let pdf_bytes = create_test_pdf_with_form_fields();
4372 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4373 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4374
4375 let text_field = fields.iter().find(|f| f.name == "name").unwrap();
4376 assert!((text_field.bbox.x0 - 50.0).abs() < 0.1);
4377 assert!((text_field.bbox.x1 - 200.0).abs() < 0.1);
4378 }
4379
4380 #[test]
4381 fn form_fields_have_page_index() {
4382 let pdf_bytes = create_test_pdf_with_form_fields();
4383 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4384 let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4385
4386 for field in &fields {
4388 assert_eq!(field.page_index, Some(0));
4389 }
4390 }
4391
4392 fn create_test_pdf_with_structure_tree() -> Vec<u8> {
4398 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4399
4400 let mut doc = Document::with_version("1.7");
4401 let pages_id: ObjectId = doc.new_object_id();
4402
4403 let content = b"BT /F1 24 Tf /H1 <</MCID 0>> BDC 72 700 Td (Chapter 1) Tj EMC /P <</MCID 1>> BDC /F1 12 Tf 72 670 Td (This is paragraph text.) Tj EMC ET";
4405 let stream = Stream::new(dictionary! {}, content.to_vec());
4406 let content_id = doc.add_object(Object::Stream(stream));
4407
4408 let font_id = doc.add_object(dictionary! {
4409 "Type" => "Font",
4410 "Subtype" => "Type1",
4411 "BaseFont" => "Helvetica",
4412 });
4413
4414 let page_id = doc.add_object(dictionary! {
4415 "Type" => "Page",
4416 "Parent" => pages_id,
4417 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4418 "Contents" => Object::Reference(content_id),
4419 "Resources" => dictionary! {
4420 "Font" => dictionary! {
4421 "F1" => Object::Reference(font_id),
4422 },
4423 },
4424 });
4425
4426 doc.objects.insert(
4427 pages_id,
4428 Object::Dictionary(dictionary! {
4429 "Type" => "Pages",
4430 "Kids" => vec![Object::Reference(page_id)],
4431 "Count" => Object::Integer(1),
4432 }),
4433 );
4434
4435 let h1_elem_id = doc.add_object(dictionary! {
4438 "Type" => "StructElem",
4439 "S" => "H1",
4440 "K" => Object::Integer(0),
4441 "Pg" => Object::Reference(page_id),
4442 });
4443
4444 let p_elem_id = doc.add_object(dictionary! {
4446 "Type" => "StructElem",
4447 "S" => "P",
4448 "K" => Object::Integer(1),
4449 "Pg" => Object::Reference(page_id),
4450 "Lang" => Object::string_literal("en-US"),
4451 });
4452
4453 let doc_elem_id = doc.add_object(dictionary! {
4455 "Type" => "StructElem",
4456 "S" => "Document",
4457 "K" => vec![
4458 Object::Reference(h1_elem_id),
4459 Object::Reference(p_elem_id),
4460 ],
4461 });
4462
4463 let struct_tree_id = doc.add_object(dictionary! {
4465 "Type" => "StructTreeRoot",
4466 "K" => Object::Reference(doc_elem_id),
4467 });
4468
4469 let mark_info_id = doc.add_object(dictionary! {
4471 "Marked" => Object::Boolean(true),
4472 });
4473
4474 let catalog_id = doc.add_object(dictionary! {
4476 "Type" => "Catalog",
4477 "Pages" => pages_id,
4478 "StructTreeRoot" => Object::Reference(struct_tree_id),
4479 "MarkInfo" => Object::Reference(mark_info_id),
4480 });
4481 doc.trailer.set("Root", catalog_id);
4482
4483 let mut buf = Vec::new();
4484 doc.save_to(&mut buf)
4485 .expect("failed to save tagged test PDF");
4486 buf
4487 }
4488
4489 fn create_test_pdf_with_table_structure() -> Vec<u8> {
4491 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4492
4493 let mut doc = Document::with_version("1.7");
4494 let pages_id: ObjectId = doc.new_object_id();
4495
4496 let content = b"BT /F1 12 Tf 72 700 Td (Cell 1) Tj ET";
4497 let stream = Stream::new(dictionary! {}, content.to_vec());
4498 let content_id = doc.add_object(Object::Stream(stream));
4499
4500 let font_id = doc.add_object(dictionary! {
4501 "Type" => "Font",
4502 "Subtype" => "Type1",
4503 "BaseFont" => "Helvetica",
4504 });
4505
4506 let page_id = doc.add_object(dictionary! {
4507 "Type" => "Page",
4508 "Parent" => pages_id,
4509 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4510 "Contents" => Object::Reference(content_id),
4511 "Resources" => dictionary! {
4512 "Font" => dictionary! {
4513 "F1" => Object::Reference(font_id),
4514 },
4515 },
4516 });
4517
4518 doc.objects.insert(
4519 pages_id,
4520 Object::Dictionary(dictionary! {
4521 "Type" => "Pages",
4522 "Kids" => vec![Object::Reference(page_id)],
4523 "Count" => Object::Integer(1),
4524 }),
4525 );
4526
4527 let td1_id = doc.add_object(dictionary! {
4529 "Type" => "StructElem",
4530 "S" => "TD",
4531 "K" => Object::Integer(0),
4532 "Pg" => Object::Reference(page_id),
4533 });
4534
4535 let td2_id = doc.add_object(dictionary! {
4536 "Type" => "StructElem",
4537 "S" => "TD",
4538 "K" => Object::Integer(1),
4539 "Pg" => Object::Reference(page_id),
4540 });
4541
4542 let tr_id = doc.add_object(dictionary! {
4543 "Type" => "StructElem",
4544 "S" => "TR",
4545 "K" => vec![Object::Reference(td1_id), Object::Reference(td2_id)],
4546 });
4547
4548 let table_id = doc.add_object(dictionary! {
4549 "Type" => "StructElem",
4550 "S" => "Table",
4551 "K" => Object::Reference(tr_id),
4552 "Pg" => Object::Reference(page_id),
4553 });
4554
4555 let struct_tree_id = doc.add_object(dictionary! {
4556 "Type" => "StructTreeRoot",
4557 "K" => Object::Reference(table_id),
4558 });
4559
4560 let catalog_id = doc.add_object(dictionary! {
4561 "Type" => "Catalog",
4562 "Pages" => pages_id,
4563 "StructTreeRoot" => Object::Reference(struct_tree_id),
4564 });
4565 doc.trailer.set("Root", catalog_id);
4566
4567 let mut buf = Vec::new();
4568 doc.save_to(&mut buf).expect("failed to save test PDF");
4569 buf
4570 }
4571
4572 #[test]
4573 fn structure_tree_tagged_pdf_has_elements() {
4574 let pdf_bytes = create_test_pdf_with_structure_tree();
4575 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4576 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4577
4578 assert!(!elements.is_empty());
4579 }
4580
4581 #[test]
4582 fn structure_tree_document_root_element() {
4583 let pdf_bytes = create_test_pdf_with_structure_tree();
4584 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4585 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4586
4587 assert_eq!(elements.len(), 1);
4589 assert_eq!(elements[0].element_type, "Document");
4590 assert_eq!(elements[0].children.len(), 2);
4591 }
4592
4593 #[test]
4594 fn structure_tree_heading_element() {
4595 let pdf_bytes = create_test_pdf_with_structure_tree();
4596 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4597 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4598
4599 let doc_elem = &elements[0];
4600 let h1 = &doc_elem.children[0];
4601 assert_eq!(h1.element_type, "H1");
4602 assert_eq!(h1.mcids, vec![0]);
4603 assert_eq!(h1.page_index, Some(0));
4604 }
4605
4606 #[test]
4607 fn structure_tree_paragraph_element() {
4608 let pdf_bytes = create_test_pdf_with_structure_tree();
4609 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4610 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4611
4612 let doc_elem = &elements[0];
4613 let p = &doc_elem.children[1];
4614 assert_eq!(p.element_type, "P");
4615 assert_eq!(p.mcids, vec![1]);
4616 assert_eq!(p.page_index, Some(0));
4617 assert_eq!(p.lang.as_deref(), Some("en-US"));
4618 }
4619
4620 #[test]
4621 fn structure_tree_untagged_pdf_returns_empty() {
4622 let pdf_bytes = create_test_pdf_with_text_content();
4624 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4625 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4626
4627 assert!(elements.is_empty());
4628 }
4629
4630 #[test]
4631 fn structure_tree_table_nested_structure() {
4632 let pdf_bytes = create_test_pdf_with_table_structure();
4633 let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4634 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4635
4636 assert_eq!(elements.len(), 1);
4638 let table = &elements[0];
4639 assert_eq!(table.element_type, "Table");
4640
4641 assert_eq!(table.children.len(), 1);
4643 let tr = &table.children[0];
4644 assert_eq!(tr.element_type, "TR");
4645
4646 assert_eq!(tr.children.len(), 2);
4648 assert_eq!(tr.children[0].element_type, "TD");
4649 assert_eq!(tr.children[0].mcids, vec![0]);
4650 assert_eq!(tr.children[1].element_type, "TD");
4651 assert_eq!(tr.children[1].mcids, vec![1]);
4652 }
4653
4654 #[test]
4655 fn structure_tree_mcr_dictionary_handling() {
4656 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4658
4659 let mut doc = Document::with_version("1.7");
4660 let pages_id: ObjectId = doc.new_object_id();
4661
4662 let content = b"BT /F1 12 Tf 72 700 Td (text) Tj ET";
4663 let stream = Stream::new(dictionary! {}, content.to_vec());
4664 let content_id = doc.add_object(Object::Stream(stream));
4665
4666 let font_id = doc.add_object(dictionary! {
4667 "Type" => "Font",
4668 "Subtype" => "Type1",
4669 "BaseFont" => "Helvetica",
4670 });
4671
4672 let page_id = doc.add_object(dictionary! {
4673 "Type" => "Page",
4674 "Parent" => pages_id,
4675 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4676 "Contents" => Object::Reference(content_id),
4677 "Resources" => dictionary! {
4678 "Font" => dictionary! {
4679 "F1" => Object::Reference(font_id),
4680 },
4681 },
4682 });
4683
4684 doc.objects.insert(
4685 pages_id,
4686 Object::Dictionary(dictionary! {
4687 "Type" => "Pages",
4688 "Kids" => vec![Object::Reference(page_id)],
4689 "Count" => Object::Integer(1),
4690 }),
4691 );
4692
4693 let p_elem_id = doc.add_object(dictionary! {
4695 "Type" => "StructElem",
4696 "S" => "P",
4697 "K" => dictionary! {
4698 "Type" => "MCR",
4699 "MCID" => Object::Integer(5),
4700 "Pg" => Object::Reference(page_id),
4701 },
4702 "Pg" => Object::Reference(page_id),
4703 });
4704
4705 let struct_tree_id = doc.add_object(dictionary! {
4706 "Type" => "StructTreeRoot",
4707 "K" => Object::Reference(p_elem_id),
4708 });
4709
4710 let catalog_id = doc.add_object(dictionary! {
4711 "Type" => "Catalog",
4712 "Pages" => pages_id,
4713 "StructTreeRoot" => Object::Reference(struct_tree_id),
4714 });
4715 doc.trailer.set("Root", catalog_id);
4716
4717 let mut buf = Vec::new();
4718 doc.save_to(&mut buf).expect("failed to save test PDF");
4719
4720 let doc = LopdfBackend::open(&buf).unwrap();
4721 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4722
4723 assert_eq!(elements.len(), 1);
4724 let p = &elements[0];
4725 assert_eq!(p.element_type, "P");
4726 assert_eq!(p.mcids, vec![5]); }
4728
4729 #[test]
4730 fn structure_tree_alt_text() {
4731 use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4732
4733 let mut doc = Document::with_version("1.7");
4734 let pages_id: ObjectId = doc.new_object_id();
4735
4736 let content = b"BT /F1 12 Tf 72 700 Td (image) Tj ET";
4737 let stream = Stream::new(dictionary! {}, content.to_vec());
4738 let content_id = doc.add_object(Object::Stream(stream));
4739
4740 let font_id = doc.add_object(dictionary! {
4741 "Type" => "Font",
4742 "Subtype" => "Type1",
4743 "BaseFont" => "Helvetica",
4744 });
4745
4746 let page_id = doc.add_object(dictionary! {
4747 "Type" => "Page",
4748 "Parent" => pages_id,
4749 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4750 "Contents" => Object::Reference(content_id),
4751 "Resources" => dictionary! {
4752 "Font" => dictionary! {
4753 "F1" => Object::Reference(font_id),
4754 },
4755 },
4756 });
4757
4758 doc.objects.insert(
4759 pages_id,
4760 Object::Dictionary(dictionary! {
4761 "Type" => "Pages",
4762 "Kids" => vec![Object::Reference(page_id)],
4763 "Count" => Object::Integer(1),
4764 }),
4765 );
4766
4767 let fig_elem_id = doc.add_object(dictionary! {
4769 "Type" => "StructElem",
4770 "S" => "Figure",
4771 "K" => Object::Integer(0),
4772 "Pg" => Object::Reference(page_id),
4773 "Alt" => Object::string_literal("A photo of a sunset"),
4774 "ActualText" => Object::string_literal("Sunset photo"),
4775 });
4776
4777 let struct_tree_id = doc.add_object(dictionary! {
4778 "Type" => "StructTreeRoot",
4779 "K" => Object::Reference(fig_elem_id),
4780 });
4781
4782 let catalog_id = doc.add_object(dictionary! {
4783 "Type" => "Catalog",
4784 "Pages" => pages_id,
4785 "StructTreeRoot" => Object::Reference(struct_tree_id),
4786 });
4787 doc.trailer.set("Root", catalog_id);
4788
4789 let mut buf = Vec::new();
4790 doc.save_to(&mut buf).expect("failed to save test PDF");
4791
4792 let doc = LopdfBackend::open(&buf).unwrap();
4793 let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4794
4795 assert_eq!(elements.len(), 1);
4796 let fig = &elements[0];
4797 assert_eq!(fig.element_type, "Figure");
4798 assert_eq!(fig.alt_text.as_deref(), Some("A photo of a sunset"));
4799 assert_eq!(fig.actual_text.as_deref(), Some("Sunset photo"));
4800 }
4801}