Skip to main content

pdfplumber_parse/
lopdf_backend.rs

1//! lopdf-based PDF parsing backend.
2//!
3//! Implements [`PdfBackend`] using the [lopdf](https://crates.io/crates/lopdf)
4//! crate for PDF document parsing. This is the default backend for pdfplumber-rs.
5
6use crate::backend::PdfBackend;
7use crate::error::BackendError;
8use crate::handler::ContentHandler;
9use pdfplumber_core::{
10    Annotation, AnnotationType, BBox, Bookmark, DocumentMetadata, ExtractOptions, FieldType,
11    FormField, Hyperlink, ImageContent, RepairOptions, RepairResult, SignatureInfo, StructElement,
12    ValidationIssue,
13};
14
15/// A parsed PDF document backed by lopdf.
16pub struct LopdfDocument {
17    /// The underlying lopdf document.
18    inner: lopdf::Document,
19    /// Cached ordered list of page ObjectIds (indexed by 0-based page number).
20    page_ids: Vec<lopdf::ObjectId>,
21}
22
23impl LopdfDocument {
24    /// Access the underlying lopdf document.
25    pub fn inner(&self) -> &lopdf::Document {
26        &self.inner
27    }
28}
29
30impl std::fmt::Debug for LopdfDocument {
31    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
32        f.debug_struct("LopdfDocument")
33            .field("page_count", &self.page_ids.len())
34            .finish_non_exhaustive()
35    }
36}
37
38/// A reference to a single page within a [`LopdfDocument`].
39#[derive(Debug, Clone, Copy)]
40pub struct LopdfPage {
41    /// The lopdf object ID for this page.
42    pub object_id: lopdf::ObjectId,
43    /// The 0-based page index.
44    pub index: usize,
45}
46
47/// The lopdf-based PDF backend.
48///
49/// Provides PDF parsing via [`lopdf::Document`]. This is the default
50/// backend used by pdfplumber-rs.
51///
52/// # Example
53///
54/// ```ignore
55/// use pdfplumber_parse::lopdf_backend::LopdfBackend;
56/// use pdfplumber_parse::PdfBackend;
57///
58/// let doc = LopdfBackend::open(pdf_bytes)?;
59/// let count = LopdfBackend::page_count(&doc);
60/// let page = LopdfBackend::get_page(&doc, 0)?;
61/// ```
62pub struct LopdfBackend;
63
64/// Extract a [`BBox`] from a lopdf array of 4 numbers `[x0, y0, x1, y1]`.
65fn extract_bbox_from_array(array: &[lopdf::Object]) -> Result<BBox, BackendError> {
66    if array.len() != 4 {
67        return Err(BackendError::Parse(format!(
68            "expected 4-element array for box, got {}",
69            array.len()
70        )));
71    }
72    let x0 = object_to_f64(&array[0])?;
73    let y0 = object_to_f64(&array[1])?;
74    let x1 = object_to_f64(&array[2])?;
75    let y1 = object_to_f64(&array[3])?;
76    Ok(BBox::new(x0, y0, x1, y1))
77}
78
79/// Convert a lopdf numeric object (Integer or Real) to f64.
80pub(crate) fn object_to_f64(obj: &lopdf::Object) -> Result<f64, BackendError> {
81    match obj {
82        lopdf::Object::Integer(i) => Ok(*i as f64),
83        lopdf::Object::Real(f) => Ok(*f as f64),
84        _ => Err(BackendError::Parse(format!("expected number, got {obj:?}"))),
85    }
86}
87
88/// Look up a key in the page dictionary, walking up the page tree
89/// (via /Parent) if the key is not found on the page itself.
90///
91/// Returns `None` if the key is not found anywhere in the tree.
92fn resolve_inherited<'a>(
93    doc: &'a lopdf::Document,
94    page_id: lopdf::ObjectId,
95    key: &[u8],
96) -> Result<Option<&'a lopdf::Object>, BackendError> {
97    let mut current_id = page_id;
98    loop {
99        let dict = doc
100            .get_object(current_id)
101            .and_then(|o| o.as_dict())
102            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
103
104        if let Ok(value) = dict.get(key) {
105            return Ok(Some(value));
106        }
107
108        // Try to follow /Parent link
109        match dict.get(b"Parent") {
110            Ok(parent_obj) => {
111                current_id = parent_obj
112                    .as_reference()
113                    .map_err(|e| BackendError::Parse(format!("invalid /Parent reference: {e}")))?;
114            }
115            Err(_) => return Ok(None),
116        }
117    }
118}
119
120impl PdfBackend for LopdfBackend {
121    type Document = LopdfDocument;
122    type Page = LopdfPage;
123    type Error = BackendError;
124
125    fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
126        let inner = lopdf::Document::load_mem(bytes)
127            .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
128
129        // Reject encrypted PDFs when no password is provided
130        if inner.is_encrypted() {
131            return Err(BackendError::Core(
132                pdfplumber_core::PdfError::PasswordRequired,
133            ));
134        }
135
136        // Cache page IDs in order (get_pages returns BTreeMap<u32, ObjectId> with 1-based keys)
137        let pages_map = inner.get_pages();
138        let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
139
140        Ok(LopdfDocument { inner, page_ids })
141    }
142
143    fn open_with_password(bytes: &[u8], password: &[u8]) -> Result<Self::Document, Self::Error> {
144        let mut inner = lopdf::Document::load_mem(bytes)
145            .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
146
147        // Decrypt if encrypted; ignore password if not encrypted
148        if inner.is_encrypted() {
149            inner.decrypt(password).map_err(|e| {
150                let msg = e.to_string();
151                if msg.contains("incorrect") || msg.contains("password") {
152                    BackendError::Core(pdfplumber_core::PdfError::InvalidPassword)
153                } else {
154                    BackendError::Parse(format!("decryption failed: {e}"))
155                }
156            })?;
157        }
158
159        // Cache page IDs in order
160        let pages_map = inner.get_pages();
161        let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
162
163        Ok(LopdfDocument { inner, page_ids })
164    }
165
166    fn page_count(doc: &Self::Document) -> usize {
167        doc.page_ids.len()
168    }
169
170    fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
171        if index >= doc.page_ids.len() {
172            return Err(BackendError::Parse(format!(
173                "page index {index} out of range (0..{})",
174                doc.page_ids.len()
175            )));
176        }
177        Ok(LopdfPage {
178            object_id: doc.page_ids[index],
179            index,
180        })
181    }
182
183    fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
184        let obj = resolve_inherited(&doc.inner, page.object_id, b"MediaBox")?
185            .ok_or_else(|| BackendError::Parse("MediaBox not found on page or ancestors".into()))?;
186        let array = obj
187            .as_array()
188            .map_err(|e| BackendError::Parse(format!("MediaBox is not an array: {e}")))?;
189        extract_bbox_from_array(array)
190    }
191
192    fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
193        // CropBox is optional — only look at the page itself, not inherited
194        let dict = doc
195            .inner
196            .get_object(page.object_id)
197            .and_then(|o| o.as_dict())
198            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
199
200        match dict.get(b"CropBox") {
201            Ok(obj) => {
202                let array = obj
203                    .as_array()
204                    .map_err(|e| BackendError::Parse(format!("CropBox is not an array: {e}")))?;
205                Ok(Some(extract_bbox_from_array(array)?))
206            }
207            Err(_) => Ok(None),
208        }
209    }
210
211    fn page_trim_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
212        match resolve_inherited(&doc.inner, page.object_id, b"TrimBox")? {
213            Some(obj) => {
214                let array = obj
215                    .as_array()
216                    .map_err(|e| BackendError::Parse(format!("TrimBox is not an array: {e}")))?;
217                Ok(Some(extract_bbox_from_array(array)?))
218            }
219            None => Ok(None),
220        }
221    }
222
223    fn page_bleed_box(
224        doc: &Self::Document,
225        page: &Self::Page,
226    ) -> Result<Option<BBox>, Self::Error> {
227        match resolve_inherited(&doc.inner, page.object_id, b"BleedBox")? {
228            Some(obj) => {
229                let array = obj
230                    .as_array()
231                    .map_err(|e| BackendError::Parse(format!("BleedBox is not an array: {e}")))?;
232                Ok(Some(extract_bbox_from_array(array)?))
233            }
234            None => Ok(None),
235        }
236    }
237
238    fn page_art_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
239        match resolve_inherited(&doc.inner, page.object_id, b"ArtBox")? {
240            Some(obj) => {
241                let array = obj
242                    .as_array()
243                    .map_err(|e| BackendError::Parse(format!("ArtBox is not an array: {e}")))?;
244                Ok(Some(extract_bbox_from_array(array)?))
245            }
246            None => Ok(None),
247        }
248    }
249
250    fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
251        match resolve_inherited(&doc.inner, page.object_id, b"Rotate")? {
252            Some(obj) => {
253                let rotation = obj
254                    .as_i64()
255                    .map_err(|e| BackendError::Parse(format!("Rotate is not an integer: {e}")))?;
256                Ok(rotation as i32)
257            }
258            None => Ok(0), // Default rotation is 0
259        }
260    }
261
262    fn document_metadata(doc: &Self::Document) -> Result<DocumentMetadata, Self::Error> {
263        extract_document_metadata(&doc.inner)
264    }
265
266    fn document_bookmarks(doc: &Self::Document) -> Result<Vec<Bookmark>, Self::Error> {
267        extract_document_bookmarks(&doc.inner)
268    }
269
270    fn document_form_fields(doc: &Self::Document) -> Result<Vec<FormField>, Self::Error> {
271        extract_document_form_fields(&doc.inner)
272    }
273
274    fn document_signatures(doc: &Self::Document) -> Result<Vec<SignatureInfo>, Self::Error> {
275        extract_document_signatures(&doc.inner)
276    }
277
278    fn document_structure_tree(doc: &Self::Document) -> Result<Vec<StructElement>, Self::Error> {
279        extract_document_structure_tree(&doc.inner)
280    }
281
282    fn page_annotations(
283        doc: &Self::Document,
284        page: &Self::Page,
285    ) -> Result<Vec<Annotation>, Self::Error> {
286        extract_page_annotations(&doc.inner, page.object_id)
287    }
288
289    fn page_hyperlinks(
290        doc: &Self::Document,
291        page: &Self::Page,
292    ) -> Result<Vec<Hyperlink>, Self::Error> {
293        extract_page_hyperlinks(&doc.inner, page.object_id)
294    }
295
296    fn interpret_page(
297        doc: &Self::Document,
298        page: &Self::Page,
299        handler: &mut dyn ContentHandler,
300        options: &ExtractOptions,
301    ) -> Result<(), Self::Error> {
302        let inner = &doc.inner;
303
304        // Get the page dictionary
305        let page_dict = inner
306            .get_object(page.object_id)
307            .and_then(|o| o.as_dict())
308            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
309
310        // Get page content stream bytes
311        let content_bytes = get_page_content_bytes(inner, page_dict)?;
312
313        // Get page resources (may be inherited)
314        let resources = get_page_resources(inner, page.object_id)?;
315
316        // Initialize state machines
317        let mut gstate = crate::interpreter_state::InterpreterState::new();
318        let mut tstate = crate::text_state::TextState::new();
319
320        // Interpret the content stream
321        crate::interpreter::interpret_content_stream(
322            inner,
323            &content_bytes,
324            resources,
325            handler,
326            options,
327            0, // page-level depth
328            &mut gstate,
329            &mut tstate,
330        )
331    }
332
333    fn extract_image_content(
334        doc: &Self::Document,
335        page: &Self::Page,
336        image_name: &str,
337    ) -> Result<ImageContent, Self::Error> {
338        use pdfplumber_core::ImageFormat;
339
340        let inner = &doc.inner;
341
342        // Get page resources
343        let resources = get_page_resources(inner, page.object_id)?;
344
345        // Look up /Resources/XObject/<image_name>
346        let xobj_dict = resources.get(b"XObject").map_err(|_| {
347            BackendError::Parse(format!(
348                "no /XObject dictionary in page resources for image /{image_name}"
349            ))
350        })?;
351        let xobj_dict = resolve_ref(inner, xobj_dict);
352        let xobj_dict = xobj_dict.as_dict().map_err(|_| {
353            BackendError::Parse("/XObject resource is not a dictionary".to_string())
354        })?;
355
356        let xobj_entry = xobj_dict.get(image_name.as_bytes()).map_err(|_| {
357            BackendError::Parse(format!(
358                "image XObject /{image_name} not found in resources"
359            ))
360        })?;
361
362        let xobj_id = xobj_entry.as_reference().map_err(|_| {
363            BackendError::Parse(format!(
364                "image XObject /{image_name} is not an indirect reference"
365            ))
366        })?;
367
368        let xobj = inner.get_object(xobj_id).map_err(|e| {
369            BackendError::Parse(format!(
370                "failed to resolve image XObject /{image_name}: {e}"
371            ))
372        })?;
373
374        let stream = xobj.as_stream().map_err(|e| {
375            BackendError::Parse(format!("image XObject /{image_name} is not a stream: {e}"))
376        })?;
377
378        // Verify it's an Image subtype
379        let subtype = stream
380            .dict
381            .get(b"Subtype")
382            .ok()
383            .and_then(|o| o.as_name_str().ok())
384            .unwrap_or("");
385        if subtype != "Image" {
386            return Err(BackendError::Parse(format!(
387                "XObject /{image_name} is not an Image (subtype: {subtype})"
388            )));
389        }
390
391        let width = stream
392            .dict
393            .get(b"Width")
394            .ok()
395            .and_then(|o| o.as_i64().ok())
396            .unwrap_or(0) as u32;
397
398        let height = stream
399            .dict
400            .get(b"Height")
401            .ok()
402            .and_then(|o| o.as_i64().ok())
403            .unwrap_or(0) as u32;
404
405        // Determine the filter to decide image format
406        let filter = stream
407            .dict
408            .get(b"Filter")
409            .ok()
410            .and_then(|o| {
411                // Filter can be a single name or an array of names
412                if let Ok(name) = o.as_name_str() {
413                    Some(vec![name.to_string()])
414                } else if let Ok(arr) = o.as_array() {
415                    Some(
416                        arr.iter()
417                            .filter_map(|item| {
418                                let resolved = resolve_ref(inner, item);
419                                resolved.as_name_str().ok().map(|s| s.to_string())
420                            })
421                            .collect(),
422                    )
423                } else {
424                    None
425                }
426            })
427            .unwrap_or_default();
428
429        // Determine format from the last filter in the chain
430        let format = if filter.is_empty() {
431            ImageFormat::Raw
432        } else {
433            match filter.last().map(|s| s.as_str()) {
434                Some("DCTDecode") => ImageFormat::Jpeg,
435                Some("JBIG2Decode") => ImageFormat::Jbig2,
436                Some("CCITTFaxDecode") => ImageFormat::CcittFax,
437                _ => ImageFormat::Raw,
438            }
439        };
440
441        // Extract the image data
442        let data = match format {
443            ImageFormat::Jpeg => {
444                // For JPEG, return the raw stream content (the JPEG bytes)
445                // If there are filters before DCTDecode, we need partial decompression
446                if filter.len() == 1 {
447                    // Only DCTDecode — raw content is the JPEG
448                    stream.content.clone()
449                } else {
450                    // Chained filters: decompress everything (lopdf handles this)
451                    stream.decompressed_content().map_err(|e| {
452                        BackendError::Parse(format!(
453                            "failed to decompress image /{image_name}: {e}"
454                        ))
455                    })?
456                }
457            }
458            ImageFormat::Jbig2 | ImageFormat::CcittFax => {
459                // Return raw stream content for these specialized formats
460                stream.content.clone()
461            }
462            ImageFormat::Raw | ImageFormat::Png => {
463                // Decompress if filters present, otherwise return raw
464                if filter.is_empty() {
465                    stream.content.clone()
466                } else {
467                    stream.decompressed_content().map_err(|e| {
468                        BackendError::Parse(format!(
469                            "failed to decompress image /{image_name}: {e}"
470                        ))
471                    })?
472                }
473            }
474        };
475
476        Ok(ImageContent {
477            data,
478            format,
479            width,
480            height,
481        })
482    }
483
484    fn validate(doc: &Self::Document) -> Result<Vec<ValidationIssue>, Self::Error> {
485        validate_document(doc)
486    }
487
488    fn repair(
489        bytes: &[u8],
490        options: &RepairOptions,
491    ) -> Result<(Vec<u8>, RepairResult), Self::Error> {
492        repair_document(bytes, options)
493    }
494}
495
496/// Validate a PDF document for specification violations.
497fn validate_document(doc: &LopdfDocument) -> Result<Vec<ValidationIssue>, BackendError> {
498    use pdfplumber_core::{Severity, ValidationIssue};
499
500    let inner = &doc.inner;
501    let mut issues = Vec::new();
502
503    // 1. Check catalog for required /Type key
504    let catalog_location = get_catalog_location(inner);
505    let catalog_dict = get_catalog_dict(inner);
506
507    if let Some(dict) = catalog_dict {
508        match dict.get(b"Type") {
509            Ok(type_obj) => {
510                if let Ok(name) = type_obj.as_name_str() {
511                    if name != "Catalog" {
512                        issues.push(ValidationIssue::with_location(
513                            Severity::Warning,
514                            "WRONG_CATALOG_TYPE",
515                            format!("catalog /Type is '{name}' instead of 'Catalog'"),
516                            &catalog_location,
517                        ));
518                    }
519                }
520            }
521            Err(_) => {
522                issues.push(ValidationIssue::with_location(
523                    Severity::Warning,
524                    "MISSING_TYPE",
525                    "catalog dictionary missing /Type key",
526                    &catalog_location,
527                ));
528            }
529        }
530
531        // Check /Pages exists
532        if dict.get(b"Pages").is_err() {
533            issues.push(ValidationIssue::with_location(
534                Severity::Error,
535                "MISSING_PAGES",
536                "catalog dictionary missing /Pages key",
537                &catalog_location,
538            ));
539        }
540    }
541
542    // 2. Check page tree structure
543    for (page_idx, &page_id) in doc.page_ids.iter().enumerate() {
544        let page_num = page_idx + 1;
545        let location = format!("page {page_num} (object {} {})", page_id.0, page_id.1);
546
547        match inner.get_object(page_id) {
548            Ok(obj) => {
549                if let Ok(dict) = obj.as_dict() {
550                    // Check page /Type key
551                    match dict.get(b"Type") {
552                        Ok(type_obj) => {
553                            if let Ok(name) = type_obj.as_name_str() {
554                                if name != "Page" {
555                                    issues.push(ValidationIssue::with_location(
556                                        Severity::Warning,
557                                        "WRONG_PAGE_TYPE",
558                                        format!("page /Type is '{name}' instead of 'Page'"),
559                                        &location,
560                                    ));
561                                }
562                            }
563                        }
564                        Err(_) => {
565                            issues.push(ValidationIssue::with_location(
566                                Severity::Warning,
567                                "MISSING_TYPE",
568                                "page dictionary missing /Type key",
569                                &location,
570                            ));
571                        }
572                    }
573
574                    // Check MediaBox (required, can be inherited)
575                    if resolve_inherited(inner, page_id, b"MediaBox")
576                        .ok()
577                        .flatten()
578                        .is_none()
579                    {
580                        issues.push(ValidationIssue::with_location(
581                            Severity::Error,
582                            "MISSING_MEDIABOX",
583                            "page has no /MediaBox (not on page or ancestors)",
584                            &location,
585                        ));
586                    }
587
588                    // Check for missing fonts referenced in content streams
589                    check_page_fonts(inner, page_id, dict, &location, &mut issues);
590                } else {
591                    issues.push(ValidationIssue::with_location(
592                        Severity::Error,
593                        "INVALID_PAGE",
594                        "page object is not a dictionary",
595                        &location,
596                    ));
597                }
598            }
599            Err(_) => {
600                issues.push(ValidationIssue::with_location(
601                    Severity::Error,
602                    "BROKEN_REF",
603                    format!("page object {} {} not found", page_id.0, page_id.1),
604                    &location,
605                ));
606            }
607        }
608    }
609
610    // 3. Check for broken object references in the xref table
611    check_broken_references(inner, &mut issues);
612
613    Ok(issues)
614}
615
616/// Get the catalog dictionary from the document.
617fn get_catalog_dict(doc: &lopdf::Document) -> Option<&lopdf::Dictionary> {
618    let root_obj = doc.trailer.get(b"Root").ok()?;
619    match root_obj {
620        lopdf::Object::Reference(id) => {
621            let obj = doc.get_object(*id).ok()?;
622            obj.as_dict().ok()
623        }
624        lopdf::Object::Dictionary(dict) => Some(dict),
625        _ => None,
626    }
627}
628
629/// Get a human-readable location string for the catalog object.
630fn get_catalog_location(doc: &lopdf::Document) -> String {
631    if let Ok(lopdf::Object::Reference(id)) = doc.trailer.get(b"Root") {
632        return format!("object {} {}", id.0, id.1);
633    }
634    "catalog".to_string()
635}
636
637/// Check that fonts referenced in content streams are defined in page resources.
638fn check_page_fonts(
639    doc: &lopdf::Document,
640    page_id: lopdf::ObjectId,
641    page_dict: &lopdf::Dictionary,
642    location: &str,
643    issues: &mut Vec<pdfplumber_core::ValidationIssue>,
644) {
645    use pdfplumber_core::{Severity, ValidationIssue};
646
647    // Get fonts from resources
648    let font_names = get_resource_font_names(doc, page_id, page_dict);
649
650    // Get content stream to find font references
651    let content_fonts = get_content_stream_font_refs(doc, page_dict);
652
653    // Check each font referenced in the content stream
654    for font_ref in &content_fonts {
655        if !font_names.contains(font_ref) {
656            issues.push(ValidationIssue::with_location(
657                Severity::Warning,
658                "MISSING_FONT",
659                format!("font /{font_ref} referenced in content stream but not in resources"),
660                location,
661            ));
662        }
663    }
664}
665
666/// Get the names of fonts defined in the page's resources.
667fn get_resource_font_names(
668    doc: &lopdf::Document,
669    page_id: lopdf::ObjectId,
670    page_dict: &lopdf::Dictionary,
671) -> Vec<String> {
672    let mut names = Vec::new();
673
674    // Try to get Resources from the page or inherited
675    let resources = if let Ok(res_obj) = page_dict.get(b"Resources") {
676        let resolved = resolve_ref(doc, res_obj);
677        resolved.as_dict().ok()
678    } else {
679        // Try inherited resources
680        resolve_inherited(doc, page_id, b"Resources")
681            .ok()
682            .flatten()
683            .and_then(|obj| obj.as_dict().ok())
684    };
685
686    if let Some(resources_dict) = resources {
687        if let Ok(font_obj) = resources_dict.get(b"Font") {
688            let font_obj = resolve_ref(doc, font_obj);
689            if let Ok(font_dict) = font_obj.as_dict() {
690                for (key, _) in font_dict.iter() {
691                    if let Ok(name) = std::str::from_utf8(key) {
692                        names.push(name.to_string());
693                    }
694                }
695            }
696        }
697    }
698
699    names
700}
701
702/// Parse content stream operators to find font name references (Tf operator).
703fn get_content_stream_font_refs(
704    doc: &lopdf::Document,
705    page_dict: &lopdf::Dictionary,
706) -> Vec<String> {
707    let mut font_refs = Vec::new();
708
709    let content_bytes = match get_content_stream_bytes(doc, page_dict) {
710        Some(bytes) => bytes,
711        None => return font_refs,
712    };
713
714    // Simple parser: look for "/FontName <number> Tf" patterns
715    let content = String::from_utf8_lossy(&content_bytes);
716    let tokens: Vec<&str> = content.split_whitespace().collect();
717
718    for (i, token) in tokens.iter().enumerate() {
719        if *token == "Tf" && i >= 2 {
720            let font_name_token = tokens[i - 2];
721            if let Some(name) = font_name_token.strip_prefix('/') {
722                if !font_refs.contains(&name.to_string()) {
723                    font_refs.push(name.to_string());
724                }
725            }
726        }
727    }
728
729    font_refs
730}
731
732/// Try to get decompressed content from a stream, falling back to raw content.
733fn stream_bytes(stream: &lopdf::Stream) -> Option<Vec<u8>> {
734    stream
735        .decompressed_content()
736        .ok()
737        .or_else(|| Some(stream.content.clone()))
738        .filter(|b| !b.is_empty())
739}
740
741/// Get the raw bytes of a page's content stream(s).
742fn get_content_stream_bytes(
743    doc: &lopdf::Document,
744    page_dict: &lopdf::Dictionary,
745) -> Option<Vec<u8>> {
746    let contents_obj = page_dict.get(b"Contents").ok()?;
747
748    match contents_obj {
749        lopdf::Object::Reference(id) => {
750            let obj = doc.get_object(*id).ok()?;
751            if let Ok(stream) = obj.as_stream() {
752                stream_bytes(stream)
753            } else {
754                None
755            }
756        }
757        lopdf::Object::Array(arr) => {
758            let mut all_bytes = Vec::new();
759            for item in arr {
760                let resolved = resolve_ref(doc, item);
761                if let Ok(stream) = resolved.as_stream() {
762                    if let Some(bytes) = stream_bytes(stream) {
763                        all_bytes.extend_from_slice(&bytes);
764                        all_bytes.push(b' ');
765                    }
766                }
767            }
768            if all_bytes.is_empty() {
769                None
770            } else {
771                Some(all_bytes)
772            }
773        }
774        _ => None,
775    }
776}
777
778/// Check for broken object references across the document.
779fn check_broken_references(
780    doc: &lopdf::Document,
781    issues: &mut Vec<pdfplumber_core::ValidationIssue>,
782) {
783    use pdfplumber_core::{Severity, ValidationIssue};
784
785    // Iterate through all objects and check references
786    for (&obj_id, obj) in &doc.objects {
787        check_references_in_object(doc, obj, obj_id, issues);
788    }
789
790    fn check_references_in_object(
791        doc: &lopdf::Document,
792        obj: &lopdf::Object,
793        source_id: lopdf::ObjectId,
794        issues: &mut Vec<ValidationIssue>,
795    ) {
796        match obj {
797            lopdf::Object::Reference(ref_id) => {
798                if doc.get_object(*ref_id).is_err() {
799                    issues.push(ValidationIssue::with_location(
800                        Severity::Warning,
801                        "BROKEN_REF",
802                        format!(
803                            "reference to object {} {} which does not exist",
804                            ref_id.0, ref_id.1
805                        ),
806                        format!("object {} {}", source_id.0, source_id.1),
807                    ));
808                }
809            }
810            lopdf::Object::Array(arr) => {
811                for item in arr {
812                    check_references_in_object(doc, item, source_id, issues);
813                }
814            }
815            lopdf::Object::Dictionary(dict) => {
816                for (_, value) in dict.iter() {
817                    check_references_in_object(doc, value, source_id, issues);
818                }
819            }
820            lopdf::Object::Stream(stream) => {
821                for (_, value) in stream.dict.iter() {
822                    check_references_in_object(doc, value, source_id, issues);
823                }
824            }
825            _ => {}
826        }
827    }
828}
829
830/// Resolve an indirect reference, returning the referenced object.
831///
832/// If the object is a `Reference`, resolves it via the document.
833/// Otherwise, returns the object as-is.
834fn resolve_ref<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
835    match obj {
836        lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
837        _ => obj,
838    }
839}
840
841/// Attempt best-effort repair of common PDF issues.
842fn repair_document(
843    bytes: &[u8],
844    options: &RepairOptions,
845) -> Result<(Vec<u8>, RepairResult), BackendError> {
846    let mut doc = lopdf::Document::load_mem(bytes)
847        .map_err(|e| BackendError::Parse(format!("failed to parse PDF for repair: {e}")))?;
848
849    let mut result = RepairResult::new();
850
851    if options.fix_stream_lengths {
852        repair_stream_lengths(&mut doc, &mut result);
853    }
854
855    if options.remove_broken_objects {
856        repair_broken_references(&mut doc, &mut result);
857    }
858
859    // rebuild_xref: lopdf rebuilds xref automatically when saving,
860    // so just saving the document effectively rebuilds the xref table.
861    if options.rebuild_xref {
862        // Force xref rebuild by saving (lopdf always writes a fresh xref on save).
863        // Only log if we explicitly opted in and haven't already logged anything.
864    }
865
866    let mut buf = Vec::new();
867    doc.save_to(&mut buf)
868        .map_err(|e| BackendError::Parse(format!("failed to save repaired PDF: {e}")))?;
869
870    Ok((buf, result))
871}
872
873/// Fix stream `/Length` entries to match actual stream content size.
874fn repair_stream_lengths(doc: &mut lopdf::Document, result: &mut RepairResult) {
875    let obj_ids: Vec<lopdf::ObjectId> = doc.objects.keys().copied().collect();
876
877    for obj_id in obj_ids {
878        let needs_fix = if let Some(lopdf::Object::Stream(stream)) = doc.objects.get(&obj_id) {
879            let actual_len = stream.content.len() as i64;
880            match stream.dict.get(b"Length") {
881                Ok(lopdf::Object::Integer(stored_len)) => *stored_len != actual_len,
882                Ok(lopdf::Object::Reference(_)) => {
883                    // Length stored as indirect reference — skip, too complex to fix
884                    false
885                }
886                _ => true, // Missing Length key
887            }
888        } else {
889            false
890        };
891
892        if needs_fix {
893            if let Some(lopdf::Object::Stream(stream)) = doc.objects.get_mut(&obj_id) {
894                let actual_len = stream.content.len() as i64;
895                let old_len = stream.dict.get(b"Length").ok().and_then(|o| {
896                    if let lopdf::Object::Integer(v) = o {
897                        Some(*v)
898                    } else {
899                        None
900                    }
901                });
902                stream
903                    .dict
904                    .set("Length", lopdf::Object::Integer(actual_len));
905                match old_len {
906                    Some(old) => {
907                        result.log.push(format!(
908                            "fixed stream length for object {} {}: {} -> {}",
909                            obj_id.0, obj_id.1, old, actual_len
910                        ));
911                    }
912                    None => {
913                        result.log.push(format!(
914                            "added missing stream length for object {} {}: {}",
915                            obj_id.0, obj_id.1, actual_len
916                        ));
917                    }
918                }
919            }
920        }
921    }
922}
923
924/// Remove broken object references, replacing them with Null.
925fn repair_broken_references(doc: &mut lopdf::Document, result: &mut RepairResult) {
926    let obj_ids: Vec<lopdf::ObjectId> = doc.objects.keys().copied().collect();
927    let existing_ids: std::collections::BTreeSet<lopdf::ObjectId> =
928        doc.objects.keys().copied().collect();
929
930    for obj_id in obj_ids {
931        if let Some(obj) = doc.objects.remove(&obj_id) {
932            let fixed = fix_references_in_object(obj, &existing_ids, obj_id, result);
933            doc.objects.insert(obj_id, fixed);
934        }
935    }
936}
937
938/// Recursively replace broken references with Null in an object tree.
939fn fix_references_in_object(
940    obj: lopdf::Object,
941    existing_ids: &std::collections::BTreeSet<lopdf::ObjectId>,
942    source_id: lopdf::ObjectId,
943    result: &mut RepairResult,
944) -> lopdf::Object {
945    match obj {
946        lopdf::Object::Reference(ref_id) => {
947            if existing_ids.contains(&ref_id) {
948                obj
949            } else {
950                result.log.push(format!(
951                    "removed broken reference to object {} {} (in object {} {})",
952                    ref_id.0, ref_id.1, source_id.0, source_id.1
953                ));
954                lopdf::Object::Null
955            }
956        }
957        lopdf::Object::Array(arr) => {
958            let fixed: Vec<lopdf::Object> = arr
959                .into_iter()
960                .map(|item| fix_references_in_object(item, existing_ids, source_id, result))
961                .collect();
962            lopdf::Object::Array(fixed)
963        }
964        lopdf::Object::Dictionary(dict) => {
965            let mut new_dict = lopdf::Dictionary::new();
966            for (key, value) in dict.into_iter() {
967                let fixed = fix_references_in_object(value, existing_ids, source_id, result);
968                new_dict.set(key, fixed);
969            }
970            lopdf::Object::Dictionary(new_dict)
971        }
972        lopdf::Object::Stream(mut stream) => {
973            let mut new_dict = lopdf::Dictionary::new();
974            for (key, value) in stream.dict.into_iter() {
975                let fixed = fix_references_in_object(value, existing_ids, source_id, result);
976                new_dict.set(key, fixed);
977            }
978            stream.dict = new_dict;
979            lopdf::Object::Stream(stream)
980        }
981        other => other,
982    }
983}
984
985/// Get the content stream bytes from a page dictionary.
986///
987/// Handles both single stream references and arrays of stream references.
988fn get_page_content_bytes(
989    doc: &lopdf::Document,
990    page_dict: &lopdf::Dictionary,
991) -> Result<Vec<u8>, BackendError> {
992    let contents_obj = match page_dict.get(b"Contents") {
993        Ok(obj) => obj,
994        Err(_) => return Ok(Vec::new()), // Page with no content
995    };
996
997    match contents_obj {
998        lopdf::Object::Reference(id) => {
999            let obj = doc
1000                .get_object(*id)
1001                .map_err(|e| BackendError::Parse(format!("failed to resolve /Contents: {e}")))?;
1002            let stream = obj
1003                .as_stream()
1004                .map_err(|e| BackendError::Parse(format!("/Contents is not a stream: {e}")))?;
1005            decode_content_stream(stream)
1006        }
1007        lopdf::Object::Array(arr) => {
1008            let mut content = Vec::new();
1009            for item in arr {
1010                let id = item.as_reference().map_err(|e| {
1011                    BackendError::Parse(format!("/Contents array item is not a reference: {e}"))
1012                })?;
1013                let obj = doc.get_object(id).map_err(|e| {
1014                    BackendError::Parse(format!("failed to resolve /Contents stream: {e}"))
1015                })?;
1016                let stream = obj.as_stream().map_err(|e| {
1017                    BackendError::Parse(format!("/Contents array item is not a stream: {e}"))
1018                })?;
1019                let bytes = decode_content_stream(stream)?;
1020                if !content.is_empty() {
1021                    content.push(b' ');
1022                }
1023                content.extend_from_slice(&bytes);
1024            }
1025            Ok(content)
1026        }
1027        _ => Err(BackendError::Parse(
1028            "/Contents is not a reference or array".to_string(),
1029        )),
1030    }
1031}
1032
1033/// Decode a content stream, decompressing if needed.
1034fn decode_content_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
1035    if stream.dict.get(b"Filter").is_ok() {
1036        stream
1037            .decompressed_content()
1038            .map_err(|e| BackendError::Parse(format!("failed to decompress content stream: {e}")))
1039    } else {
1040        Ok(stream.content.clone())
1041    }
1042}
1043
1044/// Get the resources dictionary for a page, handling inheritance.
1045fn get_page_resources(
1046    doc: &lopdf::Document,
1047    page_id: lopdf::ObjectId,
1048) -> Result<&lopdf::Dictionary, BackendError> {
1049    match resolve_inherited(doc, page_id, b"Resources")? {
1050        Some(obj) => {
1051            // Resolve indirect reference if needed
1052            let obj = match obj {
1053                lopdf::Object::Reference(id) => doc.get_object(*id).map_err(|e| {
1054                    BackendError::Parse(format!("failed to resolve /Resources reference: {e}"))
1055                })?,
1056                other => other,
1057            };
1058            obj.as_dict()
1059                .map_err(|_| BackendError::Parse("/Resources is not a dictionary".to_string()))
1060        }
1061        None => {
1062            // No resources at all — use empty dictionary
1063            // This is unusual but we handle it gracefully
1064            static EMPTY_DICT: std::sync::LazyLock<lopdf::Dictionary> =
1065                std::sync::LazyLock::new(lopdf::Dictionary::new);
1066            Ok(&EMPTY_DICT)
1067        }
1068    }
1069}
1070
1071/// Extract a string value from a lopdf dictionary, handling both String and Name types.
1072fn extract_string_from_dict(
1073    doc: &lopdf::Document,
1074    dict: &lopdf::Dictionary,
1075    key: &[u8],
1076) -> Option<String> {
1077    let obj = dict.get(key).ok()?;
1078    // Resolve indirect reference if needed
1079    let obj = match obj {
1080        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1081        other => other,
1082    };
1083    match obj {
1084        lopdf::Object::String(bytes, _) => {
1085            // Try UTF-16 BE (BOM: 0xFE 0xFF) first, then Latin-1/UTF-8
1086            if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1087                let chars: Vec<u16> = bytes[2..]
1088                    .chunks(2)
1089                    .filter_map(|c| {
1090                        if c.len() == 2 {
1091                            Some(u16::from_be_bytes([c[0], c[1]]))
1092                        } else {
1093                            None
1094                        }
1095                    })
1096                    .collect();
1097                String::from_utf16(&chars).ok()
1098            } else {
1099                // Try UTF-8 first, fall back to Latin-1
1100                match std::str::from_utf8(bytes) {
1101                    Ok(s) => Some(s.to_string()),
1102                    Err(_) => Some(bytes.iter().map(|&b| b as char).collect()),
1103                }
1104            }
1105        }
1106        lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1107        _ => None,
1108    }
1109}
1110
1111/// Extract document-level metadata from the PDF /Info dictionary.
1112fn extract_document_metadata(doc: &lopdf::Document) -> Result<DocumentMetadata, BackendError> {
1113    // The /Info dictionary is referenced from the trailer
1114    let info_ref = match doc.trailer.get(b"Info") {
1115        Ok(obj) => obj,
1116        Err(_) => return Ok(DocumentMetadata::default()),
1117    };
1118
1119    let info_dict = match info_ref {
1120        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1121            Ok(obj) => match obj.as_dict() {
1122                Ok(dict) => dict,
1123                Err(_) => return Ok(DocumentMetadata::default()),
1124            },
1125            Err(_) => return Ok(DocumentMetadata::default()),
1126        },
1127        lopdf::Object::Dictionary(dict) => dict,
1128        _ => return Ok(DocumentMetadata::default()),
1129    };
1130
1131    Ok(DocumentMetadata {
1132        title: extract_string_from_dict(doc, info_dict, b"Title"),
1133        author: extract_string_from_dict(doc, info_dict, b"Author"),
1134        subject: extract_string_from_dict(doc, info_dict, b"Subject"),
1135        keywords: extract_string_from_dict(doc, info_dict, b"Keywords"),
1136        creator: extract_string_from_dict(doc, info_dict, b"Creator"),
1137        producer: extract_string_from_dict(doc, info_dict, b"Producer"),
1138        creation_date: extract_string_from_dict(doc, info_dict, b"CreationDate"),
1139        mod_date: extract_string_from_dict(doc, info_dict, b"ModDate"),
1140    })
1141}
1142
1143/// Extract the document outline (bookmarks / table of contents) from the PDF catalog.
1144///
1145/// Walks the `/Outlines` tree using `/First`, `/Next` sibling links,
1146/// resolving destinations to page numbers and y-coordinates.
1147fn extract_document_bookmarks(doc: &lopdf::Document) -> Result<Vec<Bookmark>, BackendError> {
1148    // Get the catalog dictionary
1149    let catalog_ref = match doc.trailer.get(b"Root") {
1150        Ok(obj) => obj,
1151        Err(_) => return Ok(Vec::new()),
1152    };
1153
1154    let catalog = match catalog_ref {
1155        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1156            Ok(obj) => match obj.as_dict() {
1157                Ok(dict) => dict,
1158                Err(_) => return Ok(Vec::new()),
1159            },
1160            Err(_) => return Ok(Vec::new()),
1161        },
1162        lopdf::Object::Dictionary(dict) => dict,
1163        _ => return Ok(Vec::new()),
1164    };
1165
1166    // Get /Outlines dictionary
1167    let outlines_obj = match catalog.get(b"Outlines") {
1168        Ok(obj) => obj,
1169        Err(_) => return Ok(Vec::new()),
1170    };
1171
1172    let outlines_obj = match outlines_obj {
1173        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1174            Ok(obj) => obj,
1175            Err(_) => return Ok(Vec::new()),
1176        },
1177        other => other,
1178    };
1179
1180    let outlines_dict = match outlines_obj.as_dict() {
1181        Ok(dict) => dict,
1182        Err(_) => return Ok(Vec::new()),
1183    };
1184
1185    // Get /First child of the outlines root
1186    let first_ref = match outlines_dict.get(b"First") {
1187        Ok(lopdf::Object::Reference(id)) => *id,
1188        _ => return Ok(Vec::new()),
1189    };
1190
1191    // Build page map for resolving destinations
1192    let pages_map = doc.get_pages();
1193
1194    let mut bookmarks = Vec::new();
1195    let max_depth = 64; // Prevent circular references
1196    walk_outline_tree(doc, first_ref, 0, max_depth, &pages_map, &mut bookmarks);
1197
1198    Ok(bookmarks)
1199}
1200
1201/// Recursively walk the outline tree, collecting bookmarks.
1202fn walk_outline_tree(
1203    doc: &lopdf::Document,
1204    item_id: lopdf::ObjectId,
1205    level: usize,
1206    max_depth: usize,
1207    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1208    bookmarks: &mut Vec<Bookmark>,
1209) {
1210    if level >= max_depth {
1211        return;
1212    }
1213
1214    let mut current_id = Some(item_id);
1215    let mut visited = std::collections::HashSet::new();
1216    let max_siblings = 10_000; // Safety limit on siblings at one level
1217    let mut sibling_count = 0;
1218
1219    while let Some(node_id) = current_id {
1220        // Circular reference protection
1221        if !visited.insert(node_id) || sibling_count >= max_siblings {
1222            break;
1223        }
1224        sibling_count += 1;
1225
1226        let node_obj = match doc.get_object(node_id) {
1227            Ok(obj) => obj,
1228            Err(_) => break,
1229        };
1230
1231        let node_dict = match node_obj.as_dict() {
1232            Ok(dict) => dict,
1233            Err(_) => break,
1234        };
1235
1236        // Extract /Title
1237        let title = extract_string_from_dict(doc, node_dict, b"Title").unwrap_or_default();
1238
1239        // Resolve destination (page number and y-coordinate)
1240        let (page_number, dest_top) = resolve_bookmark_dest(doc, node_dict, pages_map);
1241
1242        bookmarks.push(Bookmark {
1243            title,
1244            level,
1245            page_number,
1246            dest_top,
1247        });
1248
1249        // Recurse into children (/First)
1250        if let Ok(lopdf::Object::Reference(child_id)) = node_dict.get(b"First") {
1251            walk_outline_tree(doc, *child_id, level + 1, max_depth, pages_map, bookmarks);
1252        }
1253
1254        // Move to next sibling (/Next)
1255        current_id = match node_dict.get(b"Next") {
1256            Ok(lopdf::Object::Reference(next_id)) => Some(*next_id),
1257            _ => None,
1258        };
1259    }
1260}
1261
1262/// Resolve a bookmark's destination to (page_number, dest_top).
1263///
1264/// Checks /Dest first, then /A (GoTo action).
1265fn resolve_bookmark_dest(
1266    doc: &lopdf::Document,
1267    node_dict: &lopdf::Dictionary,
1268    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1269) -> (Option<usize>, Option<f64>) {
1270    // Try /Dest first
1271    if let Ok(dest_obj) = node_dict.get(b"Dest") {
1272        if let Some(result) = resolve_dest_to_page(doc, dest_obj, pages_map) {
1273            return result;
1274        }
1275    }
1276
1277    // Try /A (Action) dictionary — only GoTo actions
1278    if let Ok(action_obj) = node_dict.get(b"A") {
1279        let action_obj = match action_obj {
1280            lopdf::Object::Reference(id) => match doc.get_object(*id) {
1281                Ok(obj) => obj,
1282                Err(_) => return (None, None),
1283            },
1284            other => other,
1285        };
1286        if let Ok(action_dict) = action_obj.as_dict() {
1287            if let Ok(lopdf::Object::Name(action_type)) = action_dict.get(b"S") {
1288                if String::from_utf8_lossy(action_type) == "GoTo" {
1289                    if let Ok(dest_obj) = action_dict.get(b"D") {
1290                        if let Some(result) = resolve_dest_to_page(doc, dest_obj, pages_map) {
1291                            return result;
1292                        }
1293                    }
1294                }
1295            }
1296        }
1297    }
1298
1299    (None, None)
1300}
1301
1302/// Resolve a destination object to (page_number, dest_top).
1303///
1304/// Handles explicit destination arrays `[page_ref, /type, ...]` and named destinations.
1305fn resolve_dest_to_page(
1306    doc: &lopdf::Document,
1307    dest_obj: &lopdf::Object,
1308    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1309) -> Option<(Option<usize>, Option<f64>)> {
1310    let dest_obj = match dest_obj {
1311        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1312        other => other,
1313    };
1314
1315    match dest_obj {
1316        // Explicit destination array: [page_ref, /type, ...]
1317        lopdf::Object::Array(arr) => {
1318            if arr.is_empty() {
1319                return None;
1320            }
1321            // First element is a page reference
1322            if let lopdf::Object::Reference(page_ref) = &arr[0] {
1323                // Resolve to 0-indexed page number
1324                let page_number = pages_map.iter().find_map(|(&page_num, &page_id)| {
1325                    if page_id == *page_ref {
1326                        Some((page_num - 1) as usize) // lopdf pages are 1-indexed
1327                    } else {
1328                        None
1329                    }
1330                });
1331
1332                // Try to extract dest_top from /XYZ or /FitH or /FitBH destination types
1333                let dest_top = extract_dest_top(arr);
1334
1335                return Some((page_number, dest_top));
1336            }
1337            None
1338        }
1339        // Named destination (string) — look up in /Names or /Dests
1340        lopdf::Object::String(bytes, _) => {
1341            let name = if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1342                let chars: Vec<u16> = bytes[2..]
1343                    .chunks(2)
1344                    .filter_map(|c| {
1345                        if c.len() == 2 {
1346                            Some(u16::from_be_bytes([c[0], c[1]]))
1347                        } else {
1348                            None
1349                        }
1350                    })
1351                    .collect();
1352                String::from_utf16(&chars).ok()?
1353            } else {
1354                match std::str::from_utf8(bytes) {
1355                    Ok(s) => s.to_string(),
1356                    Err(_) => bytes.iter().map(|&b| b as char).collect(),
1357                }
1358            };
1359            resolve_named_dest(doc, &name, pages_map)
1360        }
1361        // Named destination (name)
1362        lopdf::Object::Name(name) => {
1363            let name_str = String::from_utf8_lossy(name);
1364            resolve_named_dest(doc, &name_str, pages_map)
1365        }
1366        _ => None,
1367    }
1368}
1369
1370/// Extract the dest_top (y-coordinate) from a destination array.
1371///
1372/// Supports /XYZ (index 3), /FitH (index 2), /FitBH (index 2).
1373fn extract_dest_top(arr: &[lopdf::Object]) -> Option<f64> {
1374    if arr.len() < 2 {
1375        return None;
1376    }
1377    // Second element is the destination type
1378    if let lopdf::Object::Name(dest_type) = &arr[1] {
1379        let type_str = String::from_utf8_lossy(dest_type);
1380        match type_str.as_ref() {
1381            "XYZ" => {
1382                // [page, /XYZ, left, top, zoom]
1383                if arr.len() >= 4 {
1384                    return obj_to_f64(&arr[3]);
1385                }
1386            }
1387            "FitH" | "FitBH" => {
1388                // [page, /FitH, top] or [page, /FitBH, top]
1389                if arr.len() >= 3 {
1390                    return obj_to_f64(&arr[2]);
1391                }
1392            }
1393            _ => {} // /Fit, /FitV, /FitR, /FitB — no meaningful top
1394        }
1395    }
1396    None
1397}
1398
1399/// Convert a lopdf Object to f64 (handles Integer, Real, and Null).
1400fn obj_to_f64(obj: &lopdf::Object) -> Option<f64> {
1401    match obj {
1402        lopdf::Object::Integer(i) => Some(*i as f64),
1403        lopdf::Object::Real(f) => Some((*f).into()),
1404        lopdf::Object::Null => None, // null means "unchanged" in PDF spec
1405        _ => None,
1406    }
1407}
1408
1409/// Resolve a named destination to (page_number, dest_top).
1410///
1411/// Looks up the name in the catalog's /Names → /Dests name tree,
1412/// or in the catalog's /Dests dictionary.
1413fn resolve_named_dest(
1414    doc: &lopdf::Document,
1415    name: &str,
1416    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1417) -> Option<(Option<usize>, Option<f64>)> {
1418    // Get catalog
1419    let catalog_ref = doc.trailer.get(b"Root").ok()?;
1420    let catalog = match catalog_ref {
1421        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?.as_dict().ok()?,
1422        lopdf::Object::Dictionary(dict) => dict,
1423        _ => return None,
1424    };
1425
1426    // Try /Names → /Dests name tree first
1427    if let Ok(names_obj) = catalog.get(b"Names") {
1428        let names_obj = match names_obj {
1429            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1430            other => other,
1431        };
1432        if let Ok(names_dict) = names_obj.as_dict() {
1433            if let Ok(dests_obj) = names_dict.get(b"Dests") {
1434                let dests_obj = match dests_obj {
1435                    lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1436                    other => other,
1437                };
1438                if let Ok(dests_dict) = dests_obj.as_dict() {
1439                    if let Some(result) = lookup_name_tree(doc, dests_dict, name, pages_map) {
1440                        return Some(result);
1441                    }
1442                }
1443            }
1444        }
1445    }
1446
1447    // Try /Dests dictionary (older PDF spec)
1448    if let Ok(dests_obj) = catalog.get(b"Dests") {
1449        let dests_obj = match dests_obj {
1450            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1451            other => other,
1452        };
1453        if let Ok(dests_dict) = dests_obj.as_dict() {
1454            if let Ok(dest_obj) = dests_dict.get(name.as_bytes()) {
1455                let dest_obj = match dest_obj {
1456                    lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1457                    other => other,
1458                };
1459                // Could be an array directly or a dict with /D key
1460                match dest_obj {
1461                    lopdf::Object::Array(arr) => {
1462                        if let Some(result) =
1463                            resolve_dest_to_page(doc, &lopdf::Object::Array(arr.clone()), pages_map)
1464                        {
1465                            return Some(result);
1466                        }
1467                    }
1468                    lopdf::Object::Dictionary(d) => {
1469                        if let Ok(d_dest) = d.get(b"D") {
1470                            if let Some(result) = resolve_dest_to_page(doc, d_dest, pages_map) {
1471                                return Some(result);
1472                            }
1473                        }
1474                    }
1475                    _ => {}
1476                }
1477            }
1478        }
1479    }
1480
1481    None
1482}
1483
1484/// Look up a name in a PDF name tree (/Names array with key-value pairs).
1485fn lookup_name_tree(
1486    doc: &lopdf::Document,
1487    tree_dict: &lopdf::Dictionary,
1488    name: &str,
1489    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1490) -> Option<(Option<usize>, Option<f64>)> {
1491    // Check /Names array (leaf node)
1492    if let Ok(names_arr_obj) = tree_dict.get(b"Names") {
1493        let names_arr_obj = match names_arr_obj {
1494            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1495            other => other,
1496        };
1497        if let Ok(names_arr) = names_arr_obj.as_array() {
1498            // Names array is [key1, value1, key2, value2, ...]
1499            let mut i = 0;
1500            while i + 1 < names_arr.len() {
1501                let key_obj = match &names_arr[i] {
1502                    lopdf::Object::Reference(id) => match doc.get_object(*id) {
1503                        Ok(obj) => obj.clone(),
1504                        Err(_) => {
1505                            i += 2;
1506                            continue;
1507                        }
1508                    },
1509                    other => other.clone(),
1510                };
1511                if let lopdf::Object::String(key_bytes, _) = &key_obj {
1512                    let key_str = String::from_utf8_lossy(key_bytes);
1513                    if key_str == name {
1514                        let value = &names_arr[i + 1];
1515                        let value = match value {
1516                            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1517                            other => other,
1518                        };
1519                        // Value can be an array (destination) or dict with /D
1520                        match value {
1521                            lopdf::Object::Array(arr) => {
1522                                return resolve_dest_to_page(
1523                                    doc,
1524                                    &lopdf::Object::Array(arr.clone()),
1525                                    pages_map,
1526                                );
1527                            }
1528                            lopdf::Object::Dictionary(d) => {
1529                                if let Ok(d_dest) = d.get(b"D") {
1530                                    return resolve_dest_to_page(doc, d_dest, pages_map);
1531                                }
1532                            }
1533                            _ => {}
1534                        }
1535                    }
1536                }
1537                i += 2;
1538            }
1539        }
1540    }
1541
1542    // Check /Kids array (intermediate nodes)
1543    if let Ok(kids_obj) = tree_dict.get(b"Kids") {
1544        let kids_obj = match kids_obj {
1545            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1546            other => other,
1547        };
1548        if let Ok(kids_arr) = kids_obj.as_array() {
1549            for kid in kids_arr {
1550                let kid_obj = match kid {
1551                    lopdf::Object::Reference(id) => match doc.get_object(*id) {
1552                        Ok(obj) => obj,
1553                        Err(_) => continue,
1554                    },
1555                    other => other,
1556                };
1557                if let Ok(kid_dict) = kid_obj.as_dict() {
1558                    if let Some(result) = lookup_name_tree(doc, kid_dict, name, pages_map) {
1559                        return Some(result);
1560                    }
1561                }
1562            }
1563        }
1564    }
1565
1566    None
1567}
1568
1569/// Extract form fields from the document catalog's /AcroForm dictionary.
1570///
1571/// Walks the `/Fields` array recursively (handling `/Kids` for hierarchical
1572/// fields) and extracts field name, type, value, default value, options,
1573/// rect, and flags for each terminal field.
1574fn extract_document_form_fields(doc: &lopdf::Document) -> Result<Vec<FormField>, BackendError> {
1575    // Get the catalog dictionary
1576    let catalog_ref = match doc.trailer.get(b"Root") {
1577        Ok(obj) => obj,
1578        Err(_) => return Ok(Vec::new()),
1579    };
1580
1581    let catalog = match catalog_ref {
1582        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1583            Ok(obj) => match obj.as_dict() {
1584                Ok(dict) => dict,
1585                Err(_) => return Ok(Vec::new()),
1586            },
1587            Err(_) => return Ok(Vec::new()),
1588        },
1589        lopdf::Object::Dictionary(dict) => dict,
1590        _ => return Ok(Vec::new()),
1591    };
1592
1593    // Get /AcroForm dictionary
1594    let acroform_obj = match catalog.get(b"AcroForm") {
1595        Ok(obj) => obj,
1596        Err(_) => return Ok(Vec::new()), // No AcroForm in this document
1597    };
1598
1599    let acroform_obj = match acroform_obj {
1600        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1601            Ok(obj) => obj,
1602            Err(_) => return Ok(Vec::new()),
1603        },
1604        other => other,
1605    };
1606
1607    let acroform_dict = match acroform_obj.as_dict() {
1608        Ok(dict) => dict,
1609        Err(_) => return Ok(Vec::new()),
1610    };
1611
1612    // Get /Fields array
1613    let fields_obj = match acroform_dict.get(b"Fields") {
1614        Ok(obj) => obj,
1615        Err(_) => return Ok(Vec::new()),
1616    };
1617
1618    let fields_obj = match fields_obj {
1619        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1620            Ok(obj) => obj,
1621            Err(_) => return Ok(Vec::new()),
1622        },
1623        other => other,
1624    };
1625
1626    let fields_array = match fields_obj.as_array() {
1627        Ok(arr) => arr,
1628        Err(_) => return Ok(Vec::new()),
1629    };
1630
1631    // Build page map for resolving page references
1632    let pages_map = doc.get_pages();
1633
1634    let mut form_fields = Vec::new();
1635    let max_depth = 64; // Prevent circular references
1636
1637    for field_entry in fields_array {
1638        let field_ref = match field_entry {
1639            lopdf::Object::Reference(id) => *id,
1640            _ => continue,
1641        };
1642        walk_field_tree(
1643            doc,
1644            field_ref,
1645            None, // No parent name prefix
1646            None, // No inherited field type
1647            0,
1648            max_depth,
1649            &pages_map,
1650            &mut form_fields,
1651        );
1652    }
1653
1654    Ok(form_fields)
1655}
1656
1657/// Recursively walk the form field tree, collecting terminal form fields.
1658///
1659/// Handles hierarchical fields where intermediate nodes carry partial
1660/// names (joined with `.`) and field type may be inherited from parents.
1661#[allow(clippy::too_many_arguments)]
1662fn walk_field_tree(
1663    doc: &lopdf::Document,
1664    field_id: lopdf::ObjectId,
1665    parent_name: Option<&str>,
1666    inherited_ft: Option<&FieldType>,
1667    depth: usize,
1668    max_depth: usize,
1669    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1670    fields: &mut Vec<FormField>,
1671) {
1672    if depth >= max_depth {
1673        return;
1674    }
1675
1676    let field_obj = match doc.get_object(field_id) {
1677        Ok(obj) => obj,
1678        Err(_) => return,
1679    };
1680
1681    let field_dict = match field_obj.as_dict() {
1682        Ok(dict) => dict,
1683        Err(_) => return,
1684    };
1685
1686    // Extract partial name /T
1687    let partial_name = extract_string_from_dict(doc, field_dict, b"T");
1688
1689    // Build full qualified name
1690    let full_name = match (&parent_name, &partial_name) {
1691        (Some(parent), Some(name)) => format!("{parent}.{name}"),
1692        (Some(parent), None) => parent.to_string(),
1693        (None, Some(name)) => name.clone(),
1694        (None, None) => String::new(),
1695    };
1696
1697    // Extract /FT (field type) — may be inherited from parent
1698    let field_type = match field_dict.get(b"FT") {
1699        Ok(lopdf::Object::Name(name)) => FieldType::from_pdf_name(&String::from_utf8_lossy(name)),
1700        _ => inherited_ft.cloned(),
1701    };
1702
1703    // Check for /Kids — if present, this is an intermediate node
1704    if let Ok(kids_obj) = field_dict.get(b"Kids") {
1705        let kids_obj = match kids_obj {
1706            lopdf::Object::Reference(id) => match doc.get_object(*id) {
1707                Ok(obj) => obj,
1708                Err(_) => return,
1709            },
1710            other => other,
1711        };
1712
1713        if let Ok(kids_array) = kids_obj.as_array() {
1714            // Check if /Kids contains widget annotations or child fields.
1715            // If a kid has /T, it's a child field; otherwise it's a widget annotation.
1716            let has_child_fields = kids_array.iter().any(|kid| {
1717                let kid_obj = match kid {
1718                    lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
1719                    _ => Some(kid),
1720                };
1721                kid_obj
1722                    .and_then(|o| o.as_dict().ok())
1723                    .is_some_and(|d| d.get(b"T").is_ok())
1724            });
1725
1726            if has_child_fields {
1727                // Recurse into child fields
1728                for kid in kids_array {
1729                    if let lopdf::Object::Reference(kid_id) = kid {
1730                        walk_field_tree(
1731                            doc,
1732                            *kid_id,
1733                            Some(&full_name),
1734                            field_type.as_ref(),
1735                            depth + 1,
1736                            max_depth,
1737                            pages_map,
1738                            fields,
1739                        );
1740                    }
1741                }
1742                return;
1743            }
1744            // If kids are only widgets (no /T), fall through to extract this as a terminal field.
1745        }
1746    }
1747
1748    // Terminal field — extract all properties
1749    let Some(field_type) = field_type else {
1750        return; // Skip fields without a type
1751    };
1752
1753    // Extract /V (value)
1754    let value = extract_field_value(doc, field_dict, b"V");
1755
1756    // Extract /DV (default value)
1757    let default_value = extract_field_value(doc, field_dict, b"DV");
1758
1759    // Extract /Rect (bounding box)
1760    let bbox = extract_field_bbox(doc, field_dict).unwrap_or(BBox::new(0.0, 0.0, 0.0, 0.0));
1761
1762    // Extract /Opt (options for choice fields)
1763    let options = extract_field_options(doc, field_dict);
1764
1765    // Extract /Ff (field flags)
1766    let flags = match field_dict.get(b"Ff") {
1767        Ok(lopdf::Object::Integer(n)) => *n as u32,
1768        _ => 0,
1769    };
1770
1771    // Try to determine page index from /P reference or widget annotations
1772    let page_index = resolve_field_page(doc, field_dict, pages_map);
1773
1774    fields.push(FormField {
1775        name: full_name,
1776        field_type,
1777        value,
1778        default_value,
1779        bbox,
1780        options,
1781        flags,
1782        page_index,
1783    });
1784}
1785
1786/// Extract a field value from /V or /DV entry.
1787///
1788/// Handles strings, names, and arrays of strings.
1789fn extract_field_value(
1790    doc: &lopdf::Document,
1791    dict: &lopdf::Dictionary,
1792    key: &[u8],
1793) -> Option<String> {
1794    let obj = dict.get(key).ok()?;
1795    let obj = match obj {
1796        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1797        other => other,
1798    };
1799    match obj {
1800        lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1801        lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1802        lopdf::Object::Array(arr) => {
1803            // Multi-select: join values
1804            let vals: Vec<String> = arr
1805                .iter()
1806                .filter_map(|item| match item {
1807                    lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1808                    lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1809                    _ => None,
1810                })
1811                .collect();
1812            if vals.is_empty() {
1813                None
1814            } else {
1815                Some(vals.join(", "))
1816            }
1817        }
1818        _ => None,
1819    }
1820}
1821
1822/// Decode a PDF string, handling UTF-16 BE BOM and Latin-1.
1823fn decode_pdf_string(bytes: &[u8]) -> String {
1824    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1825        // UTF-16 BE
1826        let chars: Vec<u16> = bytes[2..]
1827            .chunks(2)
1828            .filter_map(|c| {
1829                if c.len() == 2 {
1830                    Some(u16::from_be_bytes([c[0], c[1]]))
1831                } else {
1832                    None
1833                }
1834            })
1835            .collect();
1836        String::from_utf16_lossy(&chars)
1837    } else {
1838        String::from_utf8_lossy(bytes).into_owned()
1839    }
1840}
1841
1842/// Extract bounding box from a field's /Rect entry.
1843fn extract_field_bbox(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> Option<BBox> {
1844    let rect_obj = dict.get(b"Rect").ok()?;
1845    let rect_obj = match rect_obj {
1846        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1847        other => other,
1848    };
1849    let arr = rect_obj.as_array().ok()?;
1850    extract_bbox_from_array(arr).ok()
1851}
1852
1853/// Extract options from a choice field's /Opt entry.
1854fn extract_field_options(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> Vec<String> {
1855    let opt_obj = match dict.get(b"Opt") {
1856        Ok(obj) => obj,
1857        Err(_) => return Vec::new(),
1858    };
1859    let opt_obj = match opt_obj {
1860        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1861            Ok(obj) => obj,
1862            Err(_) => return Vec::new(),
1863        },
1864        other => other,
1865    };
1866    let opt_array = match opt_obj.as_array() {
1867        Ok(arr) => arr,
1868        Err(_) => return Vec::new(),
1869    };
1870
1871    opt_array
1872        .iter()
1873        .filter_map(|item| {
1874            let item = match item {
1875                lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
1876                other => other,
1877            };
1878            match item {
1879                lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1880                lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
1881                // Option can be [export_value, display_value] pair
1882                lopdf::Object::Array(pair) => {
1883                    if pair.len() >= 2 {
1884                        // Use display value (second element)
1885                        match &pair[1] {
1886                            lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
1887                            lopdf::Object::Name(name) => {
1888                                Some(String::from_utf8_lossy(name).into_owned())
1889                            }
1890                            _ => None,
1891                        }
1892                    } else {
1893                        None
1894                    }
1895                }
1896                _ => None,
1897            }
1898        })
1899        .collect()
1900}
1901
1902/// Resolve a form field's page index from /P reference.
1903fn resolve_field_page(
1904    _doc: &lopdf::Document,
1905    dict: &lopdf::Dictionary,
1906    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
1907) -> Option<usize> {
1908    // Try /P (page reference)
1909    let page_ref = match dict.get(b"P") {
1910        Ok(lopdf::Object::Reference(id)) => *id,
1911        _ => return None,
1912    };
1913
1914    // Resolve page reference to 0-based index
1915    pages_map.iter().find_map(|(&page_num, &page_id)| {
1916        if page_id == page_ref {
1917            Some((page_num - 1) as usize) // lopdf pages are 1-indexed
1918        } else {
1919            None
1920        }
1921    })
1922}
1923
1924/// Extract digital signature information from the document's `/AcroForm`.
1925///
1926/// Walks the field tree and collects signature fields (`/FT /Sig`).
1927/// For signed fields (those with `/V`), extracts signer name, date,
1928/// reason, location, and contact info from the signature value dictionary.
1929fn extract_document_signatures(doc: &lopdf::Document) -> Result<Vec<SignatureInfo>, BackendError> {
1930    // Get the catalog dictionary
1931    let catalog_ref = match doc.trailer.get(b"Root") {
1932        Ok(obj) => obj,
1933        Err(_) => return Ok(Vec::new()),
1934    };
1935
1936    let catalog = match catalog_ref {
1937        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1938            Ok(obj) => match obj.as_dict() {
1939                Ok(dict) => dict,
1940                Err(_) => return Ok(Vec::new()),
1941            },
1942            Err(_) => return Ok(Vec::new()),
1943        },
1944        lopdf::Object::Dictionary(dict) => dict,
1945        _ => return Ok(Vec::new()),
1946    };
1947
1948    // Get /AcroForm dictionary
1949    let acroform_obj = match catalog.get(b"AcroForm") {
1950        Ok(obj) => obj,
1951        Err(_) => return Ok(Vec::new()),
1952    };
1953
1954    let acroform_obj = match acroform_obj {
1955        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1956            Ok(obj) => obj,
1957            Err(_) => return Ok(Vec::new()),
1958        },
1959        other => other,
1960    };
1961
1962    let acroform_dict = match acroform_obj.as_dict() {
1963        Ok(dict) => dict,
1964        Err(_) => return Ok(Vec::new()),
1965    };
1966
1967    // Get /Fields array
1968    let fields_obj = match acroform_dict.get(b"Fields") {
1969        Ok(obj) => obj,
1970        Err(_) => return Ok(Vec::new()),
1971    };
1972
1973    let fields_obj = match fields_obj {
1974        lopdf::Object::Reference(id) => match doc.get_object(*id) {
1975            Ok(obj) => obj,
1976            Err(_) => return Ok(Vec::new()),
1977        },
1978        other => other,
1979    };
1980
1981    let fields_array = match fields_obj.as_array() {
1982        Ok(arr) => arr,
1983        Err(_) => return Ok(Vec::new()),
1984    };
1985
1986    let mut signatures = Vec::new();
1987    let max_depth = 64;
1988
1989    for field_entry in fields_array {
1990        let field_ref = match field_entry {
1991            lopdf::Object::Reference(id) => *id,
1992            _ => continue,
1993        };
1994        walk_signature_tree(doc, field_ref, None, 0, max_depth, &mut signatures);
1995    }
1996
1997    Ok(signatures)
1998}
1999
2000/// Recursively walk the form field tree, collecting signature fields.
2001///
2002/// Similar to `walk_field_tree` but only collects `/FT /Sig` fields
2003/// and extracts signature-specific metadata from `/V`.
2004fn walk_signature_tree(
2005    doc: &lopdf::Document,
2006    field_id: lopdf::ObjectId,
2007    inherited_ft: Option<&[u8]>,
2008    depth: usize,
2009    max_depth: usize,
2010    signatures: &mut Vec<SignatureInfo>,
2011) {
2012    if depth >= max_depth {
2013        return;
2014    }
2015
2016    let field_obj = match doc.get_object(field_id) {
2017        Ok(obj) => obj,
2018        Err(_) => return,
2019    };
2020
2021    let field_dict = match field_obj.as_dict() {
2022        Ok(dict) => dict,
2023        Err(_) => return,
2024    };
2025
2026    // Extract /FT — may be inherited from parent
2027    let field_type = match field_dict.get(b"FT") {
2028        Ok(lopdf::Object::Name(name)) => Some(name.as_slice()),
2029        _ => inherited_ft,
2030    };
2031
2032    // Check for /Kids — if present, this may be an intermediate node
2033    if let Ok(kids_obj) = field_dict.get(b"Kids") {
2034        let kids_obj = match kids_obj {
2035            lopdf::Object::Reference(id) => match doc.get_object(*id) {
2036                Ok(obj) => obj,
2037                Err(_) => return,
2038            },
2039            other => other,
2040        };
2041
2042        if let Ok(kids_array) = kids_obj.as_array() {
2043            // Check if /Kids contains child fields (with /T) or widget annotations
2044            let has_child_fields = kids_array.iter().any(|kid| {
2045                let kid_obj = match kid {
2046                    lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
2047                    _ => Some(kid),
2048                };
2049                kid_obj
2050                    .and_then(|o| o.as_dict().ok())
2051                    .is_some_and(|d| d.get(b"T").is_ok())
2052            });
2053
2054            if has_child_fields {
2055                for kid in kids_array {
2056                    if let lopdf::Object::Reference(kid_id) = kid {
2057                        walk_signature_tree(
2058                            doc,
2059                            *kid_id,
2060                            field_type,
2061                            depth + 1,
2062                            max_depth,
2063                            signatures,
2064                        );
2065                    }
2066                }
2067                return;
2068            }
2069        }
2070    }
2071
2072    // Terminal field — check if it's a signature field
2073    let is_sig = field_type.is_some_and(|ft| ft == b"Sig");
2074    if !is_sig {
2075        return;
2076    }
2077
2078    // Check for /V (signature value dictionary)
2079    let sig_dict = field_dict
2080        .get(b"V")
2081        .ok()
2082        .and_then(|obj| match obj {
2083            lopdf::Object::Reference(id) => doc.get_object(*id).ok(),
2084            other => Some(other),
2085        })
2086        .and_then(|obj| obj.as_dict().ok());
2087
2088    let info = match sig_dict {
2089        Some(v_dict) => SignatureInfo {
2090            signer_name: extract_string_from_dict(doc, v_dict, b"Name"),
2091            sign_date: extract_string_from_dict(doc, v_dict, b"M"),
2092            reason: extract_string_from_dict(doc, v_dict, b"Reason"),
2093            location: extract_string_from_dict(doc, v_dict, b"Location"),
2094            contact_info: extract_string_from_dict(doc, v_dict, b"ContactInfo"),
2095            is_signed: true,
2096        },
2097        None => SignatureInfo {
2098            signer_name: None,
2099            sign_date: None,
2100            reason: None,
2101            location: None,
2102            contact_info: None,
2103            is_signed: false,
2104        },
2105    };
2106
2107    signatures.push(info);
2108}
2109
2110/// Extract the document structure tree from `/StructTreeRoot`.
2111///
2112/// Walks the structure tree recursively, extracting element types, MCIDs,
2113/// alt text, actual text, language, and child elements. Returns an empty
2114/// Vec for untagged PDFs (no `/StructTreeRoot`).
2115fn extract_document_structure_tree(
2116    doc: &lopdf::Document,
2117) -> Result<Vec<StructElement>, BackendError> {
2118    // Get the catalog dictionary
2119    let catalog_ref = match doc.trailer.get(b"Root") {
2120        Ok(obj) => obj,
2121        Err(_) => return Ok(Vec::new()),
2122    };
2123
2124    let catalog = match catalog_ref {
2125        lopdf::Object::Reference(id) => match doc.get_object(*id) {
2126            Ok(obj) => match obj.as_dict() {
2127                Ok(dict) => dict,
2128                Err(_) => return Ok(Vec::new()),
2129            },
2130            Err(_) => return Ok(Vec::new()),
2131        },
2132        lopdf::Object::Dictionary(dict) => dict,
2133        _ => return Ok(Vec::new()),
2134    };
2135
2136    // Get /StructTreeRoot dictionary
2137    let struct_tree_obj = match catalog.get(b"StructTreeRoot") {
2138        Ok(obj) => obj,
2139        Err(_) => return Ok(Vec::new()), // Not a tagged PDF
2140    };
2141
2142    let struct_tree_obj = resolve_object(doc, struct_tree_obj);
2143    let struct_tree_dict = match struct_tree_obj.as_dict() {
2144        Ok(dict) => dict,
2145        Err(_) => return Ok(Vec::new()),
2146    };
2147
2148    // Build page map for resolving page references
2149    let pages_map = doc.get_pages();
2150
2151    // Get /K (kids) — the children of the root structure element
2152    let kids_obj = match struct_tree_dict.get(b"K") {
2153        Ok(obj) => obj,
2154        Err(_) => return Ok(Vec::new()), // Empty structure tree
2155    };
2156
2157    let max_depth = 64; // Prevent circular references
2158    let elements = parse_struct_kids(doc, kids_obj, 0, max_depth, &pages_map);
2159    Ok(elements)
2160}
2161
2162/// Parse the /K (kids) entry of a structure element, which can be:
2163/// - An integer MCID
2164/// - A reference to a structure element dictionary
2165/// - A dictionary (MCR or structure element)
2166/// - An array of the above
2167fn parse_struct_kids(
2168    doc: &lopdf::Document,
2169    kids_obj: &lopdf::Object,
2170    depth: usize,
2171    max_depth: usize,
2172    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2173) -> Vec<StructElement> {
2174    if depth >= max_depth {
2175        return Vec::new();
2176    }
2177
2178    let kids_obj = resolve_object(doc, kids_obj);
2179
2180    match kids_obj {
2181        lopdf::Object::Array(arr) => {
2182            let mut elements = Vec::new();
2183            for item in arr {
2184                let item = resolve_object(doc, item);
2185                match item {
2186                    lopdf::Object::Dictionary(dict) => {
2187                        if let Some(elem) =
2188                            parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2189                        {
2190                            elements.push(elem);
2191                        }
2192                    }
2193                    lopdf::Object::Reference(id) => {
2194                        if let Ok(obj) = doc.get_object(*id) {
2195                            if let Ok(dict) = obj.as_dict() {
2196                                if let Some(elem) =
2197                                    parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2198                                {
2199                                    elements.push(elem);
2200                                }
2201                            }
2202                        }
2203                    }
2204                    // Integer MCID at root level — create a minimal element
2205                    lopdf::Object::Integer(_) => {
2206                        // MCIDs at root level without a structure element are unusual;
2207                        // typically they appear inside a structure element's /K
2208                    }
2209                    _ => {}
2210                }
2211            }
2212            elements
2213        }
2214        lopdf::Object::Dictionary(dict) => {
2215            if let Some(elem) = parse_struct_element(doc, dict, depth + 1, max_depth, pages_map) {
2216                vec![elem]
2217            } else {
2218                Vec::new()
2219            }
2220        }
2221        lopdf::Object::Reference(id) => {
2222            if let Ok(obj) = doc.get_object(*id) {
2223                if let Ok(dict) = obj.as_dict() {
2224                    if let Some(elem) =
2225                        parse_struct_element(doc, dict, depth + 1, max_depth, pages_map)
2226                    {
2227                        return vec![elem];
2228                    }
2229                }
2230            }
2231            Vec::new()
2232        }
2233        _ => Vec::new(),
2234    }
2235}
2236
2237/// Parse a single structure element dictionary.
2238///
2239/// Extracts /S (type), /K (kids/MCIDs), /Alt, /ActualText, /Lang,
2240/// and recurses into children.
2241fn parse_struct_element(
2242    doc: &lopdf::Document,
2243    dict: &lopdf::Dictionary,
2244    depth: usize,
2245    max_depth: usize,
2246    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2247) -> Option<StructElement> {
2248    // Check if this is a marked-content reference (MCR) dictionary
2249    // MCR dicts have /Type /MCR and /MCID, but no /S
2250    if dict.get(b"MCID").is_ok() && dict.get(b"S").is_err() {
2251        return None; // MCR, not a structure element
2252    }
2253
2254    // Get /S (structure type) — required for structure elements
2255    let element_type = match dict.get(b"S") {
2256        Ok(obj) => {
2257            let obj = resolve_object(doc, obj);
2258            match obj {
2259                lopdf::Object::Name(name) => String::from_utf8_lossy(name).into_owned(),
2260                _ => return None,
2261            }
2262        }
2263        Err(_) => return None, // Not a structure element without /S
2264    };
2265
2266    // Extract MCIDs and children from /K
2267    let mut mcids = Vec::new();
2268    let mut children = Vec::new();
2269
2270    if let Ok(k_obj) = dict.get(b"K") {
2271        collect_mcids_and_children(
2272            doc,
2273            k_obj,
2274            &mut mcids,
2275            &mut children,
2276            depth,
2277            max_depth,
2278            pages_map,
2279        );
2280    }
2281
2282    // Extract /Alt (alternative text)
2283    let alt_text = extract_string_entry(doc, dict, b"Alt");
2284
2285    // Extract /ActualText
2286    let actual_text = extract_string_entry(doc, dict, b"ActualText");
2287
2288    // Extract /Lang
2289    let lang = extract_string_entry(doc, dict, b"Lang");
2290
2291    // Extract page index from /Pg (page reference for this element)
2292    let page_index = resolve_struct_page(doc, dict, pages_map);
2293
2294    Some(StructElement {
2295        element_type,
2296        mcids,
2297        alt_text,
2298        actual_text,
2299        lang,
2300        bbox: None, // PDF structure elements don't always have explicit bbox
2301        children,
2302        page_index,
2303    })
2304}
2305
2306/// Collect MCIDs and child structure elements from a /K entry.
2307///
2308/// /K can be:
2309/// - An integer (MCID)
2310/// - A dictionary (MCR with /MCID, or a child structure element)
2311/// - A reference to a dictionary
2312/// - An array of the above
2313fn collect_mcids_and_children(
2314    doc: &lopdf::Document,
2315    k_obj: &lopdf::Object,
2316    mcids: &mut Vec<u32>,
2317    children: &mut Vec<StructElement>,
2318    depth: usize,
2319    max_depth: usize,
2320    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2321) {
2322    if depth >= max_depth {
2323        return;
2324    }
2325
2326    let k_obj = resolve_object(doc, k_obj);
2327
2328    match k_obj {
2329        lopdf::Object::Integer(n) => {
2330            // Direct MCID
2331            if *n >= 0 {
2332                mcids.push(*n as u32);
2333            }
2334        }
2335        lopdf::Object::Dictionary(dict) => {
2336            process_k_dict(doc, dict, mcids, children, depth, max_depth, pages_map);
2337        }
2338        lopdf::Object::Reference(id) => {
2339            if let Ok(obj) = doc.get_object(*id) {
2340                match obj {
2341                    lopdf::Object::Dictionary(dict) => {
2342                        process_k_dict(doc, dict, mcids, children, depth, max_depth, pages_map);
2343                    }
2344                    lopdf::Object::Integer(n) => {
2345                        if *n >= 0 {
2346                            mcids.push(*n as u32);
2347                        }
2348                    }
2349                    _ => {}
2350                }
2351            }
2352        }
2353        lopdf::Object::Array(arr) => {
2354            for item in arr {
2355                collect_mcids_and_children(doc, item, mcids, children, depth, max_depth, pages_map);
2356            }
2357        }
2358        _ => {}
2359    }
2360}
2361
2362/// Process a dictionary found in /K — it can be an MCR (with /MCID) or a child struct element.
2363fn process_k_dict(
2364    doc: &lopdf::Document,
2365    dict: &lopdf::Dictionary,
2366    mcids: &mut Vec<u32>,
2367    children: &mut Vec<StructElement>,
2368    depth: usize,
2369    max_depth: usize,
2370    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2371) {
2372    // Check if this is a marked-content reference (MCR)
2373    if let Ok(mcid_obj) = dict.get(b"MCID") {
2374        let mcid_obj = resolve_object(doc, mcid_obj);
2375        if let lopdf::Object::Integer(n) = mcid_obj {
2376            if *n >= 0 {
2377                mcids.push(*n as u32);
2378            }
2379        }
2380        return;
2381    }
2382
2383    // Otherwise, treat as a child structure element
2384    if let Some(elem) = parse_struct_element(doc, dict, depth + 1, max_depth, pages_map) {
2385        children.push(elem);
2386    }
2387}
2388
2389/// Resolve a structure element's page index from /Pg reference.
2390fn resolve_struct_page(
2391    _doc: &lopdf::Document,
2392    dict: &lopdf::Dictionary,
2393    pages_map: &std::collections::BTreeMap<u32, lopdf::ObjectId>,
2394) -> Option<usize> {
2395    let page_ref = match dict.get(b"Pg") {
2396        Ok(lopdf::Object::Reference(id)) => *id,
2397        _ => return None,
2398    };
2399
2400    // Find which page index this reference corresponds to
2401    for (page_num, page_id) in pages_map {
2402        if *page_id == page_ref {
2403            return Some((*page_num - 1) as usize); // pages_map uses 1-based
2404        }
2405    }
2406
2407    None
2408}
2409
2410/// Extract a string entry from a dictionary (handles both String and Name objects).
2411fn extract_string_entry(
2412    doc: &lopdf::Document,
2413    dict: &lopdf::Dictionary,
2414    key: &[u8],
2415) -> Option<String> {
2416    let obj = dict.get(key).ok()?;
2417    let obj = resolve_object(doc, obj);
2418    match obj {
2419        lopdf::Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
2420        lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
2421        _ => None,
2422    }
2423}
2424
2425/// Resolve a potentially indirect object reference.
2426fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
2427    match obj {
2428        lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
2429        _ => obj,
2430    }
2431}
2432
2433/// Extract annotations from a page's /Annots array.
2434fn extract_page_annotations(
2435    doc: &lopdf::Document,
2436    page_id: lopdf::ObjectId,
2437) -> Result<Vec<Annotation>, BackendError> {
2438    let page_dict = doc
2439        .get_object(page_id)
2440        .and_then(|o| o.as_dict())
2441        .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
2442
2443    // Get /Annots array (may be a direct array or indirect reference)
2444    let annots_obj = match page_dict.get(b"Annots") {
2445        Ok(obj) => obj,
2446        Err(_) => return Ok(Vec::new()), // No annotations on this page
2447    };
2448
2449    // Resolve indirect reference to the array
2450    let annots_obj = match annots_obj {
2451        lopdf::Object::Reference(id) => doc
2452            .get_object(*id)
2453            .map_err(|e| BackendError::Parse(format!("failed to resolve /Annots ref: {e}")))?,
2454        other => other,
2455    };
2456
2457    let annots_array = annots_obj
2458        .as_array()
2459        .map_err(|e| BackendError::Parse(format!("/Annots is not an array: {e}")))?;
2460
2461    let mut annotations = Vec::new();
2462
2463    for annot_entry in annots_array {
2464        // Each entry may be a direct dictionary or an indirect reference
2465        let annot_obj = match annot_entry {
2466            lopdf::Object::Reference(id) => match doc.get_object(*id) {
2467                Ok(obj) => obj,
2468                Err(_) => continue, // Skip unresolvable references
2469            },
2470            other => other,
2471        };
2472
2473        let annot_dict = match annot_obj.as_dict() {
2474            Ok(dict) => dict,
2475            Err(_) => continue, // Skip non-dictionary entries
2476        };
2477
2478        // Extract /Subtype (required for annotations)
2479        let raw_subtype = match annot_dict.get(b"Subtype") {
2480            Ok(obj) => match obj {
2481                lopdf::Object::Name(name) => String::from_utf8_lossy(name).into_owned(),
2482                _ => continue, // Skip if /Subtype is not a name
2483            },
2484            Err(_) => continue, // Skip annotations without /Subtype
2485        };
2486
2487        let annot_type = AnnotationType::from_subtype(&raw_subtype);
2488
2489        // Extract /Rect (bounding box)
2490        let bbox = match annot_dict.get(b"Rect") {
2491            Ok(obj) => {
2492                let obj = match obj {
2493                    lopdf::Object::Reference(id) => match doc.get_object(*id) {
2494                        Ok(resolved) => resolved,
2495                        Err(_) => continue,
2496                    },
2497                    other => other,
2498                };
2499                match obj.as_array() {
2500                    Ok(arr) => match extract_bbox_from_array(arr) {
2501                        Ok(b) => b,
2502                        Err(_) => continue,
2503                    },
2504                    Err(_) => continue,
2505                }
2506            }
2507            Err(_) => continue, // Skip annotations without /Rect
2508        };
2509
2510        // Extract optional fields
2511        let contents = extract_string_from_dict(doc, annot_dict, b"Contents");
2512        let author = extract_string_from_dict(doc, annot_dict, b"T");
2513        let date = extract_string_from_dict(doc, annot_dict, b"M");
2514
2515        annotations.push(Annotation {
2516            annot_type,
2517            bbox,
2518            contents,
2519            author,
2520            date,
2521            raw_subtype,
2522        });
2523    }
2524
2525    Ok(annotations)
2526}
2527
2528/// Extract hyperlinks from a page's Link annotations.
2529///
2530/// Filters annotations for `/Subtype /Link` and resolves URI targets from
2531/// `/A` (action) or `/Dest` entries.
2532fn extract_page_hyperlinks(
2533    doc: &lopdf::Document,
2534    page_id: lopdf::ObjectId,
2535) -> Result<Vec<Hyperlink>, BackendError> {
2536    let page_dict = doc
2537        .get_object(page_id)
2538        .and_then(|o| o.as_dict())
2539        .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
2540
2541    // Get /Annots array
2542    let annots_obj = match page_dict.get(b"Annots") {
2543        Ok(obj) => obj,
2544        Err(_) => return Ok(Vec::new()),
2545    };
2546
2547    // Resolve indirect reference to the array
2548    let annots_obj = match annots_obj {
2549        lopdf::Object::Reference(id) => doc
2550            .get_object(*id)
2551            .map_err(|e| BackendError::Parse(format!("failed to resolve /Annots ref: {e}")))?,
2552        other => other,
2553    };
2554
2555    let annots_array = annots_obj
2556        .as_array()
2557        .map_err(|e| BackendError::Parse(format!("/Annots is not an array: {e}")))?;
2558
2559    let mut hyperlinks = Vec::new();
2560
2561    for annot_entry in annots_array {
2562        // Each entry may be a direct dictionary or an indirect reference
2563        let annot_obj = match annot_entry {
2564            lopdf::Object::Reference(id) => match doc.get_object(*id) {
2565                Ok(obj) => obj,
2566                Err(_) => continue,
2567            },
2568            other => other,
2569        };
2570
2571        let annot_dict = match annot_obj.as_dict() {
2572            Ok(dict) => dict,
2573            Err(_) => continue,
2574        };
2575
2576        // Only process Link annotations
2577        let subtype = match annot_dict.get(b"Subtype") {
2578            Ok(lopdf::Object::Name(name)) => String::from_utf8_lossy(name).into_owned(),
2579            _ => continue,
2580        };
2581        if subtype != "Link" {
2582            continue;
2583        }
2584
2585        // Extract /Rect (bounding box)
2586        let bbox = match annot_dict.get(b"Rect") {
2587            Ok(obj) => {
2588                let obj = match obj {
2589                    lopdf::Object::Reference(id) => match doc.get_object(*id) {
2590                        Ok(resolved) => resolved,
2591                        Err(_) => continue,
2592                    },
2593                    other => other,
2594                };
2595                match obj.as_array() {
2596                    Ok(arr) => match extract_bbox_from_array(arr) {
2597                        Ok(b) => b,
2598                        Err(_) => continue,
2599                    },
2600                    Err(_) => continue,
2601                }
2602            }
2603            Err(_) => continue,
2604        };
2605
2606        // Try to resolve URI from /A (action) dictionary
2607        let uri = resolve_link_uri(doc, annot_dict);
2608
2609        // Skip links without a resolvable URI
2610        if let Some(uri) = uri {
2611            if !uri.is_empty() {
2612                hyperlinks.push(Hyperlink { bbox, uri });
2613            }
2614        }
2615    }
2616
2617    Ok(hyperlinks)
2618}
2619
2620/// Resolve the URI target of a Link annotation.
2621///
2622/// Checks the /A (action) dictionary first, then /Dest.
2623fn resolve_link_uri(doc: &lopdf::Document, annot_dict: &lopdf::Dictionary) -> Option<String> {
2624    // Try /A (Action) dictionary
2625    if let Ok(action_obj) = annot_dict.get(b"A") {
2626        let action_obj = match action_obj {
2627            lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
2628            other => other,
2629        };
2630        if let Ok(action_dict) = action_obj.as_dict() {
2631            // Get action type /S
2632            if let Ok(lopdf::Object::Name(action_type)) = action_dict.get(b"S") {
2633                let action_type_str = String::from_utf8_lossy(action_type);
2634                match action_type_str.as_ref() {
2635                    "URI" => {
2636                        // Extract /URI string
2637                        return extract_string_from_dict(doc, action_dict, b"URI");
2638                    }
2639                    "GoTo" => {
2640                        // Extract /D destination
2641                        return resolve_goto_dest(doc, action_dict);
2642                    }
2643                    "GoToR" => {
2644                        // Remote GoTo — extract /F (file) and /D (dest)
2645                        let file = extract_string_from_dict(doc, action_dict, b"F");
2646                        if let Some(f) = file {
2647                            return Some(f);
2648                        }
2649                    }
2650                    _ => {}
2651                }
2652            }
2653        }
2654    }
2655
2656    // Try /Dest (direct destination, no action)
2657    if let Ok(dest_obj) = annot_dict.get(b"Dest") {
2658        return resolve_dest_object(doc, dest_obj);
2659    }
2660
2661    None
2662}
2663
2664/// Resolve a GoTo action's /D destination to a string.
2665fn resolve_goto_dest(doc: &lopdf::Document, action_dict: &lopdf::Dictionary) -> Option<String> {
2666    let dest_obj = action_dict.get(b"D").ok()?;
2667    resolve_dest_object(doc, dest_obj)
2668}
2669
2670/// Resolve a destination object to a string representation.
2671///
2672/// Destinations can be:
2673/// - A name string (named destination)
2674/// - An array [page_ref, /type, ...] (explicit destination)
2675fn resolve_dest_object(doc: &lopdf::Document, dest_obj: &lopdf::Object) -> Option<String> {
2676    let dest_obj = match dest_obj {
2677        lopdf::Object::Reference(id) => doc.get_object(*id).ok()?,
2678        other => other,
2679    };
2680
2681    match dest_obj {
2682        // Named destination (string)
2683        lopdf::Object::String(bytes, _) => {
2684            if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
2685                let chars: Vec<u16> = bytes[2..]
2686                    .chunks(2)
2687                    .filter_map(|c| {
2688                        if c.len() == 2 {
2689                            Some(u16::from_be_bytes([c[0], c[1]]))
2690                        } else {
2691                            None
2692                        }
2693                    })
2694                    .collect();
2695                String::from_utf16(&chars).ok()
2696            } else {
2697                match std::str::from_utf8(bytes) {
2698                    Ok(s) => Some(s.to_string()),
2699                    Err(_) => Some(bytes.iter().map(|&b| b as char).collect()),
2700                }
2701            }
2702        }
2703        // Named destination (name)
2704        lopdf::Object::Name(name) => Some(String::from_utf8_lossy(name).into_owned()),
2705        // Explicit destination array [page_ref, /type, ...]
2706        lopdf::Object::Array(arr) => {
2707            if arr.is_empty() {
2708                return None;
2709            }
2710            // First element is a page reference — try to resolve page number
2711            if let lopdf::Object::Reference(page_ref) = &arr[0] {
2712                // Find the page number by matching against document pages
2713                let pages_map = doc.get_pages();
2714                for (&page_num, &page_id) in &pages_map {
2715                    if page_id == *page_ref {
2716                        return Some(format!("#page={page_num}"));
2717                    }
2718                }
2719                // Couldn't resolve page number, use reference
2720                return Some(format!("#ref={},{}", page_ref.0, page_ref.1));
2721            }
2722            None
2723        }
2724        _ => None,
2725    }
2726}
2727
2728/// Create a minimal valid PDF document with the given number of pages.
2729///
2730/// Each page is US Letter size (612 x 792 points) with no content.
2731/// Used for testing purposes.
2732#[cfg(test)]
2733fn create_test_pdf(page_count: usize) -> Vec<u8> {
2734    use lopdf::{Document, Object, ObjectId, dictionary};
2735
2736    let mut doc = Document::with_version("1.5");
2737    let pages_id: ObjectId = doc.new_object_id();
2738
2739    let mut page_ids: Vec<Object> = Vec::new();
2740    for _ in 0..page_count {
2741        let page_id = doc.add_object(dictionary! {
2742            "Type" => "Page",
2743            "Parent" => pages_id,
2744            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2745        });
2746        page_ids.push(page_id.into());
2747    }
2748
2749    doc.objects.insert(
2750        pages_id,
2751        Object::Dictionary(dictionary! {
2752            "Type" => "Pages",
2753            "Kids" => page_ids,
2754            "Count" => page_count as i64,
2755        }),
2756    );
2757
2758    let catalog_id = doc.add_object(dictionary! {
2759        "Type" => "Catalog",
2760        "Pages" => pages_id,
2761    });
2762    doc.trailer.set("Root", catalog_id);
2763
2764    let mut buf = Vec::new();
2765    doc.save_to(&mut buf).expect("failed to save test PDF");
2766    buf
2767}
2768
2769/// Create a PDF where pages inherit MediaBox from the Pages parent node.
2770#[cfg(test)]
2771fn create_test_pdf_inherited_media_box() -> Vec<u8> {
2772    use lopdf::{Document, Object, ObjectId, dictionary};
2773
2774    let mut doc = Document::with_version("1.5");
2775    let pages_id: ObjectId = doc.new_object_id();
2776
2777    // Page WITHOUT its own MediaBox — should inherit from parent
2778    let page_id = doc.add_object(dictionary! {
2779        "Type" => "Page",
2780        "Parent" => pages_id,
2781    });
2782
2783    doc.objects.insert(
2784        pages_id,
2785        Object::Dictionary(dictionary! {
2786            "Type" => "Pages",
2787            "Kids" => vec![Object::from(page_id)],
2788            "Count" => 1i64,
2789            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
2790        }),
2791    );
2792
2793    let catalog_id = doc.add_object(dictionary! {
2794        "Type" => "Catalog",
2795        "Pages" => pages_id,
2796    });
2797    doc.trailer.set("Root", catalog_id);
2798
2799    let mut buf = Vec::new();
2800    doc.save_to(&mut buf).expect("failed to save test PDF");
2801    buf
2802}
2803
2804/// Create a PDF with a page that has an explicit CropBox.
2805#[cfg(test)]
2806fn create_test_pdf_with_crop_box() -> Vec<u8> {
2807    use lopdf::{Document, Object, ObjectId, dictionary};
2808
2809    let mut doc = Document::with_version("1.5");
2810    let pages_id: ObjectId = doc.new_object_id();
2811
2812    let page_id = doc.add_object(dictionary! {
2813        "Type" => "Page",
2814        "Parent" => pages_id,
2815        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2816        "CropBox" => vec![
2817            Object::Real(36.0),
2818            Object::Real(36.0),
2819            Object::Real(576.0),
2820            Object::Real(756.0),
2821        ],
2822    });
2823
2824    doc.objects.insert(
2825        pages_id,
2826        Object::Dictionary(dictionary! {
2827            "Type" => "Pages",
2828            "Kids" => vec![Object::from(page_id)],
2829            "Count" => 1i64,
2830        }),
2831    );
2832
2833    let catalog_id = doc.add_object(dictionary! {
2834        "Type" => "Catalog",
2835        "Pages" => pages_id,
2836    });
2837    doc.trailer.set("Root", catalog_id);
2838
2839    let mut buf = Vec::new();
2840    doc.save_to(&mut buf).expect("failed to save test PDF");
2841    buf
2842}
2843
2844/// Create a PDF with a page that has a /Rotate value.
2845#[cfg(test)]
2846fn create_test_pdf_with_rotate(rotation: i64) -> Vec<u8> {
2847    use lopdf::{Document, Object, ObjectId, dictionary};
2848
2849    let mut doc = Document::with_version("1.5");
2850    let pages_id: ObjectId = doc.new_object_id();
2851
2852    let page_id = doc.add_object(dictionary! {
2853        "Type" => "Page",
2854        "Parent" => pages_id,
2855        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2856        "Rotate" => rotation,
2857    });
2858
2859    doc.objects.insert(
2860        pages_id,
2861        Object::Dictionary(dictionary! {
2862            "Type" => "Pages",
2863            "Kids" => vec![Object::from(page_id)],
2864            "Count" => 1i64,
2865        }),
2866    );
2867
2868    let catalog_id = doc.add_object(dictionary! {
2869        "Type" => "Catalog",
2870        "Pages" => pages_id,
2871    });
2872    doc.trailer.set("Root", catalog_id);
2873
2874    let mut buf = Vec::new();
2875    doc.save_to(&mut buf).expect("failed to save test PDF");
2876    buf
2877}
2878
2879/// Create a PDF where Rotate is inherited from the Pages parent node.
2880#[cfg(test)]
2881fn create_test_pdf_inherited_rotate(rotation: i64) -> Vec<u8> {
2882    use lopdf::{Document, Object, ObjectId, dictionary};
2883
2884    let mut doc = Document::with_version("1.5");
2885    let pages_id: ObjectId = doc.new_object_id();
2886
2887    // Page WITHOUT Rotate — should inherit from parent
2888    let page_id = doc.add_object(dictionary! {
2889        "Type" => "Page",
2890        "Parent" => pages_id,
2891        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2892    });
2893
2894    doc.objects.insert(
2895        pages_id,
2896        Object::Dictionary(dictionary! {
2897            "Type" => "Pages",
2898            "Kids" => vec![Object::from(page_id)],
2899            "Count" => 1i64,
2900            "Rotate" => rotation,
2901        }),
2902    );
2903
2904    let catalog_id = doc.add_object(dictionary! {
2905        "Type" => "Catalog",
2906        "Pages" => pages_id,
2907    });
2908    doc.trailer.set("Root", catalog_id);
2909
2910    let mut buf = Vec::new();
2911    doc.save_to(&mut buf).expect("failed to save test PDF");
2912    buf
2913}
2914
2915/// Create a PDF with a page that references a Form XObject containing text.
2916///
2917/// Page content: `q /FM1 Do Q`
2918/// Form XObject FM1 content: `BT /F1 12 Tf 72 700 Td (Hello) Tj ET`
2919#[cfg(test)]
2920fn create_test_pdf_with_form_xobject() -> Vec<u8> {
2921    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
2922
2923    let mut doc = Document::with_version("1.5");
2924    let pages_id: ObjectId = doc.new_object_id();
2925
2926    // Minimal Type1 font dictionary
2927    let font_id = doc.add_object(dictionary! {
2928        "Type" => "Font",
2929        "Subtype" => "Type1",
2930        "BaseFont" => "Helvetica",
2931    });
2932
2933    // Form XObject stream: contains text
2934    let form_content = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
2935    let form_stream = Stream::new(
2936        dictionary! {
2937            "Type" => "XObject",
2938            "Subtype" => "Form",
2939            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2940            "Resources" => Object::Dictionary(dictionary! {
2941                "Font" => Object::Dictionary(dictionary! {
2942                    "F1" => font_id,
2943                }),
2944            }),
2945        },
2946        form_content.to_vec(),
2947    );
2948    let form_id = doc.add_object(Object::Stream(form_stream));
2949
2950    // Page content: invoke the form XObject
2951    let page_content = b"q /FM1 Do Q";
2952    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
2953    let content_id = doc.add_object(Object::Stream(page_stream));
2954
2955    let page_id = doc.add_object(dictionary! {
2956        "Type" => "Page",
2957        "Parent" => pages_id,
2958        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2959        "Contents" => content_id,
2960        "Resources" => Object::Dictionary(dictionary! {
2961            "Font" => Object::Dictionary(dictionary! {
2962                "F1" => font_id,
2963            }),
2964            "XObject" => Object::Dictionary(dictionary! {
2965                "FM1" => form_id,
2966            }),
2967        }),
2968    });
2969
2970    doc.objects.insert(
2971        pages_id,
2972        Object::Dictionary(dictionary! {
2973            "Type" => "Pages",
2974            "Kids" => vec![Object::from(page_id)],
2975            "Count" => 1i64,
2976        }),
2977    );
2978
2979    let catalog_id = doc.add_object(dictionary! {
2980        "Type" => "Catalog",
2981        "Pages" => pages_id,
2982    });
2983    doc.trailer.set("Root", catalog_id);
2984
2985    let mut buf = Vec::new();
2986    doc.save_to(&mut buf).expect("failed to save test PDF");
2987    buf
2988}
2989
2990/// Create a PDF with nested Form XObjects (2 levels).
2991///
2992/// Page content: `q /FM1 Do Q`
2993/// FM1 content: `q /FM2 Do Q` (references FM2)
2994/// FM2 content: `BT /F1 10 Tf (Deep) Tj ET` (actual text)
2995#[cfg(test)]
2996fn create_test_pdf_with_nested_form_xobjects() -> Vec<u8> {
2997    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
2998
2999    let mut doc = Document::with_version("1.5");
3000    let pages_id: ObjectId = doc.new_object_id();
3001
3002    let font_id = doc.add_object(dictionary! {
3003        "Type" => "Font",
3004        "Subtype" => "Type1",
3005        "BaseFont" => "Helvetica",
3006    });
3007
3008    // Inner Form XObject (FM2): contains actual text
3009    let fm2_content = b"BT /F1 10 Tf (Deep) Tj ET";
3010    let fm2_stream = Stream::new(
3011        dictionary! {
3012            "Type" => "XObject",
3013            "Subtype" => "Form",
3014            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3015            "Resources" => Object::Dictionary(dictionary! {
3016                "Font" => Object::Dictionary(dictionary! {
3017                    "F1" => font_id,
3018                }),
3019            }),
3020        },
3021        fm2_content.to_vec(),
3022    );
3023    let fm2_id = doc.add_object(Object::Stream(fm2_stream));
3024
3025    // Outer Form XObject (FM1): references FM2
3026    let fm1_content = b"q /FM2 Do Q";
3027    let fm1_stream = Stream::new(
3028        dictionary! {
3029            "Type" => "XObject",
3030            "Subtype" => "Form",
3031            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3032            "Resources" => Object::Dictionary(dictionary! {
3033                "XObject" => Object::Dictionary(dictionary! {
3034                    "FM2" => fm2_id,
3035                }),
3036                "Font" => Object::Dictionary(dictionary! {
3037                    "F1" => font_id,
3038                }),
3039            }),
3040        },
3041        fm1_content.to_vec(),
3042    );
3043    let fm1_id = doc.add_object(Object::Stream(fm1_stream));
3044
3045    // Page content: invoke FM1
3046    let page_content = b"q /FM1 Do Q";
3047    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3048    let content_id = doc.add_object(Object::Stream(page_stream));
3049
3050    let page_id = doc.add_object(dictionary! {
3051        "Type" => "Page",
3052        "Parent" => pages_id,
3053        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3054        "Contents" => content_id,
3055        "Resources" => Object::Dictionary(dictionary! {
3056            "XObject" => Object::Dictionary(dictionary! {
3057                "FM1" => fm1_id,
3058            }),
3059            "Font" => Object::Dictionary(dictionary! {
3060                "F1" => font_id,
3061            }),
3062        }),
3063    });
3064
3065    doc.objects.insert(
3066        pages_id,
3067        Object::Dictionary(dictionary! {
3068            "Type" => "Pages",
3069            "Kids" => vec![Object::from(page_id)],
3070            "Count" => 1i64,
3071        }),
3072    );
3073
3074    let catalog_id = doc.add_object(dictionary! {
3075        "Type" => "Catalog",
3076        "Pages" => pages_id,
3077    });
3078    doc.trailer.set("Root", catalog_id);
3079
3080    let mut buf = Vec::new();
3081    doc.save_to(&mut buf).expect("failed to save test PDF");
3082    buf
3083}
3084
3085/// Create a PDF with a Form XObject that has a /Matrix transform.
3086///
3087/// The Form XObject has /Matrix [2 0 0 2 10 20] (scale 2x + translate).
3088#[cfg(test)]
3089fn create_test_pdf_form_xobject_with_matrix() -> Vec<u8> {
3090    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3091
3092    let mut doc = Document::with_version("1.5");
3093    let pages_id: ObjectId = doc.new_object_id();
3094
3095    let font_id = doc.add_object(dictionary! {
3096        "Type" => "Font",
3097        "Subtype" => "Type1",
3098        "BaseFont" => "Helvetica",
3099    });
3100
3101    let form_content = b"BT /F1 12 Tf (A) Tj ET";
3102    let form_stream = Stream::new(
3103        dictionary! {
3104            "Type" => "XObject",
3105            "Subtype" => "Form",
3106            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3107            "Matrix" => vec![
3108                Object::Real(2.0), Object::Real(0.0),
3109                Object::Real(0.0), Object::Real(2.0),
3110                Object::Real(10.0), Object::Real(20.0),
3111            ],
3112            "Resources" => Object::Dictionary(dictionary! {
3113                "Font" => Object::Dictionary(dictionary! {
3114                    "F1" => font_id,
3115                }),
3116            }),
3117        },
3118        form_content.to_vec(),
3119    );
3120    let form_id = doc.add_object(Object::Stream(form_stream));
3121
3122    let page_content = b"q /FM1 Do Q";
3123    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3124    let content_id = doc.add_object(Object::Stream(page_stream));
3125
3126    let page_id = doc.add_object(dictionary! {
3127        "Type" => "Page",
3128        "Parent" => pages_id,
3129        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3130        "Contents" => content_id,
3131        "Resources" => Object::Dictionary(dictionary! {
3132            "XObject" => Object::Dictionary(dictionary! {
3133                "FM1" => form_id,
3134            }),
3135            "Font" => Object::Dictionary(dictionary! {
3136                "F1" => font_id,
3137            }),
3138        }),
3139    });
3140
3141    doc.objects.insert(
3142        pages_id,
3143        Object::Dictionary(dictionary! {
3144            "Type" => "Pages",
3145            "Kids" => vec![Object::from(page_id)],
3146            "Count" => 1i64,
3147        }),
3148    );
3149
3150    let catalog_id = doc.add_object(dictionary! {
3151        "Type" => "Catalog",
3152        "Pages" => pages_id,
3153    });
3154    doc.trailer.set("Root", catalog_id);
3155
3156    let mut buf = Vec::new();
3157    doc.save_to(&mut buf).expect("failed to save test PDF");
3158    buf
3159}
3160
3161/// Create a PDF with an Image XObject (not Form).
3162#[cfg(test)]
3163fn create_test_pdf_with_image_xobject() -> Vec<u8> {
3164    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3165
3166    let mut doc = Document::with_version("1.5");
3167    let pages_id: ObjectId = doc.new_object_id();
3168
3169    // 2x2 RGB image (12 bytes of pixel data)
3170    let image_data = vec![255u8, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0];
3171    let image_stream = Stream::new(
3172        dictionary! {
3173            "Type" => "XObject",
3174            "Subtype" => "Image",
3175            "Width" => 2i64,
3176            "Height" => 2i64,
3177            "ColorSpace" => "DeviceRGB",
3178            "BitsPerComponent" => 8i64,
3179        },
3180        image_data,
3181    );
3182    let image_id = doc.add_object(Object::Stream(image_stream));
3183
3184    // Page content: scale then place image
3185    let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
3186    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3187    let content_id = doc.add_object(Object::Stream(page_stream));
3188
3189    let page_id = doc.add_object(dictionary! {
3190        "Type" => "Page",
3191        "Parent" => pages_id,
3192        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3193        "Contents" => content_id,
3194        "Resources" => Object::Dictionary(dictionary! {
3195            "XObject" => Object::Dictionary(dictionary! {
3196                "Im0" => image_id,
3197            }),
3198        }),
3199    });
3200
3201    doc.objects.insert(
3202        pages_id,
3203        Object::Dictionary(dictionary! {
3204            "Type" => "Pages",
3205            "Kids" => vec![Object::from(page_id)],
3206            "Count" => 1i64,
3207        }),
3208    );
3209
3210    let catalog_id = doc.add_object(dictionary! {
3211        "Type" => "Catalog",
3212        "Pages" => pages_id,
3213    });
3214    doc.trailer.set("Root", catalog_id);
3215
3216    let mut buf = Vec::new();
3217    doc.save_to(&mut buf).expect("failed to save test PDF");
3218    buf
3219}
3220
3221/// Create a PDF with a JPEG (DCTDecode) image XObject.
3222#[cfg(test)]
3223fn create_test_pdf_with_jpeg_image() -> Vec<u8> {
3224    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3225
3226    let mut doc = Document::with_version("1.5");
3227    let pages_id: ObjectId = doc.new_object_id();
3228
3229    // Minimal JPEG data (SOI + APP0 + EOI markers)
3230    // A real JPEG starts with FF D8 and ends with FF D9
3231    let jpeg_data = vec![
3232        0xFF, 0xD8, 0xFF, 0xE0, // SOI + APP0 marker
3233        0x00, 0x10, // Length of APP0
3234        0x4A, 0x46, 0x49, 0x46, 0x00, // "JFIF\0"
3235        0x01, 0x01, // Version
3236        0x00, // Units
3237        0x00, 0x01, 0x00, 0x01, // X/Y density
3238        0x00, 0x00, // No thumbnail
3239        0xFF, 0xD9, // EOI marker
3240    ];
3241
3242    let image_stream = Stream::new(
3243        dictionary! {
3244            "Type" => "XObject",
3245            "Subtype" => "Image",
3246            "Width" => 2i64,
3247            "Height" => 2i64,
3248            "ColorSpace" => "DeviceRGB",
3249            "BitsPerComponent" => 8i64,
3250            "Filter" => "DCTDecode",
3251        },
3252        jpeg_data,
3253    );
3254    let image_id = doc.add_object(Object::Stream(image_stream));
3255
3256    let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
3257    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3258    let content_id = doc.add_object(Object::Stream(page_stream));
3259
3260    let page_id = doc.add_object(dictionary! {
3261        "Type" => "Page",
3262        "Parent" => pages_id,
3263        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3264        "Contents" => content_id,
3265        "Resources" => Object::Dictionary(dictionary! {
3266            "XObject" => Object::Dictionary(dictionary! {
3267                "Im0" => image_id,
3268            }),
3269        }),
3270    });
3271
3272    doc.objects.insert(
3273        pages_id,
3274        Object::Dictionary(dictionary! {
3275            "Type" => "Pages",
3276            "Kids" => vec![Object::from(page_id)],
3277            "Count" => 1i64,
3278        }),
3279    );
3280
3281    let catalog_id = doc.add_object(dictionary! {
3282        "Type" => "Catalog",
3283        "Pages" => pages_id,
3284    });
3285    doc.trailer.set("Root", catalog_id);
3286
3287    let mut buf = Vec::new();
3288    doc.save_to(&mut buf).expect("failed to save test PDF");
3289    buf
3290}
3291
3292/// Create a PDF with a page that has direct text content (no XObjects).
3293#[cfg(test)]
3294fn create_test_pdf_with_text_content() -> Vec<u8> {
3295    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
3296
3297    let mut doc = Document::with_version("1.5");
3298    let pages_id: ObjectId = doc.new_object_id();
3299
3300    let font_id = doc.add_object(dictionary! {
3301        "Type" => "Font",
3302        "Subtype" => "Type1",
3303        "BaseFont" => "Helvetica",
3304    });
3305
3306    let page_content = b"BT /F1 12 Tf 72 700 Td (Hi) Tj ET";
3307    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
3308    let content_id = doc.add_object(Object::Stream(page_stream));
3309
3310    let page_id = doc.add_object(dictionary! {
3311        "Type" => "Page",
3312        "Parent" => pages_id,
3313        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3314        "Contents" => content_id,
3315        "Resources" => Object::Dictionary(dictionary! {
3316            "Font" => Object::Dictionary(dictionary! {
3317                "F1" => font_id,
3318            }),
3319        }),
3320    });
3321
3322    doc.objects.insert(
3323        pages_id,
3324        Object::Dictionary(dictionary! {
3325            "Type" => "Pages",
3326            "Kids" => vec![Object::from(page_id)],
3327            "Count" => 1i64,
3328        }),
3329    );
3330
3331    let catalog_id = doc.add_object(dictionary! {
3332        "Type" => "Catalog",
3333        "Pages" => pages_id,
3334    });
3335    doc.trailer.set("Root", catalog_id);
3336
3337    let mut buf = Vec::new();
3338    doc.save_to(&mut buf).expect("failed to save test PDF");
3339    buf
3340}
3341
3342/// Create a test PDF with an /Info metadata dictionary.
3343#[cfg(test)]
3344#[allow(clippy::too_many_arguments)]
3345fn create_test_pdf_with_metadata(
3346    title: Option<&str>,
3347    author: Option<&str>,
3348    subject: Option<&str>,
3349    keywords: Option<&str>,
3350    creator: Option<&str>,
3351    producer: Option<&str>,
3352    creation_date: Option<&str>,
3353    mod_date: Option<&str>,
3354) -> Vec<u8> {
3355    use lopdf::{Document, Object, ObjectId, dictionary};
3356
3357    let mut doc = Document::with_version("1.5");
3358    let pages_id: ObjectId = doc.new_object_id();
3359
3360    let page_id = doc.add_object(dictionary! {
3361        "Type" => "Page",
3362        "Parent" => pages_id,
3363        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3364    });
3365
3366    doc.objects.insert(
3367        pages_id,
3368        Object::Dictionary(dictionary! {
3369            "Type" => "Pages",
3370            "Kids" => vec![Object::from(page_id)],
3371            "Count" => 1i64,
3372        }),
3373    );
3374
3375    let catalog_id = doc.add_object(dictionary! {
3376        "Type" => "Catalog",
3377        "Pages" => pages_id,
3378    });
3379    doc.trailer.set("Root", catalog_id);
3380
3381    // Build /Info dictionary
3382    let mut info_dict = lopdf::Dictionary::new();
3383    if let Some(v) = title {
3384        info_dict.set("Title", Object::string_literal(v));
3385    }
3386    if let Some(v) = author {
3387        info_dict.set("Author", Object::string_literal(v));
3388    }
3389    if let Some(v) = subject {
3390        info_dict.set("Subject", Object::string_literal(v));
3391    }
3392    if let Some(v) = keywords {
3393        info_dict.set("Keywords", Object::string_literal(v));
3394    }
3395    if let Some(v) = creator {
3396        info_dict.set("Creator", Object::string_literal(v));
3397    }
3398    if let Some(v) = producer {
3399        info_dict.set("Producer", Object::string_literal(v));
3400    }
3401    if let Some(v) = creation_date {
3402        info_dict.set("CreationDate", Object::string_literal(v));
3403    }
3404    if let Some(v) = mod_date {
3405        info_dict.set("ModDate", Object::string_literal(v));
3406    }
3407
3408    let info_id = doc.add_object(Object::Dictionary(info_dict));
3409    doc.trailer.set("Info", Object::Reference(info_id));
3410
3411    let mut buf = Vec::new();
3412    doc.save_to(&mut buf).expect("failed to save test PDF");
3413    buf
3414}
3415
3416#[cfg(test)]
3417mod tests {
3418    use super::*;
3419    use crate::handler::{CharEvent, ContentHandler, ImageEvent};
3420    use pdfplumber_core::PdfError;
3421
3422    // --- CollectingHandler for interpret_page tests ---
3423
3424    struct CollectingHandler {
3425        chars: Vec<CharEvent>,
3426        images: Vec<ImageEvent>,
3427    }
3428
3429    impl CollectingHandler {
3430        fn new() -> Self {
3431            Self {
3432                chars: Vec::new(),
3433                images: Vec::new(),
3434            }
3435        }
3436    }
3437
3438    impl ContentHandler for CollectingHandler {
3439        fn on_char(&mut self, event: CharEvent) {
3440            self.chars.push(event);
3441        }
3442        fn on_image(&mut self, event: ImageEvent) {
3443            self.images.push(event);
3444        }
3445    }
3446
3447    // --- open() tests ---
3448
3449    #[test]
3450    fn open_valid_single_page_pdf() {
3451        let pdf_bytes = create_test_pdf(1);
3452        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3453        assert_eq!(LopdfBackend::page_count(&doc), 1);
3454    }
3455
3456    #[test]
3457    fn open_valid_multi_page_pdf() {
3458        let pdf_bytes = create_test_pdf(5);
3459        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3460        assert_eq!(LopdfBackend::page_count(&doc), 5);
3461    }
3462
3463    #[test]
3464    fn open_invalid_bytes_returns_error() {
3465        let result = LopdfBackend::open(b"not a pdf");
3466        assert!(result.is_err());
3467    }
3468
3469    #[test]
3470    fn open_empty_bytes_returns_error() {
3471        let result = LopdfBackend::open(&[]);
3472        assert!(result.is_err());
3473    }
3474
3475    #[test]
3476    fn open_error_converts_to_pdf_error() {
3477        let err = LopdfBackend::open(b"garbage").unwrap_err();
3478        let pdf_err: PdfError = err.into();
3479        assert!(matches!(pdf_err, PdfError::ParseError(_)));
3480    }
3481
3482    // --- page_count() tests ---
3483
3484    #[test]
3485    fn page_count_zero_pages() {
3486        let pdf_bytes = create_test_pdf(0);
3487        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3488        assert_eq!(LopdfBackend::page_count(&doc), 0);
3489    }
3490
3491    #[test]
3492    fn page_count_three_pages() {
3493        let pdf_bytes = create_test_pdf(3);
3494        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3495        assert_eq!(LopdfBackend::page_count(&doc), 3);
3496    }
3497
3498    // --- get_page() tests ---
3499
3500    #[test]
3501    fn get_page_first_page() {
3502        let pdf_bytes = create_test_pdf(3);
3503        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3504        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3505        assert_eq!(page.index, 0);
3506    }
3507
3508    #[test]
3509    fn get_page_last_page() {
3510        let pdf_bytes = create_test_pdf(3);
3511        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3512        let page = LopdfBackend::get_page(&doc, 2).unwrap();
3513        assert_eq!(page.index, 2);
3514    }
3515
3516    #[test]
3517    fn get_page_out_of_bounds() {
3518        let pdf_bytes = create_test_pdf(2);
3519        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3520        let result = LopdfBackend::get_page(&doc, 2);
3521        assert!(result.is_err());
3522    }
3523
3524    #[test]
3525    fn get_page_out_of_bounds_error_converts_to_pdf_error() {
3526        let pdf_bytes = create_test_pdf(1);
3527        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3528        let err = LopdfBackend::get_page(&doc, 5).unwrap_err();
3529        let pdf_err: PdfError = err.into();
3530        assert!(matches!(pdf_err, PdfError::ParseError(_)));
3531        assert!(pdf_err.to_string().contains("out of range"));
3532    }
3533
3534    #[test]
3535    fn get_page_on_empty_document() {
3536        let pdf_bytes = create_test_pdf(0);
3537        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3538        let result = LopdfBackend::get_page(&doc, 0);
3539        assert!(result.is_err());
3540    }
3541
3542    // --- Page object IDs are distinct ---
3543
3544    #[test]
3545    fn pages_have_distinct_object_ids() {
3546        let pdf_bytes = create_test_pdf(3);
3547        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3548        let page0 = LopdfBackend::get_page(&doc, 0).unwrap();
3549        let page1 = LopdfBackend::get_page(&doc, 1).unwrap();
3550        let page2 = LopdfBackend::get_page(&doc, 2).unwrap();
3551        assert_ne!(page0.object_id, page1.object_id);
3552        assert_ne!(page1.object_id, page2.object_id);
3553        assert_ne!(page0.object_id, page2.object_id);
3554    }
3555
3556    // --- Integration: open + page_count + get_page round-trip ---
3557
3558    #[test]
3559    fn round_trip_open_count_access() {
3560        let pdf_bytes = create_test_pdf(4);
3561        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3562        let count = LopdfBackend::page_count(&doc);
3563        assert_eq!(count, 4);
3564
3565        for i in 0..count {
3566            let page = LopdfBackend::get_page(&doc, i).unwrap();
3567            assert_eq!(page.index, i);
3568        }
3569
3570        // One past the end should fail
3571        assert!(LopdfBackend::get_page(&doc, count).is_err());
3572    }
3573
3574    // --- page_media_box() tests ---
3575
3576    #[test]
3577    fn media_box_explicit_us_letter() {
3578        let pdf_bytes = create_test_pdf(1);
3579        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3580        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3581        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3582        assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
3583    }
3584
3585    #[test]
3586    fn media_box_inherited_from_parent() {
3587        let pdf_bytes = create_test_pdf_inherited_media_box();
3588        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3589        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3590        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3591        // Inherited A4 size from parent Pages node
3592        assert_eq!(media_box, BBox::new(0.0, 0.0, 595.0, 842.0));
3593    }
3594
3595    #[test]
3596    fn media_box_width_height() {
3597        let pdf_bytes = create_test_pdf(1);
3598        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3599        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3600        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3601        assert_eq!(media_box.width(), 612.0);
3602        assert_eq!(media_box.height(), 792.0);
3603    }
3604
3605    // --- page_crop_box() tests ---
3606
3607    #[test]
3608    fn crop_box_present() {
3609        let pdf_bytes = create_test_pdf_with_crop_box();
3610        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3611        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3612        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3613        assert_eq!(crop_box, Some(BBox::new(36.0, 36.0, 576.0, 756.0)));
3614    }
3615
3616    #[test]
3617    fn crop_box_absent() {
3618        let pdf_bytes = create_test_pdf(1);
3619        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3620        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3621        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3622        assert_eq!(crop_box, None);
3623    }
3624
3625    // --- page_rotate() tests ---
3626
3627    #[test]
3628    fn rotate_default_zero() {
3629        let pdf_bytes = create_test_pdf(1);
3630        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3631        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3632        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3633        assert_eq!(rotation, 0);
3634    }
3635
3636    #[test]
3637    fn rotate_90() {
3638        let pdf_bytes = create_test_pdf_with_rotate(90);
3639        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3640        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3641        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3642        assert_eq!(rotation, 90);
3643    }
3644
3645    #[test]
3646    fn rotate_180() {
3647        let pdf_bytes = create_test_pdf_with_rotate(180);
3648        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3649        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3650        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3651        assert_eq!(rotation, 180);
3652    }
3653
3654    #[test]
3655    fn rotate_270() {
3656        let pdf_bytes = create_test_pdf_with_rotate(270);
3657        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3658        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3659        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3660        assert_eq!(rotation, 270);
3661    }
3662
3663    #[test]
3664    fn rotate_inherited_from_parent() {
3665        let pdf_bytes = create_test_pdf_inherited_rotate(90);
3666        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3667        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3668        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3669        assert_eq!(rotation, 90);
3670    }
3671
3672    // --- Integration: all page properties together ---
3673
3674    #[test]
3675    fn page_properties_round_trip() {
3676        let pdf_bytes = create_test_pdf_with_crop_box();
3677        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3678        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3679
3680        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
3681        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
3682        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
3683
3684        assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
3685        assert!(crop_box.is_some());
3686        assert_eq!(rotation, 0);
3687    }
3688
3689    // --- interpret_page: basic text extraction ---
3690
3691    #[test]
3692    fn interpret_page_simple_text() {
3693        let pdf_bytes = create_test_pdf_with_text_content();
3694        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3695        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3696        let options = ExtractOptions::default();
3697        let mut handler = CollectingHandler::new();
3698
3699        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3700
3701        // "Hi" = 2 characters
3702        assert_eq!(handler.chars.len(), 2);
3703        assert_eq!(handler.chars[0].char_code, b'H' as u32);
3704        assert_eq!(handler.chars[1].char_code, b'i' as u32);
3705        assert_eq!(handler.chars[0].font_size, 12.0);
3706        assert_eq!(handler.chars[0].font_name, "Helvetica");
3707    }
3708
3709    #[test]
3710    fn interpret_page_no_content() {
3711        let pdf_bytes = create_test_pdf(1);
3712        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3713        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3714        let options = ExtractOptions::default();
3715        let mut handler = CollectingHandler::new();
3716
3717        // Page with no /Contents should not fail
3718        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3719        assert_eq!(handler.chars.len(), 0);
3720    }
3721
3722    // --- interpret_page: Form XObject tests (US-016) ---
3723
3724    #[test]
3725    fn interpret_page_form_xobject_text() {
3726        let pdf_bytes = create_test_pdf_with_form_xobject();
3727        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3728        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3729        let options = ExtractOptions::default();
3730        let mut handler = CollectingHandler::new();
3731
3732        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3733
3734        // Form XObject contains "Hello" = 5 chars
3735        assert_eq!(handler.chars.len(), 5);
3736        assert_eq!(handler.chars[0].char_code, b'H' as u32);
3737        assert_eq!(handler.chars[1].char_code, b'e' as u32);
3738        assert_eq!(handler.chars[2].char_code, b'l' as u32);
3739        assert_eq!(handler.chars[3].char_code, b'l' as u32);
3740        assert_eq!(handler.chars[4].char_code, b'o' as u32);
3741        assert_eq!(handler.chars[0].font_name, "Helvetica");
3742        assert_eq!(handler.chars[0].font_size, 12.0);
3743    }
3744
3745    #[test]
3746    fn interpret_page_nested_form_xobjects() {
3747        let pdf_bytes = create_test_pdf_with_nested_form_xobjects();
3748        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3749        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3750        let options = ExtractOptions::default();
3751        let mut handler = CollectingHandler::new();
3752
3753        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3754
3755        // Nested form XObject FM1→FM2 contains "Deep" = 4 chars
3756        assert_eq!(handler.chars.len(), 4);
3757        assert_eq!(handler.chars[0].char_code, b'D' as u32);
3758        assert_eq!(handler.chars[1].char_code, b'e' as u32);
3759        assert_eq!(handler.chars[2].char_code, b'e' as u32);
3760        assert_eq!(handler.chars[3].char_code, b'p' as u32);
3761    }
3762
3763    #[test]
3764    fn interpret_page_form_xobject_matrix_applied() {
3765        let pdf_bytes = create_test_pdf_form_xobject_with_matrix();
3766        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3767        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3768        let options = ExtractOptions::default();
3769        let mut handler = CollectingHandler::new();
3770
3771        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3772
3773        // Form XObject has /Matrix [2 0 0 2 10 20], character "A"
3774        assert_eq!(handler.chars.len(), 1);
3775        assert_eq!(handler.chars[0].char_code, b'A' as u32);
3776        // CTM should include the form's matrix transform
3777        let ctm = handler.chars[0].ctm;
3778        // Form matrix [2 0 0 2 10 20] applied on top of identity
3779        assert!((ctm[0] - 2.0).abs() < 0.01);
3780        assert!((ctm[3] - 2.0).abs() < 0.01);
3781        assert!((ctm[4] - 10.0).abs() < 0.01);
3782        assert!((ctm[5] - 20.0).abs() < 0.01);
3783    }
3784
3785    #[test]
3786    fn interpret_page_form_xobject_state_restored() {
3787        // After processing a Form XObject, the graphics state should be restored.
3788        // The Form XObject is wrapped in q/Q on the page, and the interpreter
3789        // also saves/restores state around the Form XObject.
3790        let pdf_bytes = create_test_pdf_with_form_xobject();
3791        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3792        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3793        let options = ExtractOptions::default();
3794        let mut handler = CollectingHandler::new();
3795
3796        // This should complete without errors (state properly saved/restored)
3797        let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
3798        assert!(result.is_ok());
3799    }
3800
3801    #[test]
3802    fn interpret_page_image_xobject() {
3803        let pdf_bytes = create_test_pdf_with_image_xobject();
3804        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3805        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3806        let options = ExtractOptions::default();
3807        let mut handler = CollectingHandler::new();
3808
3809        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
3810
3811        // Should have 1 image event, no chars
3812        assert_eq!(handler.chars.len(), 0);
3813        assert_eq!(handler.images.len(), 1);
3814        assert_eq!(handler.images[0].name, "Im0");
3815        assert_eq!(handler.images[0].width, 2);
3816        assert_eq!(handler.images[0].height, 2);
3817        assert_eq!(handler.images[0].colorspace.as_deref(), Some("DeviceRGB"));
3818        assert_eq!(handler.images[0].bits_per_component, Some(8));
3819        // CTM should be [200 0 0 150 100 300] from the cm operator
3820        let ctm = handler.images[0].ctm;
3821        assert!((ctm[0] - 200.0).abs() < 0.01);
3822        assert!((ctm[3] - 150.0).abs() < 0.01);
3823        assert!((ctm[4] - 100.0).abs() < 0.01);
3824        assert!((ctm[5] - 300.0).abs() < 0.01);
3825    }
3826
3827    #[test]
3828    fn interpret_page_recursion_limit() {
3829        // Use the nested form XObject PDF but with max_recursion_depth = 0
3830        let pdf_bytes = create_test_pdf_with_form_xobject();
3831        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3832        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3833        let mut options = ExtractOptions::default();
3834        options.max_recursion_depth = 0; // Page level = 0, Form XObject = 1 > limit
3835        let mut handler = CollectingHandler::new();
3836
3837        let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
3838        assert!(result.is_err());
3839        let err_msg = result.unwrap_err().to_string();
3840        assert!(err_msg.contains("recursion depth"));
3841    }
3842
3843    // --- document_metadata() tests ---
3844
3845    #[test]
3846    fn metadata_full_info_dictionary() {
3847        let pdf_bytes = create_test_pdf_with_metadata(
3848            Some("Test Document"),
3849            Some("John Doe"),
3850            Some("Testing metadata"),
3851            Some("test, pdf, rust"),
3852            Some("LibreOffice"),
3853            Some("pdfplumber-rs"),
3854            Some("D:20240101120000+00'00'"),
3855            Some("D:20240615153000+00'00'"),
3856        );
3857        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3858        let meta = LopdfBackend::document_metadata(&doc).unwrap();
3859
3860        assert_eq!(meta.title.as_deref(), Some("Test Document"));
3861        assert_eq!(meta.author.as_deref(), Some("John Doe"));
3862        assert_eq!(meta.subject.as_deref(), Some("Testing metadata"));
3863        assert_eq!(meta.keywords.as_deref(), Some("test, pdf, rust"));
3864        assert_eq!(meta.creator.as_deref(), Some("LibreOffice"));
3865        assert_eq!(meta.producer.as_deref(), Some("pdfplumber-rs"));
3866        assert_eq!(
3867            meta.creation_date.as_deref(),
3868            Some("D:20240101120000+00'00'")
3869        );
3870        assert_eq!(meta.mod_date.as_deref(), Some("D:20240615153000+00'00'"));
3871        assert!(!meta.is_empty());
3872    }
3873
3874    #[test]
3875    fn metadata_partial_info_dictionary() {
3876        let pdf_bytes = create_test_pdf_with_metadata(
3877            Some("Only Title"),
3878            None,
3879            None,
3880            None,
3881            None,
3882            Some("A Producer"),
3883            None,
3884            None,
3885        );
3886        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3887        let meta = LopdfBackend::document_metadata(&doc).unwrap();
3888
3889        assert_eq!(meta.title.as_deref(), Some("Only Title"));
3890        assert_eq!(meta.author, None);
3891        assert_eq!(meta.subject, None);
3892        assert_eq!(meta.keywords, None);
3893        assert_eq!(meta.creator, None);
3894        assert_eq!(meta.producer.as_deref(), Some("A Producer"));
3895        assert_eq!(meta.creation_date, None);
3896        assert_eq!(meta.mod_date, None);
3897        assert!(!meta.is_empty());
3898    }
3899
3900    #[test]
3901    fn metadata_no_info_dictionary() {
3902        // create_test_pdf doesn't add an /Info dictionary
3903        let pdf_bytes = create_test_pdf(1);
3904        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3905        let meta = LopdfBackend::document_metadata(&doc).unwrap();
3906
3907        assert!(meta.is_empty());
3908        assert_eq!(meta.title, None);
3909        assert_eq!(meta.author, None);
3910    }
3911
3912    // --- extract_image_content() tests ---
3913
3914    #[test]
3915    fn extract_image_content_raw_data() {
3916        let pdf_bytes = create_test_pdf_with_image_xobject();
3917        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3918        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3919
3920        let content = LopdfBackend::extract_image_content(&doc, &page, "Im0").unwrap();
3921
3922        assert_eq!(content.format, pdfplumber_core::ImageFormat::Raw);
3923        assert_eq!(content.width, 2);
3924        assert_eq!(content.height, 2);
3925        // 2x2 RGB image = 12 bytes
3926        assert_eq!(content.data.len(), 12);
3927        assert_eq!(
3928            content.data,
3929            vec![255, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0]
3930        );
3931    }
3932
3933    #[test]
3934    fn extract_image_content_not_found() {
3935        let pdf_bytes = create_test_pdf_with_image_xobject();
3936        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3937        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3938
3939        let result = LopdfBackend::extract_image_content(&doc, &page, "NonExistent");
3940        assert!(result.is_err());
3941        let err_msg = result.unwrap_err().to_string();
3942        assert!(err_msg.contains("not found"));
3943    }
3944
3945    #[test]
3946    fn extract_image_content_jpeg() {
3947        // Create a PDF with a JPEG (DCTDecode) image
3948        let pdf_bytes = create_test_pdf_with_jpeg_image();
3949        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3950        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3951
3952        let content = LopdfBackend::extract_image_content(&doc, &page, "Im0").unwrap();
3953
3954        assert_eq!(content.format, pdfplumber_core::ImageFormat::Jpeg);
3955        assert_eq!(content.width, 2);
3956        assert_eq!(content.height, 2);
3957        // JPEG data should be returned as-is
3958        assert!(content.data.starts_with(&[0xFF, 0xD8]));
3959    }
3960
3961    #[test]
3962    fn extract_image_content_no_xobject_resources() {
3963        // A page without XObject resources
3964        let pdf_bytes = create_test_pdf_with_text_content();
3965        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
3966        let page = LopdfBackend::get_page(&doc, 0).unwrap();
3967
3968        let result = LopdfBackend::extract_image_content(&doc, &page, "Im0");
3969        assert!(result.is_err());
3970    }
3971
3972    // --- Encrypted PDF test helpers ---
3973
3974    /// PDF standard padding bytes used in encryption key derivation.
3975    const PAD_BYTES: [u8; 32] = [
3976        0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01,
3977        0x08, 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53,
3978        0x69, 0x7A,
3979    ];
3980
3981    /// Simple RC4 implementation for test encryption.
3982    fn rc4_transform(key: &[u8], data: &[u8]) -> Vec<u8> {
3983        // RC4 KSA
3984        let mut s: Vec<u8> = (0..=255).collect();
3985        let mut j: usize = 0;
3986        for i in 0..256 {
3987            j = (j + s[i] as usize + key[i % key.len()] as usize) & 0xFF;
3988            s.swap(i, j);
3989        }
3990        // RC4 PRGA
3991        let mut out = Vec::with_capacity(data.len());
3992        let mut i: usize = 0;
3993        j = 0;
3994        for &byte in data {
3995            i = (i + 1) & 0xFF;
3996            j = (j + s[i] as usize) & 0xFF;
3997            s.swap(i, j);
3998            let k = s[(s[i] as usize + s[j] as usize) & 0xFF];
3999            out.push(byte ^ k);
4000        }
4001        out
4002    }
4003
4004    /// Create an encrypted PDF with the given user password (RC4, 40-bit, V=1, R=2).
4005    fn create_encrypted_test_pdf(user_password: &[u8]) -> Vec<u8> {
4006        use lopdf::{Document, Object, ObjectId, Stream, StringFormat, dictionary};
4007
4008        let file_id = b"testfileid123456"; // 16 bytes
4009        let permissions: i32 = -4; // all permissions
4010
4011        // Pad password to 32 bytes
4012        let mut padded_pw = Vec::with_capacity(32);
4013        let pw_len = user_password.len().min(32);
4014        padded_pw.extend_from_slice(&user_password[..pw_len]);
4015        padded_pw.extend_from_slice(&PAD_BYTES[..32 - pw_len]);
4016
4017        // Algorithm 3.3: Compute /O value (owner password hash)
4018        // Using same password for owner and user (simplification for tests)
4019        let o_key_digest = md5::compute(&padded_pw);
4020        let o_key = &o_key_digest[..5]; // 40-bit key = 5 bytes
4021        let o_value = rc4_transform(o_key, &padded_pw);
4022
4023        // Algorithm 3.2: Compute encryption key
4024        let mut key_input = Vec::with_capacity(128);
4025        key_input.extend_from_slice(&padded_pw);
4026        key_input.extend_from_slice(&o_value);
4027        key_input.extend_from_slice(&(permissions as u32).to_le_bytes());
4028        key_input.extend_from_slice(file_id);
4029        let key_digest = md5::compute(&key_input);
4030        let enc_key = key_digest[..5].to_vec(); // 40-bit key
4031
4032        // Algorithm 3.4: Compute /U value (R=2)
4033        let u_value = rc4_transform(&enc_key, &PAD_BYTES);
4034
4035        // Build the PDF document
4036        let mut doc = Document::with_version("1.5");
4037        let pages_id: ObjectId = doc.new_object_id();
4038
4039        // Create page with text content (will be encrypted)
4040        let content_bytes = b"BT /F1 12 Tf 72 720 Td (Hello World) Tj ET";
4041        let stream = Stream::new(dictionary! {}, content_bytes.to_vec());
4042        let content_id = doc.add_object(Object::Stream(stream));
4043
4044        let font_id = doc.add_object(dictionary! {
4045            "Type" => "Font",
4046            "Subtype" => "Type1",
4047            "BaseFont" => "Helvetica",
4048        });
4049
4050        let page_id = doc.add_object(dictionary! {
4051            "Type" => "Page",
4052            "Parent" => pages_id,
4053            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4054            "Contents" => Object::Reference(content_id),
4055            "Resources" => dictionary! {
4056                "Font" => dictionary! {
4057                    "F1" => Object::Reference(font_id),
4058                },
4059            },
4060        });
4061
4062        doc.objects.insert(
4063            pages_id,
4064            Object::Dictionary(dictionary! {
4065                "Type" => "Pages",
4066                "Kids" => vec![Object::Reference(page_id)],
4067                "Count" => 1_i64,
4068            }),
4069        );
4070
4071        let catalog_id = doc.add_object(dictionary! {
4072            "Type" => "Catalog",
4073            "Pages" => pages_id,
4074        });
4075        doc.trailer.set("Root", catalog_id);
4076
4077        // Now encrypt all string/stream objects
4078        for (&obj_id, obj) in doc.objects.iter_mut() {
4079            // Compute per-object key: MD5(enc_key + obj_num_le + gen_num_le)[:key_len+5]
4080            let mut obj_key_input = Vec::with_capacity(10);
4081            obj_key_input.extend_from_slice(&enc_key);
4082            obj_key_input.extend_from_slice(&obj_id.0.to_le_bytes()[..3]);
4083            obj_key_input.extend_from_slice(&obj_id.1.to_le_bytes()[..2]);
4084            let obj_key_digest = md5::compute(&obj_key_input);
4085            let obj_key_len = (enc_key.len() + 5).min(16);
4086            let obj_key = &obj_key_digest[..obj_key_len];
4087
4088            match obj {
4089                Object::Stream(stream) => {
4090                    let encrypted = rc4_transform(obj_key, &stream.content);
4091                    stream.set_content(encrypted);
4092                }
4093                Object::String(content, _) => {
4094                    let encrypted = rc4_transform(obj_key, content);
4095                    *content = encrypted;
4096                }
4097                _ => {}
4098            }
4099        }
4100
4101        // Add /Encrypt dictionary
4102        let encrypt_id = doc.add_object(dictionary! {
4103            "Filter" => "Standard",
4104            "V" => 1_i64,
4105            "R" => 2_i64,
4106            "Length" => 40_i64,
4107            "O" => Object::String(o_value, StringFormat::Literal),
4108            "U" => Object::String(u_value, StringFormat::Literal),
4109            "P" => permissions as i64,
4110        });
4111        doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
4112
4113        // Add /ID array
4114        doc.trailer.set(
4115            "ID",
4116            Object::Array(vec![
4117                Object::String(file_id.to_vec(), StringFormat::Literal),
4118                Object::String(file_id.to_vec(), StringFormat::Literal),
4119            ]),
4120        );
4121
4122        let mut buf = Vec::new();
4123        doc.save_to(&mut buf)
4124            .expect("failed to save encrypted test PDF");
4125        buf
4126    }
4127
4128    // --- Encrypted PDF tests ---
4129
4130    #[test]
4131    fn open_encrypted_pdf_without_password_returns_password_required() {
4132        let pdf_bytes = create_encrypted_test_pdf(b"secret123");
4133        let result = LopdfBackend::open(&pdf_bytes);
4134        assert!(result.is_err());
4135        let err: pdfplumber_core::PdfError = result.unwrap_err().into();
4136        assert_eq!(err, pdfplumber_core::PdfError::PasswordRequired);
4137    }
4138
4139    #[test]
4140    fn open_encrypted_pdf_with_correct_password() {
4141        let password = b"secret123";
4142        let pdf_bytes = create_encrypted_test_pdf(password);
4143        let result = LopdfBackend::open_with_password(&pdf_bytes, password);
4144        assert!(result.is_ok());
4145        let doc = result.unwrap();
4146        assert_eq!(LopdfBackend::page_count(&doc), 1);
4147    }
4148
4149    #[test]
4150    fn open_encrypted_pdf_with_wrong_password_returns_invalid_password() {
4151        let pdf_bytes = create_encrypted_test_pdf(b"secret123");
4152        let result = LopdfBackend::open_with_password(&pdf_bytes, b"wrongpassword");
4153        assert!(result.is_err());
4154        let err: pdfplumber_core::PdfError = result.unwrap_err().into();
4155        assert_eq!(err, pdfplumber_core::PdfError::InvalidPassword);
4156    }
4157
4158    #[test]
4159    fn open_unencrypted_pdf_with_password_succeeds() {
4160        // Password is ignored for unencrypted PDFs
4161        let pdf_bytes = create_test_pdf(1);
4162        let result = LopdfBackend::open_with_password(&pdf_bytes, b"anypassword");
4163        assert!(result.is_ok());
4164        let doc = result.unwrap();
4165        assert_eq!(LopdfBackend::page_count(&doc), 1);
4166    }
4167
4168    #[test]
4169    fn open_encrypted_pdf_with_empty_password() {
4170        // Encrypted with empty password — should be openable with empty password
4171        let pdf_bytes = create_encrypted_test_pdf(b"");
4172        let result = LopdfBackend::open_with_password(&pdf_bytes, b"");
4173        assert!(result.is_ok());
4174        let doc = result.unwrap();
4175        assert_eq!(LopdfBackend::page_count(&doc), 1);
4176    }
4177
4178    // --- Form field extraction tests ---
4179
4180    /// Create a PDF with form fields for testing AcroForm extraction.
4181    fn create_test_pdf_with_form_fields() -> Vec<u8> {
4182        use lopdf::{Document, Object, ObjectId, dictionary};
4183
4184        let mut doc = Document::with_version("1.7");
4185        let pages_id: ObjectId = doc.new_object_id();
4186
4187        // Create a page
4188        let page_id = doc.add_object(dictionary! {
4189            "Type" => "Page",
4190            "Parent" => pages_id,
4191            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4192        });
4193
4194        doc.objects.insert(
4195            pages_id,
4196            Object::Dictionary(dictionary! {
4197                "Type" => "Pages",
4198                "Kids" => vec![Object::Reference(page_id)],
4199                "Count" => Object::Integer(1),
4200            }),
4201        );
4202
4203        // Text field
4204        let text_field_id = doc.add_object(dictionary! {
4205            "Type" => "Annot",
4206            "Subtype" => "Widget",
4207            "T" => Object::string_literal("name"),
4208            "FT" => "Tx",
4209            "V" => Object::string_literal("John Doe"),
4210            "DV" => Object::string_literal(""),
4211            "Rect" => vec![50.into(), 700.into(), 200.into(), 720.into()],
4212            "Ff" => Object::Integer(0),
4213            "P" => Object::Reference(page_id),
4214        });
4215
4216        // Checkbox field (Button)
4217        let checkbox_field_id = doc.add_object(dictionary! {
4218            "Type" => "Annot",
4219            "Subtype" => "Widget",
4220            "T" => Object::string_literal("agree"),
4221            "FT" => "Btn",
4222            "V" => "Yes",
4223            "DV" => "Off",
4224            "Rect" => vec![50.into(), 650.into(), 70.into(), 670.into()],
4225            "Ff" => Object::Integer(0),
4226            "P" => Object::Reference(page_id),
4227        });
4228
4229        // Radio button field (Button with flags)
4230        let radio_field_id = doc.add_object(dictionary! {
4231            "Type" => "Annot",
4232            "Subtype" => "Widget",
4233            "T" => Object::string_literal("gender"),
4234            "FT" => "Btn",
4235            "V" => "Male",
4236            "Rect" => vec![50.into(), 600.into(), 70.into(), 620.into()],
4237            "Ff" => Object::Integer(49152), // Radio flag (bit 15) + NoToggleToOff (bit 14)
4238            "P" => Object::Reference(page_id),
4239        });
4240
4241        // Dropdown field (Choice)
4242        let dropdown_field_id = doc.add_object(dictionary! {
4243            "Type" => "Annot",
4244            "Subtype" => "Widget",
4245            "T" => Object::string_literal("country"),
4246            "FT" => "Ch",
4247            "V" => Object::string_literal("US"),
4248            "Rect" => vec![50.into(), 550.into(), 200.into(), 570.into()],
4249            "Opt" => vec![
4250                Object::string_literal("US"),
4251                Object::string_literal("UK"),
4252                Object::string_literal("FR"),
4253            ],
4254            "Ff" => Object::Integer(0),
4255            "P" => Object::Reference(page_id),
4256        });
4257
4258        // Field with no value
4259        let empty_field_id = doc.add_object(dictionary! {
4260            "Type" => "Annot",
4261            "Subtype" => "Widget",
4262            "T" => Object::string_literal("email"),
4263            "FT" => "Tx",
4264            "Rect" => vec![50.into(), 500.into(), 200.into(), 520.into()],
4265            "Ff" => Object::Integer(0),
4266            "P" => Object::Reference(page_id),
4267        });
4268
4269        // AcroForm dictionary
4270        let acroform_id = doc.add_object(dictionary! {
4271            "Fields" => vec![
4272                Object::Reference(text_field_id),
4273                Object::Reference(checkbox_field_id),
4274                Object::Reference(radio_field_id),
4275                Object::Reference(dropdown_field_id),
4276                Object::Reference(empty_field_id),
4277            ],
4278        });
4279
4280        // Catalog
4281        let catalog_id = doc.add_object(dictionary! {
4282            "Type" => "Catalog",
4283            "Pages" => pages_id,
4284            "AcroForm" => Object::Reference(acroform_id),
4285        });
4286        doc.trailer.set("Root", catalog_id);
4287
4288        let mut buf = Vec::new();
4289        doc.save_to(&mut buf).expect("failed to save test PDF");
4290        buf
4291    }
4292
4293    #[test]
4294    fn form_fields_text_field() {
4295        let pdf_bytes = create_test_pdf_with_form_fields();
4296        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4297        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4298
4299        let text_field = fields.iter().find(|f| f.name == "name").unwrap();
4300        assert_eq!(text_field.field_type, FieldType::Text);
4301        assert_eq!(text_field.value.as_deref(), Some("John Doe"));
4302        assert_eq!(text_field.default_value.as_deref(), Some(""));
4303    }
4304
4305    #[test]
4306    fn form_fields_checkbox() {
4307        let pdf_bytes = create_test_pdf_with_form_fields();
4308        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4309        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4310
4311        let checkbox = fields.iter().find(|f| f.name == "agree").unwrap();
4312        assert_eq!(checkbox.field_type, FieldType::Button);
4313        assert_eq!(checkbox.value.as_deref(), Some("Yes"));
4314        assert_eq!(checkbox.default_value.as_deref(), Some("Off"));
4315    }
4316
4317    #[test]
4318    fn form_fields_radio_button() {
4319        let pdf_bytes = create_test_pdf_with_form_fields();
4320        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4321        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4322
4323        let radio = fields.iter().find(|f| f.name == "gender").unwrap();
4324        assert_eq!(radio.field_type, FieldType::Button);
4325        assert_eq!(radio.value.as_deref(), Some("Male"));
4326        assert_eq!(radio.flags, 49152); // Radio flags
4327    }
4328
4329    #[test]
4330    fn form_fields_dropdown_with_options() {
4331        let pdf_bytes = create_test_pdf_with_form_fields();
4332        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4333        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4334
4335        let dropdown = fields.iter().find(|f| f.name == "country").unwrap();
4336        assert_eq!(dropdown.field_type, FieldType::Choice);
4337        assert_eq!(dropdown.value.as_deref(), Some("US"));
4338        assert_eq!(dropdown.options, vec!["US", "UK", "FR"]);
4339    }
4340
4341    #[test]
4342    fn form_fields_no_value() {
4343        let pdf_bytes = create_test_pdf_with_form_fields();
4344        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4345        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4346
4347        let empty = fields.iter().find(|f| f.name == "email").unwrap();
4348        assert_eq!(empty.field_type, FieldType::Text);
4349        assert!(empty.value.is_none());
4350        assert!(empty.default_value.is_none());
4351    }
4352
4353    #[test]
4354    fn form_fields_count() {
4355        let pdf_bytes = create_test_pdf_with_form_fields();
4356        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4357        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4358        assert_eq!(fields.len(), 5);
4359    }
4360
4361    #[test]
4362    fn form_fields_no_acroform_returns_empty() {
4363        let pdf_bytes = create_test_pdf(1);
4364        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4365        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4366        assert!(fields.is_empty());
4367    }
4368
4369    #[test]
4370    fn form_fields_have_bbox() {
4371        let pdf_bytes = create_test_pdf_with_form_fields();
4372        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4373        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4374
4375        let text_field = fields.iter().find(|f| f.name == "name").unwrap();
4376        assert!((text_field.bbox.x0 - 50.0).abs() < 0.1);
4377        assert!((text_field.bbox.x1 - 200.0).abs() < 0.1);
4378    }
4379
4380    #[test]
4381    fn form_fields_have_page_index() {
4382        let pdf_bytes = create_test_pdf_with_form_fields();
4383        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4384        let fields = LopdfBackend::document_form_fields(&doc).unwrap();
4385
4386        // All fields reference page 0
4387        for field in &fields {
4388            assert_eq!(field.page_index, Some(0));
4389        }
4390    }
4391
4392    // --- Structure tree tests (US-081) ---
4393
4394    /// Create a test PDF with a structure tree (tagged PDF).
4395    ///
4396    /// Structure: Document -> H1 (MCID 0) -> P (MCID 1)
4397    fn create_test_pdf_with_structure_tree() -> Vec<u8> {
4398        use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4399
4400        let mut doc = Document::with_version("1.7");
4401        let pages_id: ObjectId = doc.new_object_id();
4402
4403        // Content stream with marked content
4404        let content = b"BT /F1 24 Tf /H1 <</MCID 0>> BDC 72 700 Td (Chapter 1) Tj EMC /P <</MCID 1>> BDC /F1 12 Tf 72 670 Td (This is paragraph text.) Tj EMC ET";
4405        let stream = Stream::new(dictionary! {}, content.to_vec());
4406        let content_id = doc.add_object(Object::Stream(stream));
4407
4408        let font_id = doc.add_object(dictionary! {
4409            "Type" => "Font",
4410            "Subtype" => "Type1",
4411            "BaseFont" => "Helvetica",
4412        });
4413
4414        let page_id = doc.add_object(dictionary! {
4415            "Type" => "Page",
4416            "Parent" => pages_id,
4417            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4418            "Contents" => Object::Reference(content_id),
4419            "Resources" => dictionary! {
4420                "Font" => dictionary! {
4421                    "F1" => Object::Reference(font_id),
4422                },
4423            },
4424        });
4425
4426        doc.objects.insert(
4427            pages_id,
4428            Object::Dictionary(dictionary! {
4429                "Type" => "Pages",
4430                "Kids" => vec![Object::Reference(page_id)],
4431                "Count" => Object::Integer(1),
4432            }),
4433        );
4434
4435        // Structure tree elements
4436        // H1 element with MCID 0
4437        let h1_elem_id = doc.add_object(dictionary! {
4438            "Type" => "StructElem",
4439            "S" => "H1",
4440            "K" => Object::Integer(0),
4441            "Pg" => Object::Reference(page_id),
4442        });
4443
4444        // P element with MCID 1
4445        let p_elem_id = doc.add_object(dictionary! {
4446            "Type" => "StructElem",
4447            "S" => "P",
4448            "K" => Object::Integer(1),
4449            "Pg" => Object::Reference(page_id),
4450            "Lang" => Object::string_literal("en-US"),
4451        });
4452
4453        // Document root element
4454        let doc_elem_id = doc.add_object(dictionary! {
4455            "Type" => "StructElem",
4456            "S" => "Document",
4457            "K" => vec![
4458                Object::Reference(h1_elem_id),
4459                Object::Reference(p_elem_id),
4460            ],
4461        });
4462
4463        // StructTreeRoot
4464        let struct_tree_id = doc.add_object(dictionary! {
4465            "Type" => "StructTreeRoot",
4466            "K" => Object::Reference(doc_elem_id),
4467        });
4468
4469        // Mark document as tagged
4470        let mark_info_id = doc.add_object(dictionary! {
4471            "Marked" => Object::Boolean(true),
4472        });
4473
4474        // Catalog
4475        let catalog_id = doc.add_object(dictionary! {
4476            "Type" => "Catalog",
4477            "Pages" => pages_id,
4478            "StructTreeRoot" => Object::Reference(struct_tree_id),
4479            "MarkInfo" => Object::Reference(mark_info_id),
4480        });
4481        doc.trailer.set("Root", catalog_id);
4482
4483        let mut buf = Vec::new();
4484        doc.save_to(&mut buf)
4485            .expect("failed to save tagged test PDF");
4486        buf
4487    }
4488
4489    /// Create a test PDF with a structure tree containing a table.
4490    fn create_test_pdf_with_table_structure() -> Vec<u8> {
4491        use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4492
4493        let mut doc = Document::with_version("1.7");
4494        let pages_id: ObjectId = doc.new_object_id();
4495
4496        let content = b"BT /F1 12 Tf 72 700 Td (Cell 1) Tj ET";
4497        let stream = Stream::new(dictionary! {}, content.to_vec());
4498        let content_id = doc.add_object(Object::Stream(stream));
4499
4500        let font_id = doc.add_object(dictionary! {
4501            "Type" => "Font",
4502            "Subtype" => "Type1",
4503            "BaseFont" => "Helvetica",
4504        });
4505
4506        let page_id = doc.add_object(dictionary! {
4507            "Type" => "Page",
4508            "Parent" => pages_id,
4509            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4510            "Contents" => Object::Reference(content_id),
4511            "Resources" => dictionary! {
4512                "Font" => dictionary! {
4513                    "F1" => Object::Reference(font_id),
4514                },
4515            },
4516        });
4517
4518        doc.objects.insert(
4519            pages_id,
4520            Object::Dictionary(dictionary! {
4521                "Type" => "Pages",
4522                "Kids" => vec![Object::Reference(page_id)],
4523                "Count" => Object::Integer(1),
4524            }),
4525        );
4526
4527        // Table structure: Table -> TR -> TD (MCID 0), TD (MCID 1)
4528        let td1_id = doc.add_object(dictionary! {
4529            "Type" => "StructElem",
4530            "S" => "TD",
4531            "K" => Object::Integer(0),
4532            "Pg" => Object::Reference(page_id),
4533        });
4534
4535        let td2_id = doc.add_object(dictionary! {
4536            "Type" => "StructElem",
4537            "S" => "TD",
4538            "K" => Object::Integer(1),
4539            "Pg" => Object::Reference(page_id),
4540        });
4541
4542        let tr_id = doc.add_object(dictionary! {
4543            "Type" => "StructElem",
4544            "S" => "TR",
4545            "K" => vec![Object::Reference(td1_id), Object::Reference(td2_id)],
4546        });
4547
4548        let table_id = doc.add_object(dictionary! {
4549            "Type" => "StructElem",
4550            "S" => "Table",
4551            "K" => Object::Reference(tr_id),
4552            "Pg" => Object::Reference(page_id),
4553        });
4554
4555        let struct_tree_id = doc.add_object(dictionary! {
4556            "Type" => "StructTreeRoot",
4557            "K" => Object::Reference(table_id),
4558        });
4559
4560        let catalog_id = doc.add_object(dictionary! {
4561            "Type" => "Catalog",
4562            "Pages" => pages_id,
4563            "StructTreeRoot" => Object::Reference(struct_tree_id),
4564        });
4565        doc.trailer.set("Root", catalog_id);
4566
4567        let mut buf = Vec::new();
4568        doc.save_to(&mut buf).expect("failed to save test PDF");
4569        buf
4570    }
4571
4572    #[test]
4573    fn structure_tree_tagged_pdf_has_elements() {
4574        let pdf_bytes = create_test_pdf_with_structure_tree();
4575        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4576        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4577
4578        assert!(!elements.is_empty());
4579    }
4580
4581    #[test]
4582    fn structure_tree_document_root_element() {
4583        let pdf_bytes = create_test_pdf_with_structure_tree();
4584        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4585        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4586
4587        // Root should be "Document" element
4588        assert_eq!(elements.len(), 1);
4589        assert_eq!(elements[0].element_type, "Document");
4590        assert_eq!(elements[0].children.len(), 2);
4591    }
4592
4593    #[test]
4594    fn structure_tree_heading_element() {
4595        let pdf_bytes = create_test_pdf_with_structure_tree();
4596        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4597        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4598
4599        let doc_elem = &elements[0];
4600        let h1 = &doc_elem.children[0];
4601        assert_eq!(h1.element_type, "H1");
4602        assert_eq!(h1.mcids, vec![0]);
4603        assert_eq!(h1.page_index, Some(0));
4604    }
4605
4606    #[test]
4607    fn structure_tree_paragraph_element() {
4608        let pdf_bytes = create_test_pdf_with_structure_tree();
4609        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4610        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4611
4612        let doc_elem = &elements[0];
4613        let p = &doc_elem.children[1];
4614        assert_eq!(p.element_type, "P");
4615        assert_eq!(p.mcids, vec![1]);
4616        assert_eq!(p.page_index, Some(0));
4617        assert_eq!(p.lang.as_deref(), Some("en-US"));
4618    }
4619
4620    #[test]
4621    fn structure_tree_untagged_pdf_returns_empty() {
4622        // Use the basic test PDF helper (no structure tree)
4623        let pdf_bytes = create_test_pdf_with_text_content();
4624        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4625        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4626
4627        assert!(elements.is_empty());
4628    }
4629
4630    #[test]
4631    fn structure_tree_table_nested_structure() {
4632        let pdf_bytes = create_test_pdf_with_table_structure();
4633        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
4634        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4635
4636        // Root is Table element
4637        assert_eq!(elements.len(), 1);
4638        let table = &elements[0];
4639        assert_eq!(table.element_type, "Table");
4640
4641        // Table -> TR
4642        assert_eq!(table.children.len(), 1);
4643        let tr = &table.children[0];
4644        assert_eq!(tr.element_type, "TR");
4645
4646        // TR -> TD, TD
4647        assert_eq!(tr.children.len(), 2);
4648        assert_eq!(tr.children[0].element_type, "TD");
4649        assert_eq!(tr.children[0].mcids, vec![0]);
4650        assert_eq!(tr.children[1].element_type, "TD");
4651        assert_eq!(tr.children[1].mcids, vec![1]);
4652    }
4653
4654    #[test]
4655    fn structure_tree_mcr_dictionary_handling() {
4656        // Test with MCR (marked content reference) dictionaries instead of integer MCIDs
4657        use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4658
4659        let mut doc = Document::with_version("1.7");
4660        let pages_id: ObjectId = doc.new_object_id();
4661
4662        let content = b"BT /F1 12 Tf 72 700 Td (text) Tj ET";
4663        let stream = Stream::new(dictionary! {}, content.to_vec());
4664        let content_id = doc.add_object(Object::Stream(stream));
4665
4666        let font_id = doc.add_object(dictionary! {
4667            "Type" => "Font",
4668            "Subtype" => "Type1",
4669            "BaseFont" => "Helvetica",
4670        });
4671
4672        let page_id = doc.add_object(dictionary! {
4673            "Type" => "Page",
4674            "Parent" => pages_id,
4675            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4676            "Contents" => Object::Reference(content_id),
4677            "Resources" => dictionary! {
4678                "Font" => dictionary! {
4679                    "F1" => Object::Reference(font_id),
4680                },
4681            },
4682        });
4683
4684        doc.objects.insert(
4685            pages_id,
4686            Object::Dictionary(dictionary! {
4687                "Type" => "Pages",
4688                "Kids" => vec![Object::Reference(page_id)],
4689                "Count" => Object::Integer(1),
4690            }),
4691        );
4692
4693        // Structure element with MCR dictionary in /K
4694        let p_elem_id = doc.add_object(dictionary! {
4695            "Type" => "StructElem",
4696            "S" => "P",
4697            "K" => dictionary! {
4698                "Type" => "MCR",
4699                "MCID" => Object::Integer(5),
4700                "Pg" => Object::Reference(page_id),
4701            },
4702            "Pg" => Object::Reference(page_id),
4703        });
4704
4705        let struct_tree_id = doc.add_object(dictionary! {
4706            "Type" => "StructTreeRoot",
4707            "K" => Object::Reference(p_elem_id),
4708        });
4709
4710        let catalog_id = doc.add_object(dictionary! {
4711            "Type" => "Catalog",
4712            "Pages" => pages_id,
4713            "StructTreeRoot" => Object::Reference(struct_tree_id),
4714        });
4715        doc.trailer.set("Root", catalog_id);
4716
4717        let mut buf = Vec::new();
4718        doc.save_to(&mut buf).expect("failed to save test PDF");
4719
4720        let doc = LopdfBackend::open(&buf).unwrap();
4721        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4722
4723        assert_eq!(elements.len(), 1);
4724        let p = &elements[0];
4725        assert_eq!(p.element_type, "P");
4726        assert_eq!(p.mcids, vec![5]); // MCID from MCR dictionary
4727    }
4728
4729    #[test]
4730    fn structure_tree_alt_text() {
4731        use lopdf::{Document, Object, ObjectId, Stream, dictionary};
4732
4733        let mut doc = Document::with_version("1.7");
4734        let pages_id: ObjectId = doc.new_object_id();
4735
4736        let content = b"BT /F1 12 Tf 72 700 Td (image) Tj ET";
4737        let stream = Stream::new(dictionary! {}, content.to_vec());
4738        let content_id = doc.add_object(Object::Stream(stream));
4739
4740        let font_id = doc.add_object(dictionary! {
4741            "Type" => "Font",
4742            "Subtype" => "Type1",
4743            "BaseFont" => "Helvetica",
4744        });
4745
4746        let page_id = doc.add_object(dictionary! {
4747            "Type" => "Page",
4748            "Parent" => pages_id,
4749            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
4750            "Contents" => Object::Reference(content_id),
4751            "Resources" => dictionary! {
4752                "Font" => dictionary! {
4753                    "F1" => Object::Reference(font_id),
4754                },
4755            },
4756        });
4757
4758        doc.objects.insert(
4759            pages_id,
4760            Object::Dictionary(dictionary! {
4761                "Type" => "Pages",
4762                "Kids" => vec![Object::Reference(page_id)],
4763                "Count" => Object::Integer(1),
4764            }),
4765        );
4766
4767        // Figure element with /Alt and /ActualText
4768        let fig_elem_id = doc.add_object(dictionary! {
4769            "Type" => "StructElem",
4770            "S" => "Figure",
4771            "K" => Object::Integer(0),
4772            "Pg" => Object::Reference(page_id),
4773            "Alt" => Object::string_literal("A photo of a sunset"),
4774            "ActualText" => Object::string_literal("Sunset photo"),
4775        });
4776
4777        let struct_tree_id = doc.add_object(dictionary! {
4778            "Type" => "StructTreeRoot",
4779            "K" => Object::Reference(fig_elem_id),
4780        });
4781
4782        let catalog_id = doc.add_object(dictionary! {
4783            "Type" => "Catalog",
4784            "Pages" => pages_id,
4785            "StructTreeRoot" => Object::Reference(struct_tree_id),
4786        });
4787        doc.trailer.set("Root", catalog_id);
4788
4789        let mut buf = Vec::new();
4790        doc.save_to(&mut buf).expect("failed to save test PDF");
4791
4792        let doc = LopdfBackend::open(&buf).unwrap();
4793        let elements = LopdfBackend::document_structure_tree(&doc).unwrap();
4794
4795        assert_eq!(elements.len(), 1);
4796        let fig = &elements[0];
4797        assert_eq!(fig.element_type, "Figure");
4798        assert_eq!(fig.alt_text.as_deref(), Some("A photo of a sunset"));
4799        assert_eq!(fig.actual_text.as_deref(), Some("Sunset photo"));
4800    }
4801}