Skip to main content

pdfplumber_parse/
lopdf_backend.rs

1//! lopdf-based PDF parsing backend.
2//!
3//! Implements [`PdfBackend`] using the [lopdf](https://crates.io/crates/lopdf)
4//! crate for PDF document parsing. This is the default backend for pdfplumber-rs.
5
6use crate::backend::PdfBackend;
7use crate::error::BackendError;
8use crate::handler::ContentHandler;
9use pdfplumber_core::{BBox, ExtractOptions};
10
11/// A parsed PDF document backed by lopdf.
12pub struct LopdfDocument {
13    /// The underlying lopdf document.
14    inner: lopdf::Document,
15    /// Cached ordered list of page ObjectIds (indexed by 0-based page number).
16    page_ids: Vec<lopdf::ObjectId>,
17}
18
19impl LopdfDocument {
20    /// Access the underlying lopdf document.
21    pub fn inner(&self) -> &lopdf::Document {
22        &self.inner
23    }
24}
25
26impl std::fmt::Debug for LopdfDocument {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        f.debug_struct("LopdfDocument")
29            .field("page_count", &self.page_ids.len())
30            .finish_non_exhaustive()
31    }
32}
33
34/// A reference to a single page within a [`LopdfDocument`].
35#[derive(Debug, Clone, Copy)]
36pub struct LopdfPage {
37    /// The lopdf object ID for this page.
38    pub object_id: lopdf::ObjectId,
39    /// The 0-based page index.
40    pub index: usize,
41}
42
43/// The lopdf-based PDF backend.
44///
45/// Provides PDF parsing via [`lopdf::Document`]. This is the default
46/// backend used by pdfplumber-rs.
47///
48/// # Example
49///
50/// ```ignore
51/// use pdfplumber_parse::lopdf_backend::LopdfBackend;
52/// use pdfplumber_parse::PdfBackend;
53///
54/// let doc = LopdfBackend::open(pdf_bytes)?;
55/// let count = LopdfBackend::page_count(&doc);
56/// let page = LopdfBackend::get_page(&doc, 0)?;
57/// ```
58pub struct LopdfBackend;
59
60/// Extract a [`BBox`] from a lopdf array of 4 numbers `[x0, y0, x1, y1]`.
61fn extract_bbox_from_array(array: &[lopdf::Object]) -> Result<BBox, BackendError> {
62    if array.len() != 4 {
63        return Err(BackendError::Parse(format!(
64            "expected 4-element array for box, got {}",
65            array.len()
66        )));
67    }
68    let x0 = object_to_f64(&array[0])?;
69    let y0 = object_to_f64(&array[1])?;
70    let x1 = object_to_f64(&array[2])?;
71    let y1 = object_to_f64(&array[3])?;
72    Ok(BBox::new(x0, y0, x1, y1))
73}
74
75/// Convert a lopdf numeric object (Integer or Real) to f64.
76pub(crate) fn object_to_f64(obj: &lopdf::Object) -> Result<f64, BackendError> {
77    match obj {
78        lopdf::Object::Integer(i) => Ok(*i as f64),
79        lopdf::Object::Real(f) => Ok(*f as f64),
80        _ => Err(BackendError::Parse(format!("expected number, got {obj:?}"))),
81    }
82}
83
84/// Look up a key in the page dictionary, walking up the page tree
85/// (via /Parent) if the key is not found on the page itself.
86///
87/// Returns `None` if the key is not found anywhere in the tree.
88fn resolve_inherited<'a>(
89    doc: &'a lopdf::Document,
90    page_id: lopdf::ObjectId,
91    key: &[u8],
92) -> Result<Option<&'a lopdf::Object>, BackendError> {
93    let mut current_id = page_id;
94    loop {
95        let dict = doc
96            .get_object(current_id)
97            .and_then(|o| o.as_dict())
98            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
99
100        if let Ok(value) = dict.get(key) {
101            return Ok(Some(value));
102        }
103
104        // Try to follow /Parent link
105        match dict.get(b"Parent") {
106            Ok(parent_obj) => {
107                current_id = parent_obj
108                    .as_reference()
109                    .map_err(|e| BackendError::Parse(format!("invalid /Parent reference: {e}")))?;
110            }
111            Err(_) => return Ok(None),
112        }
113    }
114}
115
116impl PdfBackend for LopdfBackend {
117    type Document = LopdfDocument;
118    type Page = LopdfPage;
119    type Error = BackendError;
120
121    fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
122        let inner = lopdf::Document::load_mem(bytes)
123            .map_err(|e| BackendError::Parse(format!("failed to parse PDF: {e}")))?;
124
125        // Cache page IDs in order (get_pages returns BTreeMap<u32, ObjectId> with 1-based keys)
126        let pages_map = inner.get_pages();
127        let page_ids: Vec<lopdf::ObjectId> = pages_map.values().copied().collect();
128
129        Ok(LopdfDocument { inner, page_ids })
130    }
131
132    fn page_count(doc: &Self::Document) -> usize {
133        doc.page_ids.len()
134    }
135
136    fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
137        if index >= doc.page_ids.len() {
138            return Err(BackendError::Parse(format!(
139                "page index {index} out of range (0..{})",
140                doc.page_ids.len()
141            )));
142        }
143        Ok(LopdfPage {
144            object_id: doc.page_ids[index],
145            index,
146        })
147    }
148
149    fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
150        let obj = resolve_inherited(&doc.inner, page.object_id, b"MediaBox")?
151            .ok_or_else(|| BackendError::Parse("MediaBox not found on page or ancestors".into()))?;
152        let array = obj
153            .as_array()
154            .map_err(|e| BackendError::Parse(format!("MediaBox is not an array: {e}")))?;
155        extract_bbox_from_array(array)
156    }
157
158    fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error> {
159        // CropBox is optional — only look at the page itself, not inherited
160        let dict = doc
161            .inner
162            .get_object(page.object_id)
163            .and_then(|o| o.as_dict())
164            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
165
166        match dict.get(b"CropBox") {
167            Ok(obj) => {
168                let array = obj
169                    .as_array()
170                    .map_err(|e| BackendError::Parse(format!("CropBox is not an array: {e}")))?;
171                Ok(Some(extract_bbox_from_array(array)?))
172            }
173            Err(_) => Ok(None),
174        }
175    }
176
177    fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
178        match resolve_inherited(&doc.inner, page.object_id, b"Rotate")? {
179            Some(obj) => {
180                let rotation = obj
181                    .as_i64()
182                    .map_err(|e| BackendError::Parse(format!("Rotate is not an integer: {e}")))?;
183                Ok(rotation as i32)
184            }
185            None => Ok(0), // Default rotation is 0
186        }
187    }
188
189    fn interpret_page(
190        doc: &Self::Document,
191        page: &Self::Page,
192        handler: &mut dyn ContentHandler,
193        options: &ExtractOptions,
194    ) -> Result<(), Self::Error> {
195        let inner = &doc.inner;
196
197        // Get the page dictionary
198        let page_dict = inner
199            .get_object(page.object_id)
200            .and_then(|o| o.as_dict())
201            .map_err(|e| BackendError::Parse(format!("failed to get page dictionary: {e}")))?;
202
203        // Get page content stream bytes
204        let content_bytes = get_page_content_bytes(inner, page_dict)?;
205
206        // Get page resources (may be inherited)
207        let resources = get_page_resources(inner, page.object_id)?;
208
209        // Initialize state machines
210        let mut gstate = crate::interpreter_state::InterpreterState::new();
211        let mut tstate = crate::text_state::TextState::new();
212
213        // Interpret the content stream
214        crate::interpreter::interpret_content_stream(
215            inner,
216            &content_bytes,
217            resources,
218            handler,
219            options,
220            0, // page-level depth
221            &mut gstate,
222            &mut tstate,
223        )
224    }
225}
226
227/// Get the content stream bytes from a page dictionary.
228///
229/// Handles both single stream references and arrays of stream references.
230fn get_page_content_bytes(
231    doc: &lopdf::Document,
232    page_dict: &lopdf::Dictionary,
233) -> Result<Vec<u8>, BackendError> {
234    let contents_obj = match page_dict.get(b"Contents") {
235        Ok(obj) => obj,
236        Err(_) => return Ok(Vec::new()), // Page with no content
237    };
238
239    match contents_obj {
240        lopdf::Object::Reference(id) => {
241            let obj = doc
242                .get_object(*id)
243                .map_err(|e| BackendError::Parse(format!("failed to resolve /Contents: {e}")))?;
244            let stream = obj
245                .as_stream()
246                .map_err(|e| BackendError::Parse(format!("/Contents is not a stream: {e}")))?;
247            decode_content_stream(stream)
248        }
249        lopdf::Object::Array(arr) => {
250            let mut content = Vec::new();
251            for item in arr {
252                let id = item.as_reference().map_err(|e| {
253                    BackendError::Parse(format!("/Contents array item is not a reference: {e}"))
254                })?;
255                let obj = doc.get_object(id).map_err(|e| {
256                    BackendError::Parse(format!("failed to resolve /Contents stream: {e}"))
257                })?;
258                let stream = obj.as_stream().map_err(|e| {
259                    BackendError::Parse(format!("/Contents array item is not a stream: {e}"))
260                })?;
261                let bytes = decode_content_stream(stream)?;
262                if !content.is_empty() {
263                    content.push(b' ');
264                }
265                content.extend_from_slice(&bytes);
266            }
267            Ok(content)
268        }
269        _ => Err(BackendError::Parse(
270            "/Contents is not a reference or array".to_string(),
271        )),
272    }
273}
274
275/// Decode a content stream, decompressing if needed.
276fn decode_content_stream(stream: &lopdf::Stream) -> Result<Vec<u8>, BackendError> {
277    if stream.dict.get(b"Filter").is_ok() {
278        stream
279            .decompressed_content()
280            .map_err(|e| BackendError::Parse(format!("failed to decompress content stream: {e}")))
281    } else {
282        Ok(stream.content.clone())
283    }
284}
285
286/// Get the resources dictionary for a page, handling inheritance.
287fn get_page_resources(
288    doc: &lopdf::Document,
289    page_id: lopdf::ObjectId,
290) -> Result<&lopdf::Dictionary, BackendError> {
291    match resolve_inherited(doc, page_id, b"Resources")? {
292        Some(obj) => {
293            // Resolve indirect reference if needed
294            let obj = match obj {
295                lopdf::Object::Reference(id) => doc.get_object(*id).map_err(|e| {
296                    BackendError::Parse(format!("failed to resolve /Resources reference: {e}"))
297                })?,
298                other => other,
299            };
300            obj.as_dict()
301                .map_err(|_| BackendError::Parse("/Resources is not a dictionary".to_string()))
302        }
303        None => {
304            // No resources at all — use empty dictionary
305            // This is unusual but we handle it gracefully
306            static EMPTY_DICT: std::sync::LazyLock<lopdf::Dictionary> =
307                std::sync::LazyLock::new(lopdf::Dictionary::new);
308            Ok(&EMPTY_DICT)
309        }
310    }
311}
312
313/// Create a minimal valid PDF document with the given number of pages.
314///
315/// Each page is US Letter size (612 x 792 points) with no content.
316/// Used for testing purposes.
317#[cfg(test)]
318fn create_test_pdf(page_count: usize) -> Vec<u8> {
319    use lopdf::{Document, Object, ObjectId, dictionary};
320
321    let mut doc = Document::with_version("1.5");
322    let pages_id: ObjectId = doc.new_object_id();
323
324    let mut page_ids: Vec<Object> = Vec::new();
325    for _ in 0..page_count {
326        let page_id = doc.add_object(dictionary! {
327            "Type" => "Page",
328            "Parent" => pages_id,
329            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
330        });
331        page_ids.push(page_id.into());
332    }
333
334    doc.objects.insert(
335        pages_id,
336        Object::Dictionary(dictionary! {
337            "Type" => "Pages",
338            "Kids" => page_ids,
339            "Count" => page_count as i64,
340        }),
341    );
342
343    let catalog_id = doc.add_object(dictionary! {
344        "Type" => "Catalog",
345        "Pages" => pages_id,
346    });
347    doc.trailer.set("Root", catalog_id);
348
349    let mut buf = Vec::new();
350    doc.save_to(&mut buf).expect("failed to save test PDF");
351    buf
352}
353
354/// Create a PDF where pages inherit MediaBox from the Pages parent node.
355#[cfg(test)]
356fn create_test_pdf_inherited_media_box() -> Vec<u8> {
357    use lopdf::{Document, Object, ObjectId, dictionary};
358
359    let mut doc = Document::with_version("1.5");
360    let pages_id: ObjectId = doc.new_object_id();
361
362    // Page WITHOUT its own MediaBox — should inherit from parent
363    let page_id = doc.add_object(dictionary! {
364        "Type" => "Page",
365        "Parent" => pages_id,
366    });
367
368    doc.objects.insert(
369        pages_id,
370        Object::Dictionary(dictionary! {
371            "Type" => "Pages",
372            "Kids" => vec![Object::from(page_id)],
373            "Count" => 1i64,
374            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
375        }),
376    );
377
378    let catalog_id = doc.add_object(dictionary! {
379        "Type" => "Catalog",
380        "Pages" => pages_id,
381    });
382    doc.trailer.set("Root", catalog_id);
383
384    let mut buf = Vec::new();
385    doc.save_to(&mut buf).expect("failed to save test PDF");
386    buf
387}
388
389/// Create a PDF with a page that has an explicit CropBox.
390#[cfg(test)]
391fn create_test_pdf_with_crop_box() -> Vec<u8> {
392    use lopdf::{Document, Object, ObjectId, dictionary};
393
394    let mut doc = Document::with_version("1.5");
395    let pages_id: ObjectId = doc.new_object_id();
396
397    let page_id = doc.add_object(dictionary! {
398        "Type" => "Page",
399        "Parent" => pages_id,
400        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
401        "CropBox" => vec![
402            Object::Real(36.0),
403            Object::Real(36.0),
404            Object::Real(576.0),
405            Object::Real(756.0),
406        ],
407    });
408
409    doc.objects.insert(
410        pages_id,
411        Object::Dictionary(dictionary! {
412            "Type" => "Pages",
413            "Kids" => vec![Object::from(page_id)],
414            "Count" => 1i64,
415        }),
416    );
417
418    let catalog_id = doc.add_object(dictionary! {
419        "Type" => "Catalog",
420        "Pages" => pages_id,
421    });
422    doc.trailer.set("Root", catalog_id);
423
424    let mut buf = Vec::new();
425    doc.save_to(&mut buf).expect("failed to save test PDF");
426    buf
427}
428
429/// Create a PDF with a page that has a /Rotate value.
430#[cfg(test)]
431fn create_test_pdf_with_rotate(rotation: i64) -> Vec<u8> {
432    use lopdf::{Document, Object, ObjectId, dictionary};
433
434    let mut doc = Document::with_version("1.5");
435    let pages_id: ObjectId = doc.new_object_id();
436
437    let page_id = doc.add_object(dictionary! {
438        "Type" => "Page",
439        "Parent" => pages_id,
440        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
441        "Rotate" => rotation,
442    });
443
444    doc.objects.insert(
445        pages_id,
446        Object::Dictionary(dictionary! {
447            "Type" => "Pages",
448            "Kids" => vec![Object::from(page_id)],
449            "Count" => 1i64,
450        }),
451    );
452
453    let catalog_id = doc.add_object(dictionary! {
454        "Type" => "Catalog",
455        "Pages" => pages_id,
456    });
457    doc.trailer.set("Root", catalog_id);
458
459    let mut buf = Vec::new();
460    doc.save_to(&mut buf).expect("failed to save test PDF");
461    buf
462}
463
464/// Create a PDF where Rotate is inherited from the Pages parent node.
465#[cfg(test)]
466fn create_test_pdf_inherited_rotate(rotation: i64) -> Vec<u8> {
467    use lopdf::{Document, Object, ObjectId, dictionary};
468
469    let mut doc = Document::with_version("1.5");
470    let pages_id: ObjectId = doc.new_object_id();
471
472    // Page WITHOUT Rotate — should inherit from parent
473    let page_id = doc.add_object(dictionary! {
474        "Type" => "Page",
475        "Parent" => pages_id,
476        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
477    });
478
479    doc.objects.insert(
480        pages_id,
481        Object::Dictionary(dictionary! {
482            "Type" => "Pages",
483            "Kids" => vec![Object::from(page_id)],
484            "Count" => 1i64,
485            "Rotate" => rotation,
486        }),
487    );
488
489    let catalog_id = doc.add_object(dictionary! {
490        "Type" => "Catalog",
491        "Pages" => pages_id,
492    });
493    doc.trailer.set("Root", catalog_id);
494
495    let mut buf = Vec::new();
496    doc.save_to(&mut buf).expect("failed to save test PDF");
497    buf
498}
499
500/// Create a PDF with a page that references a Form XObject containing text.
501///
502/// Page content: `q /FM1 Do Q`
503/// Form XObject FM1 content: `BT /F1 12 Tf 72 700 Td (Hello) Tj ET`
504#[cfg(test)]
505fn create_test_pdf_with_form_xobject() -> Vec<u8> {
506    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
507
508    let mut doc = Document::with_version("1.5");
509    let pages_id: ObjectId = doc.new_object_id();
510
511    // Minimal Type1 font dictionary
512    let font_id = doc.add_object(dictionary! {
513        "Type" => "Font",
514        "Subtype" => "Type1",
515        "BaseFont" => "Helvetica",
516    });
517
518    // Form XObject stream: contains text
519    let form_content = b"BT /F1 12 Tf 72 700 Td (Hello) Tj ET";
520    let form_stream = Stream::new(
521        dictionary! {
522            "Type" => "XObject",
523            "Subtype" => "Form",
524            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
525            "Resources" => Object::Dictionary(dictionary! {
526                "Font" => Object::Dictionary(dictionary! {
527                    "F1" => font_id,
528                }),
529            }),
530        },
531        form_content.to_vec(),
532    );
533    let form_id = doc.add_object(Object::Stream(form_stream));
534
535    // Page content: invoke the form XObject
536    let page_content = b"q /FM1 Do Q";
537    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
538    let content_id = doc.add_object(Object::Stream(page_stream));
539
540    let page_id = doc.add_object(dictionary! {
541        "Type" => "Page",
542        "Parent" => pages_id,
543        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
544        "Contents" => content_id,
545        "Resources" => Object::Dictionary(dictionary! {
546            "Font" => Object::Dictionary(dictionary! {
547                "F1" => font_id,
548            }),
549            "XObject" => Object::Dictionary(dictionary! {
550                "FM1" => form_id,
551            }),
552        }),
553    });
554
555    doc.objects.insert(
556        pages_id,
557        Object::Dictionary(dictionary! {
558            "Type" => "Pages",
559            "Kids" => vec![Object::from(page_id)],
560            "Count" => 1i64,
561        }),
562    );
563
564    let catalog_id = doc.add_object(dictionary! {
565        "Type" => "Catalog",
566        "Pages" => pages_id,
567    });
568    doc.trailer.set("Root", catalog_id);
569
570    let mut buf = Vec::new();
571    doc.save_to(&mut buf).expect("failed to save test PDF");
572    buf
573}
574
575/// Create a PDF with nested Form XObjects (2 levels).
576///
577/// Page content: `q /FM1 Do Q`
578/// FM1 content: `q /FM2 Do Q` (references FM2)
579/// FM2 content: `BT /F1 10 Tf (Deep) Tj ET` (actual text)
580#[cfg(test)]
581fn create_test_pdf_with_nested_form_xobjects() -> Vec<u8> {
582    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
583
584    let mut doc = Document::with_version("1.5");
585    let pages_id: ObjectId = doc.new_object_id();
586
587    let font_id = doc.add_object(dictionary! {
588        "Type" => "Font",
589        "Subtype" => "Type1",
590        "BaseFont" => "Helvetica",
591    });
592
593    // Inner Form XObject (FM2): contains actual text
594    let fm2_content = b"BT /F1 10 Tf (Deep) Tj ET";
595    let fm2_stream = Stream::new(
596        dictionary! {
597            "Type" => "XObject",
598            "Subtype" => "Form",
599            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
600            "Resources" => Object::Dictionary(dictionary! {
601                "Font" => Object::Dictionary(dictionary! {
602                    "F1" => font_id,
603                }),
604            }),
605        },
606        fm2_content.to_vec(),
607    );
608    let fm2_id = doc.add_object(Object::Stream(fm2_stream));
609
610    // Outer Form XObject (FM1): references FM2
611    let fm1_content = b"q /FM2 Do Q";
612    let fm1_stream = Stream::new(
613        dictionary! {
614            "Type" => "XObject",
615            "Subtype" => "Form",
616            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
617            "Resources" => Object::Dictionary(dictionary! {
618                "XObject" => Object::Dictionary(dictionary! {
619                    "FM2" => fm2_id,
620                }),
621                "Font" => Object::Dictionary(dictionary! {
622                    "F1" => font_id,
623                }),
624            }),
625        },
626        fm1_content.to_vec(),
627    );
628    let fm1_id = doc.add_object(Object::Stream(fm1_stream));
629
630    // Page content: invoke FM1
631    let page_content = b"q /FM1 Do Q";
632    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
633    let content_id = doc.add_object(Object::Stream(page_stream));
634
635    let page_id = doc.add_object(dictionary! {
636        "Type" => "Page",
637        "Parent" => pages_id,
638        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
639        "Contents" => content_id,
640        "Resources" => Object::Dictionary(dictionary! {
641            "XObject" => Object::Dictionary(dictionary! {
642                "FM1" => fm1_id,
643            }),
644            "Font" => Object::Dictionary(dictionary! {
645                "F1" => font_id,
646            }),
647        }),
648    });
649
650    doc.objects.insert(
651        pages_id,
652        Object::Dictionary(dictionary! {
653            "Type" => "Pages",
654            "Kids" => vec![Object::from(page_id)],
655            "Count" => 1i64,
656        }),
657    );
658
659    let catalog_id = doc.add_object(dictionary! {
660        "Type" => "Catalog",
661        "Pages" => pages_id,
662    });
663    doc.trailer.set("Root", catalog_id);
664
665    let mut buf = Vec::new();
666    doc.save_to(&mut buf).expect("failed to save test PDF");
667    buf
668}
669
670/// Create a PDF with a Form XObject that has a /Matrix transform.
671///
672/// The Form XObject has /Matrix [2 0 0 2 10 20] (scale 2x + translate).
673#[cfg(test)]
674fn create_test_pdf_form_xobject_with_matrix() -> Vec<u8> {
675    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
676
677    let mut doc = Document::with_version("1.5");
678    let pages_id: ObjectId = doc.new_object_id();
679
680    let font_id = doc.add_object(dictionary! {
681        "Type" => "Font",
682        "Subtype" => "Type1",
683        "BaseFont" => "Helvetica",
684    });
685
686    let form_content = b"BT /F1 12 Tf (A) Tj ET";
687    let form_stream = Stream::new(
688        dictionary! {
689            "Type" => "XObject",
690            "Subtype" => "Form",
691            "BBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
692            "Matrix" => vec![
693                Object::Real(2.0), Object::Real(0.0),
694                Object::Real(0.0), Object::Real(2.0),
695                Object::Real(10.0), Object::Real(20.0),
696            ],
697            "Resources" => Object::Dictionary(dictionary! {
698                "Font" => Object::Dictionary(dictionary! {
699                    "F1" => font_id,
700                }),
701            }),
702        },
703        form_content.to_vec(),
704    );
705    let form_id = doc.add_object(Object::Stream(form_stream));
706
707    let page_content = b"q /FM1 Do Q";
708    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
709    let content_id = doc.add_object(Object::Stream(page_stream));
710
711    let page_id = doc.add_object(dictionary! {
712        "Type" => "Page",
713        "Parent" => pages_id,
714        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
715        "Contents" => content_id,
716        "Resources" => Object::Dictionary(dictionary! {
717            "XObject" => Object::Dictionary(dictionary! {
718                "FM1" => form_id,
719            }),
720            "Font" => Object::Dictionary(dictionary! {
721                "F1" => font_id,
722            }),
723        }),
724    });
725
726    doc.objects.insert(
727        pages_id,
728        Object::Dictionary(dictionary! {
729            "Type" => "Pages",
730            "Kids" => vec![Object::from(page_id)],
731            "Count" => 1i64,
732        }),
733    );
734
735    let catalog_id = doc.add_object(dictionary! {
736        "Type" => "Catalog",
737        "Pages" => pages_id,
738    });
739    doc.trailer.set("Root", catalog_id);
740
741    let mut buf = Vec::new();
742    doc.save_to(&mut buf).expect("failed to save test PDF");
743    buf
744}
745
746/// Create a PDF with an Image XObject (not Form).
747#[cfg(test)]
748fn create_test_pdf_with_image_xobject() -> Vec<u8> {
749    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
750
751    let mut doc = Document::with_version("1.5");
752    let pages_id: ObjectId = doc.new_object_id();
753
754    // 2x2 RGB image (12 bytes of pixel data)
755    let image_data = vec![255u8, 0, 0, 0, 255, 0, 0, 0, 255, 255, 255, 0];
756    let image_stream = Stream::new(
757        dictionary! {
758            "Type" => "XObject",
759            "Subtype" => "Image",
760            "Width" => 2i64,
761            "Height" => 2i64,
762            "ColorSpace" => "DeviceRGB",
763            "BitsPerComponent" => 8i64,
764        },
765        image_data,
766    );
767    let image_id = doc.add_object(Object::Stream(image_stream));
768
769    // Page content: scale then place image
770    let page_content = b"q 200 0 0 150 100 300 cm /Im0 Do Q";
771    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
772    let content_id = doc.add_object(Object::Stream(page_stream));
773
774    let page_id = doc.add_object(dictionary! {
775        "Type" => "Page",
776        "Parent" => pages_id,
777        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
778        "Contents" => content_id,
779        "Resources" => Object::Dictionary(dictionary! {
780            "XObject" => Object::Dictionary(dictionary! {
781                "Im0" => image_id,
782            }),
783        }),
784    });
785
786    doc.objects.insert(
787        pages_id,
788        Object::Dictionary(dictionary! {
789            "Type" => "Pages",
790            "Kids" => vec![Object::from(page_id)],
791            "Count" => 1i64,
792        }),
793    );
794
795    let catalog_id = doc.add_object(dictionary! {
796        "Type" => "Catalog",
797        "Pages" => pages_id,
798    });
799    doc.trailer.set("Root", catalog_id);
800
801    let mut buf = Vec::new();
802    doc.save_to(&mut buf).expect("failed to save test PDF");
803    buf
804}
805
806/// Create a PDF with a page that has direct text content (no XObjects).
807#[cfg(test)]
808fn create_test_pdf_with_text_content() -> Vec<u8> {
809    use lopdf::{Document, Object, ObjectId, Stream, dictionary};
810
811    let mut doc = Document::with_version("1.5");
812    let pages_id: ObjectId = doc.new_object_id();
813
814    let font_id = doc.add_object(dictionary! {
815        "Type" => "Font",
816        "Subtype" => "Type1",
817        "BaseFont" => "Helvetica",
818    });
819
820    let page_content = b"BT /F1 12 Tf 72 700 Td (Hi) Tj ET";
821    let page_stream = Stream::new(lopdf::Dictionary::new(), page_content.to_vec());
822    let content_id = doc.add_object(Object::Stream(page_stream));
823
824    let page_id = doc.add_object(dictionary! {
825        "Type" => "Page",
826        "Parent" => pages_id,
827        "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
828        "Contents" => content_id,
829        "Resources" => Object::Dictionary(dictionary! {
830            "Font" => Object::Dictionary(dictionary! {
831                "F1" => font_id,
832            }),
833        }),
834    });
835
836    doc.objects.insert(
837        pages_id,
838        Object::Dictionary(dictionary! {
839            "Type" => "Pages",
840            "Kids" => vec![Object::from(page_id)],
841            "Count" => 1i64,
842        }),
843    );
844
845    let catalog_id = doc.add_object(dictionary! {
846        "Type" => "Catalog",
847        "Pages" => pages_id,
848    });
849    doc.trailer.set("Root", catalog_id);
850
851    let mut buf = Vec::new();
852    doc.save_to(&mut buf).expect("failed to save test PDF");
853    buf
854}
855
856#[cfg(test)]
857mod tests {
858    use super::*;
859    use crate::handler::{CharEvent, ContentHandler, ImageEvent};
860    use pdfplumber_core::PdfError;
861
862    // --- CollectingHandler for interpret_page tests ---
863
864    struct CollectingHandler {
865        chars: Vec<CharEvent>,
866        images: Vec<ImageEvent>,
867    }
868
869    impl CollectingHandler {
870        fn new() -> Self {
871            Self {
872                chars: Vec::new(),
873                images: Vec::new(),
874            }
875        }
876    }
877
878    impl ContentHandler for CollectingHandler {
879        fn on_char(&mut self, event: CharEvent) {
880            self.chars.push(event);
881        }
882        fn on_image(&mut self, event: ImageEvent) {
883            self.images.push(event);
884        }
885    }
886
887    // --- open() tests ---
888
889    #[test]
890    fn open_valid_single_page_pdf() {
891        let pdf_bytes = create_test_pdf(1);
892        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
893        assert_eq!(LopdfBackend::page_count(&doc), 1);
894    }
895
896    #[test]
897    fn open_valid_multi_page_pdf() {
898        let pdf_bytes = create_test_pdf(5);
899        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
900        assert_eq!(LopdfBackend::page_count(&doc), 5);
901    }
902
903    #[test]
904    fn open_invalid_bytes_returns_error() {
905        let result = LopdfBackend::open(b"not a pdf");
906        assert!(result.is_err());
907    }
908
909    #[test]
910    fn open_empty_bytes_returns_error() {
911        let result = LopdfBackend::open(&[]);
912        assert!(result.is_err());
913    }
914
915    #[test]
916    fn open_error_converts_to_pdf_error() {
917        let err = LopdfBackend::open(b"garbage").unwrap_err();
918        let pdf_err: PdfError = err.into();
919        assert!(matches!(pdf_err, PdfError::ParseError(_)));
920    }
921
922    // --- page_count() tests ---
923
924    #[test]
925    fn page_count_zero_pages() {
926        let pdf_bytes = create_test_pdf(0);
927        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
928        assert_eq!(LopdfBackend::page_count(&doc), 0);
929    }
930
931    #[test]
932    fn page_count_three_pages() {
933        let pdf_bytes = create_test_pdf(3);
934        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
935        assert_eq!(LopdfBackend::page_count(&doc), 3);
936    }
937
938    // --- get_page() tests ---
939
940    #[test]
941    fn get_page_first_page() {
942        let pdf_bytes = create_test_pdf(3);
943        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
944        let page = LopdfBackend::get_page(&doc, 0).unwrap();
945        assert_eq!(page.index, 0);
946    }
947
948    #[test]
949    fn get_page_last_page() {
950        let pdf_bytes = create_test_pdf(3);
951        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
952        let page = LopdfBackend::get_page(&doc, 2).unwrap();
953        assert_eq!(page.index, 2);
954    }
955
956    #[test]
957    fn get_page_out_of_bounds() {
958        let pdf_bytes = create_test_pdf(2);
959        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
960        let result = LopdfBackend::get_page(&doc, 2);
961        assert!(result.is_err());
962    }
963
964    #[test]
965    fn get_page_out_of_bounds_error_converts_to_pdf_error() {
966        let pdf_bytes = create_test_pdf(1);
967        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
968        let err = LopdfBackend::get_page(&doc, 5).unwrap_err();
969        let pdf_err: PdfError = err.into();
970        assert!(matches!(pdf_err, PdfError::ParseError(_)));
971        assert!(pdf_err.to_string().contains("out of range"));
972    }
973
974    #[test]
975    fn get_page_on_empty_document() {
976        let pdf_bytes = create_test_pdf(0);
977        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
978        let result = LopdfBackend::get_page(&doc, 0);
979        assert!(result.is_err());
980    }
981
982    // --- Page object IDs are distinct ---
983
984    #[test]
985    fn pages_have_distinct_object_ids() {
986        let pdf_bytes = create_test_pdf(3);
987        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
988        let page0 = LopdfBackend::get_page(&doc, 0).unwrap();
989        let page1 = LopdfBackend::get_page(&doc, 1).unwrap();
990        let page2 = LopdfBackend::get_page(&doc, 2).unwrap();
991        assert_ne!(page0.object_id, page1.object_id);
992        assert_ne!(page1.object_id, page2.object_id);
993        assert_ne!(page0.object_id, page2.object_id);
994    }
995
996    // --- Integration: open + page_count + get_page round-trip ---
997
998    #[test]
999    fn round_trip_open_count_access() {
1000        let pdf_bytes = create_test_pdf(4);
1001        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1002        let count = LopdfBackend::page_count(&doc);
1003        assert_eq!(count, 4);
1004
1005        for i in 0..count {
1006            let page = LopdfBackend::get_page(&doc, i).unwrap();
1007            assert_eq!(page.index, i);
1008        }
1009
1010        // One past the end should fail
1011        assert!(LopdfBackend::get_page(&doc, count).is_err());
1012    }
1013
1014    // --- page_media_box() tests ---
1015
1016    #[test]
1017    fn media_box_explicit_us_letter() {
1018        let pdf_bytes = create_test_pdf(1);
1019        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1020        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1021        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1022        assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
1023    }
1024
1025    #[test]
1026    fn media_box_inherited_from_parent() {
1027        let pdf_bytes = create_test_pdf_inherited_media_box();
1028        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1029        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1030        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1031        // Inherited A4 size from parent Pages node
1032        assert_eq!(media_box, BBox::new(0.0, 0.0, 595.0, 842.0));
1033    }
1034
1035    #[test]
1036    fn media_box_width_height() {
1037        let pdf_bytes = create_test_pdf(1);
1038        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1039        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1040        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1041        assert_eq!(media_box.width(), 612.0);
1042        assert_eq!(media_box.height(), 792.0);
1043    }
1044
1045    // --- page_crop_box() tests ---
1046
1047    #[test]
1048    fn crop_box_present() {
1049        let pdf_bytes = create_test_pdf_with_crop_box();
1050        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1051        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1052        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1053        assert_eq!(crop_box, Some(BBox::new(36.0, 36.0, 576.0, 756.0)));
1054    }
1055
1056    #[test]
1057    fn crop_box_absent() {
1058        let pdf_bytes = create_test_pdf(1);
1059        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1060        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1061        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1062        assert_eq!(crop_box, None);
1063    }
1064
1065    // --- page_rotate() tests ---
1066
1067    #[test]
1068    fn rotate_default_zero() {
1069        let pdf_bytes = create_test_pdf(1);
1070        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1071        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1072        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1073        assert_eq!(rotation, 0);
1074    }
1075
1076    #[test]
1077    fn rotate_90() {
1078        let pdf_bytes = create_test_pdf_with_rotate(90);
1079        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1080        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1081        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1082        assert_eq!(rotation, 90);
1083    }
1084
1085    #[test]
1086    fn rotate_180() {
1087        let pdf_bytes = create_test_pdf_with_rotate(180);
1088        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1089        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1090        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1091        assert_eq!(rotation, 180);
1092    }
1093
1094    #[test]
1095    fn rotate_270() {
1096        let pdf_bytes = create_test_pdf_with_rotate(270);
1097        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1098        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1099        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1100        assert_eq!(rotation, 270);
1101    }
1102
1103    #[test]
1104    fn rotate_inherited_from_parent() {
1105        let pdf_bytes = create_test_pdf_inherited_rotate(90);
1106        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1107        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1108        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1109        assert_eq!(rotation, 90);
1110    }
1111
1112    // --- Integration: all page properties together ---
1113
1114    #[test]
1115    fn page_properties_round_trip() {
1116        let pdf_bytes = create_test_pdf_with_crop_box();
1117        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1118        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1119
1120        let media_box = LopdfBackend::page_media_box(&doc, &page).unwrap();
1121        let crop_box = LopdfBackend::page_crop_box(&doc, &page).unwrap();
1122        let rotation = LopdfBackend::page_rotate(&doc, &page).unwrap();
1123
1124        assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
1125        assert!(crop_box.is_some());
1126        assert_eq!(rotation, 0);
1127    }
1128
1129    // --- interpret_page: basic text extraction ---
1130
1131    #[test]
1132    fn interpret_page_simple_text() {
1133        let pdf_bytes = create_test_pdf_with_text_content();
1134        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1135        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1136        let options = ExtractOptions::default();
1137        let mut handler = CollectingHandler::new();
1138
1139        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1140
1141        // "Hi" = 2 characters
1142        assert_eq!(handler.chars.len(), 2);
1143        assert_eq!(handler.chars[0].char_code, b'H' as u32);
1144        assert_eq!(handler.chars[1].char_code, b'i' as u32);
1145        assert_eq!(handler.chars[0].font_size, 12.0);
1146        assert_eq!(handler.chars[0].font_name, "Helvetica");
1147    }
1148
1149    #[test]
1150    fn interpret_page_no_content() {
1151        let pdf_bytes = create_test_pdf(1);
1152        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1153        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1154        let options = ExtractOptions::default();
1155        let mut handler = CollectingHandler::new();
1156
1157        // Page with no /Contents should not fail
1158        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1159        assert_eq!(handler.chars.len(), 0);
1160    }
1161
1162    // --- interpret_page: Form XObject tests (US-016) ---
1163
1164    #[test]
1165    fn interpret_page_form_xobject_text() {
1166        let pdf_bytes = create_test_pdf_with_form_xobject();
1167        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1168        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1169        let options = ExtractOptions::default();
1170        let mut handler = CollectingHandler::new();
1171
1172        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1173
1174        // Form XObject contains "Hello" = 5 chars
1175        assert_eq!(handler.chars.len(), 5);
1176        assert_eq!(handler.chars[0].char_code, b'H' as u32);
1177        assert_eq!(handler.chars[1].char_code, b'e' as u32);
1178        assert_eq!(handler.chars[2].char_code, b'l' as u32);
1179        assert_eq!(handler.chars[3].char_code, b'l' as u32);
1180        assert_eq!(handler.chars[4].char_code, b'o' as u32);
1181        assert_eq!(handler.chars[0].font_name, "Helvetica");
1182        assert_eq!(handler.chars[0].font_size, 12.0);
1183    }
1184
1185    #[test]
1186    fn interpret_page_nested_form_xobjects() {
1187        let pdf_bytes = create_test_pdf_with_nested_form_xobjects();
1188        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1189        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1190        let options = ExtractOptions::default();
1191        let mut handler = CollectingHandler::new();
1192
1193        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1194
1195        // Nested form XObject FM1→FM2 contains "Deep" = 4 chars
1196        assert_eq!(handler.chars.len(), 4);
1197        assert_eq!(handler.chars[0].char_code, b'D' as u32);
1198        assert_eq!(handler.chars[1].char_code, b'e' as u32);
1199        assert_eq!(handler.chars[2].char_code, b'e' as u32);
1200        assert_eq!(handler.chars[3].char_code, b'p' as u32);
1201    }
1202
1203    #[test]
1204    fn interpret_page_form_xobject_matrix_applied() {
1205        let pdf_bytes = create_test_pdf_form_xobject_with_matrix();
1206        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1207        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1208        let options = ExtractOptions::default();
1209        let mut handler = CollectingHandler::new();
1210
1211        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1212
1213        // Form XObject has /Matrix [2 0 0 2 10 20], character "A"
1214        assert_eq!(handler.chars.len(), 1);
1215        assert_eq!(handler.chars[0].char_code, b'A' as u32);
1216        // CTM should include the form's matrix transform
1217        let ctm = handler.chars[0].ctm;
1218        // Form matrix [2 0 0 2 10 20] applied on top of identity
1219        assert!((ctm[0] - 2.0).abs() < 0.01);
1220        assert!((ctm[3] - 2.0).abs() < 0.01);
1221        assert!((ctm[4] - 10.0).abs() < 0.01);
1222        assert!((ctm[5] - 20.0).abs() < 0.01);
1223    }
1224
1225    #[test]
1226    fn interpret_page_form_xobject_state_restored() {
1227        // After processing a Form XObject, the graphics state should be restored.
1228        // The Form XObject is wrapped in q/Q on the page, and the interpreter
1229        // also saves/restores state around the Form XObject.
1230        let pdf_bytes = create_test_pdf_with_form_xobject();
1231        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1232        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1233        let options = ExtractOptions::default();
1234        let mut handler = CollectingHandler::new();
1235
1236        // This should complete without errors (state properly saved/restored)
1237        let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
1238        assert!(result.is_ok());
1239    }
1240
1241    #[test]
1242    fn interpret_page_image_xobject() {
1243        let pdf_bytes = create_test_pdf_with_image_xobject();
1244        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1245        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1246        let options = ExtractOptions::default();
1247        let mut handler = CollectingHandler::new();
1248
1249        LopdfBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
1250
1251        // Should have 1 image event, no chars
1252        assert_eq!(handler.chars.len(), 0);
1253        assert_eq!(handler.images.len(), 1);
1254        assert_eq!(handler.images[0].name, "Im0");
1255        assert_eq!(handler.images[0].width, 2);
1256        assert_eq!(handler.images[0].height, 2);
1257        assert_eq!(handler.images[0].colorspace.as_deref(), Some("DeviceRGB"));
1258        assert_eq!(handler.images[0].bits_per_component, Some(8));
1259        // CTM should be [200 0 0 150 100 300] from the cm operator
1260        let ctm = handler.images[0].ctm;
1261        assert!((ctm[0] - 200.0).abs() < 0.01);
1262        assert!((ctm[3] - 150.0).abs() < 0.01);
1263        assert!((ctm[4] - 100.0).abs() < 0.01);
1264        assert!((ctm[5] - 300.0).abs() < 0.01);
1265    }
1266
1267    #[test]
1268    fn interpret_page_recursion_limit() {
1269        // Use the nested form XObject PDF but with max_recursion_depth = 0
1270        let pdf_bytes = create_test_pdf_with_form_xobject();
1271        let doc = LopdfBackend::open(&pdf_bytes).unwrap();
1272        let page = LopdfBackend::get_page(&doc, 0).unwrap();
1273        let mut options = ExtractOptions::default();
1274        options.max_recursion_depth = 0; // Page level = 0, Form XObject = 1 > limit
1275        let mut handler = CollectingHandler::new();
1276
1277        let result = LopdfBackend::interpret_page(&doc, &page, &mut handler, &options);
1278        assert!(result.is_err());
1279        let err_msg = result.unwrap_err().to_string();
1280        assert!(err_msg.contains("recursion depth"));
1281    }
1282}