Skip to main content

pdfluent_lopdf/
reader.rs

1use log::{error, warn};
2use std::cmp;
3use std::collections::{BTreeMap, HashSet};
4use std::convert::TryInto;
5#[cfg(not(feature = "async"))]
6use std::fs::File;
7#[cfg(not(feature = "async"))]
8use std::io::Read;
9use std::path::Path;
10use std::sync::Mutex;
11
12#[cfg(feature = "rayon")]
13use rayon::prelude::*;
14#[cfg(feature = "async")]
15use tokio::fs::File;
16#[cfg(feature = "async")]
17use tokio::io::{AsyncRead, AsyncReadExt};
18#[cfg(feature = "async")]
19use tokio::pin;
20
21use crate::encryption::{self, EncryptionState};
22use crate::error::{ParseError, XrefError};
23use crate::load_options::LoadOptions;
24use crate::object_stream::ObjectStream;
25use crate::parser::{self, ParserInput};
26use crate::xref::XrefEntry;
27use crate::{Dictionary, Document, Error, IncrementalDocument, Object, ObjectId, Result};
28
29type FilterFunc = fn((u32, u16), &mut Object) -> Option<((u32, u16), Object)>;
30
31#[cfg(not(feature = "async"))]
32impl Document {
33    /// Load a PDF document from a specified file path.
34    #[inline]
35    pub fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
36        let file = File::open(path)?;
37        let capacity = Some(file.metadata()?.len() as usize);
38        Self::load_internal(file, capacity, None, None)
39    }
40
41    /// Load a PDF document from a specified file path with a password for encrypted PDFs.
42    #[inline]
43    pub fn load_with_password<P: AsRef<Path>>(path: P, password: &str) -> Result<Document> {
44        let file = File::open(path)?;
45        let capacity = Some(file.metadata()?.len() as usize);
46        Self::load_internal(file, capacity, None, Some(password.to_string()))
47    }
48
49    #[inline]
50    pub fn load_filtered<P: AsRef<Path>>(path: P, filter_func: FilterFunc) -> Result<Document> {
51        let file = File::open(path)?;
52        let capacity = Some(file.metadata()?.len() as usize);
53        Self::load_internal(file, capacity, Some(filter_func), None)
54    }
55
56    /// Load a PDF document from an arbitrary source.
57    #[inline]
58    pub fn load_from<R: Read>(source: R) -> Result<Document> {
59        Self::load_internal(source, None, None, None)
60    }
61
62    /// Load a PDF document from an arbitrary source with a password for encrypted PDFs.
63    #[inline]
64    pub fn load_from_with_password<R: Read>(source: R, password: &str) -> Result<Document> {
65        Self::load_internal(source, None, None, Some(password.to_string()))
66    }
67
68    fn load_internal<R: Read>(
69        mut source: R,
70        capacity: Option<usize>,
71        filter_func: Option<FilterFunc>,
72        password: Option<String>,
73    ) -> Result<Document> {
74        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
75        source.read_to_end(&mut buffer)?;
76
77        Reader {
78            buffer: &buffer,
79            document: Document::new(),
80            encryption_state: None,
81            raw_objects: BTreeMap::new(),
82            password,
83            options: LoadOptions::default(),
84        }
85        .read(filter_func)
86    }
87
88    /// Load a PDF document from a memory slice.
89    pub fn load_mem(buffer: &[u8]) -> Result<Document> {
90        buffer.try_into()
91    }
92
93    /// Load a PDF document from a memory slice with custom load options.
94    ///
95    /// # Errors
96    ///
97    /// Returns `Err(Error::DocumentTooLarge)` if `opts.max_file_bytes` is set
98    /// and `buffer.len()` exceeds that limit.
99    pub fn load_mem_with_options(buffer: &[u8], opts: &LoadOptions) -> Result<Document> {
100        // Phase 1 (Issue #468): reject inputs that exceed the configured size limit
101        // before allocating the full object graph.
102        if let Some(limit) = opts.max_file_bytes {
103            if buffer.len() > limit {
104                return Err(Error::DocumentTooLarge {
105                    size: buffer.len(),
106                    limit,
107                });
108            }
109        }
110        Reader {
111            buffer,
112            document: Document::new(),
113            encryption_state: None,
114            raw_objects: BTreeMap::new(),
115            password: None,
116            options: opts.clone(),
117        }
118        .read(None)
119    }
120
121    /// Load a PDF document from a file path with custom load options.
122    ///
123    /// # Errors
124    ///
125    /// Returns `Err(Error::DocumentTooLarge)` if `opts.max_file_bytes` is set
126    /// and the file size exceeds that limit.
127    pub fn load_with_options<P: AsRef<Path>>(path: P, opts: &LoadOptions) -> Result<Document> {
128        let file = File::open(path.as_ref())?;
129        let file_size = file.metadata()?.len() as usize;
130        if let Some(limit) = opts.max_file_bytes {
131            if file_size > limit {
132                return Err(Error::DocumentTooLarge {
133                    size: file_size,
134                    limit,
135                });
136            }
137        }
138        let mut buffer = Vec::with_capacity(file_size);
139        let mut f = file;
140        f.read_to_end(&mut buffer)?;
141        Reader {
142            buffer: &buffer,
143            document: Document::new(),
144            encryption_state: None,
145            raw_objects: BTreeMap::new(),
146            password: None,
147            options: opts.clone(),
148        }
149        .read(None)
150    }
151
152    /// Load a PDF document from a memory slice with a password for encrypted PDFs.
153    pub fn load_mem_with_password(buffer: &[u8], password: &str) -> Result<Document> {
154        Reader {
155            buffer,
156            document: Document::new(),
157            encryption_state: None,
158            raw_objects: BTreeMap::new(),
159            password: Some(password.to_string()),
160            options: LoadOptions::default(),
161        }
162        .read(None)
163    }
164
165    /// Load PDF metadata (title and page count) without loading the entire document.
166    /// This is much faster for large PDFs when you only need basic information.
167    #[inline]
168    pub fn load_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
169        let file = File::open(path)?;
170        let capacity = Some(file.metadata()?.len() as usize);
171        Self::load_metadata_internal(file, capacity, None)
172    }
173
174    /// Load PDF metadata from a file path with a password for encrypted PDFs.
175    #[inline]
176    pub fn load_metadata_with_password<P: AsRef<Path>>(
177        path: P,
178        password: &str,
179    ) -> Result<PdfMetadata> {
180        let file = File::open(path)?;
181        let capacity = Some(file.metadata()?.len() as usize);
182        Self::load_metadata_internal(file, capacity, Some(password.to_string()))
183    }
184
185    /// Load PDF metadata from an arbitrary source without loading the entire document.
186    #[inline]
187    pub fn load_metadata_from<R: Read>(source: R) -> Result<PdfMetadata> {
188        Self::load_metadata_internal(source, None, None)
189    }
190
191    /// Load PDF metadata from an arbitrary source with a password for encrypted PDFs.
192    #[inline]
193    pub fn load_metadata_from_with_password<R: Read>(
194        source: R,
195        password: &str,
196    ) -> Result<PdfMetadata> {
197        Self::load_metadata_internal(source, None, Some(password.to_string()))
198    }
199
200    /// Load PDF metadata from a memory slice without loading the entire document.
201    #[inline]
202    pub fn load_metadata_mem(buffer: &[u8]) -> Result<PdfMetadata> {
203        Reader {
204            buffer,
205            document: Document::new(),
206            encryption_state: None,
207            raw_objects: BTreeMap::new(),
208            password: None,
209            options: LoadOptions::default(),
210        }
211        .read_metadata()
212    }
213
214    /// Load PDF metadata from a memory slice with a password for encrypted PDFs.
215    #[inline]
216    pub fn load_metadata_mem_with_password(buffer: &[u8], password: &str) -> Result<PdfMetadata> {
217        Reader {
218            buffer,
219            document: Document::new(),
220            encryption_state: None,
221            raw_objects: BTreeMap::new(),
222            password: Some(password.to_string()),
223            options: LoadOptions::default(),
224        }
225        .read_metadata()
226    }
227
228    fn load_metadata_internal<R: Read>(
229        mut source: R,
230        capacity: Option<usize>,
231        password: Option<String>,
232    ) -> Result<PdfMetadata> {
233        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
234        source.read_to_end(&mut buffer)?;
235
236        Reader {
237            buffer: &buffer,
238            document: Document::new(),
239            encryption_state: None,
240            raw_objects: BTreeMap::new(),
241            password,
242            options: LoadOptions::default(),
243        }
244        .read_metadata()
245    }
246}
247
248#[cfg(feature = "async")]
249impl Document {
250    pub async fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
251        let file = File::open(path).await?;
252        let metadata = file.metadata().await?;
253        let capacity = Some(metadata.len() as usize);
254        Self::load_internal(file, capacity, None, None).await
255    }
256
257    /// Load a PDF document from a specified file path with a password for encrypted PDFs.
258    pub async fn load_with_password<P: AsRef<Path>>(path: P, password: &str) -> Result<Document> {
259        let file = File::open(path).await?;
260        let metadata = file.metadata().await?;
261        let capacity = Some(metadata.len() as usize);
262        Self::load_internal(file, capacity, None, Some(password.to_string())).await
263    }
264
265    pub async fn load_filtered<P: AsRef<Path>>(
266        path: P,
267        filter_func: FilterFunc,
268    ) -> Result<Document> {
269        let file = File::open(path).await?;
270        let metadata = file.metadata().await?;
271        let capacity = Some(metadata.len() as usize);
272        Self::load_internal(file, capacity, Some(filter_func), None).await
273    }
274
275    async fn load_internal<R: AsyncRead>(
276        source: R,
277        capacity: Option<usize>,
278        filter_func: Option<FilterFunc>,
279        password: Option<String>,
280    ) -> Result<Document> {
281        pin!(source);
282
283        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
284        source.read_to_end(&mut buffer).await?;
285
286        Reader {
287            buffer: &buffer,
288            document: Document::new(),
289            encryption_state: None,
290            raw_objects: BTreeMap::new(),
291            password,
292            options: LoadOptions::default(),
293        }
294        .read(filter_func)
295    }
296
297    /// Load a PDF document from a memory slice.
298    pub fn load_mem(buffer: &[u8]) -> Result<Document> {
299        buffer.try_into()
300    }
301
302    /// Load a PDF document from a memory slice with a password for encrypted PDFs.
303    ///
304    /// This is a synchronous helper available in both sync and async builds so
305    /// that callers that already have the PDF in memory do not need to branch on
306    /// the `async` feature flag.
307    pub fn load_mem_with_password(buffer: &[u8], password: &str) -> Result<Document> {
308        Reader {
309            buffer,
310            document: Document::new(),
311            encryption_state: None,
312            raw_objects: BTreeMap::new(),
313            password: Some(password.to_string()),
314            options: LoadOptions::default(),
315        }
316        .read(None)
317    }
318
319    /// Load PDF metadata (title and page count) without loading the entire document.
320    /// This is much faster for large PDFs when you only need basic information.
321    #[inline]
322    pub async fn load_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
323        let file = File::open(path).await?;
324        let metadata = file.metadata().await?;
325        let capacity = Some(metadata.len() as usize);
326        Self::load_metadata_internal(file, capacity, None).await
327    }
328
329    /// Load PDF metadata from a file path with a password for encrypted PDFs.
330    #[inline]
331    pub async fn load_metadata_with_password<P: AsRef<Path>>(
332        path: P,
333        password: &str,
334    ) -> Result<PdfMetadata> {
335        let file = File::open(path).await?;
336        let metadata = file.metadata().await?;
337        let capacity = Some(metadata.len() as usize);
338        Self::load_metadata_internal(file, capacity, Some(password.to_string())).await
339    }
340
341    /// Load PDF metadata from an arbitrary source without loading the entire document.
342    #[inline]
343    pub async fn load_metadata_from<R: AsyncRead>(source: R) -> Result<PdfMetadata> {
344        Self::load_metadata_internal(source, None, None).await
345    }
346
347    /// Load PDF metadata from an arbitrary source with a password for encrypted PDFs.
348    #[inline]
349    pub async fn load_metadata_from_with_password<R: AsyncRead>(
350        source: R,
351        password: &str,
352    ) -> Result<PdfMetadata> {
353        Self::load_metadata_internal(source, None, Some(password.to_string())).await
354    }
355
356    /// Load PDF metadata from a memory slice without loading the entire document.
357    #[inline]
358    pub fn load_metadata_mem(buffer: &[u8]) -> Result<PdfMetadata> {
359        Reader {
360            buffer,
361            document: Document::new(),
362            encryption_state: None,
363            raw_objects: BTreeMap::new(),
364            password: None,
365            options: LoadOptions::default(),
366        }
367        .read_metadata()
368    }
369
370    /// Load PDF metadata from a memory slice with a password for encrypted PDFs.
371    #[inline]
372    pub fn load_metadata_mem_with_password(buffer: &[u8], password: &str) -> Result<PdfMetadata> {
373        Reader {
374            buffer,
375            document: Document::new(),
376            encryption_state: None,
377            raw_objects: BTreeMap::new(),
378            password: Some(password.to_string()),
379            options: LoadOptions::default(),
380        }
381        .read_metadata()
382    }
383
384    async fn load_metadata_internal<R: AsyncRead>(
385        source: R,
386        capacity: Option<usize>,
387        password: Option<String>,
388    ) -> Result<PdfMetadata> {
389        pin!(source);
390
391        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
392        source.read_to_end(&mut buffer).await?;
393
394        Reader {
395            buffer: &buffer,
396            document: Document::new(),
397            encryption_state: None,
398            raw_objects: BTreeMap::new(),
399            password,
400            options: LoadOptions::default(),
401        }
402        .read_metadata()
403    }
404}
405
406impl TryInto<Document> for &[u8] {
407    type Error = Error;
408
409    fn try_into(self) -> Result<Document> {
410        Reader {
411            buffer: self,
412            document: Document::new(),
413            encryption_state: None,
414            raw_objects: BTreeMap::new(),
415            password: None,
416            options: LoadOptions::default(),
417        }
418        .read(None)
419    }
420}
421
422#[cfg(not(feature = "async"))]
423impl IncrementalDocument {
424    /// Load a PDF document from a specified file path.
425    #[inline]
426    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
427        let file = File::open(path)?;
428        let capacity = Some(file.metadata()?.len() as usize);
429        Self::load_internal(file, capacity)
430    }
431
432    /// Load a PDF document from an arbitrary source.
433    #[inline]
434    pub fn load_from<R: Read>(source: R) -> Result<Self> {
435        Self::load_internal(source, None)
436    }
437
438    fn load_internal<R: Read>(mut source: R, capacity: Option<usize>) -> Result<Self> {
439        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
440        source.read_to_end(&mut buffer)?;
441
442        let document = Reader {
443            buffer: &buffer,
444            document: Document::new(),
445            encryption_state: None,
446            raw_objects: BTreeMap::new(),
447            password: None,
448            options: LoadOptions::default(),
449        }
450        .read(None)?;
451
452        Ok(IncrementalDocument::create_from(buffer, document))
453    }
454
455    /// Load a PDF document from a memory slice.
456    pub fn load_mem(buffer: &[u8]) -> Result<Document> {
457        buffer.try_into()
458    }
459}
460
461#[cfg(feature = "async")]
462impl IncrementalDocument {
463    /// Load a PDF document from a specified file path.
464    #[inline]
465    pub async fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
466        let file = File::open(path).await?;
467        let metadata = file.metadata().await?;
468        let capacity = Some(metadata.len() as usize);
469        Self::load_internal(file, capacity).await
470    }
471
472    /// Load a PDF document from an arbitrary source.
473    #[inline]
474    pub async fn load_from<R: AsyncRead>(source: R) -> Result<Self> {
475        Self::load_internal(source, None).await
476    }
477
478    async fn load_internal<R: AsyncRead>(source: R, capacity: Option<usize>) -> Result<Self> {
479        pin!(source);
480
481        let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
482        source.read_to_end(&mut buffer).await?;
483
484        let document = Reader {
485            buffer: &buffer,
486            document: Document::new(),
487            encryption_state: None,
488            raw_objects: BTreeMap::new(),
489            password: None,
490            options: LoadOptions::default(),
491        }
492        .read(None)?;
493
494        Ok(IncrementalDocument::create_from(buffer, document))
495    }
496
497    /// Load a PDF document from a memory slice.
498    pub fn load_mem(buffer: &[u8]) -> Result<Document> {
499        buffer.try_into()
500    }
501}
502
503impl TryInto<IncrementalDocument> for &[u8] {
504    type Error = Error;
505
506    fn try_into(self) -> Result<IncrementalDocument> {
507        let document = Reader {
508            buffer: self,
509            document: Document::new(),
510            encryption_state: None,
511            raw_objects: BTreeMap::new(),
512            password: None,
513            options: LoadOptions::default(),
514        }
515        .read(None)?;
516
517        Ok(IncrementalDocument::create_from(self.to_vec(), document))
518    }
519}
520
521pub struct Reader<'a> {
522    pub buffer: &'a [u8],
523    pub document: Document,
524    pub encryption_state: Option<EncryptionState>,
525    pub raw_objects: BTreeMap<ObjectId, Vec<u8>>, // Store raw bytes for encrypted objects
526    pub password: Option<String>,                 // Password for encrypted PDFs
527    pub options: LoadOptions,
528}
529
530/// Maximum allowed embedding of literal strings.
531pub const MAX_BRACKET: usize = 100;
532
533/// PDF metadata extracted without loading the entire document.
534/// This is useful for quickly getting basic information about large PDFs.
535#[derive(Debug, Clone)]
536pub struct PdfMetadata {
537    /// Document title from Info dictionary
538    pub title: Option<String>,
539    /// Document author from Info dictionary
540    pub author: Option<String>,
541    /// Document subject from Info dictionary
542    pub subject: Option<String>,
543    /// Document keywords from Info dictionary
544    pub keywords: Option<String>,
545    /// Application that created the document
546    pub creator: Option<String>,
547    /// Application that produced the document
548    pub producer: Option<String>,
549    /// Document creation date (PDF date format: D:YYYYMMDDHHmmSSOHH'mm')
550    pub creation_date: Option<String>,
551    /// Document modification date (PDF date format: D:YYYYMMDDHHmmSSOHH'mm')
552    pub modification_date: Option<String>,
553    /// Number of pages in the document
554    pub page_count: u32,
555    /// PDF version
556    pub version: String,
557}
558
559struct InfoMetadata {
560    title: Option<String>,
561    author: Option<String>,
562    subject: Option<String>,
563    keywords: Option<String>,
564    creator: Option<String>,
565    producer: Option<String>,
566    creation_date: Option<String>,
567    modification_date: Option<String>,
568}
569
570impl Reader<'_> {
571    /// Read metadata (title and page count) without loading the entire document.
572    /// This is much faster for large PDFs when you only need basic information.
573    ///
574    /// For encrypted PDFs, use `Document::load_metadata_with_password()` instead.
575    pub fn read_metadata(mut self) -> Result<PdfMetadata> {
576        let offset = self
577            .buffer
578            .windows(5)
579            .position(|w| w == b"%PDF-")
580            .unwrap_or(0);
581        self.buffer = &self.buffer[offset..];
582
583        let version = parser::header(ParserInput::new_extra(self.buffer, "header"))
584            .ok_or(ParseError::InvalidFileHeader)?;
585
586        let xref_start = Self::get_xref_start(self.buffer)?;
587        if xref_start > self.buffer.len() {
588            return Err(Error::Xref(XrefError::Start));
589        }
590
591        let (mut xref, mut trailer) = parser::xref_and_trailer(
592            ParserInput::new_extra(&self.buffer[xref_start..], "xref"),
593            &self,
594        )?;
595
596        let mut already_seen = HashSet::new();
597        let mut prev_xref_start = trailer.remove(b"Prev");
598        while let Some(prev) = prev_xref_start.and_then(|offset| offset.as_i64().ok()) {
599            if already_seen.contains(&prev) {
600                break;
601            }
602            already_seen.insert(prev);
603            if prev < 0 || prev as usize > self.buffer.len() {
604                return Err(Error::Xref(XrefError::PrevStart));
605            }
606
607            let (prev_xref, prev_trailer) = parser::xref_and_trailer(
608                ParserInput::new_extra(&self.buffer[prev as usize..], ""),
609                &self,
610            )?;
611            xref.merge(prev_xref);
612
613            let prev_xref_stream_start = trailer.remove(b"XRefStm");
614            if let Some(prev) = prev_xref_stream_start.and_then(|offset| offset.as_i64().ok()) {
615                if prev < 0 || prev as usize > self.buffer.len() {
616                    return Err(Error::Xref(XrefError::StreamStart));
617                }
618
619                let (prev_xref, _) = parser::xref_and_trailer(
620                    ParserInput::new_extra(&self.buffer[prev as usize..], ""),
621                    &self,
622                )?;
623                xref.merge(prev_xref);
624            }
625
626            prev_xref_start = prev_trailer.get(b"Prev").cloned().ok();
627        }
628        let xref_entry_count = xref
629            .max_id()
630            .checked_add(1)
631            .ok_or(ParseError::InvalidXref)?;
632        if xref.size != xref_entry_count {
633            warn!(
634                "Size entry of trailer dictionary is {}, correct value is {}.",
635                xref.size, xref_entry_count
636            );
637            xref.size = xref_entry_count;
638        }
639
640        self.document.reference_table = xref;
641        self.document.trailer = trailer.clone();
642
643        if self.document.trailer.get(b"Encrypt").is_ok() {
644            self.setup_encryption_for_metadata()?;
645        }
646
647        let info_metadata = self.extract_info_metadata()?;
648        let page_count = self.extract_page_count()?;
649
650        Ok(PdfMetadata {
651            title: info_metadata.title,
652            author: info_metadata.author,
653            subject: info_metadata.subject,
654            keywords: info_metadata.keywords,
655            creator: info_metadata.creator,
656            producer: info_metadata.producer,
657            creation_date: info_metadata.creation_date,
658            modification_date: info_metadata.modification_date,
659            page_count,
660            version,
661        })
662    }
663
664    fn extract_info_metadata(&self) -> Result<InfoMetadata> {
665        let info_ref = match self.document.trailer.get(b"Info") {
666            Ok(obj) => obj.as_reference().ok(),
667            Err(_) => {
668                return Ok(InfoMetadata {
669                    title: None,
670                    author: None,
671                    subject: None,
672                    keywords: None,
673                    creator: None,
674                    producer: None,
675                    creation_date: None,
676                    modification_date: None,
677                });
678            }
679        };
680
681        let info_id = match info_ref {
682            Some(id) => id,
683            None => {
684                return Ok(InfoMetadata {
685                    title: None,
686                    author: None,
687                    subject: None,
688                    keywords: None,
689                    creator: None,
690                    producer: None,
691                    creation_date: None,
692                    modification_date: None,
693                });
694            }
695        };
696
697        let mut already_seen = HashSet::new();
698        let info_obj = match self.get_object(info_id, &mut already_seen) {
699            Ok(obj) => obj,
700            Err(_) => {
701                return Ok(InfoMetadata {
702                    title: None,
703                    author: None,
704                    subject: None,
705                    keywords: None,
706                    creator: None,
707                    producer: None,
708                    creation_date: None,
709                    modification_date: None,
710                });
711            }
712        };
713
714        let info_dict = match info_obj.as_dict() {
715            Ok(dict) => dict,
716            Err(_) => {
717                return Ok(InfoMetadata {
718                    title: None,
719                    author: None,
720                    subject: None,
721                    keywords: None,
722                    creator: None,
723                    producer: None,
724                    creation_date: None,
725                    modification_date: None,
726                });
727            }
728        };
729
730        Ok(InfoMetadata {
731            title: Self::extract_string_field(info_dict, b"Title"),
732            author: Self::extract_string_field(info_dict, b"Author"),
733            subject: Self::extract_string_field(info_dict, b"Subject"),
734            keywords: Self::extract_string_field(info_dict, b"Keywords"),
735            creator: Self::extract_string_field(info_dict, b"Creator"),
736            producer: Self::extract_string_field(info_dict, b"Producer"),
737            creation_date: Self::extract_string_field(info_dict, b"CreationDate"),
738            modification_date: Self::extract_string_field(info_dict, b"ModDate"),
739        })
740    }
741
742    fn extract_string_field(dict: &Dictionary, key: &[u8]) -> Option<String> {
743        match dict.get(key) {
744            Ok(obj) => match obj {
745                Object::String(bytes, _) => {
746                    let s = if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
747                        let utf16_bytes: Vec<u16> = bytes[2..]
748                            .chunks_exact(2)
749                            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
750                            .collect();
751                        String::from_utf16_lossy(&utf16_bytes)
752                    } else {
753                        String::from_utf8_lossy(bytes).to_string()
754                    };
755                    Some(s)
756                }
757                _ => None,
758            },
759            Err(_) => None,
760        }
761    }
762
763    fn extract_page_count(&self) -> Result<u32> {
764        let root_ref = match self
765            .document
766            .trailer
767            .get(b"Root")
768            .and_then(Object::as_reference)
769        {
770            Ok(id) => id,
771            Err(_) => return Ok(0),
772        };
773
774        let mut already_seen = HashSet::new();
775        let catalog_obj = match self.get_object(root_ref, &mut already_seen) {
776            Ok(obj) => obj,
777            Err(_) => return Ok(0),
778        };
779
780        let catalog_dict = match catalog_obj.as_dict() {
781            Ok(dict) => dict,
782            Err(_) => return Ok(0),
783        };
784
785        let pages_ref = match catalog_dict.get(b"Pages").and_then(Object::as_reference) {
786            Ok(id) => id,
787            Err(_) => return Ok(0),
788        };
789
790        self.get_pages_tree_count(pages_ref, &mut HashSet::new())
791            .or(Ok(0))
792    }
793
794    fn get_pages_tree_count(
795        &self,
796        pages_id: ObjectId,
797        seen: &mut HashSet<ObjectId>,
798    ) -> Result<u32> {
799        if seen.contains(&pages_id) {
800            return Err(Error::ReferenceCycle(pages_id));
801        }
802        seen.insert(pages_id);
803
804        let mut already_seen = HashSet::new();
805        let pages_obj = match self.get_object(pages_id, &mut already_seen) {
806            Ok(obj) => obj,
807            Err(_) => return Ok(0),
808        };
809
810        let pages_dict = match pages_obj.as_dict() {
811            Ok(dict) => dict,
812            Err(_) => return Ok(0),
813        };
814
815        match pages_dict.get_type() {
816            Ok(type_name) if type_name == b"Page" => Ok(1),
817            Ok(type_name) if type_name == b"Pages" => {
818                if let Ok(count_obj) = pages_dict.get(b"Count") {
819                    if let Ok(count) = count_obj.as_i64() {
820                        if count >= 0 {
821                            return Ok(count as u32);
822                        }
823                    }
824                }
825
826                let kids = match pages_dict.get(b"Kids").and_then(Object::as_array) {
827                    Ok(arr) => arr,
828                    Err(_) => return Ok(0),
829                };
830
831                let mut total = 0u32;
832                for kid in kids.iter() {
833                    if let Ok(kid_ref) = kid.as_reference() {
834                        if let Ok(count) = self.get_pages_tree_count(kid_ref, seen) {
835                            total += count;
836                        }
837                    }
838                }
839                Ok(total)
840            }
841            _ => Ok(1),
842        }
843    }
844
845    /// Read whole document.
846    pub fn read(mut self, filter_func: Option<FilterFunc>) -> Result<Document> {
847        let offset = self
848            .buffer
849            .windows(5)
850            .position(|w| w == b"%PDF-")
851            .unwrap_or(0);
852        self.buffer = &self.buffer[offset..];
853
854        // The document structure can be expressed in PEG as:
855        //   document <- header indirect_object* xref trailer xref_start
856        let version = parser::header(ParserInput::new_extra(self.buffer, "header"))
857            .ok_or(ParseError::InvalidFileHeader)?;
858
859        //The binary_mark is in line 2 after the pdf version. If at other line number, then will be declared as invalid pdf.
860        if let Some(pos) = self.buffer.iter().position(|&byte| byte == b'\n') {
861            if let Some(binary_mark) = parser::binary_mark(ParserInput::new_extra(
862                &self.buffer[pos + 1..],
863                "binary_mark",
864            )) {
865                if binary_mark.iter().all(|&byte| byte >= 128) {
866                    self.document.binary_mark = binary_mark;
867                }
868            }
869        }
870
871        let xref_start = Self::get_xref_start(self.buffer)?;
872        if xref_start > self.buffer.len() {
873            return Err(Error::Xref(XrefError::Start));
874        }
875        self.document.xref_start = xref_start;
876
877        let (mut xref, mut trailer) = parser::xref_and_trailer(
878            ParserInput::new_extra(&self.buffer[xref_start..], "xref"),
879            &self,
880        )?;
881
882        // Read previous Xrefs of linearized or incremental updated document.
883        let mut already_seen = HashSet::new();
884        let mut prev_xref_start = trailer.remove(b"Prev");
885        while let Some(prev) = prev_xref_start.and_then(|offset| offset.as_i64().ok()) {
886            if already_seen.contains(&prev) {
887                break;
888            }
889            already_seen.insert(prev);
890            if prev < 0 || prev as usize > self.buffer.len() {
891                return Err(Error::Xref(XrefError::PrevStart));
892            }
893
894            let (prev_xref, prev_trailer) = parser::xref_and_trailer(
895                ParserInput::new_extra(&self.buffer[prev as usize..], ""),
896                &self,
897            )?;
898            xref.merge(prev_xref);
899
900            // Read xref stream in hybrid-reference file
901            let prev_xref_stream_start = trailer.remove(b"XRefStm");
902            if let Some(prev) = prev_xref_stream_start.and_then(|offset| offset.as_i64().ok()) {
903                if prev < 0 || prev as usize > self.buffer.len() {
904                    return Err(Error::Xref(XrefError::StreamStart));
905                }
906
907                let (prev_xref, _) = parser::xref_and_trailer(
908                    ParserInput::new_extra(&self.buffer[prev as usize..], ""),
909                    &self,
910                )?;
911                xref.merge(prev_xref);
912            }
913
914            prev_xref_start = prev_trailer.get(b"Prev").cloned().ok();
915        }
916        let xref_entry_count = xref
917            .max_id()
918            .checked_add(1)
919            .ok_or(ParseError::InvalidXref)?;
920        if xref.size != xref_entry_count {
921            warn!(
922                "Size entry of trailer dictionary is {}, correct value is {}.",
923                xref.size, xref_entry_count
924            );
925            xref.size = xref_entry_count;
926        }
927
928        self.document.version = version;
929        self.document.max_id = xref.size - 1;
930        self.document.trailer = trailer;
931        self.document.reference_table = xref;
932
933        // Check if encrypted
934        let is_encrypted = self.document.trailer.get(b"Encrypt").is_ok();
935
936        if is_encrypted {
937            // For encrypted PDFs, use a special loading strategy
938            self.load_encrypted_document(filter_func)?;
939        } else {
940            // For non-encrypted PDFs, use the normal loading
941            self.load_objects_raw(filter_func)?;
942        }
943
944        Ok(self.document)
945    }
946
947    fn load_encrypted_document(&mut self, _filter_func: Option<FilterFunc>) -> Result<()> {
948        // First, extract all raw object bytes without parsing
949        let entries: Vec<_> = self
950            .document
951            .reference_table
952            .entries
953            .iter()
954            .map(|(k, v)| (*k, v.clone()))
955            .collect();
956
957        let mut object_streams = Vec::new();
958
959        for (obj_num, entry) in entries {
960            match entry {
961                XrefEntry::Normal { offset, .. } => {
962                    if let Ok((obj_id, raw_bytes)) = self.extract_raw_object(offset as usize) {
963                        self.raw_objects.insert(obj_id, raw_bytes);
964                    }
965                }
966                XrefEntry::Compressed { container, index } => {
967                    // Store compressed object info for later processing
968                    object_streams.push((obj_num, container, index));
969                }
970                XrefEntry::Free | XrefEntry::UnusableFree => {
971                    // Skip free entries
972                }
973            }
974        }
975
976        self.parse_encryption_dictionary()?;
977
978        if self.authenticate_and_setup_encryption(false)?.is_none() {
979            return Ok(());
980        }
981
982        if let Some(ref state) = self.encryption_state {
983            let encrypt_ref = self
984                .document
985                .trailer
986                .get(b"Encrypt")
987                .ok()
988                .and_then(|o| o.as_reference().ok());
989
990            for (obj_id, raw_bytes) in &self.raw_objects {
991                if let Some(enc_ref) = encrypt_ref {
992                    if *obj_id == enc_ref {
993                        continue;
994                    }
995                }
996
997                if let Ok((id, mut obj)) = self.parse_raw_object(raw_bytes) {
998                    let _ = encryption::decrypt_object(state, *obj_id, &mut obj);
999                    self.document.objects.insert(id, obj);
1000                }
1001            }
1002
1003            let mut streams_to_process: std::collections::HashMap<u32, Vec<(u32, u16)>> =
1004                std::collections::HashMap::new();
1005            for (obj_num, container_id, index) in object_streams {
1006                streams_to_process
1007                    .entry(container_id)
1008                    .or_default()
1009                    .push((obj_num, index));
1010            }
1011
1012            for (container_id, objects_in_stream) in streams_to_process {
1013                if let Some(container_obj) = self.document.objects.get_mut(&(container_id, 0)) {
1014                    if let Ok(stream) = container_obj.as_stream_mut() {
1015                        match ObjectStream::new(stream) {
1016                            Ok(object_stream) => {
1017                                for (obj_num, _index) in objects_in_stream {
1018                                    let obj_id = (obj_num, 0);
1019                                    if let Some(obj) = object_stream.objects.get(&obj_id) {
1020                                        self.document.objects.insert(obj_id, obj.clone());
1021                                    }
1022                                }
1023                            }
1024                            Err(_e) => {}
1025                        }
1026                    }
1027                }
1028            }
1029
1030            self.document.encryption_state = Some(state.clone());
1031
1032            if let Some(enc_ref) = encrypt_ref {
1033                self.document.objects.remove(&enc_ref);
1034            }
1035            self.document.trailer.remove(b"Encrypt");
1036        }
1037
1038        Ok(())
1039    }
1040
1041    fn parse_raw_object(&self, raw_bytes: &[u8]) -> Result<(ObjectId, Object)> {
1042        // Parse the raw bytes as an indirect object
1043        parser::indirect_object(
1044            ParserInput::new_extra(raw_bytes, "indirect object"),
1045            0,
1046            None,
1047            self,
1048            &mut HashSet::new(),
1049        )
1050    }
1051
1052    fn load_objects_raw(&mut self, filter_func: Option<FilterFunc>) -> Result<()> {
1053        let is_encrypted = self.document.trailer.get(b"Encrypt").is_ok();
1054        let zero_length_streams = Mutex::new(vec![]);
1055        let object_streams = Mutex::new(vec![]);
1056        // Phase 2 (Issue #468): track ObjStm containers kept for lazy resolution.
1057        let pending_obj_stream_ids: Mutex<Vec<ObjectId>> = Mutex::new(vec![]);
1058        // Copy bool so the closure captures it by value (no borrow of self.options).
1059        let lazy_objstm = self.options.lazy_objstm;
1060
1061        let entries_filter_map = |(_, entry): (&_, &_)| {
1062            if let XrefEntry::Normal { offset, .. } = *entry {
1063                // read_object now handles decryption internally
1064                let result = self.read_object(offset as usize, None, &mut HashSet::new());
1065                let (object_id, mut object) = match result {
1066                    Ok(obj) => obj,
1067                    Err(e) => {
1068                        // Log error but continue
1069                        if is_encrypted {
1070                            // Expected for some encrypted objects - but log which ones
1071                            warn!("Skipping encrypted object at offset {}: {:?}", offset, e);
1072                        } else {
1073                            error!("Object load error at offset {}: {e:?}", offset);
1074                        }
1075                        return None;
1076                    }
1077                };
1078                if let Some(filter_func) = filter_func {
1079                    filter_func(object_id, &mut object)?;
1080                }
1081
1082                if let Ok(ref mut stream) = object.as_stream_mut() {
1083                    if stream.dict.has_type(b"ObjStm") && !is_encrypted {
1084                        if lazy_objstm {
1085                            // Phase 2b (Issue #468): defer decompression.
1086                            // Keep the container in document.objects and record its
1087                            // ID so the caller can call resolve_pending_object_streams.
1088                            pending_obj_stream_ids.lock().unwrap().push(object_id);
1089                            // Fall through to Some((object_id, object)) below.
1090                        } else {
1091                            // Phase 2a (Issue #468): eager extraction, drop container.
1092                            // Extract contained objects now, then return None so the
1093                            // ObjStm container itself is NOT added to document.objects.
1094                            // This eliminates the decompressed-container double-memory
1095                            // problem: the decompressed bytes (stream.content) are freed
1096                            // when `object` is dropped at the end of this arm.
1097                            if let Ok(obj_stream) = ObjectStream::new(stream) {
1098                                let container_id = object_id;
1099                                let owned_objects = obj_stream.objects.into_iter().filter(
1100                                    |(nested_object_id, _)| {
1101                                        self.document.reference_table.compressed_object_belongs_to(
1102                                            *nested_object_id,
1103                                            container_id,
1104                                        )
1105                                    },
1106                                );
1107                                let mut object_streams = object_streams.lock().unwrap();
1108                                if let Some(filter_func) = filter_func {
1109                                    let objects: BTreeMap<(u32, u16), Object> = owned_objects
1110                                        .filter_map(|(object_id, mut object)| {
1111                                            filter_func(object_id, &mut object)
1112                                        })
1113                                        .collect();
1114                                    object_streams.extend(objects);
1115                                } else {
1116                                    object_streams.extend(owned_objects);
1117                                }
1118                            }
1119                            // Return None: container is dropped here, freeing its bytes.
1120                            return None;
1121                        }
1122                    } else if stream.content.is_empty() {
1123                        let mut zero_length_streams = zero_length_streams.lock().unwrap();
1124                        zero_length_streams.push(object_id);
1125                    }
1126                }
1127
1128                Some((object_id, object))
1129            } else {
1130                None
1131            }
1132        };
1133
1134        #[cfg(feature = "rayon")]
1135        {
1136            self.document.objects = self
1137                .document
1138                .reference_table
1139                .entries
1140                .par_iter()
1141                .filter_map(entries_filter_map)
1142                .collect();
1143        }
1144        #[cfg(not(feature = "rayon"))]
1145        {
1146            self.document.objects = self
1147                .document
1148                .reference_table
1149                .entries
1150                .iter()
1151                .filter_map(entries_filter_map)
1152                .collect();
1153        }
1154
1155        // Only add entries, but never replace entries
1156        for (id, entry) in object_streams.into_inner().unwrap() {
1157            self.document.objects.entry(id).or_insert(entry);
1158        }
1159
1160        for object_id in zero_length_streams.into_inner().unwrap() {
1161            let _ = self.read_stream_content(object_id);
1162        }
1163
1164        // Phase 2b (Issue #468): store pending ObjStm container IDs in the document
1165        // so the caller can resolve them later via resolve_pending_object_streams.
1166        self.document.pending_obj_streams = pending_obj_stream_ids.into_inner().unwrap();
1167
1168        Ok(())
1169    }
1170
1171    fn read_stream_content(&mut self, object_id: ObjectId) -> Result<()> {
1172        let length = self.get_stream_length(object_id)?;
1173        let stream = self
1174            .document
1175            .get_object_mut(object_id)
1176            .and_then(Object::as_stream_mut)?;
1177        let start = stream
1178            .start_position
1179            .ok_or(Error::InvalidStream("missing start position".to_string()))?;
1180
1181        if length < 0 {
1182            return Err(Error::InvalidStream("negative stream length.".to_string()));
1183        }
1184
1185        let length = usize::try_from(length).map_err(|e| Error::NumericCast(e.to_string()))?;
1186        let end = start + length;
1187
1188        if end > self.buffer.len() {
1189            return Err(Error::InvalidStream(
1190                "stream extends after document end.".to_string(),
1191            ));
1192        }
1193
1194        stream.set_content(self.buffer[start..end].to_vec());
1195        Ok(())
1196    }
1197
1198    fn get_stream_length(&self, object_id: ObjectId) -> Result<i64> {
1199        let object = self.document.get_object(object_id)?;
1200        let stream = object.as_stream()?;
1201        stream
1202            .dict
1203            .get(b"Length")
1204            .and_then(|value| self.document.dereference(value))
1205            .and_then(|(_id, obj)| obj.as_i64())
1206            .inspect_err(|_err| {
1207                error!(
1208                    "stream dictionary of '{} {} R' is missing the Length entry",
1209                    object_id.0, object_id.1
1210                );
1211            })
1212    }
1213
1214    /// Get object offset by object ID.
1215    fn get_offset(&self, id: ObjectId) -> Result<u32> {
1216        let entry = self
1217            .document
1218            .reference_table
1219            .get(id.0)
1220            .ok_or(Error::MissingXrefEntry)?;
1221        match *entry {
1222            XrefEntry::Normal { offset, generation } if generation == id.1 => Ok(offset),
1223            _ => Err(Error::MissingXrefEntry),
1224        }
1225    }
1226
1227    /// Load a compressed object from an object stream (for lightweight metadata extraction)
1228    fn get_compressed_object(&self, id: ObjectId) -> Result<Object> {
1229        let entry = self
1230            .document
1231            .reference_table
1232            .get(id.0)
1233            .ok_or(Error::MissingXrefEntry)?;
1234
1235        let container_id = match entry {
1236            XrefEntry::Compressed { container, .. } => *container,
1237            _ => return Err(Error::MissingXrefEntry),
1238        };
1239
1240        let container_id = (container_id, 0);
1241        let mut already_seen = HashSet::new();
1242        let container_obj = self.get_object(container_id, &mut already_seen)?;
1243        let mut container_stream = container_obj.as_stream()?.clone();
1244        let object_stream = ObjectStream::new(&mut container_stream)?;
1245        object_stream
1246            .objects
1247            .get(&id)
1248            .cloned()
1249            .ok_or(Error::MissingXrefEntry)
1250    }
1251
1252    pub fn get_object(&self, id: ObjectId, already_seen: &mut HashSet<ObjectId>) -> Result<Object> {
1253        if already_seen.contains(&id) {
1254            warn!(
1255                "reference cycle detected resolving object {} {}",
1256                id.0, id.1
1257            );
1258            return Err(Error::ReferenceCycle(id));
1259        }
1260        already_seen.insert(id);
1261
1262        if let Some(entry) = self.document.reference_table.get(id.0) {
1263            if matches!(entry, XrefEntry::Compressed { .. }) {
1264                return self.get_compressed_object(id);
1265            }
1266        }
1267
1268        let offset = self.get_offset(id)?;
1269        let (_, mut obj) = self.read_object(offset as usize, Some(id), already_seen)?;
1270
1271        if let Some(ref state) = self.encryption_state {
1272            let encrypt_ref = self
1273                .document
1274                .trailer
1275                .get(b"Encrypt")
1276                .ok()
1277                .and_then(|o| o.as_reference().ok());
1278            if let Some(enc_ref) = encrypt_ref {
1279                if id != enc_ref {
1280                    encryption::decrypt_object(state, id, &mut obj).map_err(Error::Decryption)?;
1281                }
1282            }
1283        }
1284
1285        Ok(obj)
1286    }
1287
1288    fn parse_encryption_dictionary(&mut self) -> Result<()> {
1289        if let Ok(encrypt_ref) = self
1290            .document
1291            .trailer
1292            .get(b"Encrypt")
1293            .and_then(|o| o.as_reference())
1294        {
1295            if self.raw_objects.is_empty() {
1296                let offset = self.get_offset(encrypt_ref)?;
1297                let (_, encrypt_obj) =
1298                    self.read_object(offset as usize, Some(encrypt_ref), &mut HashSet::new())?;
1299                self.document.objects.insert(encrypt_ref, encrypt_obj);
1300            } else if let Some(raw_bytes) = self.raw_objects.get(&encrypt_ref) {
1301                if let Ok((_, obj)) = self.parse_raw_object(raw_bytes) {
1302                    self.document.objects.insert(encrypt_ref, obj);
1303                }
1304            }
1305        }
1306        Ok(())
1307    }
1308
1309    fn authenticate_and_setup_encryption(
1310        &mut self,
1311        require_password: bool,
1312    ) -> Result<Option<String>> {
1313        let password_to_use: Option<String> = if self.document.authenticate_password("").is_ok() {
1314            Some(String::new())
1315        } else if let Some(ref pwd) = self.password {
1316            if self.document.authenticate_password(pwd).is_ok() {
1317                Some(pwd.clone())
1318            } else if require_password {
1319                return Err(Error::InvalidPassword);
1320            } else {
1321                warn!("Invalid password provided for encrypted PDF");
1322                return Err(Error::InvalidPassword);
1323            }
1324        } else if require_password {
1325            return Err(Error::Unimplemented(
1326                "PDF is encrypted and requires a password. Use Document::load_metadata_with_password() instead.",
1327            ));
1328        } else {
1329            warn!("PDF is encrypted and requires a password");
1330            return Ok(None);
1331        };
1332
1333        if let Some(ref password) = password_to_use {
1334            let state = EncryptionState::decode(&self.document, password)?;
1335            self.encryption_state = Some(state);
1336        }
1337
1338        Ok(password_to_use)
1339    }
1340
1341    fn setup_encryption_for_metadata(&mut self) -> Result<()> {
1342        self.parse_encryption_dictionary()?;
1343        self.authenticate_and_setup_encryption(true)?;
1344        Ok(())
1345    }
1346
1347    fn extract_raw_object(&mut self, offset: usize) -> Result<(ObjectId, Vec<u8>)> {
1348        if offset > self.buffer.len() {
1349            return Err(Error::InvalidOffset(offset));
1350        }
1351
1352        // Find object header (e.g., "19 0 obj")
1353        let slice = &self.buffer[offset..];
1354
1355        // Parse object ID
1356        let mut pos = 0;
1357        while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1358            pos += 1;
1359        }
1360
1361        // Get object number
1362        let num_start = pos;
1363        while pos < slice.len() && slice[pos].is_ascii_digit() {
1364            pos += 1;
1365        }
1366        let obj_num: u32 = std::str::from_utf8(&slice[num_start..pos])
1367            .ok()
1368            .and_then(|s| s.parse().ok())
1369            .ok_or(Error::Parse(ParseError::InvalidXref))?;
1370
1371        // Skip whitespace
1372        while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1373            pos += 1;
1374        }
1375
1376        // Get generation number
1377        let gen_start = pos;
1378        while pos < slice.len() && slice[pos].is_ascii_digit() {
1379            pos += 1;
1380        }
1381        let obj_gen: u16 = std::str::from_utf8(&slice[gen_start..pos])
1382            .ok()
1383            .and_then(|s| s.parse().ok())
1384            .ok_or(Error::Parse(ParseError::InvalidXref))?;
1385
1386        // Skip to "obj"
1387        while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1388            pos += 1;
1389        }
1390        if pos + 3 > slice.len() || &slice[pos..pos + 3] != b"obj" {
1391            return Err(Error::Parse(ParseError::InvalidXref));
1392        }
1393        pos += 3;
1394
1395        // Find "endobj"
1396        let endobj_pattern = b"endobj";
1397        let mut end_pos = pos;
1398        while end_pos + endobj_pattern.len() <= slice.len() {
1399            if &slice[end_pos..end_pos + endobj_pattern.len()] == endobj_pattern {
1400                end_pos += endobj_pattern.len();
1401                break;
1402            }
1403            end_pos += 1;
1404        }
1405
1406        if end_pos > slice.len() {
1407            return Err(Error::Parse(ParseError::InvalidXref));
1408        }
1409
1410        // Extract raw object bytes (including header and trailer)
1411        let raw_bytes = slice[0..end_pos].to_vec();
1412
1413        Ok(((obj_num, obj_gen), raw_bytes))
1414    }
1415
1416    fn read_object(
1417        &self,
1418        offset: usize,
1419        expected_id: Option<ObjectId>,
1420        already_seen: &mut HashSet<ObjectId>,
1421    ) -> Result<(ObjectId, Object)> {
1422        if offset > self.buffer.len() {
1423            return Err(Error::InvalidOffset(offset));
1424        }
1425
1426        // Just parse without decryption - we'll decrypt later
1427        parser::indirect_object(
1428            ParserInput::new_extra(self.buffer, "indirect object"),
1429            offset,
1430            expected_id,
1431            self,
1432            already_seen,
1433        )
1434    }
1435
1436    fn get_xref_start(buffer: &[u8]) -> Result<usize> {
1437        let seek_pos = buffer.len() - cmp::min(buffer.len(), 512);
1438        Self::search_substring(buffer, b"%%EOF", seek_pos)
1439            .and_then(|eof_pos| if eof_pos > 25 { Some(eof_pos) } else { None })
1440            .and_then(|eof_pos| Self::search_substring(buffer, b"startxref", eof_pos - 25))
1441            .ok_or(Error::Xref(XrefError::Start))
1442            .and_then(|xref_pos| {
1443                if xref_pos <= buffer.len() {
1444                    match parser::xref_start(ParserInput::new_extra(&buffer[xref_pos..], "xref")) {
1445                        Some(startxref) => Ok(startxref as usize),
1446                        None => Err(Error::Xref(XrefError::Start)),
1447                    }
1448                } else {
1449                    Err(Error::Xref(XrefError::Start))
1450                }
1451            })
1452    }
1453
1454    fn search_substring(buffer: &[u8], pattern: &[u8], start_pos: usize) -> Option<usize> {
1455        buffer
1456            .get(start_pos..)?
1457            .windows(pattern.len())
1458            .rposition(|window| window == pattern)
1459            .map(|pos| start_pos + pos)
1460    }
1461}
1462
1463#[cfg(all(test, not(feature = "async")))]
1464#[test]
1465fn load_document() {
1466    let mut doc = Document::load("assets/example.pdf").unwrap();
1467    assert_eq!(doc.version, "1.5");
1468
1469    // Create temporary folder to store file.
1470    let temp_dir = tempfile::tempdir().unwrap();
1471    let file_path = temp_dir.path().join("test_2_load.pdf");
1472    doc.save(file_path).unwrap();
1473}
1474
1475#[cfg(all(test, feature = "async"))]
1476#[tokio::test]
1477async fn load_document() {
1478    let mut doc = Document::load("assets/example.pdf").await.unwrap();
1479    assert_eq!(doc.version, "1.5");
1480
1481    // Create temporary folder to store file.
1482    let temp_dir = tempfile::tempdir().unwrap();
1483    let file_path = temp_dir.path().join("test_2_load.pdf");
1484    doc.save(file_path).unwrap();
1485}
1486
1487#[test]
1488#[should_panic(expected = "Xref(Start)")]
1489fn load_short_document() {
1490    let _doc = Document::load_mem(b"%PDF-1.5\n%%EOF\n").unwrap();
1491}
1492
1493#[test]
1494fn load_document_with_preceding_bytes() {
1495    let mut content = Vec::new();
1496    content.extend(b"garbage");
1497    content.extend(include_bytes!("../assets/example.pdf"));
1498    let doc = Document::load_mem(&content).unwrap();
1499    assert_eq!(doc.version, "1.5");
1500}
1501
1502#[test]
1503fn load_many_shallow_brackets() {
1504    let content: String = std::iter::repeat_n("()", MAX_BRACKET * 10)
1505        .flat_map(|x| x.chars())
1506        .collect();
1507    const STREAM_CRUFT: usize = 33;
1508    let doc = format!(
1509        "%PDF-1.5
15101 0 obj<</Type/Pages/Kids[5 0 R]/Count 1/Resources 3 0 R/MediaBox[0 0 595 842]>>endobj
15112 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
15123 0 obj<</Font<</F1 2 0 R>>>>endobj
15135 0 obj<</Type/Page/Parent 1 0 R/Contents[4 0 R]>>endobj
15146 0 obj<</Type/Catalog/Pages 1 0 R>>endobj
15154 0 obj<</Length {}>>stream
1516BT
1517/F1 48 Tf
1518100 600 Td
1519({}) Tj
1520ET
1521endstream endobj\n",
1522        content.len() + STREAM_CRUFT,
1523        content
1524    );
1525    let doc = format!(
1526        "{}xref
15270 7
15280000000000 65535 f 
15290000000009 00000 n 
15300000000096 00000 n 
15310000000155 00000 n 
15320000000291 00000 n 
15330000000191 00000 n 
15340000000248 00000 n 
1535trailer
1536<</Root 6 0 R/Size 7>>
1537startxref
1538{}
1539%%EOF",
1540        doc,
1541        doc.len()
1542    );
1543
1544    let _doc = Document::load_mem(doc.as_bytes()).unwrap();
1545}
1546
1547#[test]
1548fn load_too_deep_brackets() {
1549    let content: Vec<u8> = std::iter::repeat_n(b'(', MAX_BRACKET + 1)
1550        .chain(std::iter::repeat_n(b')', MAX_BRACKET + 1))
1551        .collect();
1552    let content = String::from_utf8(content).unwrap();
1553    const STREAM_CRUFT: usize = 33;
1554    let doc = format!(
1555        "%PDF-1.5
15561 0 obj<</Type/Pages/Kids[5 0 R]/Count 1/Resources 3 0 R/MediaBox[0 0 595 842]>>endobj
15572 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
15583 0 obj<</Font<</F1 2 0 R>>>>endobj
15595 0 obj<</Type/Page/Parent 1 0 R/Contents[7 0 R 4 0 R]>>endobj
15606 0 obj<</Type/Catalog/Pages 1 0 R>>endobj
15617 0 obj<</Length 45>>stream
1562BT /F1 48 Tf 100 600 Td (Hello World!) Tj ET
1563endstream
1564endobj
15654 0 obj<</Length {}>>stream
1566BT
1567/F1 48 Tf
1568100 600 Td
1569({}) Tj
1570ET
1571endstream endobj\n",
1572        content.len() + STREAM_CRUFT,
1573        content
1574    );
1575    let doc = format!(
1576        "{}xref
15770 7
15780000000000 65535 f 
15790000000009 00000 n 
15800000000096 00000 n 
15810000000155 00000 n 
15820000000387 00000 n 
15830000000191 00000 n 
15840000000254 00000 n 
15850000000297 00000 n 
1586trailer
1587<</Root 6 0 R/Size 7>>
1588startxref
1589{}
1590%%EOF",
1591        doc,
1592        doc.len()
1593    );
1594
1595    let doc = Document::load_mem(doc.as_bytes()).unwrap();
1596    let pages = doc.get_pages().keys().cloned().collect::<Vec<_>>();
1597    assert_eq!("Hello World!\n", doc.extract_text(&pages).unwrap());
1598}
1599
1600#[cfg(all(test, not(feature = "async")))]
1601#[test]
1602fn search_substring_finds_last_occurrence() {
1603    assert_eq!(Reader::search_substring(b"hello world", b"xyz", 0), None);
1604    assert_eq!(
1605        Reader::search_substring(b"hello world", b"world", 0),
1606        Some(6)
1607    );
1608
1609    let buffer = b"%%EOF\ntest%%EOF\nend";
1610    assert_eq!(Reader::search_substring(buffer, b"%%EOF", 0), Some(10));
1611    assert_eq!(Reader::search_substring(buffer, b"%%EOF", 6), Some(10));
1612    assert_eq!(Reader::search_substring(buffer, b"%%EOF", 15), None);
1613    assert_eq!(Reader::search_substring(b"%%EOF", b"%%EOF", 0), Some(0));
1614
1615    let buffer_with_many_percents = b"%%%PDF-1.3%%%comment%%%more%%EOF";
1616    assert_eq!(
1617        Reader::search_substring(buffer_with_many_percents, b"%%EOF", 0),
1618        Some(27)
1619    );
1620}
1621
1622// ── Phase 1 & 2 tests (Issue #468) ───────────────────────────────────────────
1623
1624/// A minimal but valid PDF containing a single page with no objects in ObjStm.
1625/// Used as a fixture for LoadOptions tests.
1626#[cfg(all(test, not(feature = "async")))]
1627fn minimal_pdf_bytes() -> &'static [u8] {
1628    include_bytes!("../assets/example.pdf")
1629}
1630
1631#[cfg(all(test, not(feature = "async")))]
1632#[test]
1633fn load_with_options_accepts_normal_document() {
1634    // Default options (256 MiB limit) should accept the small example PDF.
1635    let data = minimal_pdf_bytes();
1636    let opts = LoadOptions::new();
1637    let doc = Document::load_mem_with_options(data, &opts)
1638        .expect("example.pdf should be accepted by default options");
1639    assert_eq!(doc.version, "1.5");
1640}
1641
1642#[cfg(all(test, not(feature = "async")))]
1643#[test]
1644fn load_with_options_rejects_oversized_document() {
1645    // Set a 1-byte limit — any real PDF must exceed it.
1646    let data = minimal_pdf_bytes();
1647    let opts = LoadOptions::new().max_file_bytes(1usize);
1648    let err = Document::load_mem_with_options(data, &opts)
1649        .expect_err("document larger than 1 byte must be rejected");
1650    match err {
1651        Error::DocumentTooLarge { size, limit } => {
1652            assert_eq!(limit, 1);
1653            assert_eq!(size, data.len());
1654        }
1655        other => panic!("expected DocumentTooLarge, got {other:?}"),
1656    }
1657}
1658
1659#[cfg(all(test, not(feature = "async")))]
1660#[test]
1661fn load_with_options_unlimited() {
1662    // None = no size check — should succeed for any document.
1663    let data = minimal_pdf_bytes();
1664    let opts = LoadOptions::new().max_file_bytes(None);
1665    let doc = Document::load_mem_with_options(data, &opts)
1666        .expect("unlimited options must not reject documents");
1667    assert_eq!(doc.version, "1.5");
1668}
1669
1670#[cfg(all(test, not(feature = "async")))]
1671#[test]
1672fn load_mem_with_options_lazy_objstm_no_objects_lost() {
1673    // When lazy_objstm = true, objects inside ObjStm must be accessible after
1674    // calling resolve_pending_object_streams.
1675    //
1676    // example.pdf uses ObjStm (PDF 1.5 cross-reference streams), so this
1677    // exercises the lazy path on real data.
1678    let data = minimal_pdf_bytes();
1679    let opts = LoadOptions::new().lazy_objstm(true).max_file_bytes(None);
1680    let mut lazy_doc = Document::load_mem_with_options(data, &opts)
1681        .expect("lazy load of example.pdf should succeed");
1682
1683    // Eager-loaded reference document.
1684    let eager_doc = Document::load_mem(data).expect("eager load of example.pdf should succeed");
1685
1686    // Before resolving, the lazy doc may have fewer objects.
1687    // After resolving it must match the eager doc.
1688    lazy_doc
1689        .resolve_pending_object_streams()
1690        .expect("resolve_pending_object_streams should not fail on valid data");
1691
1692    assert_eq!(
1693        lazy_doc.objects.len(),
1694        eager_doc.objects.len(),
1695        "after resolve, lazy doc must have same object count as eager doc"
1696    );
1697    assert!(
1698        lazy_doc.pending_obj_streams.is_empty(),
1699        "pending_obj_streams must be empty after resolve"
1700    );
1701}
1702
1703#[cfg(all(test, not(feature = "async")))]
1704#[test]
1705fn resolve_pending_object_streams_skips_objects_reassigned_to_newer_container() {
1706    let mut doc = Document::new();
1707    doc.reference_table.insert(
1708        7,
1709        XrefEntry::Compressed {
1710            container: 20,
1711            index: 0,
1712        },
1713    );
1714
1715    let mut old_stream = ObjectStream::builder().compression_level(0).build();
1716    old_stream
1717        .add_object((7, 0), Object::Integer(1))
1718        .expect("old ObjStm should accept object");
1719    doc.objects.insert(
1720        (10, 0),
1721        Object::Stream(old_stream.to_stream_object().unwrap()),
1722    );
1723
1724    let mut new_stream = ObjectStream::builder().compression_level(0).build();
1725    new_stream
1726        .add_object((7, 0), Object::Integer(2))
1727        .expect("new ObjStm should accept object");
1728    doc.objects.insert(
1729        (20, 0),
1730        Object::Stream(new_stream.to_stream_object().unwrap()),
1731    );
1732
1733    doc.pending_obj_streams = vec![(10, 0), (20, 0)];
1734    doc.resolve_pending_object_streams()
1735        .expect("lazy ObjStm resolution should succeed");
1736
1737    let resolved = doc
1738        .get_object((7, 0))
1739        .expect("object should resolve from the current ObjStm");
1740    assert_eq!(
1741        resolved
1742            .as_i64()
1743            .expect("resolved object should stay an integer"),
1744        2
1745    );
1746    assert!(
1747        !doc.objects.contains_key(&(10, 0)),
1748        "old ObjStm container should be dropped after resolution"
1749    );
1750    assert!(
1751        !doc.objects.contains_key(&(20, 0)),
1752        "new ObjStm container should be dropped after resolution"
1753    );
1754}
1755
1756#[test]
1757fn load_options_builder() {
1758    let opts = LoadOptions::new()
1759        .max_file_bytes(64 * 1024 * 1024)
1760        .lazy_objstm(true);
1761    assert_eq!(opts.max_file_bytes, Some(64 * 1024 * 1024));
1762    assert!(opts.lazy_objstm);
1763
1764    let no_limit = LoadOptions::new().max_file_bytes(None);
1765    assert_eq!(no_limit.max_file_bytes, None);
1766
1767    let default = LoadOptions::default();
1768    assert_eq!(
1769        default.max_file_bytes,
1770        Some(crate::load_options::DEFAULT_MAX_FILE_BYTES)
1771    );
1772    assert!(!default.lazy_objstm);
1773}