Skip to main content

justpdf_core/
parser.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2use std::hash::Hash;
3use std::path::Path;
4use std::sync::RwLock;
5
6use crate::crypto;
7use crate::crypto::SecurityState;
8use crate::error::{JustPdfError, Result};
9use crate::object::{self, IndirectRef, PdfDict, PdfObject};
10use crate::stream;
11use crate::tokenizer::Tokenizer;
12use crate::xref::{self, Xref, XrefEntry};
13
14// ---------------------------------------------------------------------------
15// PdfData: backing store abstraction (Task 1)
16// ---------------------------------------------------------------------------
17
18/// Backing store for PDF file data.
19enum PdfData {
20    Owned(Vec<u8>),
21    #[cfg(feature = "mmap")]
22    Mmap(memmap2::Mmap),
23}
24
25impl std::fmt::Debug for PdfData {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match self {
28            Self::Owned(v) => f.debug_tuple("Owned").field(&v.len()).finish(),
29            #[cfg(feature = "mmap")]
30            Self::Mmap(m) => f.debug_tuple("Mmap").field(&m.len()).finish(),
31        }
32    }
33}
34
35impl PdfData {
36    fn as_bytes(&self) -> &[u8] {
37        match self {
38            Self::Owned(v) => v,
39            #[cfg(feature = "mmap")]
40            Self::Mmap(m) => m,
41        }
42    }
43}
44
45// ---------------------------------------------------------------------------
46// LruCache: bounded object cache (Task 2)
47// ---------------------------------------------------------------------------
48
49/// A simple bounded LRU cache backed by a `HashMap` and `VecDeque`.
50struct LruCache<K: Eq + Hash + Clone, V> {
51    map: HashMap<K, V>,
52    order: VecDeque<K>,
53    capacity: usize,
54}
55
56impl<K: Eq + Hash + Clone + std::fmt::Debug, V: std::fmt::Debug> std::fmt::Debug
57    for LruCache<K, V>
58{
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        f.debug_struct("LruCache")
61            .field("len", &self.map.len())
62            .field("capacity", &self.capacity)
63            .finish()
64    }
65}
66
67impl<K: Eq + Hash + Clone, V> LruCache<K, V> {
68    fn new(capacity: usize) -> Self {
69        assert!(capacity > 0, "LruCache capacity must be > 0");
70        Self {
71            map: HashMap::with_capacity(capacity),
72            order: VecDeque::with_capacity(capacity),
73            capacity,
74        }
75    }
76
77    /// Look up a value, promoting the key to most-recently-used.
78    fn get(&mut self, key: &K) -> Option<&V> {
79        if self.map.contains_key(key) {
80            // Move to front (most recently used)
81            self.touch(key);
82            self.map.get(key)
83        } else {
84            None
85        }
86    }
87
88    /// Insert a key-value pair. If the cache is at capacity the least-recently
89    /// used entry is evicted first.
90    fn insert(&mut self, key: K, value: V) {
91        if self.map.contains_key(&key) {
92            // Update existing entry
93            self.map.insert(key.clone(), value);
94            self.touch(&key);
95            return;
96        }
97        // Evict if at capacity
98        if self.map.len() >= self.capacity {
99            if let Some(evicted) = self.order.pop_back() {
100                self.map.remove(&evicted);
101            }
102        }
103        self.order.push_front(key.clone());
104        self.map.insert(key, value);
105    }
106
107    #[allow(dead_code)]
108    fn contains_key(&self, key: &K) -> bool {
109        self.map.contains_key(key)
110    }
111
112    fn clear(&mut self) {
113        self.map.clear();
114        self.order.clear();
115    }
116
117    fn len(&self) -> usize {
118        self.map.len()
119    }
120
121    /// Set a new capacity. If the current size exceeds the new capacity,
122    /// the least-recently used entries are evicted.
123    fn set_capacity(&mut self, capacity: usize) {
124        assert!(capacity > 0, "LruCache capacity must be > 0");
125        self.capacity = capacity;
126        while self.map.len() > self.capacity {
127            if let Some(evicted) = self.order.pop_back() {
128                self.map.remove(&evicted);
129            }
130        }
131    }
132
133    // Promote `key` to front of the order deque.
134    fn touch(&mut self, key: &K) {
135        if let Some(pos) = self.order.iter().position(|k| k == key) {
136            self.order.remove(pos);
137        }
138        self.order.push_front(key.clone());
139    }
140}
141
142/// Default LRU object cache capacity.
143const DEFAULT_CACHE_CAPACITY: usize = 2048;
144
145/// A parsed PDF document.
146///
147/// `PdfDocument` uses interior mutability (`RwLock`) for its object caches so
148/// that `resolve` only requires `&self`. This makes the type `Sync` and enables
149/// multi-threaded page parsing and rendering via shared references.
150pub struct PdfDocument {
151    /// PDF version, e.g. (1, 7) for PDF 1.7.
152    pub version: (u8, u8),
153    /// The merged cross-reference table.
154    pub xref: Xref,
155    /// Raw file data (owned or memory-mapped).
156    data: PdfData,
157    /// Bounded LRU cache of parsed objects (interior-mutable).
158    objects: RwLock<LruCache<IndirectRef, PdfObject>>,
159    /// Encryption/security state (None if document is not encrypted).
160    security: Option<SecurityState>,
161    /// Cache of decoded object stream data (interior-mutable).
162    decoded_obj_streams: RwLock<HashMap<u32, Vec<u8>>>,
163}
164
165impl std::fmt::Debug for PdfDocument {
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        let obj_cache_len = self
168            .objects
169            .read()
170            .map(|c| c.len())
171            .unwrap_or(0);
172        f.debug_struct("PdfDocument")
173            .field("version", &self.version)
174            .field("xref", &self.xref)
175            .field("data", &self.data)
176            .field("objects_cached", &obj_cache_len)
177            .field("security", &self.security)
178            .finish()
179    }
180}
181
182impl PdfDocument {
183    /// Open a PDF file from a path.
184    pub fn open(path: &Path) -> Result<Self> {
185        let data = std::fs::read(path)?;
186        Self::from_bytes(data)
187    }
188
189    /// Parse a PDF from an in-memory byte vector.
190    pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
191        Self::from_pdf_data(PdfData::Owned(data))
192    }
193
194    /// Internal constructor shared by all entry points.
195    fn from_pdf_data(data: PdfData) -> Result<Self> {
196        let bytes = data.as_bytes();
197        if bytes.len() < 8 {
198            return Err(JustPdfError::NotPdf);
199        }
200
201        // Parse version from header: %PDF-X.Y
202        let version = parse_version(bytes)?;
203
204        // Load xref
205        let xref = xref::load_xref(bytes)?;
206
207        let mut doc = Self {
208            version,
209            xref,
210            data,
211            objects: RwLock::new(LruCache::new(DEFAULT_CACHE_CAPACITY)),
212            security: None,
213            decoded_obj_streams: RwLock::new(HashMap::new()),
214        };
215
216        // Detect encryption
217        doc.detect_encryption()?;
218
219        Ok(doc)
220    }
221
222    /// Open a PDF file using memory-mapped I/O.
223    ///
224    /// This avoids copying the entire file into memory, which can be
225    /// beneficial for very large documents.
226    #[cfg(feature = "mmap")]
227    pub fn open_mmap(path: &Path) -> Result<Self> {
228        let file = std::fs::File::open(path)?;
229        // SAFETY: We keep the Mmap alive for the lifetime of PdfDocument.
230        // The file must not be modified while mapped.
231        let mmap = unsafe { memmap2::Mmap::map(&file)? };
232        Self::from_pdf_data(PdfData::Mmap(mmap))
233    }
234
235    /// Construct a `PdfDocument` from pre-built parts (used by the
236    /// repair module when the normal xref/trailer is damaged).
237    pub(crate) fn from_raw_parts(data: Vec<u8>, xref: Xref, version: (u8, u8)) -> Self {
238        Self {
239            version,
240            xref,
241            data: PdfData::Owned(data),
242            objects: RwLock::new(LruCache::new(DEFAULT_CACHE_CAPACITY)),
243            security: None,
244            decoded_obj_streams: RwLock::new(HashMap::new()),
245        }
246    }
247
248    /// Detect and initialize encryption from the trailer.
249    fn detect_encryption(&mut self) -> Result<()> {
250        // Check for /Encrypt in trailer
251        let encrypt_ref = match self.xref.trailer.get_ref(b"Encrypt") {
252            Some(r) => r.clone(),
253            None => {
254                // Also check for inline /Encrypt dict
255                if self.xref.trailer.get_dict(b"Encrypt").is_some() {
256                    return self.detect_encryption_inline();
257                }
258                return Ok(());
259            }
260        };
261
262        // Load the encryption dictionary object (without decryption!)
263        let encrypt_obj = self.load_object_raw(&encrypt_ref, &mut HashSet::new())?;
264        let encrypt_dict = match &encrypt_obj {
265            PdfObject::Dict(d) => d,
266            _ => {
267                return Err(JustPdfError::EncryptionError {
268                    detail: "encryption object is not a dictionary".into(),
269                });
270            }
271        };
272
273        let ed = crypto::EncryptionDict::from_dict(encrypt_dict)?;
274
275        // Verify we support this encryption
276        if ed.filter != b"Standard" {
277            return Err(JustPdfError::UnsupportedEncryption {
278                detail: format!(
279                    "unsupported security handler: {}",
280                    String::from_utf8_lossy(&ed.filter)
281                ),
282            });
283        }
284
285        // Extract file ID from trailer
286        let file_id = self.extract_file_id();
287
288        let mut state =
289            SecurityState::new(ed, file_id, Some(encrypt_ref.obj_num));
290
291        // Try empty password (very common for user-password-only PDFs)
292        if let Ok(key) = crypto::auth::authenticate(&state, b"") {
293            state.file_key = Some(key);
294        }
295
296        self.security = Some(state);
297        Ok(())
298    }
299
300    /// Handle inline /Encrypt dict (not an indirect reference).
301    fn detect_encryption_inline(&mut self) -> Result<()> {
302        let encrypt_dict = self.xref.trailer.get_dict(b"Encrypt").unwrap().clone();
303        let ed = crypto::EncryptionDict::from_dict(&encrypt_dict)?;
304
305        if ed.filter != b"Standard" {
306            return Err(JustPdfError::UnsupportedEncryption {
307                detail: format!(
308                    "unsupported security handler: {}",
309                    String::from_utf8_lossy(&ed.filter)
310                ),
311            });
312        }
313
314        let file_id = self.extract_file_id();
315        let mut state = SecurityState::new(ed, file_id, None);
316
317        if let Ok(key) = crypto::auth::authenticate(&state, b"") {
318            state.file_key = Some(key);
319        }
320
321        self.security = Some(state);
322        Ok(())
323    }
324
325    /// Extract the first element of the /ID array from the trailer.
326    fn extract_file_id(&self) -> Vec<u8> {
327        if let Some(PdfObject::Array(arr)) = self.xref.trailer.get(b"ID") {
328            if let Some(PdfObject::String(id)) = arr.first() {
329                return id.clone();
330            }
331        }
332        Vec::new()
333    }
334
335    /// Whether the document is encrypted.
336    pub fn is_encrypted(&self) -> bool {
337        self.security.is_some()
338    }
339
340    /// Whether the document is encrypted and authentication has succeeded.
341    pub fn is_authenticated(&self) -> bool {
342        match &self.security {
343            Some(s) => s.is_authenticated(),
344            None => true, // Not encrypted = always accessible
345        }
346    }
347
348    /// Authenticate with a password. Required for encrypted documents
349    /// where the empty password doesn't work.
350    pub fn authenticate(&mut self, password: &[u8]) -> Result<()> {
351        let state = match &mut self.security {
352            Some(s) => s,
353            None => return Ok(()), // Not encrypted
354        };
355
356        if state.is_authenticated() {
357            return Ok(()); // Already authenticated
358        }
359
360        let key = crypto::auth::authenticate(state, password)?;
361        state.file_key = Some(key);
362
363        // Clear cached objects — they need to be re-loaded with decryption
364        self.objects.write().unwrap().clear();
365        self.decoded_obj_streams.write().unwrap().clear();
366
367        Ok(())
368    }
369
370    /// Get the permission flags (if encrypted).
371    pub fn permissions(&self) -> Option<crypto::Permissions> {
372        self.security.as_ref().map(|s| s.permissions())
373    }
374
375    /// Get the security state (for advanced use).
376    pub fn security_state(&self) -> Option<&SecurityState> {
377        self.security.as_ref()
378    }
379
380    /// Number of objects declared in xref.
381    pub fn object_count(&self) -> usize {
382        self.xref.len()
383    }
384
385    /// The /Root (catalog) reference from the trailer.
386    pub fn catalog_ref(&self) -> Option<&IndirectRef> {
387        self.xref.trailer.get_ref(b"Root")
388    }
389
390    /// Get the trailer dictionary.
391    pub fn trailer(&self) -> &PdfDict {
392        &self.xref.trailer
393    }
394
395    /// Resolve an indirect reference to the actual object.
396    /// Uses internal LRU cache. Detects circular references.
397    /// Automatically decrypts if the document is encrypted and authenticated.
398    ///
399    /// Returns a cloned `PdfObject` (owned). The interior LRU cache is
400    /// protected by a `RwLock`, so this method only requires `&self` and
401    /// can be called from multiple threads simultaneously.
402    pub fn resolve(&self, iref: &IndirectRef) -> Result<PdfObject> {
403        // Fast path: cache hit (read lock only).
404        {
405            let mut cache = self.objects.write().unwrap();
406            if let Some(obj) = cache.get(iref) {
407                return Ok(obj.clone());
408            }
409        }
410
411        // Check if we need authentication
412        if let Some(ref sec) = self.security {
413            if !sec.is_authenticated() {
414                return Err(JustPdfError::EncryptedDocument);
415            }
416        }
417
418        // Load the object (no lock held during I/O)
419        let obj = self.load_object(iref, &mut HashSet::new())?;
420        let result = obj.clone();
421        self.objects.write().unwrap().insert(iref.clone(), obj);
422        Ok(result)
423    }
424
425    /// Load an object, tracking visited refs to detect cycles.
426    /// Applies decryption if the document is encrypted.
427    fn load_object(
428        &self,
429        iref: &IndirectRef,
430        visited: &mut HashSet<IndirectRef>,
431    ) -> Result<PdfObject> {
432        let obj = self.load_object_raw(iref, visited)?;
433
434        // Apply decryption if needed
435        if let Some(ref sec) = self.security {
436            if sec.is_authenticated() {
437                return crypto::decrypt_object(obj, sec, iref.obj_num, iref.gen_num);
438            }
439        }
440
441        Ok(obj)
442    }
443
444    /// Load an object without decryption (used for the encryption dict itself).
445    fn load_object_raw(
446        &self,
447        iref: &IndirectRef,
448        visited: &mut HashSet<IndirectRef>,
449    ) -> Result<PdfObject> {
450        if !visited.insert(iref.clone()) {
451            return Err(JustPdfError::CircularReference {
452                obj_num: iref.obj_num,
453                gen_num: iref.gen_num,
454            });
455        }
456
457        let entry = self
458            .xref
459            .get(iref.obj_num)
460            .ok_or(JustPdfError::ObjectNotFound {
461                obj_num: iref.obj_num,
462                gen_num: iref.gen_num,
463            })?
464            .clone();
465
466        match entry {
467            XrefEntry::InUse { offset, .. } => {
468                let mut tokenizer = Tokenizer::new_at(self.data.as_bytes(), offset as usize);
469                let (_parsed_ref, obj) = object::parse_indirect_object(&mut tokenizer)?;
470                Ok(obj)
471            }
472            XrefEntry::Compressed {
473                obj_stream_num,
474                index_within,
475            } => self.load_compressed_object(obj_stream_num, index_within, visited),
476            XrefEntry::Free { .. } => Ok(PdfObject::Null),
477        }
478    }
479
480    /// Load an object from a compressed object stream.
481    /// Uses the decoded object stream cache to avoid re-decoding.
482    fn load_compressed_object(
483        &self,
484        obj_stream_num: u32,
485        index_within: u16,
486        visited: &mut HashSet<IndirectRef>,
487    ) -> Result<PdfObject> {
488        // Check the decoded object stream cache first (Task 3).
489        {
490            let cache = self.decoded_obj_streams.read().unwrap();
491            if !cache.contains_key(&obj_stream_num) {
492                drop(cache); // release read lock before acquiring write lock
493
494                let stream_ref = IndirectRef {
495                    obj_num: obj_stream_num,
496                    gen_num: 0,
497                };
498
499                // Load the object stream itself (which may need decryption)
500                let stream_obj = {
501                    let raw = self.load_object_raw(&stream_ref, visited)?;
502                    // Decrypt the object stream if needed
503                    if let Some(ref sec) = self.security {
504                        if sec.is_authenticated() {
505                            crypto::decrypt_object(raw, sec, obj_stream_num, 0)?
506                        } else {
507                            raw
508                        }
509                    } else {
510                        raw
511                    }
512                };
513
514                let (dict, raw_data) = match &stream_obj {
515                    PdfObject::Stream { dict, data } => (dict, data),
516                    _ => {
517                        return Err(JustPdfError::InvalidObject {
518                            offset: 0,
519                            detail: format!("object stream {obj_stream_num} is not a stream"),
520                        });
521                    }
522                };
523
524                let decoded = stream::decode_stream(raw_data, dict)?;
525                self.decoded_obj_streams
526                    .write()
527                    .unwrap()
528                    .insert(obj_stream_num, decoded);
529            }
530        }
531
532        let cache = self.decoded_obj_streams.read().unwrap();
533        let decoded = cache.get(&obj_stream_num).unwrap();
534
535        // We need N and First to parse the index. Parse them from the
536        // decoded data header: N pairs of (obj_num, offset) followed by
537        // the object data starting at byte offset `first`.
538        //
539        // We re-parse the index each time (cheap integer parsing) but
540        // avoid the expensive stream decompression.
541        let mut tokenizer = Tokenizer::new(decoded);
542
543        // We don't have the dict readily available here, so we parse all
544        // pairs until we run out and infer N from what we get. The index
545        // pairs are always at the start of the decoded data.
546        let mut obj_offsets = Vec::new();
547        loop {
548            let saved_pos = tokenizer.pos();
549            let obj_num = match tokenizer.next_token()? {
550                Some(crate::tokenizer::token::Token::Integer(v)) => v as u32,
551                _ => {
552                    tokenizer.seek(saved_pos);
553                    break;
554                }
555            };
556            let offset = match tokenizer.next_token()? {
557                Some(crate::tokenizer::token::Token::Integer(v)) => v as usize,
558                _ => break,
559            };
560            obj_offsets.push((obj_num, offset));
561        }
562
563        // `first` is the byte offset where actual object data starts,
564        // which equals the current tokenizer position after reading all pairs.
565        let first = tokenizer.pos();
566
567        let idx = index_within as usize;
568        if idx >= obj_offsets.len() {
569            return Err(JustPdfError::ObjectNotFound {
570                obj_num: 0,
571                gen_num: 0,
572            });
573        }
574
575        let (_obj_num, obj_offset) = obj_offsets[idx];
576        let abs_offset = first + obj_offset;
577
578        let mut tokenizer = Tokenizer::new_at(decoded, abs_offset);
579        object::parse_object(&mut tokenizer)
580    }
581
582    /// Iterate over all in-use object references.
583    pub fn object_refs(&self) -> impl Iterator<Item = IndirectRef> + '_ {
584        self.xref
585            .entries
586            .iter()
587            .filter_map(|(&obj_num, entry)| match entry {
588                XrefEntry::InUse { gen_num, .. } => Some(IndirectRef {
589                    obj_num,
590                    gen_num: *gen_num,
591                }),
592                XrefEntry::Compressed { .. } => Some(IndirectRef {
593                    obj_num,
594                    gen_num: 0,
595                }),
596                XrefEntry::Free { .. } => None,
597            })
598    }
599
600    /// Decode a stream object's data.
601    pub fn decode_stream(&self, dict: &PdfDict, raw_data: &[u8]) -> Result<Vec<u8>> {
602        stream::decode_stream(raw_data, dict)
603    }
604
605    /// Get the raw file data.
606    pub fn raw_data(&self) -> &[u8] {
607        self.data.as_bytes()
608    }
609
610    /// Set the maximum number of parsed objects to keep in the LRU cache.
611    pub fn set_cache_capacity(&mut self, capacity: usize) {
612        self.objects.write().unwrap().set_capacity(capacity);
613    }
614
615    /// Return the current number of cached objects.
616    pub fn cached_object_count(&self) -> usize {
617        self.objects.read().unwrap().len()
618    }
619}
620
621/// Parse PDF version from the header line.
622fn parse_version(data: &[u8]) -> Result<(u8, u8)> {
623    // Look for %PDF-X.Y in the first 1024 bytes
624    let search_len = data.len().min(1024);
625    let needle = b"%PDF-";
626
627    for i in 0..search_len.saturating_sub(needle.len() + 3) {
628        if &data[i..i + needle.len()] == needle {
629            let major = data.get(i + 5).copied().unwrap_or(0);
630            let dot = data.get(i + 6).copied().unwrap_or(0);
631            let minor = data.get(i + 7).copied().unwrap_or(0);
632
633            if major.is_ascii_digit() && dot == b'.' && minor.is_ascii_digit() {
634                return Ok((major - b'0', minor - b'0'));
635            }
636        }
637    }
638
639    Err(JustPdfError::NotPdf)
640}
641
642#[cfg(test)]
643mod tests {
644    use super::*;
645
646    #[test]
647    fn test_parse_version() {
648        assert_eq!(parse_version(b"%PDF-1.7\n").unwrap(), (1, 7));
649        assert_eq!(parse_version(b"%PDF-2.0\n").unwrap(), (2, 0));
650        assert_eq!(parse_version(b"%PDF-1.4 stuff").unwrap(), (1, 4));
651    }
652
653    #[test]
654    fn test_parse_version_not_pdf() {
655        assert!(parse_version(b"Hello World").is_err());
656        assert!(parse_version(b"").is_err());
657    }
658
659    #[test]
660    fn test_parse_version_offset() {
661        // Some PDFs have garbage before %PDF-
662        assert_eq!(parse_version(b"\xEF\xBB\xBF%PDF-1.7\n").unwrap(), (1, 7));
663    }
664
665    /// Build a minimal valid PDF in memory for testing.
666    fn build_minimal_pdf() -> Vec<u8> {
667        let mut pdf = Vec::new();
668        // Header
669        pdf.extend_from_slice(b"%PDF-1.4\n");
670
671        // Object 1: Catalog
672        let obj1_offset = pdf.len();
673        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
674
675        // Object 2: Pages
676        let obj2_offset = pdf.len();
677        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
678
679        // Object 3: Page
680        let obj3_offset = pdf.len();
681        pdf.extend_from_slice(
682            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n",
683        );
684
685        // Xref table
686        let xref_offset = pdf.len();
687        pdf.extend_from_slice(b"xref\n");
688        pdf.extend_from_slice(b"0 4\n");
689        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
690        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
691        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
692        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj3_offset).as_bytes());
693
694        // Trailer
695        pdf.extend_from_slice(b"trailer\n<< /Size 4 /Root 1 0 R >>\n");
696        pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
697
698        pdf
699    }
700
701    #[test]
702    fn test_open_minimal_pdf() {
703        let data = build_minimal_pdf();
704        let doc = PdfDocument::from_bytes(data).unwrap();
705
706        assert_eq!(doc.version, (1, 4));
707        assert!(doc.object_count() > 0);
708        assert!(!doc.is_encrypted());
709
710        // Resolve catalog
711        let catalog_ref = doc.catalog_ref().unwrap().clone();
712        let catalog = doc.resolve(&catalog_ref).unwrap();
713        match &catalog {
714            PdfObject::Dict(d) => {
715                assert_eq!(d.get_name(b"Type"), Some(b"Catalog".as_slice()));
716            }
717            _ => panic!("expected dict for catalog"),
718        }
719    }
720
721    #[test]
722    fn test_not_pdf() {
723        let result = PdfDocument::from_bytes(b"Hello World, not a PDF".to_vec());
724        assert!(result.is_err());
725    }
726
727    #[test]
728    fn test_empty_file() {
729        let result = PdfDocument::from_bytes(vec![]);
730        assert!(result.is_err());
731    }
732
733    #[test]
734    fn test_truncated_pdf() {
735        let result = PdfDocument::from_bytes(b"%PDF-1.4\n".to_vec());
736        assert!(result.is_err());
737    }
738
739    #[test]
740    fn test_object_not_found() {
741        let data = build_minimal_pdf();
742        let doc = PdfDocument::from_bytes(data).unwrap();
743        let result = doc.resolve(&IndirectRef {
744            obj_num: 999,
745            gen_num: 0,
746        });
747        assert!(result.is_err());
748    }
749
750    #[test]
751    fn test_unencrypted_pdf_is_authenticated() {
752        let data = build_minimal_pdf();
753        let doc = PdfDocument::from_bytes(data).unwrap();
754        assert!(!doc.is_encrypted());
755        assert!(doc.is_authenticated());
756    }
757
758    // -----------------------------------------------------------------------
759    // LRU cache tests
760    // -----------------------------------------------------------------------
761
762    #[test]
763    fn test_lru_cache_insert_and_get() {
764        let mut cache = LruCache::new(3);
765        cache.insert("a", 1);
766        cache.insert("b", 2);
767        cache.insert("c", 3);
768        assert_eq!(cache.len(), 3);
769        assert_eq!(cache.get(&"a"), Some(&1));
770        assert_eq!(cache.get(&"b"), Some(&2));
771        assert_eq!(cache.get(&"c"), Some(&3));
772    }
773
774    #[test]
775    fn test_lru_cache_eviction() {
776        let mut cache = LruCache::new(3);
777        cache.insert("a", 1);
778        cache.insert("b", 2);
779        cache.insert("c", 3);
780        // Cache is full. Inserting a 4th should evict the LRU ("a").
781        cache.insert("d", 4);
782        assert_eq!(cache.len(), 3);
783        assert_eq!(cache.get(&"a"), None); // evicted
784        assert_eq!(cache.get(&"b"), Some(&2));
785        assert_eq!(cache.get(&"c"), Some(&3));
786        assert_eq!(cache.get(&"d"), Some(&4));
787    }
788
789    #[test]
790    fn test_lru_cache_access_promotes() {
791        let mut cache = LruCache::new(3);
792        cache.insert("a", 1);
793        cache.insert("b", 2);
794        cache.insert("c", 3);
795        // Access "a" to promote it — now "b" is the LRU.
796        assert_eq!(cache.get(&"a"), Some(&1));
797        cache.insert("d", 4);
798        assert_eq!(cache.get(&"b"), None); // "b" was evicted, not "a"
799        assert_eq!(cache.get(&"a"), Some(&1));
800    }
801
802    #[test]
803    fn test_lru_cache_update_existing() {
804        let mut cache = LruCache::new(3);
805        cache.insert("a", 1);
806        cache.insert("a", 10);
807        assert_eq!(cache.len(), 1);
808        assert_eq!(cache.get(&"a"), Some(&10));
809    }
810
811    #[test]
812    fn test_lru_cache_clear() {
813        let mut cache = LruCache::new(3);
814        cache.insert("a", 1);
815        cache.insert("b", 2);
816        cache.clear();
817        assert_eq!(cache.len(), 0);
818        assert_eq!(cache.get(&"a"), None);
819    }
820
821    #[test]
822    fn test_lru_cache_set_capacity_shrinks() {
823        let mut cache = LruCache::new(5);
824        for i in 0..5 {
825            cache.insert(i, i * 10);
826        }
827        assert_eq!(cache.len(), 5);
828        // Shrink capacity — should evict the 3 LRU entries (0, 1, 2).
829        cache.set_capacity(2);
830        assert_eq!(cache.len(), 2);
831        assert_eq!(cache.get(&0), None);
832        assert_eq!(cache.get(&1), None);
833        assert_eq!(cache.get(&2), None);
834        // Most recent two should survive.
835        assert!(cache.get(&3).is_some() || cache.get(&4).is_some());
836    }
837
838    // -----------------------------------------------------------------------
839    // PdfDocument cache integration tests
840    // -----------------------------------------------------------------------
841
842    #[test]
843    fn test_set_cache_capacity() {
844        let data = build_minimal_pdf();
845        let mut doc = PdfDocument::from_bytes(data).unwrap();
846
847        // Resolve all 3 objects to fill the cache.
848        for obj_num in 1..=3u32 {
849            let iref = IndirectRef { obj_num, gen_num: 0 };
850            doc.resolve(&iref).unwrap();
851        }
852        assert_eq!(doc.cached_object_count(), 3);
853
854        // Shrink capacity to 1 — should evict 2 entries.
855        doc.set_cache_capacity(1);
856        assert_eq!(doc.cached_object_count(), 1);
857    }
858
859    #[test]
860    fn test_lru_cache_hit_miss_on_document() {
861        let data = build_minimal_pdf();
862        let mut doc = PdfDocument::from_bytes(data).unwrap();
863        doc.set_cache_capacity(2);
864
865        let ref1 = IndirectRef { obj_num: 1, gen_num: 0 };
866        let ref2 = IndirectRef { obj_num: 2, gen_num: 0 };
867        let ref3 = IndirectRef { obj_num: 3, gen_num: 0 };
868
869        // Resolve 1 and 2 — both cached.
870        doc.resolve(&ref1).unwrap();
871        doc.resolve(&ref2).unwrap();
872        assert_eq!(doc.cached_object_count(), 2);
873
874        // Resolving 3 should evict ref1 (LRU).
875        doc.resolve(&ref3).unwrap();
876        assert_eq!(doc.cached_object_count(), 2);
877        assert!(!doc.objects.read().unwrap().contains_key(&ref1));
878        assert!(doc.objects.read().unwrap().contains_key(&ref2));
879        assert!(doc.objects.read().unwrap().contains_key(&ref3));
880
881        // Re-resolving ref1 should work (re-parsed from data).
882        doc.resolve(&ref1).unwrap();
883        assert!(doc.objects.read().unwrap().contains_key(&ref1));
884    }
885
886    #[test]
887    fn test_object_stream_caching() {
888        let data = build_minimal_pdf();
889        let doc = PdfDocument::from_bytes(data).unwrap();
890        // The minimal PDF uses normal (non-compressed) objects, so the
891        // decoded_obj_streams cache should be empty.
892        assert_eq!(doc.decoded_obj_streams.read().unwrap().len(), 0);
893
894        // Verify the cache exists and is functional by inserting directly.
895        doc.decoded_obj_streams.write().unwrap().insert(42, vec![1, 2, 3]);
896        assert!(doc.decoded_obj_streams.read().unwrap().contains_key(&42));
897        assert_eq!(
898            doc.decoded_obj_streams.read().unwrap().get(&42).unwrap(),
899            &[1, 2, 3]
900        );
901
902        // Authentication clear should also clear the stream cache.
903        doc.decoded_obj_streams.write().unwrap().insert(99, vec![4, 5, 6]);
904        // Simulate what authenticate() does:
905        doc.objects.write().unwrap().clear();
906        doc.decoded_obj_streams.write().unwrap().clear();
907        assert_eq!(doc.decoded_obj_streams.read().unwrap().len(), 0);
908    }
909
910    #[cfg(feature = "mmap")]
911    #[test]
912    fn test_mmap_truncated_file() {
913        use std::io::Write;
914        let dir = std::env::temp_dir();
915        let path = dir.join("justpdf_mmap_truncated.pdf");
916        {
917            let mut f = std::fs::File::create(&path).unwrap();
918            // Write just the PDF header, not a complete PDF
919            f.write_all(b"%PDF-1.4\n").unwrap();
920        }
921        let result = PdfDocument::open_mmap(&path);
922        // Should be an error, not a panic
923        assert!(result.is_err());
924        let _ = std::fs::remove_file(&path);
925    }
926
927    #[cfg(feature = "mmap")]
928    #[test]
929    fn test_mmap_empty_file() {
930        let dir = std::env::temp_dir();
931        let path = dir.join("justpdf_mmap_empty.pdf");
932        {
933            std::fs::File::create(&path).unwrap();
934        }
935        let result = PdfDocument::open_mmap(&path);
936        // Should be an error, not a panic
937        assert!(result.is_err());
938        let _ = std::fs::remove_file(&path);
939    }
940
941    #[cfg(feature = "mmap")]
942    #[test]
943    fn test_open_mmap() {
944        use std::io::Write;
945        // Write a minimal PDF to a temp file and open with mmap.
946        let data = build_minimal_pdf();
947        let dir = std::env::temp_dir();
948        let path = dir.join("justpdf_mmap_test.pdf");
949        {
950            let mut f = std::fs::File::create(&path).unwrap();
951            f.write_all(&data).unwrap();
952        }
953        let doc = PdfDocument::open_mmap(&path).unwrap();
954        assert_eq!(doc.version, (1, 4));
955        assert!(!doc.is_encrypted());
956
957        let catalog_ref = doc.catalog_ref().unwrap().clone();
958        let catalog = doc.resolve(&catalog_ref).unwrap();
959        match &catalog {
960            PdfObject::Dict(d) => {
961                assert_eq!(d.get_name(b"Type"), Some(b"Catalog".as_slice()));
962            }
963            _ => panic!("expected dict for catalog"),
964        }
965
966        // Clean up.
967        let _ = std::fs::remove_file(&path);
968    }
969}