Skip to main content

justpdf_core/
parser.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2use std::hash::Hash;
3use std::path::Path;
4use std::sync::RwLock;
5
6use crate::crypto;
7use crate::crypto::SecurityState;
8use crate::error::{JustPdfError, Result};
9use crate::object::{self, IndirectRef, PdfDict, PdfObject};
10use crate::stream;
11use crate::tokenizer::Tokenizer;
12use crate::xref::{self, Xref, XrefEntry};
13
14// ---------------------------------------------------------------------------
15// PdfData: backing store abstraction (Task 1)
16// ---------------------------------------------------------------------------
17
18/// Backing store for PDF file data.
19enum PdfData {
20    Owned(Vec<u8>),
21    #[cfg(feature = "mmap")]
22    Mmap(memmap2::Mmap),
23}
24
25impl std::fmt::Debug for PdfData {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match self {
28            Self::Owned(v) => f.debug_tuple("Owned").field(&v.len()).finish(),
29            #[cfg(feature = "mmap")]
30            Self::Mmap(m) => f.debug_tuple("Mmap").field(&m.len()).finish(),
31        }
32    }
33}
34
35impl PdfData {
36    fn as_bytes(&self) -> &[u8] {
37        match self {
38            Self::Owned(v) => v,
39            #[cfg(feature = "mmap")]
40            Self::Mmap(m) => m,
41        }
42    }
43}
44
45// ---------------------------------------------------------------------------
46// LruCache: bounded object cache (Task 2)
47// ---------------------------------------------------------------------------
48
49/// A simple bounded LRU cache backed by a `HashMap` and `VecDeque`.
50struct LruCache<K: Eq + Hash + Clone, V> {
51    map: HashMap<K, V>,
52    order: VecDeque<K>,
53    capacity: usize,
54}
55
56impl<K: Eq + Hash + Clone + std::fmt::Debug, V: std::fmt::Debug> std::fmt::Debug
57    for LruCache<K, V>
58{
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        f.debug_struct("LruCache")
61            .field("len", &self.map.len())
62            .field("capacity", &self.capacity)
63            .finish()
64    }
65}
66
67impl<K: Eq + Hash + Clone, V> LruCache<K, V> {
68    fn new(capacity: usize) -> Self {
69        assert!(capacity > 0, "LruCache capacity must be > 0");
70        Self {
71            map: HashMap::with_capacity(capacity),
72            order: VecDeque::with_capacity(capacity),
73            capacity,
74        }
75    }
76
77    /// Look up a value, promoting the key to most-recently-used.
78    fn get(&mut self, key: &K) -> Option<&V> {
79        if self.map.contains_key(key) {
80            // Move to front (most recently used)
81            self.touch(key);
82            self.map.get(key)
83        } else {
84            None
85        }
86    }
87
88    /// Insert a key-value pair. If the cache is at capacity the least-recently
89    /// used entry is evicted first.
90    fn insert(&mut self, key: K, value: V) {
91        if self.map.contains_key(&key) {
92            // Update existing entry
93            self.map.insert(key.clone(), value);
94            self.touch(&key);
95            return;
96        }
97        // Evict if at capacity
98        if self.map.len() >= self.capacity {
99            if let Some(evicted) = self.order.pop_back() {
100                self.map.remove(&evicted);
101            }
102        }
103        self.order.push_front(key.clone());
104        self.map.insert(key, value);
105    }
106
107    fn contains_key(&self, key: &K) -> bool {
108        self.map.contains_key(key)
109    }
110
111    fn clear(&mut self) {
112        self.map.clear();
113        self.order.clear();
114    }
115
116    fn len(&self) -> usize {
117        self.map.len()
118    }
119
120    /// Set a new capacity. If the current size exceeds the new capacity,
121    /// the least-recently used entries are evicted.
122    fn set_capacity(&mut self, capacity: usize) {
123        assert!(capacity > 0, "LruCache capacity must be > 0");
124        self.capacity = capacity;
125        while self.map.len() > self.capacity {
126            if let Some(evicted) = self.order.pop_back() {
127                self.map.remove(&evicted);
128            }
129        }
130    }
131
132    // Promote `key` to front of the order deque.
133    fn touch(&mut self, key: &K) {
134        if let Some(pos) = self.order.iter().position(|k| k == key) {
135            self.order.remove(pos);
136        }
137        self.order.push_front(key.clone());
138    }
139}
140
141/// Default LRU object cache capacity.
142const DEFAULT_CACHE_CAPACITY: usize = 2048;
143
144/// A parsed PDF document.
145///
146/// `PdfDocument` uses interior mutability (`RwLock`) for its object caches so
147/// that `resolve` only requires `&self`. This makes the type `Sync` and enables
148/// multi-threaded page parsing and rendering via shared references.
149pub struct PdfDocument {
150    /// PDF version, e.g. (1, 7) for PDF 1.7.
151    pub version: (u8, u8),
152    /// The merged cross-reference table.
153    pub xref: Xref,
154    /// Raw file data (owned or memory-mapped).
155    data: PdfData,
156    /// Bounded LRU cache of parsed objects (interior-mutable).
157    objects: RwLock<LruCache<IndirectRef, PdfObject>>,
158    /// Encryption/security state (None if document is not encrypted).
159    security: Option<SecurityState>,
160    /// Cache of decoded object stream data (interior-mutable).
161    decoded_obj_streams: RwLock<HashMap<u32, Vec<u8>>>,
162}
163
164impl std::fmt::Debug for PdfDocument {
165    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
166        let obj_cache_len = self
167            .objects
168            .read()
169            .map(|c| c.len())
170            .unwrap_or(0);
171        f.debug_struct("PdfDocument")
172            .field("version", &self.version)
173            .field("xref", &self.xref)
174            .field("data", &self.data)
175            .field("objects_cached", &obj_cache_len)
176            .field("security", &self.security)
177            .finish()
178    }
179}
180
181impl PdfDocument {
182    /// Open a PDF file from a path.
183    pub fn open(path: &Path) -> Result<Self> {
184        let data = std::fs::read(path)?;
185        Self::from_bytes(data)
186    }
187
188    /// Parse a PDF from an in-memory byte vector.
189    pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
190        Self::from_pdf_data(PdfData::Owned(data))
191    }
192
193    /// Internal constructor shared by all entry points.
194    fn from_pdf_data(data: PdfData) -> Result<Self> {
195        let bytes = data.as_bytes();
196        if bytes.len() < 8 {
197            return Err(JustPdfError::NotPdf);
198        }
199
200        // Parse version from header: %PDF-X.Y
201        let version = parse_version(bytes)?;
202
203        // Load xref
204        let xref = xref::load_xref(bytes)?;
205
206        let mut doc = Self {
207            version,
208            xref,
209            data,
210            objects: RwLock::new(LruCache::new(DEFAULT_CACHE_CAPACITY)),
211            security: None,
212            decoded_obj_streams: RwLock::new(HashMap::new()),
213        };
214
215        // Detect encryption
216        doc.detect_encryption()?;
217
218        Ok(doc)
219    }
220
221    /// Open a PDF file using memory-mapped I/O.
222    ///
223    /// This avoids copying the entire file into memory, which can be
224    /// beneficial for very large documents.
225    #[cfg(feature = "mmap")]
226    pub fn open_mmap(path: &Path) -> Result<Self> {
227        let file = std::fs::File::open(path)?;
228        // SAFETY: We keep the Mmap alive for the lifetime of PdfDocument.
229        // The file must not be modified while mapped.
230        let mmap = unsafe { memmap2::Mmap::map(&file)? };
231        Self::from_pdf_data(PdfData::Mmap(mmap))
232    }
233
234    /// Construct a `PdfDocument` from pre-built parts (used by the
235    /// repair module when the normal xref/trailer is damaged).
236    pub(crate) fn from_raw_parts(data: Vec<u8>, xref: Xref, version: (u8, u8)) -> Self {
237        Self {
238            version,
239            xref,
240            data: PdfData::Owned(data),
241            objects: RwLock::new(LruCache::new(DEFAULT_CACHE_CAPACITY)),
242            security: None,
243            decoded_obj_streams: RwLock::new(HashMap::new()),
244        }
245    }
246
247    /// Detect and initialize encryption from the trailer.
248    fn detect_encryption(&mut self) -> Result<()> {
249        // Check for /Encrypt in trailer
250        let encrypt_ref = match self.xref.trailer.get_ref(b"Encrypt") {
251            Some(r) => r.clone(),
252            None => {
253                // Also check for inline /Encrypt dict
254                if self.xref.trailer.get_dict(b"Encrypt").is_some() {
255                    return self.detect_encryption_inline();
256                }
257                return Ok(());
258            }
259        };
260
261        // Load the encryption dictionary object (without decryption!)
262        let encrypt_obj = self.load_object_raw(&encrypt_ref, &mut HashSet::new())?;
263        let encrypt_dict = match &encrypt_obj {
264            PdfObject::Dict(d) => d,
265            _ => {
266                return Err(JustPdfError::EncryptionError {
267                    detail: "encryption object is not a dictionary".into(),
268                });
269            }
270        };
271
272        let ed = crypto::EncryptionDict::from_dict(encrypt_dict)?;
273
274        // Verify we support this encryption
275        if ed.filter != b"Standard" {
276            return Err(JustPdfError::UnsupportedEncryption {
277                detail: format!(
278                    "unsupported security handler: {}",
279                    String::from_utf8_lossy(&ed.filter)
280                ),
281            });
282        }
283
284        // Extract file ID from trailer
285        let file_id = self.extract_file_id();
286
287        let mut state =
288            SecurityState::new(ed, file_id, Some(encrypt_ref.obj_num));
289
290        // Try empty password (very common for user-password-only PDFs)
291        if let Ok(key) = crypto::auth::authenticate(&state, b"") {
292            state.file_key = Some(key);
293        }
294
295        self.security = Some(state);
296        Ok(())
297    }
298
299    /// Handle inline /Encrypt dict (not an indirect reference).
300    fn detect_encryption_inline(&mut self) -> Result<()> {
301        let encrypt_dict = self.xref.trailer.get_dict(b"Encrypt").unwrap().clone();
302        let ed = crypto::EncryptionDict::from_dict(&encrypt_dict)?;
303
304        if ed.filter != b"Standard" {
305            return Err(JustPdfError::UnsupportedEncryption {
306                detail: format!(
307                    "unsupported security handler: {}",
308                    String::from_utf8_lossy(&ed.filter)
309                ),
310            });
311        }
312
313        let file_id = self.extract_file_id();
314        let mut state = SecurityState::new(ed, file_id, None);
315
316        if let Ok(key) = crypto::auth::authenticate(&state, b"") {
317            state.file_key = Some(key);
318        }
319
320        self.security = Some(state);
321        Ok(())
322    }
323
324    /// Extract the first element of the /ID array from the trailer.
325    fn extract_file_id(&self) -> Vec<u8> {
326        if let Some(PdfObject::Array(arr)) = self.xref.trailer.get(b"ID") {
327            if let Some(PdfObject::String(id)) = arr.first() {
328                return id.clone();
329            }
330        }
331        Vec::new()
332    }
333
334    /// Whether the document is encrypted.
335    pub fn is_encrypted(&self) -> bool {
336        self.security.is_some()
337    }
338
339    /// Whether the document is encrypted and authentication has succeeded.
340    pub fn is_authenticated(&self) -> bool {
341        match &self.security {
342            Some(s) => s.is_authenticated(),
343            None => true, // Not encrypted = always accessible
344        }
345    }
346
347    /// Authenticate with a password. Required for encrypted documents
348    /// where the empty password doesn't work.
349    pub fn authenticate(&mut self, password: &[u8]) -> Result<()> {
350        let state = match &mut self.security {
351            Some(s) => s,
352            None => return Ok(()), // Not encrypted
353        };
354
355        if state.is_authenticated() {
356            return Ok(()); // Already authenticated
357        }
358
359        let key = crypto::auth::authenticate(state, password)?;
360        state.file_key = Some(key);
361
362        // Clear cached objects — they need to be re-loaded with decryption
363        self.objects.write().unwrap().clear();
364        self.decoded_obj_streams.write().unwrap().clear();
365
366        Ok(())
367    }
368
369    /// Get the permission flags (if encrypted).
370    pub fn permissions(&self) -> Option<crypto::Permissions> {
371        self.security.as_ref().map(|s| s.permissions())
372    }
373
374    /// Get the security state (for advanced use).
375    pub fn security_state(&self) -> Option<&SecurityState> {
376        self.security.as_ref()
377    }
378
379    /// Number of objects declared in xref.
380    pub fn object_count(&self) -> usize {
381        self.xref.len()
382    }
383
384    /// The /Root (catalog) reference from the trailer.
385    pub fn catalog_ref(&self) -> Option<&IndirectRef> {
386        self.xref.trailer.get_ref(b"Root")
387    }
388
389    /// Get the trailer dictionary.
390    pub fn trailer(&self) -> &PdfDict {
391        &self.xref.trailer
392    }
393
394    /// Resolve an indirect reference to the actual object.
395    /// Uses internal LRU cache. Detects circular references.
396    /// Automatically decrypts if the document is encrypted and authenticated.
397    ///
398    /// Returns a cloned `PdfObject` (owned). The interior LRU cache is
399    /// protected by a `RwLock`, so this method only requires `&self` and
400    /// can be called from multiple threads simultaneously.
401    pub fn resolve(&self, iref: &IndirectRef) -> Result<PdfObject> {
402        // Fast path: cache hit (read lock only).
403        {
404            let mut cache = self.objects.write().unwrap();
405            if let Some(obj) = cache.get(iref) {
406                return Ok(obj.clone());
407            }
408        }
409
410        // Check if we need authentication
411        if let Some(ref sec) = self.security {
412            if !sec.is_authenticated() {
413                return Err(JustPdfError::EncryptedDocument);
414            }
415        }
416
417        // Load the object (no lock held during I/O)
418        let obj = self.load_object(iref, &mut HashSet::new())?;
419        let result = obj.clone();
420        self.objects.write().unwrap().insert(iref.clone(), obj);
421        Ok(result)
422    }
423
424    /// Load an object, tracking visited refs to detect cycles.
425    /// Applies decryption if the document is encrypted.
426    fn load_object(
427        &self,
428        iref: &IndirectRef,
429        visited: &mut HashSet<IndirectRef>,
430    ) -> Result<PdfObject> {
431        let obj = self.load_object_raw(iref, visited)?;
432
433        // Apply decryption if needed
434        if let Some(ref sec) = self.security {
435            if sec.is_authenticated() {
436                return crypto::decrypt_object(obj, sec, iref.obj_num, iref.gen_num);
437            }
438        }
439
440        Ok(obj)
441    }
442
443    /// Load an object without decryption (used for the encryption dict itself).
444    fn load_object_raw(
445        &self,
446        iref: &IndirectRef,
447        visited: &mut HashSet<IndirectRef>,
448    ) -> Result<PdfObject> {
449        if !visited.insert(iref.clone()) {
450            return Err(JustPdfError::CircularReference {
451                obj_num: iref.obj_num,
452                gen_num: iref.gen_num,
453            });
454        }
455
456        let entry = self
457            .xref
458            .get(iref.obj_num)
459            .ok_or(JustPdfError::ObjectNotFound {
460                obj_num: iref.obj_num,
461                gen_num: iref.gen_num,
462            })?
463            .clone();
464
465        match entry {
466            XrefEntry::InUse { offset, .. } => {
467                let mut tokenizer = Tokenizer::new_at(self.data.as_bytes(), offset as usize);
468                let (_parsed_ref, obj) = object::parse_indirect_object(&mut tokenizer)?;
469                Ok(obj)
470            }
471            XrefEntry::Compressed {
472                obj_stream_num,
473                index_within,
474            } => self.load_compressed_object(obj_stream_num, index_within, visited),
475            XrefEntry::Free { .. } => Ok(PdfObject::Null),
476        }
477    }
478
479    /// Load an object from a compressed object stream.
480    /// Uses the decoded object stream cache to avoid re-decoding.
481    fn load_compressed_object(
482        &self,
483        obj_stream_num: u32,
484        index_within: u16,
485        visited: &mut HashSet<IndirectRef>,
486    ) -> Result<PdfObject> {
487        // Check the decoded object stream cache first (Task 3).
488        {
489            let cache = self.decoded_obj_streams.read().unwrap();
490            if !cache.contains_key(&obj_stream_num) {
491                drop(cache); // release read lock before acquiring write lock
492
493                let stream_ref = IndirectRef {
494                    obj_num: obj_stream_num,
495                    gen_num: 0,
496                };
497
498                // Load the object stream itself (which may need decryption)
499                let stream_obj = {
500                    let raw = self.load_object_raw(&stream_ref, visited)?;
501                    // Decrypt the object stream if needed
502                    if let Some(ref sec) = self.security {
503                        if sec.is_authenticated() {
504                            crypto::decrypt_object(raw, sec, obj_stream_num, 0)?
505                        } else {
506                            raw
507                        }
508                    } else {
509                        raw
510                    }
511                };
512
513                let (dict, raw_data) = match &stream_obj {
514                    PdfObject::Stream { dict, data } => (dict, data),
515                    _ => {
516                        return Err(JustPdfError::InvalidObject {
517                            offset: 0,
518                            detail: format!("object stream {obj_stream_num} is not a stream"),
519                        });
520                    }
521                };
522
523                let decoded = stream::decode_stream(raw_data, dict)?;
524                self.decoded_obj_streams
525                    .write()
526                    .unwrap()
527                    .insert(obj_stream_num, decoded);
528            }
529        }
530
531        let cache = self.decoded_obj_streams.read().unwrap();
532        let decoded = cache.get(&obj_stream_num).unwrap();
533
534        // We need N and First to parse the index. Parse them from the
535        // decoded data header: N pairs of (obj_num, offset) followed by
536        // the object data starting at byte offset `first`.
537        //
538        // We re-parse the index each time (cheap integer parsing) but
539        // avoid the expensive stream decompression.
540        let mut tokenizer = Tokenizer::new(decoded);
541
542        // We don't have the dict readily available here, so we parse all
543        // pairs until we run out and infer N from what we get. The index
544        // pairs are always at the start of the decoded data.
545        let mut obj_offsets = Vec::new();
546        loop {
547            let saved_pos = tokenizer.pos();
548            let obj_num = match tokenizer.next_token()? {
549                Some(crate::tokenizer::token::Token::Integer(v)) => v as u32,
550                _ => {
551                    tokenizer.seek(saved_pos);
552                    break;
553                }
554            };
555            let offset = match tokenizer.next_token()? {
556                Some(crate::tokenizer::token::Token::Integer(v)) => v as usize,
557                _ => break,
558            };
559            obj_offsets.push((obj_num, offset));
560        }
561
562        // `first` is the byte offset where actual object data starts,
563        // which equals the current tokenizer position after reading all pairs.
564        let first = tokenizer.pos();
565
566        let idx = index_within as usize;
567        if idx >= obj_offsets.len() {
568            return Err(JustPdfError::ObjectNotFound {
569                obj_num: 0,
570                gen_num: 0,
571            });
572        }
573
574        let (_obj_num, obj_offset) = obj_offsets[idx];
575        let abs_offset = first + obj_offset;
576
577        let mut tokenizer = Tokenizer::new_at(decoded, abs_offset);
578        object::parse_object(&mut tokenizer)
579    }
580
581    /// Iterate over all in-use object references.
582    pub fn object_refs(&self) -> impl Iterator<Item = IndirectRef> + '_ {
583        self.xref
584            .entries
585            .iter()
586            .filter_map(|(&obj_num, entry)| match entry {
587                XrefEntry::InUse { gen_num, .. } => Some(IndirectRef {
588                    obj_num,
589                    gen_num: *gen_num,
590                }),
591                XrefEntry::Compressed { .. } => Some(IndirectRef {
592                    obj_num,
593                    gen_num: 0,
594                }),
595                XrefEntry::Free { .. } => None,
596            })
597    }
598
599    /// Decode a stream object's data.
600    pub fn decode_stream(&self, dict: &PdfDict, raw_data: &[u8]) -> Result<Vec<u8>> {
601        stream::decode_stream(raw_data, dict)
602    }
603
604    /// Get the raw file data.
605    pub fn raw_data(&self) -> &[u8] {
606        self.data.as_bytes()
607    }
608
609    /// Set the maximum number of parsed objects to keep in the LRU cache.
610    pub fn set_cache_capacity(&mut self, capacity: usize) {
611        self.objects.write().unwrap().set_capacity(capacity);
612    }
613
614    /// Return the current number of cached objects.
615    pub fn cached_object_count(&self) -> usize {
616        self.objects.read().unwrap().len()
617    }
618}
619
620/// Parse PDF version from the header line.
621fn parse_version(data: &[u8]) -> Result<(u8, u8)> {
622    // Look for %PDF-X.Y in the first 1024 bytes
623    let search_len = data.len().min(1024);
624    let needle = b"%PDF-";
625
626    for i in 0..search_len.saturating_sub(needle.len() + 3) {
627        if &data[i..i + needle.len()] == needle {
628            let major = data.get(i + 5).copied().unwrap_or(0);
629            let dot = data.get(i + 6).copied().unwrap_or(0);
630            let minor = data.get(i + 7).copied().unwrap_or(0);
631
632            if major.is_ascii_digit() && dot == b'.' && minor.is_ascii_digit() {
633                return Ok((major - b'0', minor - b'0'));
634            }
635        }
636    }
637
638    Err(JustPdfError::NotPdf)
639}
640
641#[cfg(test)]
642mod tests {
643    use super::*;
644
645    #[test]
646    fn test_parse_version() {
647        assert_eq!(parse_version(b"%PDF-1.7\n").unwrap(), (1, 7));
648        assert_eq!(parse_version(b"%PDF-2.0\n").unwrap(), (2, 0));
649        assert_eq!(parse_version(b"%PDF-1.4 stuff").unwrap(), (1, 4));
650    }
651
652    #[test]
653    fn test_parse_version_not_pdf() {
654        assert!(parse_version(b"Hello World").is_err());
655        assert!(parse_version(b"").is_err());
656    }
657
658    #[test]
659    fn test_parse_version_offset() {
660        // Some PDFs have garbage before %PDF-
661        assert_eq!(parse_version(b"\xEF\xBB\xBF%PDF-1.7\n").unwrap(), (1, 7));
662    }
663
664    /// Build a minimal valid PDF in memory for testing.
665    fn build_minimal_pdf() -> Vec<u8> {
666        let mut pdf = Vec::new();
667        // Header
668        pdf.extend_from_slice(b"%PDF-1.4\n");
669
670        // Object 1: Catalog
671        let obj1_offset = pdf.len();
672        pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
673
674        // Object 2: Pages
675        let obj2_offset = pdf.len();
676        pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
677
678        // Object 3: Page
679        let obj3_offset = pdf.len();
680        pdf.extend_from_slice(
681            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n",
682        );
683
684        // Xref table
685        let xref_offset = pdf.len();
686        pdf.extend_from_slice(b"xref\n");
687        pdf.extend_from_slice(b"0 4\n");
688        pdf.extend_from_slice(b"0000000000 65535 f \r\n");
689        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
690        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
691        pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj3_offset).as_bytes());
692
693        // Trailer
694        pdf.extend_from_slice(b"trailer\n<< /Size 4 /Root 1 0 R >>\n");
695        pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
696
697        pdf
698    }
699
700    #[test]
701    fn test_open_minimal_pdf() {
702        let data = build_minimal_pdf();
703        let doc = PdfDocument::from_bytes(data).unwrap();
704
705        assert_eq!(doc.version, (1, 4));
706        assert!(doc.object_count() > 0);
707        assert!(!doc.is_encrypted());
708
709        // Resolve catalog
710        let catalog_ref = doc.catalog_ref().unwrap().clone();
711        let catalog = doc.resolve(&catalog_ref).unwrap();
712        match &catalog {
713            PdfObject::Dict(d) => {
714                assert_eq!(d.get_name(b"Type"), Some(b"Catalog".as_slice()));
715            }
716            _ => panic!("expected dict for catalog"),
717        }
718    }
719
720    #[test]
721    fn test_not_pdf() {
722        let result = PdfDocument::from_bytes(b"Hello World, not a PDF".to_vec());
723        assert!(result.is_err());
724    }
725
726    #[test]
727    fn test_empty_file() {
728        let result = PdfDocument::from_bytes(vec![]);
729        assert!(result.is_err());
730    }
731
732    #[test]
733    fn test_truncated_pdf() {
734        let result = PdfDocument::from_bytes(b"%PDF-1.4\n".to_vec());
735        assert!(result.is_err());
736    }
737
738    #[test]
739    fn test_object_not_found() {
740        let data = build_minimal_pdf();
741        let doc = PdfDocument::from_bytes(data).unwrap();
742        let result = doc.resolve(&IndirectRef {
743            obj_num: 999,
744            gen_num: 0,
745        });
746        assert!(result.is_err());
747    }
748
749    #[test]
750    fn test_unencrypted_pdf_is_authenticated() {
751        let data = build_minimal_pdf();
752        let doc = PdfDocument::from_bytes(data).unwrap();
753        assert!(!doc.is_encrypted());
754        assert!(doc.is_authenticated());
755    }
756
757    // -----------------------------------------------------------------------
758    // LRU cache tests
759    // -----------------------------------------------------------------------
760
761    #[test]
762    fn test_lru_cache_insert_and_get() {
763        let mut cache = LruCache::new(3);
764        cache.insert("a", 1);
765        cache.insert("b", 2);
766        cache.insert("c", 3);
767        assert_eq!(cache.len(), 3);
768        assert_eq!(cache.get(&"a"), Some(&1));
769        assert_eq!(cache.get(&"b"), Some(&2));
770        assert_eq!(cache.get(&"c"), Some(&3));
771    }
772
773    #[test]
774    fn test_lru_cache_eviction() {
775        let mut cache = LruCache::new(3);
776        cache.insert("a", 1);
777        cache.insert("b", 2);
778        cache.insert("c", 3);
779        // Cache is full. Inserting a 4th should evict the LRU ("a").
780        cache.insert("d", 4);
781        assert_eq!(cache.len(), 3);
782        assert_eq!(cache.get(&"a"), None); // evicted
783        assert_eq!(cache.get(&"b"), Some(&2));
784        assert_eq!(cache.get(&"c"), Some(&3));
785        assert_eq!(cache.get(&"d"), Some(&4));
786    }
787
788    #[test]
789    fn test_lru_cache_access_promotes() {
790        let mut cache = LruCache::new(3);
791        cache.insert("a", 1);
792        cache.insert("b", 2);
793        cache.insert("c", 3);
794        // Access "a" to promote it — now "b" is the LRU.
795        assert_eq!(cache.get(&"a"), Some(&1));
796        cache.insert("d", 4);
797        assert_eq!(cache.get(&"b"), None); // "b" was evicted, not "a"
798        assert_eq!(cache.get(&"a"), Some(&1));
799    }
800
801    #[test]
802    fn test_lru_cache_update_existing() {
803        let mut cache = LruCache::new(3);
804        cache.insert("a", 1);
805        cache.insert("a", 10);
806        assert_eq!(cache.len(), 1);
807        assert_eq!(cache.get(&"a"), Some(&10));
808    }
809
810    #[test]
811    fn test_lru_cache_clear() {
812        let mut cache = LruCache::new(3);
813        cache.insert("a", 1);
814        cache.insert("b", 2);
815        cache.clear();
816        assert_eq!(cache.len(), 0);
817        assert_eq!(cache.get(&"a"), None);
818    }
819
820    #[test]
821    fn test_lru_cache_set_capacity_shrinks() {
822        let mut cache = LruCache::new(5);
823        for i in 0..5 {
824            cache.insert(i, i * 10);
825        }
826        assert_eq!(cache.len(), 5);
827        // Shrink capacity — should evict the 3 LRU entries (0, 1, 2).
828        cache.set_capacity(2);
829        assert_eq!(cache.len(), 2);
830        assert_eq!(cache.get(&0), None);
831        assert_eq!(cache.get(&1), None);
832        assert_eq!(cache.get(&2), None);
833        // Most recent two should survive.
834        assert!(cache.get(&3).is_some() || cache.get(&4).is_some());
835    }
836
837    // -----------------------------------------------------------------------
838    // PdfDocument cache integration tests
839    // -----------------------------------------------------------------------
840
841    #[test]
842    fn test_set_cache_capacity() {
843        let data = build_minimal_pdf();
844        let mut doc = PdfDocument::from_bytes(data).unwrap();
845
846        // Resolve all 3 objects to fill the cache.
847        for obj_num in 1..=3u32 {
848            let iref = IndirectRef { obj_num, gen_num: 0 };
849            doc.resolve(&iref).unwrap();
850        }
851        assert_eq!(doc.cached_object_count(), 3);
852
853        // Shrink capacity to 1 — should evict 2 entries.
854        doc.set_cache_capacity(1);
855        assert_eq!(doc.cached_object_count(), 1);
856    }
857
858    #[test]
859    fn test_lru_cache_hit_miss_on_document() {
860        let data = build_minimal_pdf();
861        let mut doc = PdfDocument::from_bytes(data).unwrap();
862        doc.set_cache_capacity(2);
863
864        let ref1 = IndirectRef { obj_num: 1, gen_num: 0 };
865        let ref2 = IndirectRef { obj_num: 2, gen_num: 0 };
866        let ref3 = IndirectRef { obj_num: 3, gen_num: 0 };
867
868        // Resolve 1 and 2 — both cached.
869        doc.resolve(&ref1).unwrap();
870        doc.resolve(&ref2).unwrap();
871        assert_eq!(doc.cached_object_count(), 2);
872
873        // Resolving 3 should evict ref1 (LRU).
874        doc.resolve(&ref3).unwrap();
875        assert_eq!(doc.cached_object_count(), 2);
876        assert!(!doc.objects.read().unwrap().contains_key(&ref1));
877        assert!(doc.objects.read().unwrap().contains_key(&ref2));
878        assert!(doc.objects.read().unwrap().contains_key(&ref3));
879
880        // Re-resolving ref1 should work (re-parsed from data).
881        doc.resolve(&ref1).unwrap();
882        assert!(doc.objects.read().unwrap().contains_key(&ref1));
883    }
884
885    #[test]
886    fn test_object_stream_caching() {
887        let data = build_minimal_pdf();
888        let doc = PdfDocument::from_bytes(data).unwrap();
889        // The minimal PDF uses normal (non-compressed) objects, so the
890        // decoded_obj_streams cache should be empty.
891        assert_eq!(doc.decoded_obj_streams.read().unwrap().len(), 0);
892
893        // Verify the cache exists and is functional by inserting directly.
894        doc.decoded_obj_streams.write().unwrap().insert(42, vec![1, 2, 3]);
895        assert!(doc.decoded_obj_streams.read().unwrap().contains_key(&42));
896        assert_eq!(
897            doc.decoded_obj_streams.read().unwrap().get(&42).unwrap(),
898            &[1, 2, 3]
899        );
900
901        // Authentication clear should also clear the stream cache.
902        doc.decoded_obj_streams.write().unwrap().insert(99, vec![4, 5, 6]);
903        // Simulate what authenticate() does:
904        doc.objects.write().unwrap().clear();
905        doc.decoded_obj_streams.write().unwrap().clear();
906        assert_eq!(doc.decoded_obj_streams.read().unwrap().len(), 0);
907    }
908
909    #[cfg(feature = "mmap")]
910    #[test]
911    fn test_mmap_truncated_file() {
912        use std::io::Write;
913        let dir = std::env::temp_dir();
914        let path = dir.join("justpdf_mmap_truncated.pdf");
915        {
916            let mut f = std::fs::File::create(&path).unwrap();
917            // Write just the PDF header, not a complete PDF
918            f.write_all(b"%PDF-1.4\n").unwrap();
919        }
920        let result = PdfDocument::open_mmap(&path);
921        // Should be an error, not a panic
922        assert!(result.is_err());
923        let _ = std::fs::remove_file(&path);
924    }
925
926    #[cfg(feature = "mmap")]
927    #[test]
928    fn test_mmap_empty_file() {
929        let dir = std::env::temp_dir();
930        let path = dir.join("justpdf_mmap_empty.pdf");
931        {
932            std::fs::File::create(&path).unwrap();
933        }
934        let result = PdfDocument::open_mmap(&path);
935        // Should be an error, not a panic
936        assert!(result.is_err());
937        let _ = std::fs::remove_file(&path);
938    }
939
940    #[cfg(feature = "mmap")]
941    #[test]
942    fn test_open_mmap() {
943        use std::io::Write;
944        // Write a minimal PDF to a temp file and open with mmap.
945        let data = build_minimal_pdf();
946        let dir = std::env::temp_dir();
947        let path = dir.join("justpdf_mmap_test.pdf");
948        {
949            let mut f = std::fs::File::create(&path).unwrap();
950            f.write_all(&data).unwrap();
951        }
952        let doc = PdfDocument::open_mmap(&path).unwrap();
953        assert_eq!(doc.version, (1, 4));
954        assert!(!doc.is_encrypted());
955
956        let catalog_ref = doc.catalog_ref().unwrap().clone();
957        let catalog = doc.resolve(&catalog_ref).unwrap();
958        match &catalog {
959            PdfObject::Dict(d) => {
960                assert_eq!(d.get_name(b"Type"), Some(b"Catalog".as_slice()));
961            }
962            _ => panic!("expected dict for catalog"),
963        }
964
965        // Clean up.
966        let _ = std::fs::remove_file(&path);
967    }
968}