Skip to main content

pdf_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10    AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11    OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Array, MaybeRef};
15use crate::object::{DateTime, Dict};
16use crate::object::{Object, ObjectLike};
17use crate::pdf::PdfVersion;
18use crate::reader::Reader;
19use crate::reader::{Readable, ReaderContext, ReaderExt};
20use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
21use crate::{PdfData, object};
22use alloc::vec;
23use alloc::vec::Vec;
24use core::cmp::max;
25use core::iter;
26use core::ops::Deref;
27use log::{error, warn};
28
29pub(crate) const XREF_ENTRY_LEN: usize = 20;
30
31#[derive(Debug, Copy, Clone)]
32pub(crate) enum XRefError {
33    Unknown,
34    Encryption(DecryptionError),
35}
36
37/// Parse the "root" xref from the PDF.
38pub(crate) fn root_xref(data: PdfData, password: &[u8]) -> Result<XRef, XRefError> {
39    let mut xref_map = FxHashMap::default();
40    let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
41    let trailer =
42        populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
43
44    XRef::new(
45        data.clone(),
46        xref_map,
47        XRefInput::TrailerDictData(trailer),
48        false,
49        password,
50    )
51}
52
53/// Try to manually parse the PDF to build an xref table and trailer dictionary.
54pub(crate) fn fallback(data: PdfData, password: &[u8]) -> Option<XRef> {
55    warn!("xref table was invalid, trying to manually build xref table");
56    let (xref_map, xref_input) = fallback_xref_map(&data, password);
57
58    if let Some(xref_input) = xref_input {
59        warn!("rebuild xref table with {} entries", xref_map.len());
60
61        XRef::new(data.clone(), xref_map, xref_input, true, password).ok()
62    } else {
63        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
64
65        None
66    }
67}
68
69fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
70    fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
71}
72
73fn fallback_xref_map_inner<'a>(
74    data: &'a PdfData,
75    mut dummy_ctx: ReaderContext<'a>,
76    recurse: bool,
77    password: &[u8],
78) -> (XrefMap, Option<XRefInput<'a>>) {
79    let mut xref_map = FxHashMap::default();
80    let mut trailer_dicts = vec![];
81    let mut root_ref = None;
82
83    let mut r = Reader::new(data.as_ref());
84
85    let mut last_obj_num = None;
86
87    loop {
88        let cur_pos = r.offset();
89
90        let mut old_r = r.clone();
91
92        if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
93            let mut cloned = r.clone();
94            // Check that the object following it is actually valid before inserting it.
95            cloned.skip_white_spaces_and_comments();
96            if cloned.skip::<Object<'_>>(false).is_some() {
97                xref_map.insert(obj_id, EntryType::Normal(cur_pos));
98                last_obj_num = Some(obj_id);
99                dummy_ctx.set_obj_number(obj_id);
100            }
101        } else if let Some(dict) = r.read::<Dict<'_>>(&dummy_ctx) {
102            if dict.contains_key(ROOT) {
103                trailer_dicts.push(dict.clone());
104            }
105
106            if dict
107                .get::<Name>(TYPE)
108                .is_some_and(|n| n.as_str() == "Catalog")
109            {
110                root_ref = last_obj_num;
111            }
112
113            if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
114                && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
115                && let Some(data) = stream.decoded().ok()
116                && let Some(last_obj_num) = last_obj_num
117                && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
118            {
119                for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
120                    let id = ObjectIdentifier::new(*obj_num as i32, 0);
121                    // If we already found an entry for that object number that was not
122                    // inside an object stream. Somewhat arbitrary and maybe
123                    // we can do better, but that seems to work for the current
124                    // set of tests.
125                    if xref_map
126                        .get(&id)
127                        .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
128                    {
129                        xref_map.insert(
130                            id,
131                            EntryType::ObjStream(last_obj_num.obj_number as u32, idx as u32),
132                        );
133                    }
134                }
135            }
136        } else {
137            r.read_byte();
138        }
139
140        if r.at_end() {
141            break;
142        }
143    }
144
145    // Try to choose the right trailer dict by doing basic validation.
146    let mut trailer_dict = None;
147
148    for dict in trailer_dicts {
149        if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
150            let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
151
152            match root_id {
153                MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
154                    Some(EntryType::Normal(offset)) => {
155                        let mut reader = Reader::new(&data.as_ref()[*offset..]);
156
157                        if let Some(obj) =
158                            reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
159                            && check(&obj.clone().get())
160                        {
161                            trailer_dict = Some(dict);
162                        }
163                    }
164                    Some(EntryType::ObjStream(obj_num, idx)) => {
165                        if let Some(EntryType::Normal(offset)) =
166                            xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
167                        {
168                            let mut reader = Reader::new(&data.as_ref()[*offset..]);
169
170                            if let Some(stream) =
171                                reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
172                                && let Some(data) = stream.clone().get().decoded().ok()
173                                && let Some(object_stream) =
174                                    ObjectStream::new(stream.get(), &data, &dummy_ctx)
175                                && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
176                                && check(&obj)
177                            {
178                                trailer_dict = Some(dict);
179                            }
180                        }
181                    }
182                    _ => {}
183                },
184                MaybeRef::NotRef(d) => {
185                    if check(&d) {
186                        trailer_dict = Some(dict);
187                    }
188                }
189            }
190        }
191    }
192
193    let has_encryption = trailer_dict
194        .as_ref()
195        .is_some_and(|t| t.contains_key(ENCRYPT));
196
197    if has_encryption && recurse {
198        // The problem is that in this case, we have used a dummy reader context which does not have
199        // a decryptor. Therefore, we were unable to decrypt any of the object streams and missed
200        // all objects that are inside of such a stream. Therefore, we need to redo the process
201        // using a `ReaderContext` that does have the ability to decrypt.
202        if let Some(Ok(xref)) = trailer_dict.as_ref().map(|d| {
203            XRef::new(
204                data.clone(),
205                xref_map.clone(),
206                XRefInput::TrailerDictData(d.data()),
207                true,
208                password,
209            )
210        }) {
211            let ctx = ReaderContext::new(&xref, false);
212            let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
213            xref_map = patched_map;
214        }
215    }
216
217    if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
218        (
219            xref_map,
220            Some(XRefInput::TrailerDictData(trailer_dict_data)),
221        )
222    } else if let Some(root_ref) = root_ref {
223        (xref_map, Some(XRefInput::RootRef(root_ref)))
224    } else {
225        (xref_map, None)
226    }
227}
228
229const DUMMY_XREF: XRef = XRef(Inner::Dummy);
230
231/// An xref table.
232#[derive(Debug, Clone)]
233pub struct XRef(Inner);
234
235impl XRef {
236    fn new(
237        data: PdfData,
238        xref_map: XrefMap,
239        input: XRefInput<'_>,
240        repaired: bool,
241        password: &[u8],
242    ) -> Result<Self, XRefError> {
243        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
244        // before we actually created the xref struct. So we first create it using dummy data
245        // and then populate the data.
246        let trailer_data = TrailerData::dummy();
247
248        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
249            data: Arc::new(Data::new(data)),
250            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
251            decryptor: Arc::new(Decryptor::None),
252            has_ocgs: false,
253            metadata: Arc::new(Metadata::default()),
254            trailer_data,
255            password: password.to_vec(),
256        })));
257
258        // We read the trailer twice, once to determine the encryption used and then a second
259        // time to resolve the catalog dictionary, etc. This allows us to support catalog dictionaries
260        // that are stored in an encrypted object stream.
261
262        let decryptor = {
263            match input {
264                XRefInput::TrailerDictData(trailer_dict_data) => {
265                    let mut r = Reader::new(trailer_dict_data);
266
267                    let trailer_dict = r
268                        .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
269                        .ok_or(XRefError::Unknown)?;
270
271                    get_decryptor(&trailer_dict, password)?
272                }
273                XRefInput::RootRef(_) => Decryptor::None,
274            }
275        };
276
277        match &mut xref.0 {
278            Inner::Dummy => unreachable!(),
279            Inner::Some(r) => {
280                let mutable = Arc::make_mut(r);
281                mutable.decryptor = Arc::new(decryptor.clone());
282            }
283        }
284
285        let (trailer_data, has_ocgs, metadata) = match input {
286            XRefInput::TrailerDictData(trailer_dict_data) => {
287                let mut r = Reader::new(trailer_dict_data);
288
289                let trailer_dict = r
290                    .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
291                    .ok_or(XRefError::Unknown)?;
292
293                let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
294                let root = trailer_dict
295                    .get::<Dict<'_>>(ROOT)
296                    .ok_or(XRefError::Unknown)?;
297                let metadata = trailer_dict
298                    .get::<Dict<'_>>(INFO)
299                    .map(|d| parse_metadata(&d))
300                    .unwrap_or_default();
301                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
302                let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
303                let version = root
304                    .get::<Name>(VERSION)
305                    .and_then(|v| PdfVersion::from_bytes(v.deref()));
306
307                let td = TrailerData {
308                    pages_ref: pages_ref.into(),
309                    root_ref: root_ref.into(),
310                    version,
311                };
312
313                (td, has_ocgs, metadata)
314            }
315            XRefInput::RootRef(root_ref) => {
316                let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
317                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
318
319                let td = TrailerData {
320                    pages_ref: pages_ref.into(),
321                    root_ref,
322                    version: None,
323                };
324
325                (td, false, Metadata::default())
326            }
327        };
328
329        match &mut xref.0 {
330            Inner::Dummy => unreachable!(),
331            Inner::Some(r) => {
332                let mutable = Arc::make_mut(r);
333                mutable.trailer_data = trailer_data;
334                mutable.decryptor = Arc::new(decryptor);
335                mutable.has_ocgs = has_ocgs;
336                mutable.metadata = Arc::new(metadata);
337            }
338        }
339
340        Ok(xref)
341    }
342
343    fn is_repaired(&self) -> bool {
344        match &self.0 {
345            Inner::Dummy => false,
346            Inner::Some(r) => {
347                let locked = r.map.get();
348                locked.repaired
349            }
350        }
351    }
352
353    pub(crate) fn dummy() -> &'static Self {
354        &DUMMY_XREF
355    }
356
357    pub(crate) fn len(&self) -> usize {
358        match &self.0 {
359            Inner::Dummy => 0,
360            Inner::Some(r) => r.map.get().xref_map.len(),
361        }
362    }
363
364    pub(crate) fn trailer_data(&self) -> &TrailerData {
365        match &self.0 {
366            Inner::Dummy => unreachable!(),
367            Inner::Some(r) => &r.trailer_data,
368        }
369    }
370
371    pub(crate) fn metadata(&self) -> &Metadata {
372        match &self.0 {
373            Inner::Dummy => unreachable!(),
374            Inner::Some(r) => &r.metadata,
375        }
376    }
377
378    /// Return the object ID of the root dictionary.
379    pub fn root_id(&self) -> ObjectIdentifier {
380        self.trailer_data().root_ref
381    }
382
383    /// Whether the PDF has optional content groups.
384    pub fn has_optional_content_groups(&self) -> bool {
385        match &self.0 {
386            Inner::Dummy => false,
387            Inner::Some(r) => r.has_ocgs,
388        }
389    }
390
391    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
392        match &self.0 {
393            Inner::Dummy => unimplemented!(),
394            Inner::Some(r) => {
395                let locked = r.map.get();
396                let mut elements = locked
397                    .xref_map
398                    .iter()
399                    .map(|(id, e)| {
400                        let offset = match e {
401                            EntryType::Normal(o) => (*o, 0),
402                            EntryType::ObjStream(id, index) => {
403                                if let Some(EntryType::Normal(offset)) =
404                                    locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
405                                {
406                                    (*offset, *index)
407                                } else {
408                                    (usize::MAX, 0)
409                                }
410                            }
411                        };
412
413                        (*id, offset)
414                    })
415                    .collect::<Vec<_>>();
416
417                // Try to yield in the order the objects appeared in the
418                // PDF.
419                elements.sort_by(|e1, e2| e1.1.cmp(&e2.1));
420
421                let mut iter = elements.into_iter();
422
423                iter::from_fn(move || {
424                    for next in iter.by_ref() {
425                        if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
426                            return Some(obj);
427                        } else {
428                            // Skip invalid objects.
429                            continue;
430                        }
431                    }
432
433                    None
434                })
435            }
436        }
437    }
438
439    pub(crate) fn repair(&self) {
440        let Inner::Some(r) = &self.0 else {
441            unreachable!();
442        };
443
444        let mut locked = r
445            .map
446            .try_put()
447            .expect("xref repair: map lock not contended");
448        assert!(!locked.repaired);
449
450        let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
451        locked.xref_map = xref_map;
452        locked.repaired = true;
453    }
454
455    #[inline]
456    pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
457        match &self.0 {
458            Inner::Dummy => false,
459            Inner::Some(r) => {
460                if matches!(r.decryptor.as_ref(), Decryptor::None) {
461                    false
462                } else {
463                    !ctx.in_content_stream() && !ctx.in_object_stream()
464                }
465            }
466        }
467    }
468
469    #[inline]
470    pub(crate) fn decrypt(
471        &self,
472        id: ObjectIdentifier,
473        data: &[u8],
474        target: DecryptionTarget,
475    ) -> Option<Vec<u8>> {
476        match &self.0 {
477            Inner::Dummy => Some(data.to_vec()),
478            Inner::Some(r) => r.decryptor.decrypt(id, data, target),
479        }
480    }
481
482    /// Return the object with the given identifier.
483    #[allow(private_bounds)]
484    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
485    where
486        T: ObjectLike<'a>,
487    {
488        let ctx = ReaderContext::new(self, false);
489        self.get_with(id, &ctx)
490    }
491
492    /// Return the object with the given identifier.
493    #[allow(private_bounds)]
494    pub(crate) fn get_with<'a, T>(
495        &'a self,
496        id: ObjectIdentifier,
497        ctx: &ReaderContext<'a>,
498    ) -> Option<T>
499    where
500        T: ObjectLike<'a>,
501    {
502        let Inner::Some(repr) = &self.0 else {
503            return None;
504        };
505
506        let locked = repr.map.try_get()?;
507
508        let mut r = Reader::new(repr.data.get().as_ref());
509
510        let entry = *locked.xref_map.get(&id).or({
511            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
512            // shall be treated as a reference to the null object.
513            None
514        })?;
515        drop(locked);
516
517        let mut ctx = ctx.clone();
518        ctx.set_obj_number(id);
519        ctx.set_in_content_stream(false);
520
521        match entry {
522            EntryType::Normal(offset) => {
523                ctx.set_in_object_stream(false);
524                r.jump(offset);
525
526                if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
527                    if object.id() == &id {
528                        return Some(object.get());
529                    }
530                } else {
531                    // There is a valid object at the offset, it's just not of the type the caller
532                    // expected, which is fine.
533                    if r.skip_not_in_content_stream::<IndirectObject<Object<'_>>>()
534                        .is_some()
535                    {
536                        return None;
537                    }
538                };
539
540                // The xref table is broken, try to repair if not already repaired.
541                if self.is_repaired() {
542                    error!(
543                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
544                    );
545
546                    None
547                } else {
548                    warn!("broken xref, attempting to repair");
549
550                    self.repair();
551
552                    // Now try reading again.
553                    self.get_with::<T>(id, &ctx)
554                }
555            }
556            EntryType::ObjStream(obj_stram_gen_num, index) => {
557                // Generation number is implicitly 0.
558                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
559
560                if obj_stream_id == id {
561                    warn!("cycle detected in object stream");
562
563                    return None;
564                }
565
566                let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
567                let data = repr.data.get_with(obj_stream_id, &ctx)?;
568                let object_stream = ObjectStream::new(stream, data, &ctx)?;
569                object_stream.get(index)
570            }
571        }
572    }
573}
574
575/// An input that is passed to the xref constructor so that we can fully resolve
576/// the PDF.
577#[derive(Debug, Copy, Clone)]
578pub(crate) enum XRefInput<'a> {
579    /// This option is going to be uesd in 99.999% of the case. It contains the
580    /// raw data of the trailer dictionary which is then going to be processed.
581    TrailerDictData(&'a [u8]),
582    /// In case the trailer dictionary could not be read (for example because
583    /// it is cut-off), we just pass the object ID of the root dictionary
584    /// in case we have found one, and try our best to build the PDF just
585    /// with the information we have there.
586    ///
587    /// Note that this won't work if the document is encrypted, as we
588    /// can't access the crypto dictionary.
589    RootRef(ObjectIdentifier),
590}
591
592pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
593    let mut finder = Reader::new(data);
594    let mut pos = finder.len().checked_sub(1)?;
595    finder.jump(pos);
596
597    let needle = b"startxref";
598
599    loop {
600        if finder.forward_tag(needle).is_some() {
601            finder.skip_white_spaces_and_comments();
602
603            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
604
605            return Some(offset);
606        }
607
608        pos = pos.checked_sub(1)?;
609        finder.jump(pos);
610    }
611}
612
613/// A type of xref entry.
614#[derive(Debug, PartialEq, Eq, Clone, Copy)]
615enum EntryType {
616    /// An indirect object that is at a specific offset in the original data.
617    Normal(usize),
618    /// An indirect object that is part of an object stream. First number indicates the object
619    /// number of the _object stream_ (the generation number is always 0), the second number indicates
620    /// the index in the object stream.
621    ObjStream(u32, u32),
622}
623
624type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
625
626/// Representation of a proper xref table.
627#[derive(Debug)]
628struct MapRepr {
629    xref_map: XrefMap,
630    repaired: bool,
631}
632
633#[derive(Debug, Copy, Clone)]
634pub(crate) struct TrailerData {
635    pub(crate) pages_ref: ObjectIdentifier,
636    pub(crate) root_ref: ObjectIdentifier,
637    pub(crate) version: Option<PdfVersion>,
638}
639
640impl TrailerData {
641    pub(crate) fn dummy() -> Self {
642        Self {
643            pages_ref: ObjectIdentifier::new(0, 0),
644            root_ref: ObjectIdentifier::new(0, 0),
645            version: None,
646        }
647    }
648}
649
650#[derive(Debug, Clone)]
651struct SomeRepr {
652    data: Arc<Data>,
653    map: Arc<RwLock<MapRepr>>,
654    metadata: Arc<Metadata>,
655    decryptor: Arc<Decryptor>,
656    has_ocgs: bool,
657    password: Vec<u8>,
658    trailer_data: TrailerData,
659}
660
661#[derive(Debug, Clone)]
662enum Inner {
663    /// A dummy xref table that doesn't have any entries.
664    Dummy,
665    /// A proper xref table.
666    Some(Arc<SomeRepr>),
667}
668
669#[derive(Debug)]
670struct XRefEntry {
671    offset: usize,
672    gen_number: i32,
673    used: bool,
674}
675
676impl XRefEntry {
677    pub(crate) fn read(data: &[u8]) -> Option<Self> {
678        #[inline(always)]
679        fn parse_u32(data: &[u8]) -> Option<u32> {
680            let mut accum = 0_u32;
681
682            for byte in data {
683                accum = accum.checked_mul(10)?;
684
685                match *byte {
686                    b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
687                    _ => return None,
688                }
689            }
690
691            Some(accum)
692        }
693
694        let offset = parse_u32(&data[0..10])? as usize;
695        let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
696
697        let used = data[17] == b'n';
698
699        Some(Self {
700            offset,
701            gen_number,
702            used,
703        })
704    }
705}
706
707/// Maximum depth for following xref Prev/XRefStm chains to prevent stack
708/// overflow on circular or deeply chained xref tables.
709const MAX_XREF_CHAIN_DEPTH: usize = 64;
710
711fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
712    populate_xref_depth(data, pos, xref_map, 0)
713}
714
715fn populate_xref_depth<'a>(
716    data: &'a [u8],
717    pos: usize,
718    xref_map: &mut XrefMap,
719    depth: usize,
720) -> Option<&'a [u8]> {
721    if depth > MAX_XREF_CHAIN_DEPTH {
722        log::warn!("Xref chain depth exceeds {MAX_XREF_CHAIN_DEPTH}, stopping traversal");
723        return None;
724    }
725    let mut reader = Reader::new(data);
726    reader.jump(pos);
727    // In case the position points to before the object number of a xref stream.
728    reader.skip_white_spaces_and_comments();
729
730    let mut r2 = reader.clone();
731    if reader
732        .clone()
733        .read_without_context::<ObjectIdentifier>()
734        .is_some()
735    {
736        populate_from_xref_stream(data, &mut r2, xref_map, depth)
737    } else {
738        populate_from_xref_table(data, &mut r2, xref_map, depth)
739    }
740}
741
742pub(super) struct SubsectionHeader {
743    pub(super) start: u32,
744    pub(super) num_entries: u32,
745}
746
747impl Readable<'_> for SubsectionHeader {
748    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
749        r.skip_white_spaces();
750        let start = r.read_without_context::<u32>()?;
751        r.skip_white_spaces();
752        let num_entries = r.read_without_context::<u32>()?;
753        r.skip_white_spaces();
754
755        Some(Self { start, num_entries })
756    }
757}
758
759/// Populate the xref table, and return the trailer dict.
760fn populate_from_xref_table<'a>(
761    data: &'a [u8],
762    reader: &mut Reader<'a>,
763    insert_map: &mut XrefMap,
764    depth: usize,
765) -> Option<&'a [u8]> {
766    let trailer = {
767        let mut reader = reader.clone();
768        read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
769    };
770
771    reader.skip_white_spaces();
772    reader.forward_tag(b"xref")?;
773    reader.skip_white_spaces();
774
775    let mut max_obj = 0;
776
777    if let Some(prev) = trailer.get::<i32>(PREV) {
778        // First insert the entries from any previous xref tables.
779        populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
780    }
781
782    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
783    // after looking at `PREV`.
784    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
785        populate_xref_depth(data, xref_stm as usize, insert_map, depth + 1)?;
786    }
787
788    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
789        reader.skip_white_spaces();
790
791        let start = header.start;
792        let end = start + header.num_entries;
793
794        for obj_number in start..end {
795            max_obj = max(max_obj, obj_number);
796            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
797            let entry = XRefEntry::read(bytes)?;
798
799            // Specification says we should ignore any object number > SIZE, but probably
800            // not important?
801            if entry.used {
802                insert_map.insert(
803                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
804                    EntryType::Normal(entry.offset),
805                );
806            }
807        }
808    }
809
810    Some(trailer.data())
811}
812
813fn populate_from_xref_stream<'a>(
814    data: &'a [u8],
815    reader: &mut Reader<'a>,
816    insert_map: &mut XrefMap,
817    depth: usize,
818) -> Option<&'a [u8]> {
819    let stream = reader
820        .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
821        .get();
822
823    if let Some(prev) = stream.dict().get::<i32>(PREV) {
824        // First insert the entries from any previous xref tables.
825        let _ = populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
826    }
827
828    let size = stream.dict().get::<u32>(SIZE)?;
829
830    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
831
832    if f2_len > size_of::<u64>() as u8 {
833        error!("xref offset length is larger than the allowed limit");
834
835        return None;
836    }
837
838    // Do such files exist?
839    if f1_len != 1 {
840        warn!("first field in xref stream was longer than 1");
841    }
842
843    let xref_data = stream.decoded().ok()?;
844    let mut xref_reader = Reader::new(xref_data.as_ref());
845
846    if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
847        let iter = arr.iter::<(u32, u32)>();
848
849        for (start, num_elements) in iter {
850            xref_stream_subsection(
851                &mut xref_reader,
852                start,
853                num_elements,
854                f1_len,
855                f2_len,
856                f3_len,
857                insert_map,
858            )?;
859        }
860    } else {
861        xref_stream_subsection(
862            &mut xref_reader,
863            0,
864            size,
865            f1_len,
866            f2_len,
867            f3_len,
868            insert_map,
869        )?;
870    }
871
872    Some(stream.dict().data())
873}
874
875fn xref_stream_num(data: &[u8]) -> Option<u32> {
876    Some(match data.len() {
877        0 => return None,
878        1 => u8::from_be(data[0]) as u32,
879        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
880        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
881        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
882        8 => {
883            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
884                return Some(num);
885            } else {
886                warn!("xref stream number is too large");
887
888                return None;
889            }
890        }
891        n => {
892            warn!("invalid xref stream number {n}");
893
894            return None;
895        }
896    })
897}
898
899fn xref_stream_subsection<'a>(
900    xref_reader: &mut Reader<'a>,
901    start: u32,
902    num_elements: u32,
903    f1_len: u8,
904    f2_len: u8,
905    f3_len: u8,
906    insert_map: &mut XrefMap,
907) -> Option<()> {
908    for i in 0..num_elements {
909        let f_type = if f1_len == 0 {
910            1
911        } else {
912            // We assume a length of 1.
913            xref_reader.read_bytes(1)?[0]
914        };
915
916        let obj_number = start + i;
917
918        match f_type {
919            // We don't care about free objects.
920            0 => {
921                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
922            }
923            1 => {
924                let offset = if f2_len > 0 {
925                    let data = xref_reader.read_bytes(f2_len as usize)?;
926                    xref_stream_num(data)?
927                } else {
928                    0
929                };
930
931                let gen_number = if f3_len > 0 {
932                    let data = xref_reader.read_bytes(f3_len as usize)?;
933                    xref_stream_num(data)?
934                } else {
935                    0
936                };
937
938                insert_map.insert(
939                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
940                    EntryType::Normal(offset as usize),
941                );
942            }
943            2 => {
944                let obj_stream_number = {
945                    let data = xref_reader.read_bytes(f2_len as usize)?;
946                    xref_stream_num(data)?
947                };
948                let gen_number = 0;
949                let index = if f3_len > 0 {
950                    let data = xref_reader.read_bytes(f3_len as usize)?;
951                    xref_stream_num(data)?
952                } else {
953                    0
954                };
955
956                insert_map.insert(
957                    ObjectIdentifier::new(obj_number as i32, gen_number),
958                    EntryType::ObjStream(obj_stream_number, index),
959                );
960            }
961            _ => {
962                warn!("xref has unknown field type {f_type}");
963
964                return None;
965            }
966        }
967    }
968
969    Some(())
970}
971
972fn read_xref_table_trailer<'a>(
973    reader: &mut Reader<'a>,
974    ctx: &ReaderContext<'a>,
975) -> Option<Dict<'a>> {
976    reader.skip_white_spaces();
977    reader.forward_tag(b"xref")?;
978    reader.skip_white_spaces();
979
980    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
981        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
982    }
983
984    reader.skip_white_spaces();
985    reader.forward_tag(b"trailer")?;
986    reader.skip_white_spaces();
987
988    reader.read_with_context::<Dict<'_>>(ctx)
989}
990
991fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
992    if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
993        let id = if let Some(id) = trailer_dict
994            .get::<Array<'_>>(ID)
995            .and_then(|a| a.flex_iter().next::<object::String>())
996        {
997            id.to_vec()
998        } else {
999            // Assume an empty ID entry.
1000            vec![]
1001        };
1002
1003        get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1004    } else {
1005        Ok(Decryptor::None)
1006    }
1007}
1008
1009struct ObjectStream<'a> {
1010    data: &'a [u8],
1011    ctx: ReaderContext<'a>,
1012    offsets: Vec<(u32, usize)>,
1013}
1014
1015impl<'a> ObjectStream<'a> {
1016    fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1017        let num_objects = inner.dict().get::<usize>(N)?;
1018        let first_offset = inner.dict().get::<usize>(FIRST)?;
1019
1020        let mut r = Reader::new(data);
1021
1022        let mut offsets = vec![];
1023
1024        for _ in 0..num_objects {
1025            r.skip_white_spaces_and_comments();
1026            // Skip object number
1027            let obj_num = r.read_without_context::<u32>()?;
1028            r.skip_white_spaces_and_comments();
1029            let relative_offset = r.read_without_context::<usize>()?;
1030            offsets.push((obj_num, first_offset + relative_offset));
1031        }
1032
1033        let mut ctx = ctx.clone();
1034        ctx.set_in_object_stream(true);
1035
1036        Some(Self { data, ctx, offsets })
1037    }
1038
1039    fn get<T>(&self, index: u32) -> Option<T>
1040    where
1041        T: ObjectLike<'a>,
1042    {
1043        let offset = self.offsets.get(index as usize)?.1;
1044        let mut r = Reader::new(self.data);
1045        r.jump(offset);
1046        r.skip_white_spaces_and_comments();
1047
1048        r.read_with_context::<T>(&self.ctx)
1049    }
1050}
1051
1052fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1053    Metadata {
1054        creation_date: info_dict
1055            .get::<object::String>(CREATION_DATE)
1056            .and_then(|c| DateTime::from_bytes(&c)),
1057        modification_date: info_dict
1058            .get::<object::String>(MOD_DATE)
1059            .and_then(|c| DateTime::from_bytes(&c)),
1060        title: info_dict.get::<object::String>(TITLE).map(|t| t.to_vec()),
1061        author: info_dict.get::<object::String>(AUTHOR).map(|t| t.to_vec()),
1062        subject: info_dict.get::<object::String>(SUBJECT).map(|t| t.to_vec()),
1063        keywords: info_dict
1064            .get::<object::String>(KEYWORDS)
1065            .map(|t| t.to_vec()),
1066        creator: info_dict.get::<object::String>(CREATOR).map(|t| t.to_vec()),
1067        producer: info_dict
1068            .get::<object::String>(PRODUCER)
1069            .map(|t| t.to_vec()),
1070    }
1071}