Skip to main content

pdf_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10    AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11    OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Array, MaybeRef};
15use crate::object::{DateTime, Dict};
16use crate::object::{Object, ObjectLike};
17use crate::pdf::{PdfLoadLimits, PdfVersion};
18use crate::reader::Reader;
19use crate::reader::{Readable, ReaderContext, ReaderExt};
20use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
21use crate::{PdfData, object};
22use alloc::vec;
23use alloc::vec::Vec;
24use core::cmp::max;
25use core::iter;
26use core::ops::Deref;
27use log::{error, warn};
28
29pub(crate) const XREF_ENTRY_LEN: usize = 20;
30
31#[derive(Debug, Copy, Clone)]
32pub(crate) enum XRefError {
33    Unknown,
34    Encryption(DecryptionError),
35}
36
37/// Parse the "root" xref from the PDF.
38pub(crate) fn root_xref(
39    data: PdfData,
40    password: &[u8],
41    limits: PdfLoadLimits,
42) -> Result<XRef, XRefError> {
43    let mut xref_map = FxHashMap::default();
44    let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
45    let trailer =
46        populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
47
48    XRef::new(
49        data.clone(),
50        xref_map,
51        XRefInput::TrailerDictData(trailer),
52        false,
53        password,
54        limits,
55    )
56}
57
58/// Try to manually parse the PDF to build an xref table and trailer dictionary.
59pub(crate) fn fallback(data: PdfData, password: &[u8], limits: PdfLoadLimits) -> Option<XRef> {
60    warn!("xref table was invalid, trying to manually build xref table");
61    let (xref_map, xref_input) = fallback_xref_map(&data, password);
62
63    if let Some(xref_input) = xref_input {
64        warn!("rebuild xref table with {} entries", xref_map.len());
65
66        XRef::new(data.clone(), xref_map, xref_input, true, password, limits).ok()
67    } else {
68        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
69
70        None
71    }
72}
73
74fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
75    fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
76}
77
78fn fallback_xref_map_inner<'a>(
79    data: &'a PdfData,
80    mut dummy_ctx: ReaderContext<'a>,
81    recurse: bool,
82    password: &[u8],
83) -> (XrefMap, Option<XRefInput<'a>>) {
84    let mut xref_map = FxHashMap::default();
85    let mut trailer_dicts = vec![];
86    let mut root_ref = None;
87
88    let mut r = Reader::new(data.as_ref());
89
90    let mut last_obj_num = None;
91
92    loop {
93        let cur_pos = r.offset();
94
95        let mut old_r = r.clone();
96
97        if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
98            let mut cloned = r.clone();
99            // Check that the object following it is actually valid before inserting it.
100            cloned.skip_white_spaces_and_comments();
101            if cloned.skip::<Object<'_>>(false).is_some() {
102                xref_map.insert(obj_id, EntryType::Normal(cur_pos));
103                last_obj_num = Some(obj_id);
104                dummy_ctx.set_obj_number(obj_id);
105            }
106        } else if let Some(dict) = r.read::<Dict<'_>>(&dummy_ctx) {
107            if dict.contains_key(ROOT) {
108                trailer_dicts.push(dict.clone());
109            }
110
111            if dict
112                .get::<Name>(TYPE)
113                .is_some_and(|n| n.as_str() == "Catalog")
114            {
115                root_ref = last_obj_num;
116            }
117
118            if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
119                && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
120                && let Some(data) = stream.decoded().ok()
121                && let Some(last_obj_num) = last_obj_num
122                && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
123            {
124                for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
125                    let id = ObjectIdentifier::new(*obj_num as i32, 0);
126                    // If we already found an entry for that object number that was not
127                    // inside an object stream. Somewhat arbitrary and maybe
128                    // we can do better, but that seems to work for the current
129                    // set of tests.
130                    if xref_map
131                        .get(&id)
132                        .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
133                    {
134                        xref_map.insert(
135                            id,
136                            EntryType::ObjStream(last_obj_num.obj_number as u32, idx as u32),
137                        );
138                    }
139                }
140            }
141        } else {
142            r.read_byte();
143        }
144
145        if r.at_end() {
146            break;
147        }
148    }
149
150    // Try to choose the right trailer dict by doing basic validation.
151    let mut trailer_dict = None;
152
153    for dict in trailer_dicts {
154        if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
155            let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
156
157            match root_id {
158                MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
159                    Some(EntryType::Normal(offset)) => {
160                        let mut reader = Reader::new(&data.as_ref()[*offset..]);
161
162                        if let Some(obj) =
163                            reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
164                            && check(&obj.clone().get())
165                        {
166                            trailer_dict = Some(dict);
167                        }
168                    }
169                    Some(EntryType::ObjStream(obj_num, idx)) => {
170                        if let Some(EntryType::Normal(offset)) =
171                            xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
172                        {
173                            let mut reader = Reader::new(&data.as_ref()[*offset..]);
174
175                            if let Some(stream) =
176                                reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
177                                && let Some(data) = stream.clone().get().decoded().ok()
178                                && let Some(object_stream) =
179                                    ObjectStream::new(stream.get(), &data, &dummy_ctx)
180                                && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
181                                && check(&obj)
182                            {
183                                trailer_dict = Some(dict);
184                            }
185                        }
186                    }
187                    _ => {}
188                },
189                MaybeRef::NotRef(d) => {
190                    if check(&d) {
191                        trailer_dict = Some(dict);
192                    }
193                }
194            }
195        }
196    }
197
198    let has_encryption = trailer_dict
199        .as_ref()
200        .is_some_and(|t| t.contains_key(ENCRYPT));
201
202    if has_encryption && recurse {
203        // The problem is that in this case, we have used a dummy reader context which does not have
204        // a decryptor. Therefore, we were unable to decrypt any of the object streams and missed
205        // all objects that are inside of such a stream. Therefore, we need to redo the process
206        // using a `ReaderContext` that does have the ability to decrypt.
207        if let Some(Ok(xref)) = trailer_dict.as_ref().map(|d| {
208            XRef::new(
209                data.clone(),
210                xref_map.clone(),
211                XRefInput::TrailerDictData(d.data()),
212                true,
213                password,
214                PdfLoadLimits::default(),
215            )
216        }) {
217            let ctx = ReaderContext::new(&xref, false);
218            let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
219            xref_map = patched_map;
220        }
221    }
222
223    if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
224        (
225            xref_map,
226            Some(XRefInput::TrailerDictData(trailer_dict_data)),
227        )
228    } else if let Some(root_ref) = root_ref {
229        (xref_map, Some(XRefInput::RootRef(root_ref)))
230    } else {
231        (xref_map, None)
232    }
233}
234
235const DUMMY_XREF: XRef = XRef(Inner::Dummy);
236
237/// An xref table.
238#[derive(Debug, Clone)]
239pub struct XRef(Inner);
240
241impl XRef {
242    fn new(
243        data: PdfData,
244        xref_map: XrefMap,
245        input: XRefInput<'_>,
246        repaired: bool,
247        password: &[u8],
248        load_limits: PdfLoadLimits,
249    ) -> Result<Self, XRefError> {
250        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
251        // before we actually created the xref struct. So we first create it using dummy data
252        // and then populate the data.
253        let trailer_data = TrailerData::dummy();
254
255        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
256            data: Arc::new(Data::new(data)),
257            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
258            decryptor: Arc::new(Decryptor::None),
259            has_ocgs: false,
260            metadata: Arc::new(Metadata::default()),
261            trailer_data,
262            password: password.to_vec(),
263            load_limits,
264        })));
265
266        // We read the trailer twice, once to determine the encryption used and then a second
267        // time to resolve the catalog dictionary, etc. This allows us to support catalog dictionaries
268        // that are stored in an encrypted object stream.
269
270        let decryptor = {
271            match input {
272                XRefInput::TrailerDictData(trailer_dict_data) => {
273                    let mut r = Reader::new(trailer_dict_data);
274
275                    let trailer_dict = r
276                        .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
277                        .ok_or(XRefError::Unknown)?;
278
279                    get_decryptor(&trailer_dict, password)?
280                }
281                XRefInput::RootRef(_) => Decryptor::None,
282            }
283        };
284
285        match &mut xref.0 {
286            Inner::Dummy => unreachable!(),
287            Inner::Some(r) => {
288                let mutable = Arc::make_mut(r);
289                mutable.decryptor = Arc::new(decryptor.clone());
290            }
291        }
292
293        let (trailer_data, has_ocgs, metadata) = match input {
294            XRefInput::TrailerDictData(trailer_dict_data) => {
295                let mut r = Reader::new(trailer_dict_data);
296
297                let trailer_dict = r
298                    .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
299                    .ok_or(XRefError::Unknown)?;
300
301                let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
302                let root = trailer_dict
303                    .get::<Dict<'_>>(ROOT)
304                    .ok_or(XRefError::Unknown)?;
305                let metadata = trailer_dict
306                    .get::<Dict<'_>>(INFO)
307                    .map(|d| parse_metadata(&d))
308                    .unwrap_or_default();
309                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
310                let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
311                let version = root
312                    .get::<Name>(VERSION)
313                    .and_then(|v| PdfVersion::from_bytes(v.deref()));
314
315                let td = TrailerData {
316                    pages_ref: pages_ref.into(),
317                    root_ref: root_ref.into(),
318                    version,
319                };
320
321                (td, has_ocgs, metadata)
322            }
323            XRefInput::RootRef(root_ref) => {
324                let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
325                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
326
327                let td = TrailerData {
328                    pages_ref: pages_ref.into(),
329                    root_ref,
330                    version: None,
331                };
332
333                (td, false, Metadata::default())
334            }
335        };
336
337        match &mut xref.0 {
338            Inner::Dummy => unreachable!(),
339            Inner::Some(r) => {
340                let mutable = Arc::make_mut(r);
341                mutable.trailer_data = trailer_data;
342                mutable.decryptor = Arc::new(decryptor);
343                mutable.has_ocgs = has_ocgs;
344                mutable.metadata = Arc::new(metadata);
345            }
346        }
347
348        Ok(xref)
349    }
350
351    fn is_repaired(&self) -> bool {
352        match &self.0 {
353            Inner::Dummy => false,
354            Inner::Some(r) => {
355                let locked = r.map.get();
356                locked.repaired
357            }
358        }
359    }
360
361    pub(crate) fn dummy() -> &'static Self {
362        &DUMMY_XREF
363    }
364
365    pub(crate) fn load_limits(&self) -> PdfLoadLimits {
366        match &self.0 {
367            Inner::Dummy => PdfLoadLimits::default(),
368            Inner::Some(r) => r.load_limits,
369        }
370    }
371
372    pub(crate) fn len(&self) -> usize {
373        match &self.0 {
374            Inner::Dummy => 0,
375            Inner::Some(r) => r.map.get().xref_map.len(),
376        }
377    }
378
379    pub(crate) fn trailer_data(&self) -> &TrailerData {
380        match &self.0 {
381            Inner::Dummy => unreachable!(),
382            Inner::Some(r) => &r.trailer_data,
383        }
384    }
385
386    pub(crate) fn metadata(&self) -> &Metadata {
387        match &self.0 {
388            Inner::Dummy => unreachable!(),
389            Inner::Some(r) => &r.metadata,
390        }
391    }
392
393    /// Return the object ID of the root dictionary.
394    pub fn root_id(&self) -> ObjectIdentifier {
395        self.trailer_data().root_ref
396    }
397
398    /// Whether the PDF has optional content groups.
399    pub fn has_optional_content_groups(&self) -> bool {
400        match &self.0 {
401            Inner::Dummy => false,
402            Inner::Some(r) => r.has_ocgs,
403        }
404    }
405
406    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
407        match &self.0 {
408            Inner::Dummy => unimplemented!(),
409            Inner::Some(r) => {
410                let locked = r.map.get();
411                let mut elements = locked
412                    .xref_map
413                    .iter()
414                    .map(|(id, e)| {
415                        let offset = match e {
416                            EntryType::Normal(o) => (*o, 0),
417                            EntryType::ObjStream(id, index) => {
418                                if let Some(EntryType::Normal(offset)) =
419                                    locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
420                                {
421                                    (*offset, *index)
422                                } else {
423                                    (usize::MAX, 0)
424                                }
425                            }
426                        };
427
428                        (*id, offset)
429                    })
430                    .collect::<Vec<_>>();
431
432                // Try to yield in the order the objects appeared in the
433                // PDF.
434                elements.sort_by_key(|e1| e1.1);
435
436                let mut iter = elements.into_iter();
437
438                iter::from_fn(move || {
439                    for next in iter.by_ref() {
440                        if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
441                            return Some(obj);
442                        } else {
443                            // Skip invalid objects.
444                            continue;
445                        }
446                    }
447
448                    None
449                })
450            }
451        }
452    }
453
454    pub(crate) fn repair(&self) {
455        let Inner::Some(r) = &self.0 else {
456            unreachable!();
457        };
458
459        let mut locked = r
460            .map
461            .try_put()
462            .expect("xref repair: map lock not contended");
463        assert!(!locked.repaired);
464
465        let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
466        locked.xref_map = xref_map;
467        locked.repaired = true;
468    }
469
470    #[inline]
471    pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
472        match &self.0 {
473            Inner::Dummy => false,
474            Inner::Some(r) => {
475                if matches!(r.decryptor.as_ref(), Decryptor::None) {
476                    false
477                } else {
478                    !ctx.in_content_stream() && !ctx.in_object_stream()
479                }
480            }
481        }
482    }
483
484    #[inline]
485    pub(crate) fn decrypt(
486        &self,
487        id: ObjectIdentifier,
488        data: &[u8],
489        target: DecryptionTarget,
490    ) -> Option<Vec<u8>> {
491        match &self.0 {
492            Inner::Dummy => Some(data.to_vec()),
493            Inner::Some(r) => r.decryptor.decrypt(id, data, target),
494        }
495    }
496
497    /// Return the object with the given identifier.
498    #[allow(private_bounds)]
499    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
500    where
501        T: ObjectLike<'a>,
502    {
503        let ctx = ReaderContext::new(self, false);
504        self.get_with(id, &ctx)
505    }
506
507    /// Return the object with the given identifier.
508    #[allow(private_bounds)]
509    pub(crate) fn get_with<'a, T>(
510        &'a self,
511        id: ObjectIdentifier,
512        ctx: &ReaderContext<'a>,
513    ) -> Option<T>
514    where
515        T: ObjectLike<'a>,
516    {
517        let Inner::Some(repr) = &self.0 else {
518            return None;
519        };
520
521        let locked = repr.map.try_get()?;
522
523        let mut r = Reader::new(repr.data.get().as_ref());
524
525        let entry = *locked.xref_map.get(&id).or({
526            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
527            // shall be treated as a reference to the null object.
528            None
529        })?;
530        drop(locked);
531
532        let mut ctx = ctx.clone();
533        ctx.set_obj_number(id);
534        ctx.set_in_content_stream(false);
535
536        match entry {
537            EntryType::Normal(offset) => {
538                ctx.set_in_object_stream(false);
539                r.jump(offset);
540
541                if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
542                    if object.id() == &id {
543                        return Some(object.get());
544                    }
545                } else {
546                    // There is a valid object at the offset, it's just not of the type the caller
547                    // expected, which is fine.
548                    if r.skip_not_in_content_stream::<IndirectObject<Object<'_>>>()
549                        .is_some()
550                    {
551                        return None;
552                    }
553                };
554
555                // The xref table is broken, try to repair if not already repaired.
556                if self.is_repaired() {
557                    error!(
558                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
559                    );
560
561                    None
562                } else {
563                    warn!("broken xref, attempting to repair");
564
565                    self.repair();
566
567                    // Now try reading again.
568                    self.get_with::<T>(id, &ctx)
569                }
570            }
571            EntryType::ObjStream(obj_stram_gen_num, index) => {
572                // Generation number is implicitly 0.
573                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
574
575                if obj_stream_id == id {
576                    warn!("cycle detected in object stream");
577
578                    return None;
579                }
580
581                let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
582                let data = repr.data.get_with(obj_stream_id, &ctx)?;
583                let object_stream = ObjectStream::new(stream, data, &ctx)?;
584                object_stream.get(index)
585            }
586        }
587    }
588}
589
590/// An input that is passed to the xref constructor so that we can fully resolve
591/// the PDF.
592#[derive(Debug, Copy, Clone)]
593pub(crate) enum XRefInput<'a> {
594    /// This option is going to be uesd in 99.999% of the case. It contains the
595    /// raw data of the trailer dictionary which is then going to be processed.
596    TrailerDictData(&'a [u8]),
597    /// In case the trailer dictionary could not be read (for example because
598    /// it is cut-off), we just pass the object ID of the root dictionary
599    /// in case we have found one, and try our best to build the PDF just
600    /// with the information we have there.
601    ///
602    /// Note that this won't work if the document is encrypted, as we
603    /// can't access the crypto dictionary.
604    RootRef(ObjectIdentifier),
605}
606
607pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
608    let mut finder = Reader::new(data);
609    let mut pos = finder.len().checked_sub(1)?;
610    finder.jump(pos);
611
612    let needle = b"startxref";
613
614    loop {
615        if finder.forward_tag(needle).is_some() {
616            finder.skip_white_spaces_and_comments();
617
618            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
619
620            return Some(offset);
621        }
622
623        pos = pos.checked_sub(1)?;
624        finder.jump(pos);
625    }
626}
627
628/// A type of xref entry.
629#[derive(Debug, PartialEq, Eq, Clone, Copy)]
630enum EntryType {
631    /// An indirect object that is at a specific offset in the original data.
632    Normal(usize),
633    /// An indirect object that is part of an object stream. First number indicates the object
634    /// number of the _object stream_ (the generation number is always 0), the second number indicates
635    /// the index in the object stream.
636    ObjStream(u32, u32),
637}
638
639type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
640
641/// Representation of a proper xref table.
642#[derive(Debug)]
643struct MapRepr {
644    xref_map: XrefMap,
645    repaired: bool,
646}
647
648#[derive(Debug, Copy, Clone)]
649pub(crate) struct TrailerData {
650    pub(crate) pages_ref: ObjectIdentifier,
651    pub(crate) root_ref: ObjectIdentifier,
652    pub(crate) version: Option<PdfVersion>,
653}
654
655impl TrailerData {
656    pub(crate) fn dummy() -> Self {
657        Self {
658            pages_ref: ObjectIdentifier::new(0, 0),
659            root_ref: ObjectIdentifier::new(0, 0),
660            version: None,
661        }
662    }
663}
664
665#[derive(Debug, Clone)]
666struct SomeRepr {
667    data: Arc<Data>,
668    map: Arc<RwLock<MapRepr>>,
669    metadata: Arc<Metadata>,
670    decryptor: Arc<Decryptor>,
671    has_ocgs: bool,
672    password: Vec<u8>,
673    trailer_data: TrailerData,
674    load_limits: PdfLoadLimits,
675}
676
677#[derive(Debug, Clone)]
678enum Inner {
679    /// A dummy xref table that doesn't have any entries.
680    Dummy,
681    /// A proper xref table.
682    Some(Arc<SomeRepr>),
683}
684
685#[derive(Debug)]
686struct XRefEntry {
687    offset: usize,
688    gen_number: i32,
689    used: bool,
690}
691
692impl XRefEntry {
693    pub(crate) fn read(data: &[u8]) -> Option<Self> {
694        #[inline(always)]
695        fn parse_u32(data: &[u8]) -> Option<u32> {
696            let mut accum = 0_u32;
697
698            for byte in data {
699                accum = accum.checked_mul(10)?;
700
701                match *byte {
702                    b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
703                    _ => return None,
704                }
705            }
706
707            Some(accum)
708        }
709
710        let offset = parse_u32(&data[0..10])? as usize;
711        let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
712
713        let used = data[17] == b'n';
714
715        Some(Self {
716            offset,
717            gen_number,
718            used,
719        })
720    }
721}
722
723/// Maximum depth for following xref Prev/XRefStm chains to prevent stack
724/// overflow on circular or deeply chained xref tables.
725const MAX_XREF_CHAIN_DEPTH: usize = 64;
726
727fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
728    populate_xref_depth(data, pos, xref_map, 0)
729}
730
731fn populate_xref_depth<'a>(
732    data: &'a [u8],
733    pos: usize,
734    xref_map: &mut XrefMap,
735    depth: usize,
736) -> Option<&'a [u8]> {
737    if depth > MAX_XREF_CHAIN_DEPTH {
738        log::warn!("Xref chain depth exceeds {MAX_XREF_CHAIN_DEPTH}, stopping traversal");
739        return None;
740    }
741    let mut reader = Reader::new(data);
742    reader.jump(pos);
743    // In case the position points to before the object number of a xref stream.
744    reader.skip_white_spaces_and_comments();
745
746    let mut r2 = reader.clone();
747    if reader
748        .clone()
749        .read_without_context::<ObjectIdentifier>()
750        .is_some()
751    {
752        populate_from_xref_stream(data, &mut r2, xref_map, depth)
753    } else {
754        populate_from_xref_table(data, &mut r2, xref_map, depth)
755    }
756}
757
758pub(super) struct SubsectionHeader {
759    pub(super) start: u32,
760    pub(super) num_entries: u32,
761}
762
763impl Readable<'_> for SubsectionHeader {
764    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
765        r.skip_white_spaces();
766        let start = r.read_without_context::<u32>()?;
767        r.skip_white_spaces();
768        let num_entries = r.read_without_context::<u32>()?;
769        r.skip_white_spaces();
770
771        Some(Self { start, num_entries })
772    }
773}
774
775/// Populate the xref table, and return the trailer dict.
776fn populate_from_xref_table<'a>(
777    data: &'a [u8],
778    reader: &mut Reader<'a>,
779    insert_map: &mut XrefMap,
780    depth: usize,
781) -> Option<&'a [u8]> {
782    let trailer = {
783        let mut reader = reader.clone();
784        read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
785    };
786
787    reader.skip_white_spaces();
788    reader.forward_tag(b"xref")?;
789    reader.skip_white_spaces();
790
791    let mut max_obj = 0;
792
793    if let Some(prev) = trailer.get::<i32>(PREV) {
794        // First insert the entries from any previous xref tables.
795        populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
796    }
797
798    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
799    // after looking at `PREV`.
800    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
801        populate_xref_depth(data, xref_stm as usize, insert_map, depth + 1)?;
802    }
803
804    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
805        reader.skip_white_spaces();
806
807        let start = header.start;
808        let end = start + header.num_entries;
809
810        for obj_number in start..end {
811            max_obj = max(max_obj, obj_number);
812            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
813            let entry = XRefEntry::read(bytes)?;
814
815            // Specification says we should ignore any object number > SIZE, but probably
816            // not important?
817            if entry.used {
818                insert_map.insert(
819                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
820                    EntryType::Normal(entry.offset),
821                );
822            }
823        }
824    }
825
826    Some(trailer.data())
827}
828
829fn populate_from_xref_stream<'a>(
830    data: &'a [u8],
831    reader: &mut Reader<'a>,
832    insert_map: &mut XrefMap,
833    depth: usize,
834) -> Option<&'a [u8]> {
835    let stream = reader
836        .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
837        .get();
838
839    if let Some(prev) = stream.dict().get::<i32>(PREV) {
840        // First insert the entries from any previous xref tables.
841        let _ = populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
842    }
843
844    let size = stream.dict().get::<u32>(SIZE)?;
845
846    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
847
848    if f2_len > size_of::<u64>() as u8 {
849        error!("xref offset length is larger than the allowed limit");
850
851        return None;
852    }
853
854    // Do such files exist?
855    if f1_len != 1 {
856        warn!("first field in xref stream was longer than 1");
857    }
858
859    let xref_data = stream.decoded().ok()?;
860    let mut xref_reader = Reader::new(xref_data.as_ref());
861
862    if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
863        let iter = arr.iter::<(u32, u32)>();
864
865        for (start, num_elements) in iter {
866            xref_stream_subsection(
867                &mut xref_reader,
868                start,
869                num_elements,
870                f1_len,
871                f2_len,
872                f3_len,
873                insert_map,
874            )?;
875        }
876    } else {
877        xref_stream_subsection(
878            &mut xref_reader,
879            0,
880            size,
881            f1_len,
882            f2_len,
883            f3_len,
884            insert_map,
885        )?;
886    }
887
888    Some(stream.dict().data())
889}
890
891fn xref_stream_num(data: &[u8]) -> Option<u32> {
892    Some(match data.len() {
893        0 => return None,
894        1 => u8::from_be(data[0]) as u32,
895        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
896        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
897        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
898        8 => {
899            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
900                return Some(num);
901            } else {
902                warn!("xref stream number is too large");
903
904                return None;
905            }
906        }
907        n => {
908            warn!("invalid xref stream number {n}");
909
910            return None;
911        }
912    })
913}
914
915fn xref_stream_subsection<'a>(
916    xref_reader: &mut Reader<'a>,
917    start: u32,
918    num_elements: u32,
919    f1_len: u8,
920    f2_len: u8,
921    f3_len: u8,
922    insert_map: &mut XrefMap,
923) -> Option<()> {
924    for i in 0..num_elements {
925        let f_type = if f1_len == 0 {
926            1
927        } else {
928            // We assume a length of 1.
929            xref_reader.read_bytes(1)?[0]
930        };
931
932        let obj_number = start + i;
933
934        match f_type {
935            // We don't care about free objects.
936            0 => {
937                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
938            }
939            1 => {
940                let offset = if f2_len > 0 {
941                    let data = xref_reader.read_bytes(f2_len as usize)?;
942                    xref_stream_num(data)?
943                } else {
944                    0
945                };
946
947                let gen_number = if f3_len > 0 {
948                    let data = xref_reader.read_bytes(f3_len as usize)?;
949                    xref_stream_num(data)?
950                } else {
951                    0
952                };
953
954                insert_map.insert(
955                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
956                    EntryType::Normal(offset as usize),
957                );
958            }
959            2 => {
960                let obj_stream_number = {
961                    let data = xref_reader.read_bytes(f2_len as usize)?;
962                    xref_stream_num(data)?
963                };
964                let gen_number = 0;
965                let index = if f3_len > 0 {
966                    let data = xref_reader.read_bytes(f3_len as usize)?;
967                    xref_stream_num(data)?
968                } else {
969                    0
970                };
971
972                insert_map.insert(
973                    ObjectIdentifier::new(obj_number as i32, gen_number),
974                    EntryType::ObjStream(obj_stream_number, index),
975                );
976            }
977            _ => {
978                warn!("xref has unknown field type {f_type}");
979
980                return None;
981            }
982        }
983    }
984
985    Some(())
986}
987
988fn read_xref_table_trailer<'a>(
989    reader: &mut Reader<'a>,
990    ctx: &ReaderContext<'a>,
991) -> Option<Dict<'a>> {
992    reader.skip_white_spaces();
993    reader.forward_tag(b"xref")?;
994    reader.skip_white_spaces();
995
996    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
997        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
998    }
999
1000    reader.skip_white_spaces();
1001    reader.forward_tag(b"trailer")?;
1002    reader.skip_white_spaces();
1003
1004    reader.read_with_context::<Dict<'_>>(ctx)
1005}
1006
1007fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
1008    if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
1009        let id = if let Some(id) = trailer_dict
1010            .get::<Array<'_>>(ID)
1011            .and_then(|a| a.flex_iter().next::<object::String>())
1012        {
1013            id.to_vec()
1014        } else {
1015            // Assume an empty ID entry.
1016            vec![]
1017        };
1018
1019        get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1020    } else {
1021        Ok(Decryptor::None)
1022    }
1023}
1024
1025struct ObjectStream<'a> {
1026    data: &'a [u8],
1027    ctx: ReaderContext<'a>,
1028    offsets: Vec<(u32, usize)>,
1029}
1030
1031impl<'a> ObjectStream<'a> {
1032    fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1033        let num_objects = inner.dict().get::<usize>(N)?;
1034        let first_offset = inner.dict().get::<usize>(FIRST)?;
1035
1036        let mut r = Reader::new(data);
1037
1038        let mut offsets = vec![];
1039
1040        for _ in 0..num_objects {
1041            r.skip_white_spaces_and_comments();
1042            // Skip object number
1043            let obj_num = r.read_without_context::<u32>()?;
1044            r.skip_white_spaces_and_comments();
1045            let relative_offset = r.read_without_context::<usize>()?;
1046            offsets.push((obj_num, first_offset + relative_offset));
1047        }
1048
1049        let mut ctx = ctx.clone();
1050        ctx.set_in_object_stream(true);
1051
1052        Some(Self { data, ctx, offsets })
1053    }
1054
1055    fn get<T>(&self, index: u32) -> Option<T>
1056    where
1057        T: ObjectLike<'a>,
1058    {
1059        let offset = self.offsets.get(index as usize)?.1;
1060        let mut r = Reader::new(self.data);
1061        r.jump(offset);
1062        r.skip_white_spaces_and_comments();
1063
1064        r.read_with_context::<T>(&self.ctx)
1065    }
1066}
1067
1068fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1069    Metadata {
1070        creation_date: info_dict
1071            .get::<object::String>(CREATION_DATE)
1072            .and_then(|c| DateTime::from_bytes(&c)),
1073        modification_date: info_dict
1074            .get::<object::String>(MOD_DATE)
1075            .and_then(|c| DateTime::from_bytes(&c)),
1076        title: info_dict.get::<object::String>(TITLE).map(|t| t.to_vec()),
1077        author: info_dict.get::<object::String>(AUTHOR).map(|t| t.to_vec()),
1078        subject: info_dict.get::<object::String>(SUBJECT).map(|t| t.to_vec()),
1079        keywords: info_dict
1080            .get::<object::String>(KEYWORDS)
1081            .map(|t| t.to_vec()),
1082        creator: info_dict.get::<object::String>(CREATOR).map(|t| t.to_vec()),
1083        producer: info_dict
1084            .get::<object::String>(PRODUCER)
1085            .map(|t| t.to_vec()),
1086    }
1087}