Skip to main content

hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10    AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11    OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::dict::probe_dict;
14use crate::object::indirect::IndirectObject;
15use crate::object::{Array, MaybeRef};
16use crate::object::{DateTime, Dict};
17use crate::object::{Object, ObjectLike};
18use crate::pdf::PdfVersion;
19use crate::reader::Reader;
20use crate::reader::{Readable, ReaderContext, ReaderExt};
21use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
22use crate::trivia::is_white_space_character;
23use crate::util::findr_needle;
24use crate::{PdfData, object};
25use alloc::collections::BTreeSet;
26use alloc::vec;
27use alloc::vec::Vec;
28use core::cmp::max;
29use core::iter;
30use core::ops::Deref;
31
32pub(crate) const XREF_ENTRY_LEN: usize = 20;
33
34#[derive(Debug, Copy, Clone)]
35pub(crate) enum XRefError {
36    Unknown,
37    Encryption(DecryptionError),
38}
39
40/// Parse the "root" xref from the PDF.
41pub(crate) fn root_xref(data: PdfData, password: &[u8]) -> Result<XRef, XRefError> {
42    let mut xref_map = FxHashMap::default();
43    let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
44    let trailer =
45        populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
46
47    XRef::new(
48        data.clone(),
49        xref_map,
50        XRefInput::TrailerDictData(trailer),
51        false,
52        password,
53    )
54}
55
56/// Try to manually parse the PDF to build an xref table and trailer dictionary.
57pub(crate) fn fallback(data: PdfData, password: &[u8]) -> Option<XRef> {
58    warn!("xref table was invalid, trying to manually build xref table");
59    let (xref_map, xref_input) = fallback_xref_map(&data, password);
60
61    if let Some(xref_input) = xref_input {
62        warn!("rebuild xref table with {} entries", xref_map.len());
63
64        XRef::new(data.clone(), xref_map, xref_input, true, password).ok()
65    } else {
66        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
67
68        None
69    }
70}
71
72fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
73    fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
74}
75
76fn fallback_xref_map_inner<'a>(
77    data: &'a PdfData,
78    mut dummy_ctx: ReaderContext<'a>,
79    recurse: bool,
80    password: &[u8],
81) -> (XrefMap, Option<XRefInput<'a>>) {
82    let mut xref_map = FxHashMap::default();
83    let mut trailer_dicts = vec![];
84    let mut root_ref = None;
85
86    let mut r = Reader::new(data.as_ref());
87
88    let mut last_obj_num = None;
89
90    loop {
91        let cur_pos = r.offset();
92
93        let mut old_r = r.clone();
94
95        // First try to check if we have an object identifier.
96        if r.peek_byte().is_some_and(|b: u8| b.is_ascii_digit()) {
97            if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
98                let mut cloned = r.clone();
99                // Check that the object following it is actually valid before inserting it.
100                cloned.skip_white_spaces_and_comments();
101                if cloned.skip::<Object<'_>>(false).is_some() {
102                    xref_map.insert(obj_id, EntryType::Normal(cur_pos));
103                    last_obj_num = Some(obj_id);
104                    dummy_ctx.set_obj_number(obj_id);
105                }
106            } else {
107                // There must be a white space before the next object number.
108                r.forward_while(|b| !is_white_space_character(b));
109            }
110        } else {
111            // Then, try to check whether we have a dictionary, in particular a trailer
112            // dictionary.
113            let mut probe_reader = r.clone();
114            if r.peek_bytes(2).is_some_and(|b| b == b"<<")
115                && let Some(probe) =
116                    { probe_dict(&mut probe_reader, &dummy_ctx, Some(b"<<"), b">>") }
117            {
118                r = probe_reader;
119                if probe.has_root || probe.has_type {
120                    let mut dict_reader = Reader::new(probe.data);
121                    if let Some(dict) = dict_reader.read_with_context::<Dict<'_>>(&dummy_ctx) {
122                        if probe.has_root && dict.contains_key(ROOT) {
123                            trailer_dicts.push(dict.clone());
124                        }
125
126                        if dict
127                            .get::<Name<'_>>(TYPE)
128                            .is_some_and(|n| n.as_str() == "Catalog")
129                        {
130                            root_ref = last_obj_num;
131                        }
132
133                        if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
134                            && dict.get::<Name<'_>>(TYPE).as_deref() == Some(b"ObjStm")
135                            && let Some(data) = stream.decoded().ok()
136                            && let Some(last_obj_num) = last_obj_num
137                            && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
138                        {
139                            for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
140                                let id = ObjectIdentifier::new(*obj_num as i32, 0);
141                                // If we already found an entry for that object number that was not
142                                // inside an object stream. Somewhat arbitrary and maybe
143                                // we can do better, but that seems to work for the current
144                                // set of tests.
145                                if xref_map
146                                    .get(&id)
147                                    .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
148                                {
149                                    xref_map.insert(
150                                        id,
151                                        EntryType::ObjStream(
152                                            last_obj_num.obj_number as u32,
153                                            idx as u32,
154                                        ),
155                                    );
156                                }
157                            }
158                        }
159                    }
160                }
161            } else {
162                // We can skip everything until the next white space character,
163                // as there cannot possibly be any new dictionary/object identifier
164                // until then.
165                let old_pos = r.offset;
166                r.forward_while(|b| !is_white_space_character(b));
167                if r.offset == old_pos {
168                    r.read_byte();
169                }
170            }
171        }
172
173        if r.at_end() {
174            break;
175        }
176    }
177
178    // Try to choose the right trailer dict by doing basic validation.
179    let mut trailer_dict = None;
180
181    for dict in trailer_dicts {
182        if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
183            let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
184
185            match root_id {
186                MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
187                    Some(EntryType::Normal(offset)) => {
188                        let mut reader = Reader::new(&data.as_ref()[*offset..]);
189
190                        if let Some(obj) =
191                            reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
192                            && {
193                                let obj = obj.get();
194                                check(&obj)
195                            }
196                        {
197                            trailer_dict = Some(dict);
198                        }
199                    }
200                    Some(EntryType::ObjStream(obj_num, idx)) => {
201                        if let Some(EntryType::Normal(offset)) =
202                            xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
203                        {
204                            let mut reader = Reader::new(&data.as_ref()[*offset..]);
205
206                            if let Some(stream) =
207                                reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
208                                && {
209                                    let stream = stream.get();
210                                    if let Some(data) = stream.decoded().ok()
211                                        && let Some(object_stream) =
212                                            ObjectStream::new(stream, &data, &dummy_ctx)
213                                        && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
214                                    {
215                                        check(&obj)
216                                    } else {
217                                        false
218                                    }
219                                }
220                            {
221                                trailer_dict = Some(dict);
222                            }
223                        }
224                    }
225                    _ => {}
226                },
227                MaybeRef::NotRef(d) => {
228                    if check(&d) {
229                        trailer_dict = Some(dict);
230                    }
231                }
232            }
233        }
234    }
235
236    let has_encryption = trailer_dict
237        .as_ref()
238        .is_some_and(|t| t.contains_key(ENCRYPT));
239
240    if has_encryption && recurse {
241        // The problem is that in this case, we have used a dummy reader context which does not have
242        // a decryptor. Therefore, we were unable to decrypt any of the object streams and missed
243        // all objects that are inside of such a stream. Therefore, we need to redo the process
244        // using a `ReaderContext` that does have the ability to decrypt.
245        if let Ok(xref) = XRef::new(
246            data.clone(),
247            xref_map.clone(),
248            XRefInput::TrailerDictData(trailer_dict.as_ref().map(|d| d.data()).unwrap()),
249            true,
250            password,
251        ) {
252            let ctx = ReaderContext::new(&xref, false);
253            let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
254            xref_map = patched_map;
255        }
256    }
257
258    if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
259        (
260            xref_map,
261            Some(XRefInput::TrailerDictData(trailer_dict_data)),
262        )
263    } else if let Some(root_ref) = root_ref {
264        (xref_map, Some(XRefInput::RootRef(root_ref)))
265    } else {
266        (xref_map, None)
267    }
268}
269
270const DUMMY_XREF: XRef = XRef(Inner::Dummy);
271
272/// An xref table.
273#[derive(Debug, Clone)]
274pub struct XRef(Inner);
275
276impl XRef {
277    fn new(
278        data: PdfData,
279        xref_map: XrefMap,
280        input: XRefInput<'_>,
281        repaired: bool,
282        password: &[u8],
283    ) -> Result<Self, XRefError> {
284        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
285        // before we actually created the xref struct. So we first create it using dummy data
286        // and then populate the data.
287        let trailer_data = TrailerData::dummy();
288
289        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
290            data: Arc::new(Data::new(data)),
291            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
292            decryptor: Arc::new(Decryptor::None),
293            has_ocgs: false,
294            metadata: Arc::new(Metadata::default()),
295            trailer_data,
296            password: password.to_vec(),
297        })));
298
299        // We read the trailer twice, once to determine the encryption used and then a second
300        // time to resolve the catalog dictionary, etc. This allows us to support catalog dictionaries
301        // that are stored in an encrypted object stream.
302
303        let decryptor = {
304            match input {
305                XRefInput::TrailerDictData(trailer_dict_data) => {
306                    let mut r = Reader::new(trailer_dict_data);
307
308                    let trailer_dict = r
309                        .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
310                        .ok_or(XRefError::Unknown)?;
311
312                    get_decryptor(&trailer_dict, password)?
313                }
314                XRefInput::RootRef(_) => Decryptor::None,
315            }
316        };
317
318        match &mut xref.0 {
319            Inner::Dummy => unreachable!(),
320            Inner::Some(r) => {
321                let mutable = Arc::make_mut(r);
322                mutable.decryptor = Arc::new(decryptor.clone());
323            }
324        }
325
326        let (trailer_data, has_ocgs, metadata) = match input {
327            XRefInput::TrailerDictData(trailer_dict_data) => {
328                let mut r = Reader::new(trailer_dict_data);
329
330                let trailer_dict = r
331                    .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
332                    .ok_or(XRefError::Unknown)?;
333
334                let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
335                let root = trailer_dict
336                    .get::<Dict<'_>>(ROOT)
337                    .ok_or(XRefError::Unknown)?;
338                let metadata = trailer_dict
339                    .get::<Dict<'_>>(INFO)
340                    .map(|d| parse_metadata(&d))
341                    .unwrap_or_default();
342                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
343                let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
344                let version = root
345                    .get::<Name<'_>>(VERSION)
346                    .and_then(|v| PdfVersion::from_bytes(v.deref()));
347
348                let td = TrailerData {
349                    pages_ref: pages_ref.into(),
350                    root_ref: root_ref.into(),
351                    version,
352                };
353
354                (td, has_ocgs, metadata)
355            }
356            XRefInput::RootRef(root_ref) => {
357                let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
358                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
359
360                let td = TrailerData {
361                    pages_ref: pages_ref.into(),
362                    root_ref,
363                    version: None,
364                };
365
366                (td, false, Metadata::default())
367            }
368        };
369
370        match &mut xref.0 {
371            Inner::Dummy => unreachable!(),
372            Inner::Some(r) => {
373                let mutable = Arc::make_mut(r);
374                mutable.trailer_data = trailer_data;
375                mutable.decryptor = Arc::new(decryptor);
376                mutable.has_ocgs = has_ocgs;
377                mutable.metadata = Arc::new(metadata);
378            }
379        }
380
381        Ok(xref)
382    }
383
384    fn is_repaired(&self) -> bool {
385        match &self.0 {
386            Inner::Dummy => false,
387            Inner::Some(r) => {
388                let locked = r.map.get();
389                locked.repaired
390            }
391        }
392    }
393
394    pub(crate) fn dummy() -> &'static Self {
395        &DUMMY_XREF
396    }
397
398    pub(crate) fn len(&self) -> usize {
399        match &self.0 {
400            Inner::Dummy => 0,
401            Inner::Some(r) => r.map.get().xref_map.len(),
402        }
403    }
404
405    pub(crate) fn trailer_data(&self) -> &TrailerData {
406        match &self.0 {
407            Inner::Dummy => unreachable!(),
408            Inner::Some(r) => &r.trailer_data,
409        }
410    }
411
412    pub(crate) fn metadata(&self) -> &Metadata {
413        match &self.0 {
414            Inner::Dummy => unreachable!(),
415            Inner::Some(r) => &r.metadata,
416        }
417    }
418
419    /// Return the object ID of the root dictionary.
420    pub fn root_id(&self) -> ObjectIdentifier {
421        self.trailer_data().root_ref
422    }
423
424    /// Whether the PDF has optional content groups.
425    pub fn has_optional_content_groups(&self) -> bool {
426        match &self.0 {
427            Inner::Dummy => false,
428            Inner::Some(r) => r.has_ocgs,
429        }
430    }
431
432    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
433        match &self.0 {
434            Inner::Dummy => unimplemented!(),
435            Inner::Some(r) => {
436                let locked = r.map.get();
437                let mut elements = locked
438                    .xref_map
439                    .iter()
440                    .map(|(id, e)| {
441                        let offset = match e {
442                            EntryType::Normal(o) => (*o, 0),
443                            EntryType::ObjStream(id, index) => {
444                                if let Some(EntryType::Normal(offset)) =
445                                    locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
446                                {
447                                    (*offset, *index)
448                                } else {
449                                    (usize::MAX, 0)
450                                }
451                            }
452                        };
453
454                        (*id, offset)
455                    })
456                    .collect::<Vec<_>>();
457
458                // Try to yield in the order the objects appeared in the
459                // PDF.
460                elements.sort_by(|e1, e2| e1.1.cmp(&e2.1));
461
462                let mut iter = elements.into_iter();
463
464                iter::from_fn(move || {
465                    for next in iter.by_ref() {
466                        if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
467                            return Some(obj);
468                        } else {
469                            // Skip invalid objects.
470                            continue;
471                        }
472                    }
473
474                    None
475                })
476            }
477        }
478    }
479
480    pub(crate) fn repair(&self) {
481        let Inner::Some(r) = &self.0 else {
482            unreachable!();
483        };
484
485        let mut locked = r.map.try_put().unwrap();
486        assert!(!locked.repaired);
487
488        let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
489        locked.xref_map = xref_map;
490        locked.repaired = true;
491    }
492
493    #[inline]
494    pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
495        match &self.0 {
496            Inner::Dummy => false,
497            Inner::Some(r) => {
498                if matches!(r.decryptor.as_ref(), Decryptor::None) {
499                    false
500                } else {
501                    !ctx.in_content_stream() && !ctx.in_object_stream()
502                }
503            }
504        }
505    }
506
507    #[inline]
508    pub(crate) fn decrypt(
509        &self,
510        id: ObjectIdentifier,
511        data: &[u8],
512        target: DecryptionTarget,
513    ) -> Option<Vec<u8>> {
514        match &self.0 {
515            Inner::Dummy => Some(data.to_vec()),
516            Inner::Some(r) => r.decryptor.decrypt(id, data, target),
517        }
518    }
519
520    /// Return the object with the given identifier.
521    #[allow(private_bounds)]
522    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
523    where
524        T: ObjectLike<'a>,
525    {
526        let ctx = ReaderContext::new(self, false);
527        self.get_with(id, &ctx)
528    }
529
530    /// Return the object with the given identifier.
531    #[allow(private_bounds)]
532    pub(crate) fn get_with<'a, T>(
533        &'a self,
534        id: ObjectIdentifier,
535        ctx: &ReaderContext<'a>,
536    ) -> Option<T>
537    where
538        T: ObjectLike<'a>,
539    {
540        let Inner::Some(repr) = &self.0 else {
541            return None;
542        };
543
544        let locked = repr.map.try_get().unwrap();
545
546        let mut r = Reader::new(repr.data.get().as_ref());
547
548        let entry = *locked.xref_map.get(&id).or({
549            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
550            // shall be treated as a reference to the null object.
551            None
552        })?;
553        drop(locked);
554
555        let mut ctx = ctx.clone();
556        ctx.set_obj_number(id);
557        ctx.set_in_content_stream(false);
558
559        match entry {
560            EntryType::Normal(offset) => {
561                ctx.set_in_object_stream(false);
562                r.jump(offset);
563
564                if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
565                    if object.id() == &id {
566                        return Some(object.get());
567                    }
568                } else {
569                    // There is a valid object at the offset, it's just not of the type the caller
570                    // expected, which is fine.
571                    if r.skip::<IndirectObject<Object<'_>>>(false).is_some() {
572                        return None;
573                    }
574                };
575
576                // The xref table is broken, try to repair if not already repaired.
577                if self.is_repaired() {
578                    error!(
579                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
580                    );
581
582                    None
583                } else {
584                    warn!("broken xref, attempting to repair");
585
586                    self.repair();
587
588                    // Now try reading again.
589                    self.get_with::<T>(id, &ctx)
590                }
591            }
592            EntryType::ObjStream(obj_stram_gen_num, index) => {
593                // Generation number is implicitly 0.
594                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
595
596                if obj_stream_id == id {
597                    warn!("cycle detected in object stream");
598
599                    return None;
600                }
601
602                let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
603                let data = repr.data.get_with(obj_stream_id, &ctx)?;
604                let object_stream = ObjectStream::new(stream, data, &ctx)?;
605                object_stream.get(index)
606            }
607        }
608    }
609}
610
611/// An input that is passed to the xref constructor so that we can fully resolve
612/// the PDF.
613#[derive(Debug, Copy, Clone)]
614pub(crate) enum XRefInput<'a> {
615    /// This option is going to be uesd in 99.999% of the case. It contains the
616    /// raw data of the trailer dictionary which is then going to be processed.
617    TrailerDictData(&'a [u8]),
618    /// In case the trailer dictionary could not be read (for example because
619    /// it is cut-off), we just pass the object ID of the root dictionary
620    /// in case we have found one, and try our best to build the PDF just
621    /// with the information we have there.
622    ///
623    /// Note that this won't work if the document is encrypted, as we
624    /// can't access the crypto dictionary.
625    RootRef(ObjectIdentifier),
626}
627
628pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
629    let needle = b"startxref";
630    let pos = findr_needle(data, needle)?;
631    let mut finder = Reader::new(data);
632    finder.jump(pos);
633    finder.forward_tag(needle)?;
634    finder.skip_white_spaces_and_comments();
635    finder.read_without_context::<i32>()?.try_into().ok()
636}
637
638/// A type of xref entry.
639#[derive(Debug, PartialEq, Eq, Clone, Copy)]
640enum EntryType {
641    /// An indirect object that is at a specific offset in the original data.
642    Normal(usize),
643    /// An indirect object that is part of an object stream. First number indicates the object
644    /// number of the _object stream_ (the generation number is always 0), the second number indicates
645    /// the index in the object stream.
646    ObjStream(u32, u32),
647}
648
649type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
650
651/// Representation of a proper xref table.
652#[derive(Debug)]
653struct MapRepr {
654    xref_map: XrefMap,
655    repaired: bool,
656}
657
658#[derive(Debug, Copy, Clone)]
659pub(crate) struct TrailerData {
660    pub(crate) pages_ref: ObjectIdentifier,
661    pub(crate) root_ref: ObjectIdentifier,
662    pub(crate) version: Option<PdfVersion>,
663}
664
665impl TrailerData {
666    pub(crate) fn dummy() -> Self {
667        Self {
668            pages_ref: ObjectIdentifier::new(0, 0),
669            root_ref: ObjectIdentifier::new(0, 0),
670            version: None,
671        }
672    }
673}
674
675#[derive(Debug, Clone)]
676struct SomeRepr {
677    data: Arc<Data>,
678    map: Arc<RwLock<MapRepr>>,
679    metadata: Arc<Metadata>,
680    decryptor: Arc<Decryptor>,
681    has_ocgs: bool,
682    password: Vec<u8>,
683    trailer_data: TrailerData,
684}
685
686#[derive(Debug, Clone)]
687enum Inner {
688    /// A dummy xref table that doesn't have any entries.
689    Dummy,
690    /// A proper xref table.
691    Some(Arc<SomeRepr>),
692}
693
694#[derive(Debug)]
695struct XRefEntry {
696    offset: usize,
697    gen_number: i32,
698    used: bool,
699}
700
701impl XRefEntry {
702    pub(crate) fn read(data: &[u8]) -> Option<Self> {
703        #[inline(always)]
704        fn parse_u32(data: &[u8]) -> Option<u32> {
705            let mut accum = 0_u32;
706
707            for byte in data {
708                accum = accum.checked_mul(10)?;
709
710                match *byte {
711                    b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
712                    _ => return None,
713                }
714            }
715
716            Some(accum)
717        }
718
719        let offset = parse_u32(&data[0..10])? as usize;
720        let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
721
722        let used = data[17] == b'n';
723
724        Some(Self {
725            offset,
726            gen_number,
727            used,
728        })
729    }
730}
731
732fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
733    let mut visited = BTreeSet::new();
734    populate_xref_impl_inner(data, pos, xref_map, &mut visited)
735}
736
737/// Maximum number of allowed xref `Prev` pointers before we abort.
738const MAX_XREF_CHAIN_DEPTH: usize = 256;
739
740fn populate_xref_impl_inner<'a>(
741    data: &'a [u8],
742    pos: usize,
743    xref_map: &mut XrefMap,
744    visited: &mut BTreeSet<usize>,
745) -> Option<&'a [u8]> {
746    if !visited.insert(pos) {
747        warn!("circular xref PREV chain detected at offset {}", pos);
748
749        return None;
750    }
751
752    if visited.len() > MAX_XREF_CHAIN_DEPTH {
753        warn!(
754            "xref PREV chain exceeds maximum depth of {}",
755            MAX_XREF_CHAIN_DEPTH
756        );
757
758        return None;
759    }
760
761    let mut reader = Reader::new(data);
762    reader.jump(pos);
763    // In case the position points to before the object number of a xref stream.
764    reader.skip_white_spaces_and_comments();
765
766    let mut r2 = reader.clone();
767    if reader
768        .clone()
769        .read_without_context::<ObjectIdentifier>()
770        .is_some()
771    {
772        populate_from_xref_stream(data, &mut r2, xref_map, visited)
773    } else {
774        populate_from_xref_table(data, &mut r2, xref_map, visited)
775    }
776}
777
778pub(super) struct SubsectionHeader {
779    pub(super) start: u32,
780    pub(super) num_entries: u32,
781}
782
783impl Readable<'_> for SubsectionHeader {
784    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
785        r.skip_white_spaces();
786        let start = r.read_without_context::<u32>()?;
787        r.skip_white_spaces();
788        let num_entries = r.read_without_context::<u32>()?;
789        r.skip_white_spaces();
790
791        Some(Self { start, num_entries })
792    }
793}
794
795/// Populate the xref table, and return the trailer dict.
796fn populate_from_xref_table<'a>(
797    data: &'a [u8],
798    reader: &mut Reader<'a>,
799    insert_map: &mut XrefMap,
800    visited: &mut BTreeSet<usize>,
801) -> Option<&'a [u8]> {
802    let trailer = {
803        let mut reader = reader.clone();
804        read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
805    };
806
807    reader.skip_white_spaces();
808    reader.forward_tag(b"xref")?;
809    reader.skip_white_spaces();
810
811    let mut max_obj = 0;
812
813    if let Some(prev) = trailer.get::<i32>(PREV) {
814        // First insert the entries from any previous xref tables.
815        populate_xref_impl_inner(data, prev as usize, insert_map, visited)?;
816    }
817
818    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
819    // after looking at `PREV`.
820    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
821        populate_xref_impl_inner(data, xref_stm as usize, insert_map, visited)?;
822    }
823
824    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
825        reader.skip_white_spaces();
826
827        let start = header.start;
828        let end = start.checked_add(header.num_entries)?;
829
830        for obj_number in start..end {
831            max_obj = max(max_obj, obj_number);
832            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
833            let entry = XRefEntry::read(bytes)?;
834
835            // Specification says we should ignore any object number > SIZE, but probably
836            // not important?
837            if entry.used {
838                insert_map.insert(
839                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
840                    EntryType::Normal(entry.offset),
841                );
842            }
843        }
844    }
845
846    Some(trailer.data())
847}
848
849fn populate_from_xref_stream<'a>(
850    data: &'a [u8],
851    reader: &mut Reader<'a>,
852    insert_map: &mut XrefMap,
853    visited: &mut BTreeSet<usize>,
854) -> Option<&'a [u8]> {
855    let stream = reader
856        .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
857        .get();
858
859    if let Some(prev) = stream.dict().get::<i32>(PREV) {
860        // First insert the entries from any previous xref tables.
861        let _ = populate_xref_impl_inner(data, prev as usize, insert_map, visited)?;
862    }
863
864    let size = stream.dict().get::<u32>(SIZE)?;
865
866    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
867
868    if f2_len > size_of::<u64>() as u8 {
869        error!("xref offset length is larger than the allowed limit");
870
871        return None;
872    }
873
874    // Do such files exist?
875    if f1_len != 1 {
876        warn!("first field in xref stream was longer than 1");
877    }
878
879    let xref_data = stream.decoded().ok()?;
880    let mut xref_reader = Reader::new(xref_data.as_ref());
881
882    if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
883        let iter = arr.iter::<(u32, u32)>();
884
885        for (start, num_elements) in iter {
886            xref_stream_subsection(
887                &mut xref_reader,
888                start,
889                num_elements,
890                f1_len,
891                f2_len,
892                f3_len,
893                insert_map,
894            )?;
895        }
896    } else {
897        xref_stream_subsection(
898            &mut xref_reader,
899            0,
900            size,
901            f1_len,
902            f2_len,
903            f3_len,
904            insert_map,
905        )?;
906    }
907
908    Some(stream.dict().data())
909}
910
911fn xref_stream_num(data: &[u8]) -> Option<u32> {
912    Some(match data.len() {
913        0 => return None,
914        1 => u8::from_be(data[0]) as u32,
915        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
916        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
917        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
918        8 => {
919            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
920                return Some(num);
921            } else {
922                warn!("xref stream number is too large");
923
924                return None;
925            }
926        }
927        _n => {
928            warn!("invalid xref stream number {_n}");
929
930            return None;
931        }
932    })
933}
934
935fn xref_stream_subsection<'a>(
936    xref_reader: &mut Reader<'a>,
937    start: u32,
938    num_elements: u32,
939    f1_len: u8,
940    f2_len: u8,
941    f3_len: u8,
942    insert_map: &mut XrefMap,
943) -> Option<()> {
944    for i in 0..num_elements {
945        let f_type = if f1_len == 0 {
946            1
947        } else {
948            // We assume a length of 1.
949            xref_reader.read_bytes(1)?[0]
950        };
951
952        let obj_number = start + i;
953
954        match f_type {
955            // We don't care about free objects.
956            0 => {
957                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
958            }
959            1 => {
960                let offset = if f2_len > 0 {
961                    let data = xref_reader.read_bytes(f2_len as usize)?;
962                    xref_stream_num(data)?
963                } else {
964                    0
965                };
966
967                let gen_number = if f3_len > 0 {
968                    let data = xref_reader.read_bytes(f3_len as usize)?;
969                    xref_stream_num(data)?
970                } else {
971                    0
972                };
973
974                insert_map.insert(
975                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
976                    EntryType::Normal(offset as usize),
977                );
978            }
979            2 => {
980                let obj_stream_number = {
981                    let data = xref_reader.read_bytes(f2_len as usize)?;
982                    xref_stream_num(data)?
983                };
984                let gen_number = 0;
985                let index = if f3_len > 0 {
986                    let data = xref_reader.read_bytes(f3_len as usize)?;
987                    xref_stream_num(data)?
988                } else {
989                    0
990                };
991
992                insert_map.insert(
993                    ObjectIdentifier::new(obj_number as i32, gen_number),
994                    EntryType::ObjStream(obj_stream_number, index),
995                );
996            }
997            _ => {
998                warn!("xref has unknown field type {f_type}");
999
1000                return None;
1001            }
1002        }
1003    }
1004
1005    Some(())
1006}
1007
1008fn read_xref_table_trailer<'a>(
1009    reader: &mut Reader<'a>,
1010    ctx: &ReaderContext<'a>,
1011) -> Option<Dict<'a>> {
1012    reader.skip_white_spaces();
1013    reader.forward_tag(b"xref")?;
1014    reader.skip_white_spaces();
1015
1016    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
1017        let len = XREF_ENTRY_LEN.checked_mul(header.num_entries as usize)?;
1018        reader.jump(reader.offset().checked_add(len)?);
1019    }
1020
1021    reader.skip_white_spaces();
1022    reader.forward_tag(b"trailer")?;
1023    reader.skip_white_spaces();
1024
1025    reader.read_with_context::<Dict<'_>>(ctx)
1026}
1027
1028fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
1029    if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
1030        let id = if let Some(id) = trailer_dict
1031            .get::<Array<'_>>(ID)
1032            .and_then(|a| a.flex_iter().next::<object::String<'_>>())
1033        {
1034            id.to_vec()
1035        } else {
1036            // Assume an empty ID entry.
1037            vec![]
1038        };
1039
1040        get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1041    } else {
1042        Ok(Decryptor::None)
1043    }
1044}
1045
1046struct ObjectStream<'a> {
1047    data: &'a [u8],
1048    ctx: ReaderContext<'a>,
1049    offsets: Vec<(u32, usize)>,
1050}
1051
1052impl<'a> ObjectStream<'a> {
1053    fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1054        let num_objects = inner.dict().get::<usize>(N)?;
1055        let first_offset = inner.dict().get::<usize>(FIRST)?;
1056
1057        let mut r = Reader::new(data);
1058
1059        let mut offsets = vec![];
1060
1061        for _ in 0..num_objects {
1062            r.skip_white_spaces_and_comments();
1063            // Skip object number
1064            let obj_num = r.read_without_context::<u32>()?;
1065            r.skip_white_spaces_and_comments();
1066            let relative_offset = r.read_without_context::<usize>()?;
1067            offsets.push((obj_num, first_offset + relative_offset));
1068        }
1069
1070        let mut ctx = ctx.clone();
1071        ctx.set_in_object_stream(true);
1072
1073        Some(Self { data, ctx, offsets })
1074    }
1075
1076    fn get<T>(&self, index: u32) -> Option<T>
1077    where
1078        T: ObjectLike<'a>,
1079    {
1080        let offset = self.offsets.get(index as usize)?.1;
1081        let mut r = Reader::new(self.data);
1082        r.jump(offset);
1083        r.skip_white_spaces_and_comments();
1084
1085        r.read_with_context::<T>(&self.ctx)
1086    }
1087}
1088
1089fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1090    Metadata {
1091        creation_date: info_dict
1092            .get::<object::String<'_>>(CREATION_DATE)
1093            .and_then(|c| DateTime::from_bytes(&c)),
1094        modification_date: info_dict
1095            .get::<object::String<'_>>(MOD_DATE)
1096            .and_then(|c| DateTime::from_bytes(&c)),
1097        title: info_dict
1098            .get::<object::String<'_>>(TITLE)
1099            .map(|t| t.to_vec()),
1100        author: info_dict
1101            .get::<object::String<'_>>(AUTHOR)
1102            .map(|t| t.to_vec()),
1103        subject: info_dict
1104            .get::<object::String<'_>>(SUBJECT)
1105            .map(|t| t.to_vec()),
1106        keywords: info_dict
1107            .get::<object::String<'_>>(KEYWORDS)
1108            .map(|t| t.to_vec()),
1109        creator: info_dict
1110            .get::<object::String<'_>>(CREATOR)
1111            .map(|t| t.to_vec()),
1112        producer: info_dict
1113            .get::<object::String<'_>>(PRODUCER)
1114            .map(|t| t.to_vec()),
1115    }
1116}
1117
1118#[cfg(test)]
1119mod tests {
1120    use super::*;
1121
1122    #[test]
1123    fn circular_prev_chain() {
1124        let mut pdf = b"%PDF-1.0\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n".to_vec();
1125        let expected_xref_pos = pdf.len();
1126        pdf.extend_from_slice(
1127            format!(
1128                "xref\n\
1129                 0 1\n\
1130                 0000000000 65535 f \r\n\
1131                 trailer\n<< /Size 1 /Root 1 0 R /Prev {expected_xref_pos} >>\n\
1132                 startxref\n{expected_xref_pos}\n%%EOF"
1133            )
1134            .as_bytes(),
1135        );
1136
1137        let mut xref_map = FxHashMap::default();
1138        let xref_pos = find_last_xref_pos(pdf.as_ref()).unwrap();
1139        let _result = populate_xref_impl(pdf.as_ref(), xref_pos, &mut xref_map);
1140    }
1141
1142    #[test]
1143    fn find_last_xref_uses_last_startxref() {
1144        let pdf = b"%PDF-1.0\nstartxref\n5\n%%EOF\nstartxref\n42\n%%EOF";
1145        assert_eq!(find_last_xref_pos(pdf), Some(42));
1146    }
1147
1148    #[test]
1149    fn xref_table_trailer_rejects_overflowing_entry_skip() {
1150        let data = b"xref\n0 999999999999999999999\ntrailer\n<<>>";
1151        let mut reader = Reader::new(data);
1152        assert!(read_xref_table_trailer(&mut reader, &ReaderContext::dummy()).is_none());
1153    }
1154}