hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10    AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11    OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Array, MaybeRef};
15use crate::object::{DateTime, Dict};
16use crate::object::{Object, ObjectLike};
17use crate::pdf::PdfVersion;
18use crate::reader::Reader;
19use crate::reader::{Readable, ReaderContext, ReaderExt};
20use crate::{PdfData, object};
21use log::{error, warn};
22use rustc_hash::FxHashMap;
23use std::cmp::max;
24use std::iter;
25use std::ops::Deref;
26use std::sync::{Arc, RwLock};
27
28pub(crate) const XREF_ENTRY_LEN: usize = 20;
29
30#[derive(Debug, Copy, Clone)]
31pub(crate) enum XRefError {
32    Unknown,
33    Encryption(DecryptionError),
34}
35
36/// Parse the "root" xref from the PDF.
37pub(crate) fn root_xref(data: PdfData, password: &[u8]) -> Result<XRef, XRefError> {
38    let mut xref_map = FxHashMap::default();
39    let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
40    let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
41        .ok_or(XRefError::Unknown)?;
42
43    XRef::new(
44        data.clone(),
45        xref_map,
46        XRefInput::TrailerDictData(trailer),
47        false,
48        password,
49    )
50}
51
52/// Try to manually parse the PDF to build an xref table and trailer dictionary.
53pub(crate) fn fallback(data: PdfData, password: &[u8]) -> Option<XRef> {
54    warn!("xref table was invalid, trying to manually build xref table");
55    let (xref_map, xref_input) = fallback_xref_map(&data, password);
56
57    if let Some(xref_input) = xref_input {
58        warn!("rebuild xref table with {} entries", xref_map.len());
59
60        XRef::new(data.clone(), xref_map, xref_input, true, password).ok()
61    } else {
62        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
63
64        None
65    }
66}
67
68fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
69    fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
70}
71
72fn fallback_xref_map_inner<'a>(
73    data: &'a PdfData,
74    mut dummy_ctx: ReaderContext<'a>,
75    recurse: bool,
76    password: &[u8],
77) -> (XrefMap, Option<XRefInput<'a>>) {
78    let mut xref_map = FxHashMap::default();
79    let mut trailer_dicts = vec![];
80    let mut root_ref = None;
81
82    let mut r = Reader::new(data.as_ref().as_ref());
83
84    let mut last_obj_num = None;
85
86    loop {
87        let cur_pos = r.offset();
88
89        let mut old_r = r.clone();
90
91        if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
92            let mut cloned = r.clone();
93            // Check that the object following it is actually valid before inserting it.
94            cloned.skip_white_spaces_and_comments();
95            if cloned.skip::<Object<'_>>(false).is_some() {
96                xref_map.insert(obj_id, EntryType::Normal(cur_pos));
97                last_obj_num = Some(obj_id);
98                dummy_ctx.obj_number = Some(obj_id);
99            }
100        } else if let Some(dict) = r.read::<Dict<'_>>(&dummy_ctx) {
101            if dict.contains_key(ROOT) {
102                trailer_dicts.push(dict.clone());
103            }
104
105            if dict
106                .get::<Name<'_>>(TYPE)
107                .is_some_and(|n| n.as_str() == "Catalog")
108            {
109                root_ref = last_obj_num;
110            }
111
112            if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
113                && stream.dict().get::<Name<'_>>(TYPE).as_deref() == Some(b"ObjStm")
114                && let Some(data) = stream.decoded().ok()
115                && let Some(last_obj_num) = last_obj_num
116                && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
117            {
118                for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
119                    let id = ObjectIdentifier::new(*obj_num as i32, 0);
120                    // If we already found an entry for that object number that was not
121                    // inside an object stream. Somewhat arbitrary and maybe
122                    // we can do better, but that seems to work for the current
123                    // set of tests.
124                    if xref_map
125                        .get(&id)
126                        .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
127                    {
128                        xref_map.insert(
129                            id,
130                            EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
131                        );
132                    }
133                }
134            }
135        } else {
136            r.read_byte();
137        }
138
139        if r.at_end() {
140            break;
141        }
142    }
143
144    // Try to choose the right trailer dict by doing basic validation.
145    let mut trailer_dict = None;
146
147    for dict in trailer_dicts {
148        if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
149            let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
150
151            match root_id {
152                MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
153                    Some(EntryType::Normal(offset)) => {
154                        let mut reader = Reader::new(&data.as_ref().as_ref()[*offset..]);
155
156                        if let Some(obj) =
157                            reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
158                            && check(&obj.clone().get())
159                        {
160                            trailer_dict = Some(dict);
161                        }
162                    }
163                    Some(EntryType::ObjStream(obj_num, idx)) => {
164                        if let Some(EntryType::Normal(offset)) =
165                            xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
166                        {
167                            let mut reader = Reader::new(&data.as_ref().as_ref()[*offset..]);
168
169                            if let Some(stream) =
170                                reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
171                                && let Some(data) = stream.clone().get().decoded().ok()
172                                && let Some(object_stream) =
173                                    ObjectStream::new(stream.get(), &data, &dummy_ctx)
174                                && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
175                                && check(&obj)
176                            {
177                                trailer_dict = Some(dict);
178                            }
179                        }
180                    }
181                    _ => {}
182                },
183                MaybeRef::NotRef(d) => {
184                    if check(&d) {
185                        trailer_dict = Some(dict);
186                    }
187                }
188            }
189        }
190    }
191
192    let has_encryption = trailer_dict
193        .as_ref()
194        .is_some_and(|t| t.contains_key(ENCRYPT));
195
196    if has_encryption && recurse {
197        // The problem is that in this case, we have used a dummy reader context which does not have
198        // a decryptor. Therefore, we were unable to decrypt any of the object streams and missed
199        // all objects that are inside of such a stream. Therefore, we need to redo the process
200        // using a `ReaderContext` that does have the ability to decrypt.
201        if let Ok(xref) = XRef::new(
202            data.clone(),
203            xref_map.clone(),
204            XRefInput::TrailerDictData(trailer_dict.as_ref().map(|d| d.data()).unwrap()),
205            true,
206            password,
207        ) {
208            let ctx = ReaderContext::new(&xref, false);
209            let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
210            xref_map = patched_map;
211        }
212    }
213
214    if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
215        (
216            xref_map,
217            Some(XRefInput::TrailerDictData(trailer_dict_data)),
218        )
219    } else if let Some(root_ref) = root_ref {
220        (xref_map, Some(XRefInput::RootRef(root_ref)))
221    } else {
222        (xref_map, None)
223    }
224}
225
226static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
227
228/// An xref table.
229#[derive(Debug, Clone)]
230pub struct XRef(Inner);
231
232impl XRef {
233    fn new(
234        data: PdfData,
235        xref_map: XrefMap,
236        input: XRefInput<'_>,
237        repaired: bool,
238        password: &[u8],
239    ) -> Result<Self, XRefError> {
240        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
241        // before we actually created the xref struct. So we first create it using dummy data
242        // and then populate the data.
243        let trailer_data = TrailerData::dummy();
244
245        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
246            data: Arc::new(Data::new(data)),
247            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
248            decryptor: Arc::new(Decryptor::None),
249            has_ocgs: false,
250            metadata: Arc::new(Metadata::default()),
251            trailer_data,
252            password: password.to_vec(),
253        })));
254
255        // We read the trailer twice, once to determine the encryption used and then a second
256        // time to resolve the catalog dictionary, etc. This allows us to support catalog dictionaries
257        // that are stored in an encrypted object stream.
258
259        let decryptor = {
260            match input {
261                XRefInput::TrailerDictData(trailer_dict_data) => {
262                    let mut r = Reader::new(trailer_dict_data);
263
264                    let trailer_dict = r
265                        .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
266                        .ok_or(XRefError::Unknown)?;
267
268                    get_decryptor(&trailer_dict, password)?
269                }
270                XRefInput::RootRef(_) => Decryptor::None,
271            }
272        };
273
274        match &mut xref.0 {
275            Inner::Dummy => unreachable!(),
276            Inner::Some(r) => {
277                let mutable = Arc::make_mut(r);
278                mutable.decryptor = Arc::new(decryptor.clone());
279            }
280        }
281
282        let (trailer_data, has_ocgs, metadata) = match input {
283            XRefInput::TrailerDictData(trailer_dict_data) => {
284                let mut r = Reader::new(trailer_dict_data);
285
286                let trailer_dict = r
287                    .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
288                    .ok_or(XRefError::Unknown)?;
289
290                let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
291                let root = trailer_dict
292                    .get::<Dict<'_>>(ROOT)
293                    .ok_or(XRefError::Unknown)?;
294                let metadata = trailer_dict
295                    .get::<Dict<'_>>(INFO)
296                    .map(|d| parse_metadata(&d))
297                    .unwrap_or_default();
298                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
299                let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
300                let version = root
301                    .get::<Name<'_>>(VERSION)
302                    .and_then(|v| PdfVersion::from_bytes(v.deref()));
303
304                let td = TrailerData {
305                    pages_ref: pages_ref.into(),
306                    root_ref: root_ref.into(),
307                    version,
308                };
309
310                (td, has_ocgs, metadata)
311            }
312            XRefInput::RootRef(root_ref) => {
313                let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
314                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
315
316                let td = TrailerData {
317                    pages_ref: pages_ref.into(),
318                    root_ref,
319                    version: None,
320                };
321
322                (td, false, Metadata::default())
323            }
324        };
325
326        match &mut xref.0 {
327            Inner::Dummy => unreachable!(),
328            Inner::Some(r) => {
329                let mutable = Arc::make_mut(r);
330                mutable.trailer_data = trailer_data;
331                mutable.decryptor = Arc::new(decryptor);
332                mutable.has_ocgs = has_ocgs;
333                mutable.metadata = Arc::new(metadata);
334            }
335        }
336
337        Ok(xref)
338    }
339
340    fn is_repaired(&self) -> bool {
341        match &self.0 {
342            Inner::Dummy => false,
343            Inner::Some(r) => {
344                let locked = r.map.read().unwrap();
345                locked.repaired
346            }
347        }
348    }
349
350    pub(crate) fn dummy() -> &'static Self {
351        DUMMY_XREF
352    }
353
354    pub(crate) fn len(&self) -> usize {
355        match &self.0 {
356            Inner::Dummy => 0,
357            Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
358        }
359    }
360
361    pub(crate) fn trailer_data(&self) -> &TrailerData {
362        match &self.0 {
363            Inner::Dummy => unreachable!(),
364            Inner::Some(r) => &r.trailer_data,
365        }
366    }
367
368    pub(crate) fn metadata(&self) -> &Metadata {
369        match &self.0 {
370            Inner::Dummy => unreachable!(),
371            Inner::Some(r) => &r.metadata,
372        }
373    }
374
375    /// Return the object ID of the root dictionary.
376    pub fn root_id(&self) -> ObjectIdentifier {
377        self.trailer_data().root_ref
378    }
379
380    /// Whether the PDF has optional content groups.
381    pub fn has_optional_content_groups(&self) -> bool {
382        match &self.0 {
383            Inner::Dummy => false,
384            Inner::Some(r) => r.has_ocgs,
385        }
386    }
387
388    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
389        match &self.0 {
390            Inner::Dummy => unimplemented!(),
391            Inner::Some(r) => {
392                let locked = r.map.read().unwrap();
393                let mut elements = locked
394                    .xref_map
395                    .iter()
396                    .map(|(id, e)| {
397                        let offset = match e {
398                            EntryType::Normal(o) => (*o, 0),
399                            EntryType::ObjStream(id, index) => {
400                                if let Some(EntryType::Normal(offset)) =
401                                    locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
402                                {
403                                    (*offset, *index)
404                                } else {
405                                    (usize::MAX, 0)
406                                }
407                            }
408                        };
409
410                        (*id, offset)
411                    })
412                    .collect::<Vec<_>>();
413
414                // Try to yield in the order the objects appeared in the
415                // PDF.
416                elements.sort_by(|e1, e2| e1.1.cmp(&e2.1));
417
418                let mut iter = elements.into_iter();
419
420                iter::from_fn(move || {
421                    for next in iter.by_ref() {
422                        if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
423                            return Some(obj);
424                        } else {
425                            // Skip invalid objects.
426                            continue;
427                        }
428                    }
429
430                    None
431                })
432            }
433        }
434    }
435
436    pub(crate) fn repair(&self) {
437        let Inner::Some(r) = &self.0 else {
438            unreachable!();
439        };
440
441        let mut locked = r.map.try_write().unwrap();
442        assert!(!locked.repaired);
443
444        let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
445        locked.xref_map = xref_map;
446        locked.repaired = true;
447    }
448
449    #[inline]
450    pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
451        match &self.0 {
452            Inner::Dummy => false,
453            Inner::Some(r) => {
454                if matches!(r.decryptor.as_ref(), Decryptor::None) {
455                    false
456                } else {
457                    !ctx.in_content_stream && !ctx.in_object_stream
458                }
459            }
460        }
461    }
462
463    #[inline]
464    pub(crate) fn decrypt(
465        &self,
466        id: ObjectIdentifier,
467        data: &[u8],
468        target: DecryptionTarget,
469    ) -> Option<Vec<u8>> {
470        match &self.0 {
471            Inner::Dummy => Some(data.to_vec()),
472            Inner::Some(r) => r.decryptor.decrypt(id, data, target),
473        }
474    }
475
476    /// Return the object with the given identifier.
477    #[allow(private_bounds)]
478    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
479    where
480        T: ObjectLike<'a>,
481    {
482        let ctx = ReaderContext::new(self, false);
483        self.get_with(id, &ctx)
484    }
485
486    /// Return the object with the given identifier.
487    #[allow(private_bounds)]
488    pub(crate) fn get_with<'a, T>(
489        &'a self,
490        id: ObjectIdentifier,
491        ctx: &ReaderContext<'a>,
492    ) -> Option<T>
493    where
494        T: ObjectLike<'a>,
495    {
496        let Inner::Some(repr) = &self.0 else {
497            return None;
498        };
499
500        let locked = repr.map.try_read().unwrap();
501
502        let mut r = Reader::new(repr.data.get().as_ref().as_ref());
503
504        let entry = *locked.xref_map.get(&id).or({
505            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
506            // shall be treated as a reference to the null object.
507            None
508        })?;
509        drop(locked);
510
511        let mut ctx = ctx.clone();
512        ctx.obj_number = Some(id);
513        ctx.in_content_stream = false;
514
515        match entry {
516            EntryType::Normal(offset) => {
517                ctx.in_object_stream = false;
518                r.jump(offset);
519
520                if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
521                    if object.id() == &id {
522                        return Some(object.get());
523                    }
524                } else {
525                    // There is a valid object at the offset, it's just not of the type the caller
526                    // expected, which is fine.
527                    if r.skip_not_in_content_stream::<IndirectObject<Object<'_>>>()
528                        .is_some()
529                    {
530                        return None;
531                    }
532                };
533
534                // The xref table is broken, try to repair if not already repaired.
535                if self.is_repaired() {
536                    error!(
537                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
538                    );
539
540                    None
541                } else {
542                    warn!("broken xref, attempting to repair");
543
544                    self.repair();
545
546                    // Now try reading again.
547                    self.get_with::<T>(id, &ctx)
548                }
549            }
550            EntryType::ObjStream(obj_stram_gen_num, index) => {
551                // Generation number is implicitly 0.
552                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
553
554                if obj_stream_id == id {
555                    warn!("cycle detected in object stream");
556
557                    return None;
558                }
559
560                let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
561                let data = repr.data.get_with(obj_stream_id, &ctx)?;
562                let object_stream = ObjectStream::new(stream, data, &ctx)?;
563                object_stream.get(index)
564            }
565        }
566    }
567}
568
569/// An input that is passed to the xref constructor so that we can fully resolve
570/// the PDF.
571#[derive(Debug, Copy, Clone)]
572pub(crate) enum XRefInput<'a> {
573    /// This option is going to be uesd in 99.999% of the case. It contains the
574    /// raw data of the trailer dictionary which is then going to be processed.
575    TrailerDictData(&'a [u8]),
576    /// In case the trailer dictionary could not be read (for example because
577    /// it is cut-off), we just pass the object ID of the root dictionary
578    /// in case we have found one, and try our best to build the PDF just
579    /// with the information we have there.
580    ///
581    /// Note that this won't work if the document is encrypted, as we
582    /// can't access the crypto dictionary.
583    RootRef(ObjectIdentifier),
584}
585
586pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
587    let mut finder = Reader::new(data);
588    let mut pos = finder.len().checked_sub(1)?;
589    finder.jump(pos);
590
591    let needle = b"startxref";
592
593    loop {
594        if finder.forward_tag(needle).is_some() {
595            finder.skip_white_spaces_and_comments();
596
597            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
598
599            return Some(offset);
600        }
601
602        pos = pos.checked_sub(1)?;
603        finder.jump(pos);
604    }
605}
606
607/// A type of xref entry.
608#[derive(Debug, PartialEq, Eq, Clone, Copy)]
609enum EntryType {
610    /// An indirect object that is at a specific offset in the original data.
611    Normal(usize),
612    /// An indirect object that is part of an object stream. First number indicates the object
613    /// number of the _object stream_ (the generation number is always 0), the second number indicates
614    /// the index in the object stream.
615    ObjStream(u32, u32),
616}
617
618type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
619
620/// Representation of a proper xref table.
621#[derive(Debug)]
622struct MapRepr {
623    xref_map: XrefMap,
624    repaired: bool,
625}
626
627#[derive(Debug, Copy, Clone)]
628pub(crate) struct TrailerData {
629    pub(crate) pages_ref: ObjectIdentifier,
630    pub(crate) root_ref: ObjectIdentifier,
631    pub(crate) version: Option<PdfVersion>,
632}
633
634impl TrailerData {
635    pub(crate) fn dummy() -> Self {
636        Self {
637            pages_ref: ObjectIdentifier::new(0, 0),
638            root_ref: ObjectIdentifier::new(0, 0),
639            version: None,
640        }
641    }
642}
643
644#[derive(Debug, Clone)]
645struct SomeRepr {
646    data: Arc<Data>,
647    map: Arc<RwLock<MapRepr>>,
648    metadata: Arc<Metadata>,
649    decryptor: Arc<Decryptor>,
650    has_ocgs: bool,
651    password: Vec<u8>,
652    trailer_data: TrailerData,
653}
654
655#[derive(Debug, Clone)]
656enum Inner {
657    /// A dummy xref table that doesn't have any entries.
658    Dummy,
659    /// A proper xref table.
660    Some(Arc<SomeRepr>),
661}
662
663#[derive(Debug)]
664struct XRefEntry {
665    offset: usize,
666    gen_number: i32,
667    used: bool,
668}
669
670impl XRefEntry {
671    pub(crate) fn read(data: &[u8]) -> Option<Self> {
672        #[inline(always)]
673        fn parse_u32(data: &[u8]) -> Option<u32> {
674            let mut accum = 0_u32;
675
676            for byte in data {
677                accum = accum.checked_mul(10)?;
678
679                match *byte {
680                    b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
681                    _ => return None,
682                }
683            }
684
685            Some(accum)
686        }
687
688        let offset = parse_u32(&data[0..10])? as usize;
689        let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
690
691        let used = data[17] == b'n';
692
693        Some(Self {
694            offset,
695            gen_number,
696            used,
697        })
698    }
699}
700
701fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
702    let mut reader = Reader::new(data);
703    reader.jump(pos);
704    // In case the position points to before the object number of a xref stream.
705    reader.skip_white_spaces_and_comments();
706
707    let mut r2 = reader.clone();
708    if reader
709        .clone()
710        .read_without_context::<ObjectIdentifier>()
711        .is_some()
712    {
713        populate_from_xref_stream(data, &mut r2, xref_map)
714    } else {
715        populate_from_xref_table(data, &mut r2, xref_map)
716    }
717}
718
719pub(super) struct SubsectionHeader {
720    pub(super) start: u32,
721    pub(super) num_entries: u32,
722}
723
724impl Readable<'_> for SubsectionHeader {
725    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
726        r.skip_white_spaces();
727        let start = r.read_without_context::<u32>()?;
728        r.skip_white_spaces();
729        let num_entries = r.read_without_context::<u32>()?;
730        r.skip_white_spaces();
731
732        Some(Self { start, num_entries })
733    }
734}
735
736/// Populate the xref table, and return the trailer dict.
737fn populate_from_xref_table<'a>(
738    data: &'a [u8],
739    reader: &mut Reader<'a>,
740    insert_map: &mut XrefMap,
741) -> Option<&'a [u8]> {
742    let trailer = {
743        let mut reader = reader.clone();
744        read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
745    };
746
747    reader.skip_white_spaces();
748    reader.forward_tag(b"xref")?;
749    reader.skip_white_spaces();
750
751    let mut max_obj = 0;
752
753    if let Some(prev) = trailer.get::<i32>(PREV) {
754        // First insert the entries from any previous xref tables.
755        populate_xref_impl(data, prev as usize, insert_map)?;
756    }
757
758    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
759    // after looking at `PREV`.
760    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
761        populate_xref_impl(data, xref_stm as usize, insert_map)?;
762    }
763
764    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
765        reader.skip_white_spaces();
766
767        let start = header.start;
768        let end = start + header.num_entries;
769
770        for obj_number in start..end {
771            max_obj = max(max_obj, obj_number);
772            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
773            let entry = XRefEntry::read(bytes)?;
774
775            // Specification says we should ignore any object number > SIZE, but probably
776            // not important?
777            if entry.used {
778                insert_map.insert(
779                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
780                    EntryType::Normal(entry.offset),
781                );
782            }
783        }
784    }
785
786    Some(trailer.data())
787}
788
789fn populate_from_xref_stream<'a>(
790    data: &'a [u8],
791    reader: &mut Reader<'a>,
792    insert_map: &mut XrefMap,
793) -> Option<&'a [u8]> {
794    let stream = reader
795        .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
796        .get();
797
798    if let Some(prev) = stream.dict().get::<i32>(PREV) {
799        // First insert the entries from any previous xref tables.
800        let _ = populate_xref_impl(data, prev as usize, insert_map)?;
801    }
802
803    let size = stream.dict().get::<u32>(SIZE)?;
804
805    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
806
807    if f2_len > size_of::<u64>() as u8 {
808        error!("xref offset length is larger than the allowed limit");
809
810        return None;
811    }
812
813    // Do such files exist?
814    if f1_len != 1 {
815        warn!("first field in xref stream was longer than 1");
816    }
817
818    let xref_data = stream.decoded().ok()?;
819    let mut xref_reader = Reader::new(xref_data.as_ref());
820
821    if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
822        let iter = arr.iter::<(u32, u32)>();
823
824        for (start, num_elements) in iter {
825            xref_stream_subsection(
826                &mut xref_reader,
827                start,
828                num_elements,
829                f1_len,
830                f2_len,
831                f3_len,
832                insert_map,
833            )?;
834        }
835    } else {
836        xref_stream_subsection(
837            &mut xref_reader,
838            0,
839            size,
840            f1_len,
841            f2_len,
842            f3_len,
843            insert_map,
844        )?;
845    }
846
847    Some(stream.dict().data())
848}
849
850fn xref_stream_num(data: &[u8]) -> Option<u32> {
851    Some(match data.len() {
852        0 => return None,
853        1 => u8::from_be(data[0]) as u32,
854        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
855        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
856        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
857        8 => {
858            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
859                return Some(num);
860            } else {
861                warn!("xref stream number is too large");
862
863                return None;
864            }
865        }
866        n => {
867            warn!("invalid xref stream number {n}");
868
869            return None;
870        }
871    })
872}
873
874fn xref_stream_subsection<'a>(
875    xref_reader: &mut Reader<'a>,
876    start: u32,
877    num_elements: u32,
878    f1_len: u8,
879    f2_len: u8,
880    f3_len: u8,
881    insert_map: &mut XrefMap,
882) -> Option<()> {
883    for i in 0..num_elements {
884        let f_type = if f1_len == 0 {
885            1
886        } else {
887            // We assume a length of 1.
888            xref_reader.read_bytes(1)?[0]
889        };
890
891        let obj_number = start + i;
892
893        match f_type {
894            // We don't care about free objects.
895            0 => {
896                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
897            }
898            1 => {
899                let offset = if f2_len > 0 {
900                    let data = xref_reader.read_bytes(f2_len as usize)?;
901                    xref_stream_num(data)?
902                } else {
903                    0
904                };
905
906                let gen_number = if f3_len > 0 {
907                    let data = xref_reader.read_bytes(f3_len as usize)?;
908                    xref_stream_num(data)?
909                } else {
910                    0
911                };
912
913                insert_map.insert(
914                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
915                    EntryType::Normal(offset as usize),
916                );
917            }
918            2 => {
919                let obj_stream_number = {
920                    let data = xref_reader.read_bytes(f2_len as usize)?;
921                    xref_stream_num(data)?
922                };
923                let gen_number = 0;
924                let index = if f3_len > 0 {
925                    let data = xref_reader.read_bytes(f3_len as usize)?;
926                    xref_stream_num(data)?
927                } else {
928                    0
929                };
930
931                insert_map.insert(
932                    ObjectIdentifier::new(obj_number as i32, gen_number),
933                    EntryType::ObjStream(obj_stream_number, index),
934                );
935            }
936            _ => {
937                warn!("xref has unknown field type {f_type}");
938
939                return None;
940            }
941        }
942    }
943
944    Some(())
945}
946
947fn read_xref_table_trailer<'a>(
948    reader: &mut Reader<'a>,
949    ctx: &ReaderContext<'a>,
950) -> Option<Dict<'a>> {
951    reader.skip_white_spaces();
952    reader.forward_tag(b"xref")?;
953    reader.skip_white_spaces();
954
955    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
956        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
957    }
958
959    reader.skip_white_spaces();
960    reader.forward_tag(b"trailer")?;
961    reader.skip_white_spaces();
962
963    reader.read_with_context::<Dict<'_>>(ctx)
964}
965
966fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
967    if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
968        let id = if let Some(id) = trailer_dict
969            .get::<Array<'_>>(ID)
970            .and_then(|a| a.flex_iter().next::<object::String<'_>>())
971        {
972            id.get().to_vec()
973        } else {
974            // Assume an empty ID entry.
975            vec![]
976        };
977
978        get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
979    } else {
980        Ok(Decryptor::None)
981    }
982}
983
984struct ObjectStream<'a> {
985    data: &'a [u8],
986    ctx: ReaderContext<'a>,
987    offsets: Vec<(u32, usize)>,
988}
989
990impl<'a> ObjectStream<'a> {
991    fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
992        let num_objects = inner.dict().get::<usize>(N)?;
993        let first_offset = inner.dict().get::<usize>(FIRST)?;
994
995        let mut r = Reader::new(data);
996
997        let mut offsets = vec![];
998
999        for _ in 0..num_objects {
1000            r.skip_white_spaces_and_comments();
1001            // Skip object number
1002            let obj_num = r.read_without_context::<u32>()?;
1003            r.skip_white_spaces_and_comments();
1004            let relative_offset = r.read_without_context::<usize>()?;
1005            offsets.push((obj_num, first_offset + relative_offset));
1006        }
1007
1008        let mut ctx = ctx.clone();
1009        ctx.in_object_stream = true;
1010
1011        Some(Self { data, ctx, offsets })
1012    }
1013
1014    fn get<T>(&self, index: u32) -> Option<T>
1015    where
1016        T: ObjectLike<'a>,
1017    {
1018        let offset = self.offsets.get(index as usize)?.1;
1019        let mut r = Reader::new(self.data);
1020        r.jump(offset);
1021        r.skip_white_spaces_and_comments();
1022
1023        r.read_with_context::<T>(&self.ctx)
1024    }
1025}
1026
1027fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1028    Metadata {
1029        creation_date: info_dict
1030            .get::<object::String<'_>>(CREATION_DATE)
1031            .and_then(|c| DateTime::from_bytes(c.get().as_ref())),
1032        modification_date: info_dict
1033            .get::<object::String<'_>>(MOD_DATE)
1034            .and_then(|c| DateTime::from_bytes(c.get().as_ref())),
1035        title: info_dict
1036            .get::<object::String<'_>>(TITLE)
1037            .map(|t| t.get().to_vec()),
1038        author: info_dict
1039            .get::<object::String<'_>>(AUTHOR)
1040            .map(|t| t.get().to_vec()),
1041        subject: info_dict
1042            .get::<object::String<'_>>(SUBJECT)
1043            .map(|t| t.get().to_vec()),
1044        keywords: info_dict
1045            .get::<object::String<'_>>(KEYWORDS)
1046            .map(|t| t.get().to_vec()),
1047        creator: info_dict
1048            .get::<object::String<'_>>(CREATOR)
1049            .map(|t| t.get().to_vec()),
1050        producer: info_dict
1051            .get::<object::String<'_>>(PRODUCER)
1052            .map(|t| t.get().to_vec()),
1053    }
1054}