hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11    ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28    Unknown,
29    Encrypted,
30}
31
32/// Parse the "root" xref from the PDF.
33pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34    let mut xref_map = FxHashMap::default();
35    let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36    let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37        .ok_or(XRefError::Unknown)?;
38
39    XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42/// Try to manually parse the PDF to build an xref table and trailer dictionary.
43pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44    warn!("xref table was invalid, trying to manually build xref table");
45    let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47    if let Some(trailer_dict_data) = trailer_dict {
48        warn!("rebuild xref table with {} entries", xref_map.len());
49
50        XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51    } else {
52        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54        None
55    }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59    let mut xref_map = FxHashMap::default();
60    let mut trailer_dict = None;
61
62    let mut r = Reader::new(data);
63
64    let mut dummy_ctx = ReaderContext::dummy();
65    let mut last_obj_num = None;
66
67    loop {
68        let cur_pos = r.offset();
69
70        let mut old_r = r.clone();
71
72        if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73            xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74            last_obj_num = Some(obj_id);
75            dummy_ctx.obj_number = Some(obj_id);
76        } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
77            if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
78                trailer_dict = Some(dict.clone());
79            }
80
81            if let Some(stream) = old_r.read::<Stream>(dummy_ctx) {
82                if stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
83                    && let Some(data) = stream.decoded().ok()
84                    && let Some(last_obj_num) = last_obj_num
85                {
86                    if let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx) {
87                        for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
88                            let id = ObjectIdentifier::new(*obj_num as i32, 0);
89                            xref_map.insert(
90                                id,
91                                EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
92                            );
93                        }
94                    }
95                }
96            }
97        } else {
98            r.read_byte();
99        }
100
101        if r.at_end() {
102            break;
103        }
104    }
105
106    (xref_map, trailer_dict.map(|d| d.data()))
107}
108
109static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
110
111/// An xref table.
112#[derive(Debug, Clone)]
113pub struct XRef(Inner);
114
115impl XRef {
116    fn new(
117        data: PdfData,
118        xref_map: XrefMap,
119        trailer_dict_data: &[u8],
120        repaired: bool,
121    ) -> Result<Self, XRefError> {
122        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
123        // before we actually created the xref struct. So we first create it using dummy data
124        // and then populate the data.
125        let trailer_data = TrailerData::dummy();
126
127        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
128            data: Arc::new(Data::new(data)),
129            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
130            trailer_data,
131        })));
132
133        let mut r = Reader::new(trailer_dict_data);
134        let trailer_dict = r
135            .read_with_context::<Dict>(ReaderContext::new(&xref, false))
136            .ok_or(XRefError::Unknown)?;
137
138        if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
139            warn!("encrypted PDF files are not yet supported");
140
141            return Err(XRefError::Encrypted);
142        }
143
144        let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
145        let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
146        let version = root
147            .get::<Name>(VERSION)
148            .and_then(|v| PdfVersion::from_bytes(v.deref()));
149
150        let td = TrailerData {
151            pages_ref: pages_ref.into(),
152            version,
153        };
154
155        match &mut xref.0 {
156            Inner::Dummy => unreachable!(),
157            Inner::Some(r) => {
158                Arc::make_mut(r).trailer_data = td;
159            }
160        }
161
162        Ok(xref)
163    }
164
165    fn is_repaired(&self) -> bool {
166        match &self.0 {
167            Inner::Dummy => false,
168            Inner::Some(r) => {
169                let locked = r.map.read().unwrap();
170                locked.repaired
171            }
172        }
173    }
174
175    pub(crate) fn dummy() -> &'static XRef {
176        DUMMY_XREF
177    }
178
179    pub(crate) fn len(&self) -> usize {
180        match &self.0 {
181            Inner::Dummy => 0,
182            Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
183        }
184    }
185
186    pub(crate) fn trailer_data(&self) -> &TrailerData {
187        match &self.0 {
188            Inner::Dummy => unreachable!(),
189            Inner::Some(r) => &r.trailer_data,
190        }
191    }
192
193    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
194        match &self.0 {
195            Inner::Dummy => unimplemented!(),
196            Inner::Some(r) => iter::from_fn(move || {
197                let locked = r.map.read().unwrap();
198                let mut iter = locked.xref_map.keys();
199
200                iter.next().and_then(|k| self.get(*k))
201            }),
202        }
203    }
204
205    pub(crate) fn repair(&self) {
206        let Inner::Some(r) = &self.0 else {
207            unreachable!();
208        };
209
210        let mut locked = r.map.try_write().unwrap();
211        assert!(!locked.repaired);
212
213        let (xref_map, _) = fallback_xref_map(r.data.get());
214        locked.xref_map = xref_map;
215        locked.repaired = true;
216    }
217
218    /// Return the object with the given identifier.
219    #[allow(private_bounds)]
220    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
221    where
222        T: ObjectLike<'a>,
223    {
224        let Inner::Some(repr) = &self.0 else {
225            return None;
226        };
227
228        let locked = repr.map.try_read().unwrap();
229
230        let mut r = Reader::new(repr.data.get());
231
232        let entry = *locked.xref_map.get(&id).or({
233            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
234            // shall be treated as a reference to the null object.
235            None
236        })?;
237        drop(locked);
238
239        match entry {
240            EntryType::Normal(offset) => {
241                r.jump(offset);
242
243                if let Some(object) =
244                    r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
245                {
246                    if object.id() == &id {
247                        return Some(object.get());
248                    }
249                } else {
250                    // There is a valid object at the offset, it's just not of the type the caller
251                    // expected, which is fine.
252                    if r.skip_not_in_content_stream::<IndirectObject<Object>>()
253                        .is_some()
254                    {
255                        return None;
256                    }
257                };
258
259                // The xref table is broken, try to repair if not already repaired.
260                if self.is_repaired() {
261                    error!(
262                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
263                    );
264
265                    None
266                } else {
267                    warn!("broken xref, attempting to repair");
268
269                    self.repair();
270
271                    // Now try reading again.
272                    self.get::<T>(id)
273                }
274            }
275            EntryType::ObjStream(obj_stram_gen_num, index) => {
276                // Generation number is implicitly 0.
277                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
278
279                let stream = self.get::<Stream>(obj_stream_id)?;
280                let data = repr.data.get_with(obj_stream_id, self)?;
281                let object_stream =
282                    ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
283                object_stream.get(index)
284            }
285        }
286    }
287}
288
289pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
290    let mut finder = Reader::new(data);
291    let mut pos = finder.len().checked_sub(1)?;
292    finder.jump(pos);
293
294    let needle = b"startxref";
295
296    loop {
297        if finder.forward_tag(needle).is_some() {
298            finder.skip_white_spaces_and_comments();
299
300            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
301
302            return Some(offset);
303        }
304
305        pos = pos.checked_sub(1)?;
306        finder.jump(pos);
307    }
308}
309
310/// A type of xref entry.
311#[derive(Debug, PartialEq, Eq, Clone, Copy)]
312enum EntryType {
313    /// An indirect object that is at a specific offset in the original data.
314    Normal(usize),
315    /// An indirect object that is part of an object stream. First number indicates the object
316    /// number of the _object stream_ (the generation number is always 0), the second number indicates
317    /// the index in the object stream.
318    ObjStream(u32, u32),
319}
320
321type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
322
323/// Representation of a proper xref table.
324#[derive(Debug)]
325struct MapRepr {
326    xref_map: XrefMap,
327    repaired: bool,
328}
329
330#[derive(Debug, Copy, Clone)]
331pub(crate) struct TrailerData {
332    pub pages_ref: ObjectIdentifier,
333    pub version: Option<PdfVersion>,
334}
335
336impl TrailerData {
337    pub fn dummy() -> Self {
338        Self {
339            pages_ref: ObjectIdentifier::new(0, 0),
340            version: None,
341        }
342    }
343}
344
345#[derive(Debug, Clone)]
346struct SomeRepr {
347    data: Arc<Data>,
348    map: Arc<RwLock<MapRepr>>,
349    trailer_data: TrailerData,
350}
351
352#[derive(Debug, Clone)]
353enum Inner {
354    /// A dummy xref table that doesn't have any entries.
355    Dummy,
356    /// A proper xref table.
357    Some(Arc<SomeRepr>),
358}
359
360#[derive(Debug)]
361struct XRefEntry {
362    offset: usize,
363    gen_number: i32,
364    used: bool,
365}
366
367impl XRefEntry {
368    pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
369        #[inline(always)]
370        fn parse_u32(data: &[u8]) -> Option<u32> {
371            let mut accum = 0;
372
373            for byte in data {
374                accum *= 10;
375
376                match *byte {
377                    b'0'..=b'9' => accum += (*byte - b'0') as u32,
378                    _ => return None,
379                }
380            }
381
382            Some(accum)
383        }
384
385        let offset = parse_u32(&data[0..10])? as usize;
386        let gen_number = parse_u32(&data[11..16])? as i32;
387
388        let used = data[17] == b'n';
389
390        Some(Self {
391            offset,
392            gen_number,
393            used,
394        })
395    }
396}
397
398fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
399    let mut reader = Reader::new(data);
400    reader.jump(pos);
401
402    let mut r2 = reader.clone();
403    if reader
404        .clone()
405        .read_without_context::<ObjectIdentifier>()
406        .is_some()
407    {
408        populate_from_xref_stream(data, &mut r2, xref_map)
409    } else {
410        populate_from_xref_table(data, &mut r2, xref_map)
411    }
412}
413
414pub(super) struct SubsectionHeader {
415    pub(super) start: u32,
416    pub(super) num_entries: u32,
417}
418
419impl Readable<'_> for SubsectionHeader {
420    fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
421        r.skip_white_spaces();
422        let start = r.read_without_context::<u32>()?;
423        r.skip_white_spaces();
424        let num_entries = r.read_without_context::<u32>()?;
425        r.skip_white_spaces();
426
427        Some(Self { start, num_entries })
428    }
429}
430
431/// Populate the xref table, and return the trailer dict.
432fn populate_from_xref_table<'a>(
433    data: &'a [u8],
434    reader: &mut Reader<'a>,
435    insert_map: &mut XrefMap,
436) -> Option<&'a [u8]> {
437    let trailer = {
438        let mut reader = reader.clone();
439        read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
440    };
441
442    reader.skip_white_spaces();
443    reader.forward_tag(b"xref")?;
444    reader.skip_white_spaces();
445
446    let mut max_obj = 0;
447
448    if let Some(prev) = trailer.get::<i32>(PREV) {
449        // First insert the entries from any previous xref tables.
450        populate_xref_impl(data, prev as usize, insert_map)?;
451    }
452
453    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
454    // after looking at `PREV`.
455    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
456        populate_xref_impl(data, xref_stm as usize, insert_map)?;
457    }
458
459    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
460        reader.skip_white_spaces();
461
462        let start = header.start;
463        let end = start + header.num_entries;
464
465        for obj_number in start..end {
466            max_obj = max(max_obj, obj_number);
467            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
468            let entry = XRefEntry::read(bytes)?;
469
470            // Specification says we should ignore any object number > SIZE, but probably
471            // not important?
472            if entry.used {
473                insert_map.insert(
474                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
475                    EntryType::Normal(entry.offset),
476                );
477            }
478        }
479    }
480
481    Some(trailer.data())
482}
483
484fn populate_from_xref_stream<'a>(
485    data: &'a [u8],
486    reader: &mut Reader<'a>,
487    insert_map: &mut XrefMap,
488) -> Option<&'a [u8]> {
489    let stream = reader
490        .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
491        .get();
492
493    if let Some(prev) = stream.dict().get::<i32>(PREV) {
494        // First insert the entries from any previous xref tables.
495        let _ = populate_xref_impl(data, prev as usize, insert_map)?;
496    }
497
498    let size = stream.dict().get::<u32>(SIZE)?;
499
500    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
501
502    if f2_len > size_of::<u64>() as u8 {
503        error!("xref offset length is larger than the allowed limit");
504
505        return None;
506    }
507
508    // Do such files exist?
509    if f1_len != 1 {
510        warn!("first field in xref stream was longer than 1");
511    }
512
513    let xref_data = stream.decoded().ok()?;
514    let mut xref_reader = Reader::new(xref_data.as_ref());
515
516    if let Some(arr) = stream.dict().get::<Array>(INDEX) {
517        let iter = arr.iter::<(u32, u32)>();
518
519        for (start, num_elements) in iter {
520            xref_stream_subsection(
521                &mut xref_reader,
522                start,
523                num_elements,
524                f1_len,
525                f2_len,
526                f3_len,
527                insert_map,
528            )?;
529        }
530    } else {
531        xref_stream_subsection(
532            &mut xref_reader,
533            0,
534            size,
535            f1_len,
536            f2_len,
537            f3_len,
538            insert_map,
539        )?;
540    }
541
542    Some(stream.dict().data())
543}
544
545fn xref_stream_num(data: &[u8]) -> Option<u32> {
546    Some(match data.len() {
547        0 => return None,
548        1 => u8::from_be(data[0]) as u32,
549        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
550        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
551        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
552        8 => {
553            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
554                return Some(num);
555            } else {
556                warn!("xref stream number is too large");
557
558                return None;
559            }
560        }
561        n => {
562            warn!("invalid xref stream number {n}");
563
564            return None;
565        }
566    })
567}
568
569fn xref_stream_subsection<'a>(
570    xref_reader: &mut Reader<'a>,
571    start: u32,
572    num_elements: u32,
573    f1_len: u8,
574    f2_len: u8,
575    f3_len: u8,
576    insert_map: &mut XrefMap,
577) -> Option<()> {
578    for i in 0..num_elements {
579        let f_type = if f1_len == 0 {
580            1
581        } else {
582            // We assume a length of 1.
583            xref_reader.read_bytes(1)?[0]
584        };
585
586        let obj_number = start + i;
587
588        match f_type {
589            // We don't care about free objects.
590            0 => {
591                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
592            }
593            1 => {
594                let offset = if f2_len > 0 {
595                    let data = xref_reader.read_bytes(f2_len as usize)?;
596                    xref_stream_num(data)?
597                } else {
598                    0
599                };
600
601                let gen_number = if f3_len > 0 {
602                    let data = xref_reader.read_bytes(f3_len as usize)?;
603                    xref_stream_num(data)?
604                } else {
605                    0
606                };
607
608                insert_map.insert(
609                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
610                    EntryType::Normal(offset as usize),
611                );
612            }
613            2 => {
614                let obj_stream_number = {
615                    let data = xref_reader.read_bytes(f2_len as usize)?;
616                    xref_stream_num(data)?
617                };
618                let gen_number = 0;
619                let index = if f3_len > 0 {
620                    let data = xref_reader.read_bytes(f3_len as usize)?;
621                    xref_stream_num(data)?
622                } else {
623                    0
624                };
625
626                insert_map.insert(
627                    ObjectIdentifier::new(obj_number as i32, gen_number),
628                    EntryType::ObjStream(obj_stream_number, index),
629                );
630            }
631            _ => {
632                warn!("xref has unknown field type {f_type}");
633
634                return None;
635            }
636        }
637    }
638
639    Some(())
640}
641
642fn read_xref_table_trailer<'a>(
643    reader: &mut Reader<'a>,
644    ctx: ReaderContext<'a>,
645) -> Option<Dict<'a>> {
646    reader.skip_white_spaces();
647    reader.forward_tag(b"xref")?;
648    reader.skip_white_spaces();
649
650    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
651        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
652    }
653
654    reader.skip_white_spaces();
655    reader.forward_tag(b"trailer")?;
656    reader.skip_white_spaces();
657
658    reader.read_with_context::<Dict>(ctx)
659}
660
661struct ObjectStream<'a> {
662    data: &'a [u8],
663    ctx: ReaderContext<'a>,
664    offsets: Vec<(u32, usize)>,
665}
666
667impl<'a> ObjectStream<'a> {
668    fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
669        let num_objects = inner.dict().get::<usize>(N)?;
670        let first_offset = inner.dict().get::<usize>(FIRST)?;
671
672        let mut r = Reader::new(data);
673
674        let mut offsets = vec![];
675
676        for _ in 0..num_objects {
677            r.skip_white_spaces_and_comments();
678            // Skip object number
679            let obj_num = r.read_without_context::<u32>()?;
680            r.skip_white_spaces_and_comments();
681            let relative_offset = r.read_without_context::<usize>()?;
682            offsets.push((obj_num, first_offset + relative_offset));
683        }
684
685        Some(Self { data, ctx, offsets })
686    }
687
688    fn get<T>(&self, index: u32) -> Option<T>
689    where
690        T: ObjectLike<'a>,
691    {
692        let offset = self.offsets.get(index as usize)?.1;
693        let mut r = Reader::new(self.data);
694        r.jump(offset);
695        r.skip_white_spaces_and_comments();
696
697        r.read_with_context::<T>(self.ctx)
698    }
699}