hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::PdfData;
4use crate::data::Data;
5use crate::object::ObjectIdentifier;
6use crate::object::array::Array;
7use crate::object::dict::Dict;
8use crate::object::dict::keys::{FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, W, XREF_STM};
9use crate::object::indirect::IndirectObject;
10use crate::object::stream::Stream;
11use crate::object::{Object, ObjectLike};
12use crate::reader::{Readable, Reader};
13use log::{error, warn};
14use rustc_hash::FxHashMap;
15use std::cmp::max;
16use std::iter;
17use std::sync::{Arc, RwLock};
18
19pub(crate) const XREF_ENTRY_LEN: usize = 20;
20
21/// Parse the "root" xref from the PDF.
22pub(crate) fn root_xref(data: PdfData) -> Option<XRef> {
23    let mut xref_map = FxHashMap::default();
24    let xref_pos = find_last_xref_pos(data.as_ref().as_ref())?;
25    let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)?;
26
27    XRef::new(data.clone(), xref_map, &trailer, false)
28}
29
30/// Try to manually parse the PDF to build an xref table and trailer dictionary.
31pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
32    warn!("xref table was invalid, trying to manually build xref table");
33    let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
34
35    if let Some(trailer_dict_data) = trailer_dict {
36        warn!("rebuild xref table with {} entries", xref_map.len());
37
38        XRef::new(data.clone(), xref_map, trailer_dict_data, true)
39    } else {
40        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
41
42        None
43    }
44}
45
46fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
47    let mut xref_map = FxHashMap::default();
48    let mut trailer_dict = None;
49
50    let mut r = Reader::new(data);
51
52    loop {
53        let cur_pos = r.offset();
54
55        if let Some(obj) = r.read_without_xref::<ObjectIdentifier>() {
56            xref_map.insert(obj, EntryType::Normal(cur_pos));
57        } else if let Some(dict) = r.read::<false, Dict>(XRef::dummy()) {
58            if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
59                trailer_dict = Some(dict);
60            }
61        } else {
62            r.read_byte();
63        }
64
65        if r.at_end() {
66            break;
67        }
68    }
69
70    (xref_map, trailer_dict.map(|d| d.data()))
71}
72
73static DUMMY_XREF: &'static XRef = &XRef(Inner::Dummy);
74
75/// An xref table.
76#[derive(Debug)]
77pub struct XRef(Inner);
78
79impl XRef {
80    fn new(
81        data: PdfData,
82        xref_map: XrefMap,
83        trailer_dict_data: &[u8],
84        repaired: bool,
85    ) -> Option<Self> {
86        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
87        // before we actually created the xref struct. So we first create it using dummy data
88        // and then populate the data.
89        let trailer_data = TrailerData::dummy();
90
91        let mut xref = Self(Inner::Some {
92            data: Data::new(data),
93            map: Arc::new(RwLock::new(SomeRepr { xref_map, repaired })),
94            trailer_data,
95        });
96
97        let mut r = Reader::new(&trailer_dict_data);
98        let trailer_dict = r.read_with_xref::<Dict>(&xref)?;
99        let root = trailer_dict.get::<Dict>(ROOT)?;
100        let pages_ref = root.get_ref(PAGES)?;
101
102        let td = TrailerData {
103            pages_ref: pages_ref.into(),
104        };
105
106        match &mut xref.0 {
107            Inner::Dummy => unreachable!(),
108            Inner::Some { trailer_data, .. } => {
109                *trailer_data = td;
110            }
111        }
112
113        Some(xref)
114    }
115
116    pub(crate) fn dummy() -> &'static XRef {
117        DUMMY_XREF
118    }
119
120    pub(crate) fn len(&self) -> usize {
121        match &self.0 {
122            Inner::Dummy => 0,
123            Inner::Some { map, .. } => map.read().unwrap().xref_map.len(),
124        }
125    }
126
127    pub(crate) fn trailer_data(&self) -> &TrailerData {
128        match &self.0 {
129            Inner::Dummy => unreachable!(),
130            Inner::Some { trailer_data, .. } => trailer_data,
131        }
132    }
133
134    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
135        match &self.0 {
136            Inner::Dummy => unimplemented!(),
137            Inner::Some { map, .. } => iter::from_fn(move || {
138                let locked = map.read().unwrap();
139                let mut iter = locked.xref_map.keys();
140
141                iter.next().and_then(|k| self.get(*k))
142            }),
143        }
144    }
145
146    pub(crate) fn repair(&self) {
147        let Inner::Some { map, data, .. } = &self.0 else {
148            unreachable!();
149        };
150
151        let mut locked = map.try_write().unwrap();
152        assert!(!locked.repaired);
153
154        let (xref_map, _) = fallback_xref_map(data.get());
155        locked.xref_map = xref_map;
156        locked.repaired = true;
157    }
158
159    pub(crate) fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
160    where
161        T: ObjectLike<'a>,
162    {
163        let Inner::Some { map, data, .. } = &self.0 else {
164            return None;
165        };
166
167        let locked = map.try_read().unwrap();
168        let repaired = locked.repaired;
169
170        let mut r = Reader::new(data.get());
171
172        let entry = *locked.xref_map.get(&id).or_else(|| {
173            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
174            // shall be treated as a reference to the null object.
175            None
176        })?;
177        drop(locked);
178
179        match entry {
180            EntryType::Normal(offset) => {
181                r.jump(offset);
182
183                if let Some(object) = r.read_with_xref::<IndirectObject<T>>(self) {
184                    if object.id() == &id {
185                        return Some(object.get());
186                    }
187                } else {
188                    // There is a valid object at the offset, it's just not of the type the caller
189                    // expected, which is fine.
190                    if r.skip_non_plain::<IndirectObject<Object>>().is_some() {
191                        return None;
192                    }
193                };
194
195                // The xref table is broken, try to repair if not already repaired.
196                if repaired {
197                    error!(
198                        "attempt was made at repairing xref, but object {:?} still couldn't be read",
199                        id
200                    );
201
202                    None
203                } else {
204                    warn!("broken xref, attempting to repair");
205
206                    self.repair();
207
208                    // Now try reading again.
209                    self.get::<T>(id)
210                }
211            }
212            EntryType::ObjStream(id, index) => {
213                // Generation number is implicitly 0.
214                let id = ObjectIdentifier::new(id as i32, 0);
215
216                let stream = self.get::<Stream>(id)?;
217                let data = data.get_with(id, self)?;
218                let object_stream = ObjectStream::new(stream, data, self)?;
219                object_stream.get(index)
220            }
221        }
222    }
223}
224
225pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
226    let mut finder = Reader::new(data);
227    let mut pos = finder.len() - 1;
228    finder.jump(pos);
229
230    let needle = b"startxref";
231
232    loop {
233        if finder.forward_tag(needle).is_some() {
234            finder.skip_white_spaces_and_comments();
235
236            let offset = finder.read_without_xref::<i32>()?.try_into().ok()?;
237
238            return Some(offset);
239        }
240
241        pos = pos.checked_sub(1)?;
242        finder.jump(pos);
243    }
244}
245
246/// A type of xref entry.
247#[derive(Debug, PartialEq, Eq, Clone, Copy)]
248enum EntryType {
249    /// An indirect object that is at a specific offset in the original data.
250    Normal(usize),
251    /// An indirect object that is part of an object stream. First number indicates the object
252    /// number of the _object stream_ (the generation number is always 0), the second number indicates
253    /// the index in the object stream.
254    ObjStream(u32, u32),
255}
256
257type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
258
259/// Representation of a proper xref table.
260#[derive(Debug)]
261struct SomeRepr {
262    xref_map: XrefMap,
263    repaired: bool,
264}
265
266#[derive(Debug, Copy, Clone)]
267pub(crate) struct TrailerData {
268    pub pages_ref: ObjectIdentifier,
269}
270
271impl TrailerData {
272    pub fn dummy() -> Self {
273        Self {
274            pages_ref: ObjectIdentifier::new(0, 0),
275        }
276    }
277}
278
279#[derive(Debug)]
280enum Inner {
281    /// A dummy xref table that doesn't have any entries.
282    Dummy,
283    /// A proper xref table.
284    Some {
285        data: Data,
286        map: Arc<RwLock<SomeRepr>>,
287        trailer_data: TrailerData,
288    },
289}
290
291#[derive(Debug)]
292struct XRefEntry {
293    offset: usize,
294    gen_number: i32,
295    used: bool,
296}
297
298impl XRefEntry {
299    pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
300        #[inline(always)]
301        fn parse_u32(data: &[u8]) -> Option<u32> {
302            let mut accum = 0;
303
304            for byte in data {
305                accum = accum * 10;
306
307                match *byte {
308                    b'0'..=b'9' => accum += (*byte - b'0') as u32,
309                    _ => return None,
310                }
311            }
312
313            Some(accum)
314        }
315
316        let offset = parse_u32(&data[0..10])? as usize;
317        let gen_number = parse_u32(&data[11..16])? as i32;
318
319        let used = data[17] == b'n';
320
321        Some(Self {
322            offset,
323            gen_number,
324            used,
325        })
326    }
327}
328
329fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
330    let mut reader = Reader::new(data);
331    reader.jump(pos);
332
333    let mut r2 = reader.clone();
334    if reader
335        .clone()
336        .read_without_xref::<ObjectIdentifier>()
337        .is_some()
338    {
339        populate_from_xref_stream(data, &mut r2, xref_map)
340    } else {
341        populate_from_xref_table(data, &mut r2, xref_map)
342    }
343}
344
345pub(super) struct SubsectionHeader {
346    pub(super) start: u32,
347    pub(super) num_entries: u32,
348}
349
350impl Readable<'_> for SubsectionHeader {
351    fn read<const PLAIN: bool>(r: &mut Reader<'_>, _: &XRef) -> Option<Self> {
352        r.skip_white_spaces();
353        let start = r.read_without_xref::<u32>()?;
354        r.skip_white_spaces();
355        let num_entries = r.read_without_xref::<u32>()?;
356        r.skip_white_spaces();
357
358        Some(Self { start, num_entries })
359    }
360}
361
362/// Populate the xref table, and return the trailer dict.
363fn populate_from_xref_table<'a>(
364    data: &'a [u8],
365    reader: &mut Reader<'a>,
366    insert_map: &mut XrefMap,
367) -> Option<&'a [u8]> {
368    let trailer = {
369        let mut reader = reader.clone();
370        read_xref_table_trailer(&mut reader, XRef::dummy())?
371    };
372
373    reader.skip_white_spaces();
374    reader.forward_tag(b"xref")?;
375    reader.skip_white_spaces();
376
377    let mut max_obj = 0;
378
379    if let Some(prev) = trailer.get::<i32>(PREV) {
380        // First insert the entries from any previous xref tables.
381        populate_xref_impl(data, prev as usize, insert_map)?;
382    }
383
384    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
385    // after looking at `PREV`.
386    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
387        populate_xref_impl(data, xref_stm as usize, insert_map)?;
388    }
389
390    while let Some(header) = reader.read_without_xref::<SubsectionHeader>() {
391        reader.skip_white_spaces();
392
393        let start = header.start;
394        let end = start + header.num_entries;
395
396        for obj_number in start..end {
397            max_obj = max(max_obj, obj_number);
398            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
399            let entry = XRefEntry::read(bytes)?;
400
401            // Specification says we should ignore any object number > SIZE, but probably
402            // not important?
403            if entry.used {
404                insert_map.insert(
405                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
406                    EntryType::Normal(entry.offset),
407                );
408            }
409        }
410    }
411
412    Some(trailer.data())
413}
414
415fn populate_from_xref_stream<'a>(
416    data: &'a [u8],
417    reader: &mut Reader<'a>,
418    insert_map: &mut XrefMap,
419) -> Option<&'a [u8]> {
420    let stream = reader
421        .read_with_xref::<IndirectObject<Stream>>(XRef::dummy())?
422        .get();
423
424    if let Some(prev) = stream.dict().get::<i32>(PREV) {
425        // First insert the entries from any previous xref tables.
426        let _ = populate_xref_impl(data, prev as usize, insert_map)?;
427    }
428
429    let size = stream.dict().get::<u32>(SIZE)?;
430
431    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
432
433    if f2_len > size_of::<u32>() as u8 {
434        error!("xref offset length is larger than the allowed limit");
435
436        return None;
437    }
438
439    // Do such files exist?
440    if f1_len != 1 {
441        warn!("first field in xref stream was longer than 1");
442    }
443
444    let xref_data = stream.decoded()?;
445    let mut xref_reader = Reader::new(xref_data.as_ref());
446
447    if let Some(arr) = stream.dict().get::<Array>(INDEX) {
448        let mut iter = arr.iter::<(u32, u32)>();
449
450        while let Some((start, num_elements)) = iter.next() {
451            xref_stream_subsection(
452                &mut xref_reader,
453                start,
454                num_elements,
455                f1_len,
456                f2_len,
457                f3_len,
458                insert_map,
459            )?;
460        }
461    } else {
462        xref_stream_subsection(
463            &mut xref_reader,
464            0,
465            size,
466            f1_len,
467            f2_len,
468            f3_len,
469            insert_map,
470        )?;
471    }
472
473    Some(stream.dict().data())
474}
475
476fn xref_stream_num<'a>(data: &[u8]) -> Option<u32> {
477    Some(match data.len() {
478        0 => return None,
479        1 => u8::from_be(data[0]) as u32,
480        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
481        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
482        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
483        n => {
484            warn!("invalid xref stream number {}", n);
485
486            return None;
487        }
488    })
489}
490
491fn xref_stream_subsection<'a>(
492    xref_reader: &mut Reader<'a>,
493    start: u32,
494    num_elements: u32,
495    f1_len: u8,
496    f2_len: u8,
497    f3_len: u8,
498    insert_map: &mut XrefMap,
499) -> Option<()> {
500    for i in 0..num_elements {
501        let f_type = if f1_len == 0 {
502            1
503        } else {
504            // We assume a length of 1.
505            xref_reader.read_bytes(1)?[0]
506        };
507
508        let obj_number = start + i;
509
510        match f_type {
511            // We don't care about free objects.
512            0 => {
513                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
514            }
515            1 => {
516                let offset = if f2_len > 0 {
517                    let data = xref_reader.read_bytes(f2_len as usize)?;
518                    xref_stream_num(data)?
519                } else {
520                    0
521                };
522
523                let gen_number = if f3_len > 0 {
524                    let data = xref_reader.read_bytes(f3_len as usize)?;
525                    xref_stream_num(data)?
526                } else {
527                    0
528                };
529
530                insert_map.insert(
531                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
532                    EntryType::Normal(offset as usize),
533                );
534            }
535            2 => {
536                let obj_stream_number = {
537                    let data = xref_reader.read_bytes(f2_len as usize)?;
538                    xref_stream_num(data)?
539                };
540                let gen_number = 0;
541                let index = if f3_len > 0 {
542                    let data = xref_reader.read_bytes(f3_len as usize)?;
543                    xref_stream_num(data)?
544                } else {
545                    0
546                };
547
548                insert_map.insert(
549                    ObjectIdentifier::new(obj_number as i32, gen_number),
550                    EntryType::ObjStream(obj_stream_number, index),
551                );
552            }
553            _ => {
554                warn!("xref has unknown field type {}", f_type);
555
556                return None;
557            }
558        }
559    }
560
561    Some(())
562}
563
564fn read_xref_table_trailer<'a>(reader: &mut Reader<'a>, xref: &'a XRef) -> Option<Dict<'a>> {
565    reader.skip_white_spaces();
566    reader.forward_tag(b"xref")?;
567    reader.skip_white_spaces();
568
569    while let Some(header) = reader.read_without_xref::<SubsectionHeader>() {
570        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
571    }
572
573    reader.skip_white_spaces();
574    reader.forward_tag(b"trailer")?;
575    reader.skip_white_spaces();
576
577    reader.read_with_xref::<Dict>(xref)
578}
579
580struct ObjectStream<'a> {
581    data: &'a [u8],
582    xref: &'a XRef,
583    offsets: Vec<usize>,
584}
585
586impl<'a> ObjectStream<'a> {
587    pub fn new(inner: Stream<'a>, data: &'a [u8], xref: &'a XRef) -> Option<Self> {
588        let num_objects = inner.dict().get::<usize>(N)?;
589        let first_offset = inner.dict().get::<usize>(FIRST)?;
590
591        let mut r = Reader::new(data.as_ref());
592
593        let mut offsets = vec![];
594
595        for _ in 0..num_objects {
596            r.skip_white_spaces_and_comments();
597            // Skip object number
598            let _ = r.read_without_xref::<u32>()?;
599            r.skip_white_spaces_and_comments();
600            let relative_offset = r.read_without_xref::<usize>()?;
601            offsets.push(first_offset + relative_offset);
602        }
603
604        Some(Self {
605            data,
606            xref,
607            offsets,
608        })
609    }
610
611    pub fn get<T>(&self, index: u32) -> Option<T>
612    where
613        T: ObjectLike<'a>,
614    {
615        let offset = *self.offsets.get(index as usize)?;
616        let mut r = Reader::new(&self.data);
617        r.jump(offset);
618
619        r.read_with_xref::<T>(&self.xref)
620    }
621}