hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11    ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28    Unknown,
29    Encrypted,
30}
31
32/// Parse the "root" xref from the PDF.
33pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34    let mut xref_map = FxHashMap::default();
35    let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36    let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37        .ok_or(XRefError::Unknown)?;
38
39    XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42/// Try to manually parse the PDF to build an xref table and trailer dictionary.
43pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44    warn!("xref table was invalid, trying to manually build xref table");
45    let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47    if let Some(trailer_dict_data) = trailer_dict {
48        warn!("rebuild xref table with {} entries", xref_map.len());
49
50        XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51    } else {
52        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54        None
55    }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59    let mut xref_map = FxHashMap::default();
60    let mut trailer_dict = None;
61
62    let mut r = Reader::new(data);
63
64    let dummy_ctx = ReaderContext::dummy();
65    let mut last_obj_num = None;
66
67    loop {
68        let cur_pos = r.offset();
69
70        let mut old_r = r.clone();
71
72        if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73            xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74            last_obj_num = Some(obj_id);
75        } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
76            if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
77                trailer_dict = Some(dict.clone());
78            }
79
80            if let Some(stream) = old_r.read::<Stream>(dummy_ctx) {
81                if stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
82                    && let Some(data) = stream.decoded().ok()
83                    && let Some(last_obj_num) = last_obj_num
84                {
85                    if let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx) {
86                        for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
87                            let id = ObjectIdentifier::new(*obj_num as i32, 0);
88                            xref_map.insert(
89                                id,
90                                EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
91                            );
92                        }
93                    }
94                }
95            }
96        } else {
97            r.read_byte();
98        }
99
100        if r.at_end() {
101            break;
102        }
103    }
104
105    (xref_map, trailer_dict.map(|d| d.data()))
106}
107
108static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
109
110/// An xref table.
111#[derive(Debug, Clone)]
112pub struct XRef(Inner);
113
114impl XRef {
115    fn new(
116        data: PdfData,
117        xref_map: XrefMap,
118        trailer_dict_data: &[u8],
119        repaired: bool,
120    ) -> Result<Self, XRefError> {
121        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
122        // before we actually created the xref struct. So we first create it using dummy data
123        // and then populate the data.
124        let trailer_data = TrailerData::dummy();
125
126        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
127            data: Arc::new(Data::new(data)),
128            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
129            trailer_data,
130        })));
131
132        let mut r = Reader::new(trailer_dict_data);
133        let trailer_dict = r
134            .read_with_context::<Dict>(ReaderContext::new(&xref, false))
135            .ok_or(XRefError::Unknown)?;
136
137        if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
138            warn!("encrypted PDF files are not yet supported");
139
140            return Err(XRefError::Encrypted);
141        }
142
143        let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
144        let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
145        let version = root
146            .get::<Name>(VERSION)
147            .and_then(|v| PdfVersion::from_bytes(v.deref()));
148
149        let td = TrailerData {
150            pages_ref: pages_ref.into(),
151            version,
152        };
153
154        match &mut xref.0 {
155            Inner::Dummy => unreachable!(),
156            Inner::Some(r) => {
157                Arc::make_mut(r).trailer_data = td;
158            }
159        }
160
161        Ok(xref)
162    }
163
164    fn is_repaired(&self) -> bool {
165        match &self.0 {
166            Inner::Dummy => false,
167            Inner::Some(r) => {
168                let locked = r.map.read().unwrap();
169                locked.repaired
170            }
171        }
172    }
173
174    pub(crate) fn dummy() -> &'static XRef {
175        DUMMY_XREF
176    }
177
178    pub(crate) fn len(&self) -> usize {
179        match &self.0 {
180            Inner::Dummy => 0,
181            Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
182        }
183    }
184
185    pub(crate) fn trailer_data(&self) -> &TrailerData {
186        match &self.0 {
187            Inner::Dummy => unreachable!(),
188            Inner::Some(r) => &r.trailer_data,
189        }
190    }
191
192    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
193        match &self.0 {
194            Inner::Dummy => unimplemented!(),
195            Inner::Some(r) => iter::from_fn(move || {
196                let locked = r.map.read().unwrap();
197                let mut iter = locked.xref_map.keys();
198
199                iter.next().and_then(|k| self.get(*k))
200            }),
201        }
202    }
203
204    pub(crate) fn repair(&self) {
205        let Inner::Some(r) = &self.0 else {
206            unreachable!();
207        };
208
209        let mut locked = r.map.try_write().unwrap();
210        assert!(!locked.repaired);
211
212        let (xref_map, _) = fallback_xref_map(r.data.get());
213        locked.xref_map = xref_map;
214        locked.repaired = true;
215    }
216
217    /// Return the object with the given identifier.
218    #[allow(private_bounds)]
219    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
220    where
221        T: ObjectLike<'a>,
222    {
223        let Inner::Some(repr) = &self.0 else {
224            return None;
225        };
226
227        let locked = repr.map.try_read().unwrap();
228
229        let mut r = Reader::new(repr.data.get());
230
231        let entry = *locked.xref_map.get(&id).or({
232            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
233            // shall be treated as a reference to the null object.
234            None
235        })?;
236        drop(locked);
237
238        match entry {
239            EntryType::Normal(offset) => {
240                r.jump(offset);
241
242                if let Some(object) =
243                    r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
244                {
245                    if object.id() == &id {
246                        return Some(object.get());
247                    }
248                } else {
249                    // There is a valid object at the offset, it's just not of the type the caller
250                    // expected, which is fine.
251                    if r.skip_not_in_content_stream::<IndirectObject<Object>>()
252                        .is_some()
253                    {
254                        return None;
255                    }
256                };
257
258                // The xref table is broken, try to repair if not already repaired.
259                if self.is_repaired() {
260                    error!(
261                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
262                    );
263
264                    None
265                } else {
266                    warn!("broken xref, attempting to repair");
267
268                    self.repair();
269
270                    // Now try reading again.
271                    self.get::<T>(id)
272                }
273            }
274            EntryType::ObjStream(id, index) => {
275                // Generation number is implicitly 0.
276                let id = ObjectIdentifier::new(id as i32, 0);
277
278                let stream = self.get::<Stream>(id)?;
279                let data = repr.data.get_with(id, self)?;
280                let object_stream =
281                    ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
282                object_stream.get(index)
283            }
284        }
285    }
286}
287
288pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
289    let mut finder = Reader::new(data);
290    let mut pos = finder.len().checked_sub(1)?;
291    finder.jump(pos);
292
293    let needle = b"startxref";
294
295    loop {
296        if finder.forward_tag(needle).is_some() {
297            finder.skip_white_spaces_and_comments();
298
299            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
300
301            return Some(offset);
302        }
303
304        pos = pos.checked_sub(1)?;
305        finder.jump(pos);
306    }
307}
308
309/// A type of xref entry.
310#[derive(Debug, PartialEq, Eq, Clone, Copy)]
311enum EntryType {
312    /// An indirect object that is at a specific offset in the original data.
313    Normal(usize),
314    /// An indirect object that is part of an object stream. First number indicates the object
315    /// number of the _object stream_ (the generation number is always 0), the second number indicates
316    /// the index in the object stream.
317    ObjStream(u32, u32),
318}
319
320type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
321
322/// Representation of a proper xref table.
323#[derive(Debug)]
324struct MapRepr {
325    xref_map: XrefMap,
326    repaired: bool,
327}
328
329#[derive(Debug, Copy, Clone)]
330pub(crate) struct TrailerData {
331    pub pages_ref: ObjectIdentifier,
332    pub version: Option<PdfVersion>,
333}
334
335impl TrailerData {
336    pub fn dummy() -> Self {
337        Self {
338            pages_ref: ObjectIdentifier::new(0, 0),
339            version: None,
340        }
341    }
342}
343
344#[derive(Debug, Clone)]
345struct SomeRepr {
346    data: Arc<Data>,
347    map: Arc<RwLock<MapRepr>>,
348    trailer_data: TrailerData,
349}
350
351#[derive(Debug, Clone)]
352enum Inner {
353    /// A dummy xref table that doesn't have any entries.
354    Dummy,
355    /// A proper xref table.
356    Some(Arc<SomeRepr>),
357}
358
359#[derive(Debug)]
360struct XRefEntry {
361    offset: usize,
362    gen_number: i32,
363    used: bool,
364}
365
366impl XRefEntry {
367    pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
368        #[inline(always)]
369        fn parse_u32(data: &[u8]) -> Option<u32> {
370            let mut accum = 0;
371
372            for byte in data {
373                accum *= 10;
374
375                match *byte {
376                    b'0'..=b'9' => accum += (*byte - b'0') as u32,
377                    _ => return None,
378                }
379            }
380
381            Some(accum)
382        }
383
384        let offset = parse_u32(&data[0..10])? as usize;
385        let gen_number = parse_u32(&data[11..16])? as i32;
386
387        let used = data[17] == b'n';
388
389        Some(Self {
390            offset,
391            gen_number,
392            used,
393        })
394    }
395}
396
397fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
398    let mut reader = Reader::new(data);
399    reader.jump(pos);
400
401    let mut r2 = reader.clone();
402    if reader
403        .clone()
404        .read_without_context::<ObjectIdentifier>()
405        .is_some()
406    {
407        populate_from_xref_stream(data, &mut r2, xref_map)
408    } else {
409        populate_from_xref_table(data, &mut r2, xref_map)
410    }
411}
412
413pub(super) struct SubsectionHeader {
414    pub(super) start: u32,
415    pub(super) num_entries: u32,
416}
417
418impl Readable<'_> for SubsectionHeader {
419    fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
420        r.skip_white_spaces();
421        let start = r.read_without_context::<u32>()?;
422        r.skip_white_spaces();
423        let num_entries = r.read_without_context::<u32>()?;
424        r.skip_white_spaces();
425
426        Some(Self { start, num_entries })
427    }
428}
429
430/// Populate the xref table, and return the trailer dict.
431fn populate_from_xref_table<'a>(
432    data: &'a [u8],
433    reader: &mut Reader<'a>,
434    insert_map: &mut XrefMap,
435) -> Option<&'a [u8]> {
436    let trailer = {
437        let mut reader = reader.clone();
438        read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
439    };
440
441    reader.skip_white_spaces();
442    reader.forward_tag(b"xref")?;
443    reader.skip_white_spaces();
444
445    let mut max_obj = 0;
446
447    if let Some(prev) = trailer.get::<i32>(PREV) {
448        // First insert the entries from any previous xref tables.
449        populate_xref_impl(data, prev as usize, insert_map)?;
450    }
451
452    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
453    // after looking at `PREV`.
454    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
455        populate_xref_impl(data, xref_stm as usize, insert_map)?;
456    }
457
458    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
459        reader.skip_white_spaces();
460
461        let start = header.start;
462        let end = start + header.num_entries;
463
464        for obj_number in start..end {
465            max_obj = max(max_obj, obj_number);
466            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
467            let entry = XRefEntry::read(bytes)?;
468
469            // Specification says we should ignore any object number > SIZE, but probably
470            // not important?
471            if entry.used {
472                insert_map.insert(
473                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
474                    EntryType::Normal(entry.offset),
475                );
476            }
477        }
478    }
479
480    Some(trailer.data())
481}
482
483fn populate_from_xref_stream<'a>(
484    data: &'a [u8],
485    reader: &mut Reader<'a>,
486    insert_map: &mut XrefMap,
487) -> Option<&'a [u8]> {
488    let stream = reader
489        .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
490        .get();
491
492    if let Some(prev) = stream.dict().get::<i32>(PREV) {
493        // First insert the entries from any previous xref tables.
494        let _ = populate_xref_impl(data, prev as usize, insert_map)?;
495    }
496
497    let size = stream.dict().get::<u32>(SIZE)?;
498
499    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
500
501    if f2_len > size_of::<u64>() as u8 {
502        error!("xref offset length is larger than the allowed limit");
503
504        return None;
505    }
506
507    // Do such files exist?
508    if f1_len != 1 {
509        warn!("first field in xref stream was longer than 1");
510    }
511
512    let xref_data = stream.decoded().ok()?;
513    let mut xref_reader = Reader::new(xref_data.as_ref());
514
515    if let Some(arr) = stream.dict().get::<Array>(INDEX) {
516        let iter = arr.iter::<(u32, u32)>();
517
518        for (start, num_elements) in iter {
519            xref_stream_subsection(
520                &mut xref_reader,
521                start,
522                num_elements,
523                f1_len,
524                f2_len,
525                f3_len,
526                insert_map,
527            )?;
528        }
529    } else {
530        xref_stream_subsection(
531            &mut xref_reader,
532            0,
533            size,
534            f1_len,
535            f2_len,
536            f3_len,
537            insert_map,
538        )?;
539    }
540
541    Some(stream.dict().data())
542}
543
544fn xref_stream_num(data: &[u8]) -> Option<u32> {
545    Some(match data.len() {
546        0 => return None,
547        1 => u8::from_be(data[0]) as u32,
548        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
549        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
550        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
551        8 => {
552            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
553                return Some(num);
554            } else {
555                warn!("xref stream number is too large");
556
557                return None;
558            }
559        }
560        n => {
561            warn!("invalid xref stream number {n}");
562
563            return None;
564        }
565    })
566}
567
568fn xref_stream_subsection<'a>(
569    xref_reader: &mut Reader<'a>,
570    start: u32,
571    num_elements: u32,
572    f1_len: u8,
573    f2_len: u8,
574    f3_len: u8,
575    insert_map: &mut XrefMap,
576) -> Option<()> {
577    for i in 0..num_elements {
578        let f_type = if f1_len == 0 {
579            1
580        } else {
581            // We assume a length of 1.
582            xref_reader.read_bytes(1)?[0]
583        };
584
585        let obj_number = start + i;
586
587        match f_type {
588            // We don't care about free objects.
589            0 => {
590                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
591            }
592            1 => {
593                let offset = if f2_len > 0 {
594                    let data = xref_reader.read_bytes(f2_len as usize)?;
595                    xref_stream_num(data)?
596                } else {
597                    0
598                };
599
600                let gen_number = if f3_len > 0 {
601                    let data = xref_reader.read_bytes(f3_len as usize)?;
602                    xref_stream_num(data)?
603                } else {
604                    0
605                };
606
607                insert_map.insert(
608                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
609                    EntryType::Normal(offset as usize),
610                );
611            }
612            2 => {
613                let obj_stream_number = {
614                    let data = xref_reader.read_bytes(f2_len as usize)?;
615                    xref_stream_num(data)?
616                };
617                let gen_number = 0;
618                let index = if f3_len > 0 {
619                    let data = xref_reader.read_bytes(f3_len as usize)?;
620                    xref_stream_num(data)?
621                } else {
622                    0
623                };
624
625                insert_map.insert(
626                    ObjectIdentifier::new(obj_number as i32, gen_number),
627                    EntryType::ObjStream(obj_stream_number, index),
628                );
629            }
630            _ => {
631                warn!("xref has unknown field type {f_type}");
632
633                return None;
634            }
635        }
636    }
637
638    Some(())
639}
640
641fn read_xref_table_trailer<'a>(
642    reader: &mut Reader<'a>,
643    ctx: ReaderContext<'a>,
644) -> Option<Dict<'a>> {
645    reader.skip_white_spaces();
646    reader.forward_tag(b"xref")?;
647    reader.skip_white_spaces();
648
649    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
650        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
651    }
652
653    reader.skip_white_spaces();
654    reader.forward_tag(b"trailer")?;
655    reader.skip_white_spaces();
656
657    reader.read_with_context::<Dict>(ctx)
658}
659
660struct ObjectStream<'a> {
661    data: &'a [u8],
662    ctx: ReaderContext<'a>,
663    offsets: Vec<(u32, usize)>,
664}
665
666impl<'a> ObjectStream<'a> {
667    fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
668        let num_objects = inner.dict().get::<usize>(N)?;
669        let first_offset = inner.dict().get::<usize>(FIRST)?;
670
671        let mut r = Reader::new(data);
672
673        let mut offsets = vec![];
674
675        for _ in 0..num_objects {
676            r.skip_white_spaces_and_comments();
677            // Skip object number
678            let obj_num = r.read_without_context::<u32>()?;
679            r.skip_white_spaces_and_comments();
680            let relative_offset = r.read_without_context::<usize>()?;
681            offsets.push((obj_num, first_offset + relative_offset));
682        }
683
684        Some(Self { data, ctx, offsets })
685    }
686
687    fn get<T>(&self, index: u32) -> Option<T>
688    where
689        T: ObjectLike<'a>,
690    {
691        let offset = self.offsets.get(index as usize)?.1;
692        let mut r = Reader::new(self.data);
693        r.jump(offset);
694
695        r.read_with_context::<T>(self.ctx)
696    }
697}