hayro_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11    ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28    Unknown,
29    Encrypted,
30}
31
32/// Parse the "root" xref from the PDF.
33pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34    let mut xref_map = FxHashMap::default();
35    let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36    let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37        .ok_or(XRefError::Unknown)?;
38
39    XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42/// Try to manually parse the PDF to build an xref table and trailer dictionary.
43pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44    warn!("xref table was invalid, trying to manually build xref table");
45    let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47    if let Some(trailer_dict_data) = trailer_dict {
48        warn!("rebuild xref table with {} entries", xref_map.len());
49
50        XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51    } else {
52        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54        None
55    }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59    let mut xref_map = FxHashMap::default();
60    let mut trailer_dict = None;
61
62    let mut r = Reader::new(data);
63
64    let mut dummy_ctx = ReaderContext::dummy();
65    let mut last_obj_num = None;
66
67    loop {
68        let cur_pos = r.offset();
69
70        let mut old_r = r.clone();
71
72        if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73            xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74            last_obj_num = Some(obj_id);
75            dummy_ctx.obj_number = Some(obj_id);
76        } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
77            if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
78                trailer_dict = Some(dict.clone());
79            }
80
81            if let Some(stream) = old_r.read::<Stream>(dummy_ctx)
82                && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
83                && let Some(data) = stream.decoded().ok()
84                && let Some(last_obj_num) = last_obj_num
85                && let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx)
86            {
87                for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
88                    let id = ObjectIdentifier::new(*obj_num as i32, 0);
89                    xref_map.insert(
90                        id,
91                        EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
92                    );
93                }
94            }
95        } else {
96            r.read_byte();
97        }
98
99        if r.at_end() {
100            break;
101        }
102    }
103
104    (xref_map, trailer_dict.map(|d| d.data()))
105}
106
107static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
108
109/// An xref table.
110#[derive(Debug, Clone)]
111pub struct XRef(Inner);
112
113impl XRef {
114    fn new(
115        data: PdfData,
116        xref_map: XrefMap,
117        trailer_dict_data: &[u8],
118        repaired: bool,
119    ) -> Result<Self, XRefError> {
120        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
121        // before we actually created the xref struct. So we first create it using dummy data
122        // and then populate the data.
123        let trailer_data = TrailerData::dummy();
124
125        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
126            data: Arc::new(Data::new(data)),
127            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
128            trailer_data,
129        })));
130
131        let mut r = Reader::new(trailer_dict_data);
132        let trailer_dict = r
133            .read_with_context::<Dict>(ReaderContext::new(&xref, false))
134            .ok_or(XRefError::Unknown)?;
135
136        if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
137            warn!("encrypted PDF files are not yet supported");
138
139            return Err(XRefError::Encrypted);
140        }
141
142        let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
143        let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
144        let version = root
145            .get::<Name>(VERSION)
146            .and_then(|v| PdfVersion::from_bytes(v.deref()));
147
148        let td = TrailerData {
149            pages_ref: pages_ref.into(),
150            version,
151        };
152
153        match &mut xref.0 {
154            Inner::Dummy => unreachable!(),
155            Inner::Some(r) => {
156                Arc::make_mut(r).trailer_data = td;
157            }
158        }
159
160        Ok(xref)
161    }
162
163    fn is_repaired(&self) -> bool {
164        match &self.0 {
165            Inner::Dummy => false,
166            Inner::Some(r) => {
167                let locked = r.map.read().unwrap();
168                locked.repaired
169            }
170        }
171    }
172
173    pub(crate) fn dummy() -> &'static XRef {
174        DUMMY_XREF
175    }
176
177    pub(crate) fn len(&self) -> usize {
178        match &self.0 {
179            Inner::Dummy => 0,
180            Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
181        }
182    }
183
184    pub(crate) fn trailer_data(&self) -> &TrailerData {
185        match &self.0 {
186            Inner::Dummy => unreachable!(),
187            Inner::Some(r) => &r.trailer_data,
188        }
189    }
190
191    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
192        match &self.0 {
193            Inner::Dummy => unimplemented!(),
194            Inner::Some(r) => iter::from_fn(move || {
195                let locked = r.map.read().unwrap();
196                let mut iter = locked.xref_map.keys();
197
198                iter.next().and_then(|k| self.get(*k))
199            }),
200        }
201    }
202
203    pub(crate) fn repair(&self) {
204        let Inner::Some(r) = &self.0 else {
205            unreachable!();
206        };
207
208        let mut locked = r.map.try_write().unwrap();
209        assert!(!locked.repaired);
210
211        let (xref_map, _) = fallback_xref_map(r.data.get());
212        locked.xref_map = xref_map;
213        locked.repaired = true;
214    }
215
216    /// Return the object with the given identifier.
217    #[allow(private_bounds)]
218    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
219    where
220        T: ObjectLike<'a>,
221    {
222        let Inner::Some(repr) = &self.0 else {
223            return None;
224        };
225
226        let locked = repr.map.try_read().unwrap();
227
228        let mut r = Reader::new(repr.data.get());
229
230        let entry = *locked.xref_map.get(&id).or({
231            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
232            // shall be treated as a reference to the null object.
233            None
234        })?;
235        drop(locked);
236
237        match entry {
238            EntryType::Normal(offset) => {
239                r.jump(offset);
240
241                if let Some(object) =
242                    r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
243                {
244                    if object.id() == &id {
245                        return Some(object.get());
246                    }
247                } else {
248                    // There is a valid object at the offset, it's just not of the type the caller
249                    // expected, which is fine.
250                    if r.skip_not_in_content_stream::<IndirectObject<Object>>()
251                        .is_some()
252                    {
253                        return None;
254                    }
255                };
256
257                // The xref table is broken, try to repair if not already repaired.
258                if self.is_repaired() {
259                    error!(
260                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
261                    );
262
263                    None
264                } else {
265                    warn!("broken xref, attempting to repair");
266
267                    self.repair();
268
269                    // Now try reading again.
270                    self.get::<T>(id)
271                }
272            }
273            EntryType::ObjStream(obj_stram_gen_num, index) => {
274                // Generation number is implicitly 0.
275                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
276
277                let stream = self.get::<Stream>(obj_stream_id)?;
278                let data = repr.data.get_with(obj_stream_id, self)?;
279                let object_stream =
280                    ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
281                object_stream.get(index)
282            }
283        }
284    }
285}
286
287pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
288    let mut finder = Reader::new(data);
289    let mut pos = finder.len().checked_sub(1)?;
290    finder.jump(pos);
291
292    let needle = b"startxref";
293
294    loop {
295        if finder.forward_tag(needle).is_some() {
296            finder.skip_white_spaces_and_comments();
297
298            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
299
300            return Some(offset);
301        }
302
303        pos = pos.checked_sub(1)?;
304        finder.jump(pos);
305    }
306}
307
308/// A type of xref entry.
309#[derive(Debug, PartialEq, Eq, Clone, Copy)]
310enum EntryType {
311    /// An indirect object that is at a specific offset in the original data.
312    Normal(usize),
313    /// An indirect object that is part of an object stream. First number indicates the object
314    /// number of the _object stream_ (the generation number is always 0), the second number indicates
315    /// the index in the object stream.
316    ObjStream(u32, u32),
317}
318
319type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
320
321/// Representation of a proper xref table.
322#[derive(Debug)]
323struct MapRepr {
324    xref_map: XrefMap,
325    repaired: bool,
326}
327
328#[derive(Debug, Copy, Clone)]
329pub(crate) struct TrailerData {
330    pub pages_ref: ObjectIdentifier,
331    pub version: Option<PdfVersion>,
332}
333
334impl TrailerData {
335    pub fn dummy() -> Self {
336        Self {
337            pages_ref: ObjectIdentifier::new(0, 0),
338            version: None,
339        }
340    }
341}
342
343#[derive(Debug, Clone)]
344struct SomeRepr {
345    data: Arc<Data>,
346    map: Arc<RwLock<MapRepr>>,
347    trailer_data: TrailerData,
348}
349
350#[derive(Debug, Clone)]
351enum Inner {
352    /// A dummy xref table that doesn't have any entries.
353    Dummy,
354    /// A proper xref table.
355    Some(Arc<SomeRepr>),
356}
357
358#[derive(Debug)]
359struct XRefEntry {
360    offset: usize,
361    gen_number: i32,
362    used: bool,
363}
364
365impl XRefEntry {
366    pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
367        #[inline(always)]
368        fn parse_u32(data: &[u8]) -> Option<u32> {
369            let mut accum = 0;
370
371            for byte in data {
372                accum *= 10;
373
374                match *byte {
375                    b'0'..=b'9' => accum += (*byte - b'0') as u32,
376                    _ => return None,
377                }
378            }
379
380            Some(accum)
381        }
382
383        let offset = parse_u32(&data[0..10])? as usize;
384        let gen_number = parse_u32(&data[11..16])? as i32;
385
386        let used = data[17] == b'n';
387
388        Some(Self {
389            offset,
390            gen_number,
391            used,
392        })
393    }
394}
395
396fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
397    let mut reader = Reader::new(data);
398    reader.jump(pos);
399
400    let mut r2 = reader.clone();
401    if reader
402        .clone()
403        .read_without_context::<ObjectIdentifier>()
404        .is_some()
405    {
406        populate_from_xref_stream(data, &mut r2, xref_map)
407    } else {
408        populate_from_xref_table(data, &mut r2, xref_map)
409    }
410}
411
412pub(super) struct SubsectionHeader {
413    pub(super) start: u32,
414    pub(super) num_entries: u32,
415}
416
417impl Readable<'_> for SubsectionHeader {
418    fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
419        r.skip_white_spaces();
420        let start = r.read_without_context::<u32>()?;
421        r.skip_white_spaces();
422        let num_entries = r.read_without_context::<u32>()?;
423        r.skip_white_spaces();
424
425        Some(Self { start, num_entries })
426    }
427}
428
429/// Populate the xref table, and return the trailer dict.
430fn populate_from_xref_table<'a>(
431    data: &'a [u8],
432    reader: &mut Reader<'a>,
433    insert_map: &mut XrefMap,
434) -> Option<&'a [u8]> {
435    let trailer = {
436        let mut reader = reader.clone();
437        read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
438    };
439
440    reader.skip_white_spaces();
441    reader.forward_tag(b"xref")?;
442    reader.skip_white_spaces();
443
444    let mut max_obj = 0;
445
446    if let Some(prev) = trailer.get::<i32>(PREV) {
447        // First insert the entries from any previous xref tables.
448        populate_xref_impl(data, prev as usize, insert_map)?;
449    }
450
451    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
452    // after looking at `PREV`.
453    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
454        populate_xref_impl(data, xref_stm as usize, insert_map)?;
455    }
456
457    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
458        reader.skip_white_spaces();
459
460        let start = header.start;
461        let end = start + header.num_entries;
462
463        for obj_number in start..end {
464            max_obj = max(max_obj, obj_number);
465            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
466            let entry = XRefEntry::read(bytes)?;
467
468            // Specification says we should ignore any object number > SIZE, but probably
469            // not important?
470            if entry.used {
471                insert_map.insert(
472                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
473                    EntryType::Normal(entry.offset),
474                );
475            }
476        }
477    }
478
479    Some(trailer.data())
480}
481
482fn populate_from_xref_stream<'a>(
483    data: &'a [u8],
484    reader: &mut Reader<'a>,
485    insert_map: &mut XrefMap,
486) -> Option<&'a [u8]> {
487    let stream = reader
488        .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
489        .get();
490
491    if let Some(prev) = stream.dict().get::<i32>(PREV) {
492        // First insert the entries from any previous xref tables.
493        let _ = populate_xref_impl(data, prev as usize, insert_map)?;
494    }
495
496    let size = stream.dict().get::<u32>(SIZE)?;
497
498    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
499
500    if f2_len > size_of::<u64>() as u8 {
501        error!("xref offset length is larger than the allowed limit");
502
503        return None;
504    }
505
506    // Do such files exist?
507    if f1_len != 1 {
508        warn!("first field in xref stream was longer than 1");
509    }
510
511    let xref_data = stream.decoded().ok()?;
512    let mut xref_reader = Reader::new(xref_data.as_ref());
513
514    if let Some(arr) = stream.dict().get::<Array>(INDEX) {
515        let iter = arr.iter::<(u32, u32)>();
516
517        for (start, num_elements) in iter {
518            xref_stream_subsection(
519                &mut xref_reader,
520                start,
521                num_elements,
522                f1_len,
523                f2_len,
524                f3_len,
525                insert_map,
526            )?;
527        }
528    } else {
529        xref_stream_subsection(
530            &mut xref_reader,
531            0,
532            size,
533            f1_len,
534            f2_len,
535            f3_len,
536            insert_map,
537        )?;
538    }
539
540    Some(stream.dict().data())
541}
542
543fn xref_stream_num(data: &[u8]) -> Option<u32> {
544    Some(match data.len() {
545        0 => return None,
546        1 => u8::from_be(data[0]) as u32,
547        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
548        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
549        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
550        8 => {
551            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
552                return Some(num);
553            } else {
554                warn!("xref stream number is too large");
555
556                return None;
557            }
558        }
559        n => {
560            warn!("invalid xref stream number {n}");
561
562            return None;
563        }
564    })
565}
566
567fn xref_stream_subsection<'a>(
568    xref_reader: &mut Reader<'a>,
569    start: u32,
570    num_elements: u32,
571    f1_len: u8,
572    f2_len: u8,
573    f3_len: u8,
574    insert_map: &mut XrefMap,
575) -> Option<()> {
576    for i in 0..num_elements {
577        let f_type = if f1_len == 0 {
578            1
579        } else {
580            // We assume a length of 1.
581            xref_reader.read_bytes(1)?[0]
582        };
583
584        let obj_number = start + i;
585
586        match f_type {
587            // We don't care about free objects.
588            0 => {
589                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
590            }
591            1 => {
592                let offset = if f2_len > 0 {
593                    let data = xref_reader.read_bytes(f2_len as usize)?;
594                    xref_stream_num(data)?
595                } else {
596                    0
597                };
598
599                let gen_number = if f3_len > 0 {
600                    let data = xref_reader.read_bytes(f3_len as usize)?;
601                    xref_stream_num(data)?
602                } else {
603                    0
604                };
605
606                insert_map.insert(
607                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
608                    EntryType::Normal(offset as usize),
609                );
610            }
611            2 => {
612                let obj_stream_number = {
613                    let data = xref_reader.read_bytes(f2_len as usize)?;
614                    xref_stream_num(data)?
615                };
616                let gen_number = 0;
617                let index = if f3_len > 0 {
618                    let data = xref_reader.read_bytes(f3_len as usize)?;
619                    xref_stream_num(data)?
620                } else {
621                    0
622                };
623
624                insert_map.insert(
625                    ObjectIdentifier::new(obj_number as i32, gen_number),
626                    EntryType::ObjStream(obj_stream_number, index),
627                );
628            }
629            _ => {
630                warn!("xref has unknown field type {f_type}");
631
632                return None;
633            }
634        }
635    }
636
637    Some(())
638}
639
640fn read_xref_table_trailer<'a>(
641    reader: &mut Reader<'a>,
642    ctx: ReaderContext<'a>,
643) -> Option<Dict<'a>> {
644    reader.skip_white_spaces();
645    reader.forward_tag(b"xref")?;
646    reader.skip_white_spaces();
647
648    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
649        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
650    }
651
652    reader.skip_white_spaces();
653    reader.forward_tag(b"trailer")?;
654    reader.skip_white_spaces();
655
656    reader.read_with_context::<Dict>(ctx)
657}
658
659struct ObjectStream<'a> {
660    data: &'a [u8],
661    ctx: ReaderContext<'a>,
662    offsets: Vec<(u32, usize)>,
663}
664
665impl<'a> ObjectStream<'a> {
666    fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
667        let num_objects = inner.dict().get::<usize>(N)?;
668        let first_offset = inner.dict().get::<usize>(FIRST)?;
669
670        let mut r = Reader::new(data);
671
672        let mut offsets = vec![];
673
674        for _ in 0..num_objects {
675            r.skip_white_spaces_and_comments();
676            // Skip object number
677            let obj_num = r.read_without_context::<u32>()?;
678            r.skip_white_spaces_and_comments();
679            let relative_offset = r.read_without_context::<usize>()?;
680            offsets.push((obj_num, first_offset + relative_offset));
681        }
682
683        Some(Self { data, ctx, offsets })
684    }
685
686    fn get<T>(&self, index: u32) -> Option<T>
687    where
688        T: ObjectLike<'a>,
689    {
690        let offset = self.offsets.get(index as usize)?.1;
691        let mut r = Reader::new(self.data);
692        r.jump(offset);
693        r.skip_white_spaces_and_comments();
694
695        r.read_with_context::<T>(self.ctx)
696    }
697}