1use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11 ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28 Unknown,
29 Encrypted,
30}
31
32pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34 let mut xref_map = FxHashMap::default();
35 let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36 let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37 .ok_or(XRefError::Unknown)?;
38
39 XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44 warn!("xref table was invalid, trying to manually build xref table");
45 let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47 if let Some(trailer_dict_data) = trailer_dict {
48 warn!("rebuild xref table with {} entries", xref_map.len());
49
50 XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51 } else {
52 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54 None
55 }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59 let mut xref_map = FxHashMap::default();
60 let mut trailer_dict = None;
61
62 let mut r = Reader::new(data);
63
64 let mut dummy_ctx = ReaderContext::dummy();
65 let mut last_obj_num = None;
66
67 loop {
68 let cur_pos = r.offset();
69
70 let mut old_r = r.clone();
71
72 if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74 last_obj_num = Some(obj_id);
75 dummy_ctx.obj_number = Some(obj_id);
76 } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
77 if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
78 trailer_dict = Some(dict.clone());
79 }
80
81 if let Some(stream) = old_r.read::<Stream>(dummy_ctx)
82 && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
83 && let Some(data) = stream.decoded().ok()
84 && let Some(last_obj_num) = last_obj_num
85 && let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx)
86 {
87 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
88 let id = ObjectIdentifier::new(*obj_num as i32, 0);
89 xref_map.insert(
90 id,
91 EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
92 );
93 }
94 }
95 } else {
96 r.read_byte();
97 }
98
99 if r.at_end() {
100 break;
101 }
102 }
103
104 (xref_map, trailer_dict.map(|d| d.data()))
105}
106
107static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
108
109#[derive(Debug, Clone)]
111pub struct XRef(Inner);
112
113impl XRef {
114 fn new(
115 data: PdfData,
116 xref_map: XrefMap,
117 trailer_dict_data: &[u8],
118 repaired: bool,
119 ) -> Result<Self, XRefError> {
120 let trailer_data = TrailerData::dummy();
124
125 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
126 data: Arc::new(Data::new(data)),
127 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
128 trailer_data,
129 })));
130
131 let mut r = Reader::new(trailer_dict_data);
132 let trailer_dict = r
133 .read_with_context::<Dict>(ReaderContext::new(&xref, false))
134 .ok_or(XRefError::Unknown)?;
135
136 if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
137 warn!("encrypted PDF files are not yet supported");
138
139 return Err(XRefError::Encrypted);
140 }
141
142 let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
143 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
144 let version = root
145 .get::<Name>(VERSION)
146 .and_then(|v| PdfVersion::from_bytes(v.deref()));
147
148 let td = TrailerData {
149 pages_ref: pages_ref.into(),
150 version,
151 };
152
153 match &mut xref.0 {
154 Inner::Dummy => unreachable!(),
155 Inner::Some(r) => {
156 Arc::make_mut(r).trailer_data = td;
157 }
158 }
159
160 Ok(xref)
161 }
162
163 fn is_repaired(&self) -> bool {
164 match &self.0 {
165 Inner::Dummy => false,
166 Inner::Some(r) => {
167 let locked = r.map.read().unwrap();
168 locked.repaired
169 }
170 }
171 }
172
173 pub(crate) fn dummy() -> &'static XRef {
174 DUMMY_XREF
175 }
176
177 pub(crate) fn len(&self) -> usize {
178 match &self.0 {
179 Inner::Dummy => 0,
180 Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
181 }
182 }
183
184 pub(crate) fn trailer_data(&self) -> &TrailerData {
185 match &self.0 {
186 Inner::Dummy => unreachable!(),
187 Inner::Some(r) => &r.trailer_data,
188 }
189 }
190
191 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
192 match &self.0 {
193 Inner::Dummy => unimplemented!(),
194 Inner::Some(r) => iter::from_fn(move || {
195 let locked = r.map.read().unwrap();
196 let mut iter = locked.xref_map.keys();
197
198 iter.next().and_then(|k| self.get(*k))
199 }),
200 }
201 }
202
203 pub(crate) fn repair(&self) {
204 let Inner::Some(r) = &self.0 else {
205 unreachable!();
206 };
207
208 let mut locked = r.map.try_write().unwrap();
209 assert!(!locked.repaired);
210
211 let (xref_map, _) = fallback_xref_map(r.data.get());
212 locked.xref_map = xref_map;
213 locked.repaired = true;
214 }
215
216 #[allow(private_bounds)]
218 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
219 where
220 T: ObjectLike<'a>,
221 {
222 let Inner::Some(repr) = &self.0 else {
223 return None;
224 };
225
226 let locked = repr.map.try_read().unwrap();
227
228 let mut r = Reader::new(repr.data.get());
229
230 let entry = *locked.xref_map.get(&id).or({
231 None
234 })?;
235 drop(locked);
236
237 match entry {
238 EntryType::Normal(offset) => {
239 r.jump(offset);
240
241 if let Some(object) =
242 r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
243 {
244 if object.id() == &id {
245 return Some(object.get());
246 }
247 } else {
248 if r.skip_not_in_content_stream::<IndirectObject<Object>>()
251 .is_some()
252 {
253 return None;
254 }
255 };
256
257 if self.is_repaired() {
259 error!(
260 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
261 );
262
263 None
264 } else {
265 warn!("broken xref, attempting to repair");
266
267 self.repair();
268
269 self.get::<T>(id)
271 }
272 }
273 EntryType::ObjStream(obj_stram_gen_num, index) => {
274 let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
276
277 let stream = self.get::<Stream>(obj_stream_id)?;
278 let data = repr.data.get_with(obj_stream_id, self)?;
279 let object_stream =
280 ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
281 object_stream.get(index)
282 }
283 }
284 }
285}
286
287pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
288 let mut finder = Reader::new(data);
289 let mut pos = finder.len().checked_sub(1)?;
290 finder.jump(pos);
291
292 let needle = b"startxref";
293
294 loop {
295 if finder.forward_tag(needle).is_some() {
296 finder.skip_white_spaces_and_comments();
297
298 let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
299
300 return Some(offset);
301 }
302
303 pos = pos.checked_sub(1)?;
304 finder.jump(pos);
305 }
306}
307
308#[derive(Debug, PartialEq, Eq, Clone, Copy)]
310enum EntryType {
311 Normal(usize),
313 ObjStream(u32, u32),
317}
318
319type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
320
321#[derive(Debug)]
323struct MapRepr {
324 xref_map: XrefMap,
325 repaired: bool,
326}
327
328#[derive(Debug, Copy, Clone)]
329pub(crate) struct TrailerData {
330 pub pages_ref: ObjectIdentifier,
331 pub version: Option<PdfVersion>,
332}
333
334impl TrailerData {
335 pub fn dummy() -> Self {
336 Self {
337 pages_ref: ObjectIdentifier::new(0, 0),
338 version: None,
339 }
340 }
341}
342
343#[derive(Debug, Clone)]
344struct SomeRepr {
345 data: Arc<Data>,
346 map: Arc<RwLock<MapRepr>>,
347 trailer_data: TrailerData,
348}
349
350#[derive(Debug, Clone)]
351enum Inner {
352 Dummy,
354 Some(Arc<SomeRepr>),
356}
357
358#[derive(Debug)]
359struct XRefEntry {
360 offset: usize,
361 gen_number: i32,
362 used: bool,
363}
364
365impl XRefEntry {
366 pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
367 #[inline(always)]
368 fn parse_u32(data: &[u8]) -> Option<u32> {
369 let mut accum = 0;
370
371 for byte in data {
372 accum *= 10;
373
374 match *byte {
375 b'0'..=b'9' => accum += (*byte - b'0') as u32,
376 _ => return None,
377 }
378 }
379
380 Some(accum)
381 }
382
383 let offset = parse_u32(&data[0..10])? as usize;
384 let gen_number = parse_u32(&data[11..16])? as i32;
385
386 let used = data[17] == b'n';
387
388 Some(Self {
389 offset,
390 gen_number,
391 used,
392 })
393 }
394}
395
396fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
397 let mut reader = Reader::new(data);
398 reader.jump(pos);
399
400 let mut r2 = reader.clone();
401 if reader
402 .clone()
403 .read_without_context::<ObjectIdentifier>()
404 .is_some()
405 {
406 populate_from_xref_stream(data, &mut r2, xref_map)
407 } else {
408 populate_from_xref_table(data, &mut r2, xref_map)
409 }
410}
411
412pub(super) struct SubsectionHeader {
413 pub(super) start: u32,
414 pub(super) num_entries: u32,
415}
416
417impl Readable<'_> for SubsectionHeader {
418 fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
419 r.skip_white_spaces();
420 let start = r.read_without_context::<u32>()?;
421 r.skip_white_spaces();
422 let num_entries = r.read_without_context::<u32>()?;
423 r.skip_white_spaces();
424
425 Some(Self { start, num_entries })
426 }
427}
428
429fn populate_from_xref_table<'a>(
431 data: &'a [u8],
432 reader: &mut Reader<'a>,
433 insert_map: &mut XrefMap,
434) -> Option<&'a [u8]> {
435 let trailer = {
436 let mut reader = reader.clone();
437 read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
438 };
439
440 reader.skip_white_spaces();
441 reader.forward_tag(b"xref")?;
442 reader.skip_white_spaces();
443
444 let mut max_obj = 0;
445
446 if let Some(prev) = trailer.get::<i32>(PREV) {
447 populate_xref_impl(data, prev as usize, insert_map)?;
449 }
450
451 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
454 populate_xref_impl(data, xref_stm as usize, insert_map)?;
455 }
456
457 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
458 reader.skip_white_spaces();
459
460 let start = header.start;
461 let end = start + header.num_entries;
462
463 for obj_number in start..end {
464 max_obj = max(max_obj, obj_number);
465 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
466 let entry = XRefEntry::read(bytes)?;
467
468 if entry.used {
471 insert_map.insert(
472 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
473 EntryType::Normal(entry.offset),
474 );
475 }
476 }
477 }
478
479 Some(trailer.data())
480}
481
482fn populate_from_xref_stream<'a>(
483 data: &'a [u8],
484 reader: &mut Reader<'a>,
485 insert_map: &mut XrefMap,
486) -> Option<&'a [u8]> {
487 let stream = reader
488 .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
489 .get();
490
491 if let Some(prev) = stream.dict().get::<i32>(PREV) {
492 let _ = populate_xref_impl(data, prev as usize, insert_map)?;
494 }
495
496 let size = stream.dict().get::<u32>(SIZE)?;
497
498 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
499
500 if f2_len > size_of::<u64>() as u8 {
501 error!("xref offset length is larger than the allowed limit");
502
503 return None;
504 }
505
506 if f1_len != 1 {
508 warn!("first field in xref stream was longer than 1");
509 }
510
511 let xref_data = stream.decoded().ok()?;
512 let mut xref_reader = Reader::new(xref_data.as_ref());
513
514 if let Some(arr) = stream.dict().get::<Array>(INDEX) {
515 let iter = arr.iter::<(u32, u32)>();
516
517 for (start, num_elements) in iter {
518 xref_stream_subsection(
519 &mut xref_reader,
520 start,
521 num_elements,
522 f1_len,
523 f2_len,
524 f3_len,
525 insert_map,
526 )?;
527 }
528 } else {
529 xref_stream_subsection(
530 &mut xref_reader,
531 0,
532 size,
533 f1_len,
534 f2_len,
535 f3_len,
536 insert_map,
537 )?;
538 }
539
540 Some(stream.dict().data())
541}
542
543fn xref_stream_num(data: &[u8]) -> Option<u32> {
544 Some(match data.len() {
545 0 => return None,
546 1 => u8::from_be(data[0]) as u32,
547 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
548 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
549 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
550 8 => {
551 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
552 return Some(num);
553 } else {
554 warn!("xref stream number is too large");
555
556 return None;
557 }
558 }
559 n => {
560 warn!("invalid xref stream number {n}");
561
562 return None;
563 }
564 })
565}
566
567fn xref_stream_subsection<'a>(
568 xref_reader: &mut Reader<'a>,
569 start: u32,
570 num_elements: u32,
571 f1_len: u8,
572 f2_len: u8,
573 f3_len: u8,
574 insert_map: &mut XrefMap,
575) -> Option<()> {
576 for i in 0..num_elements {
577 let f_type = if f1_len == 0 {
578 1
579 } else {
580 xref_reader.read_bytes(1)?[0]
582 };
583
584 let obj_number = start + i;
585
586 match f_type {
587 0 => {
589 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
590 }
591 1 => {
592 let offset = if f2_len > 0 {
593 let data = xref_reader.read_bytes(f2_len as usize)?;
594 xref_stream_num(data)?
595 } else {
596 0
597 };
598
599 let gen_number = if f3_len > 0 {
600 let data = xref_reader.read_bytes(f3_len as usize)?;
601 xref_stream_num(data)?
602 } else {
603 0
604 };
605
606 insert_map.insert(
607 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
608 EntryType::Normal(offset as usize),
609 );
610 }
611 2 => {
612 let obj_stream_number = {
613 let data = xref_reader.read_bytes(f2_len as usize)?;
614 xref_stream_num(data)?
615 };
616 let gen_number = 0;
617 let index = if f3_len > 0 {
618 let data = xref_reader.read_bytes(f3_len as usize)?;
619 xref_stream_num(data)?
620 } else {
621 0
622 };
623
624 insert_map.insert(
625 ObjectIdentifier::new(obj_number as i32, gen_number),
626 EntryType::ObjStream(obj_stream_number, index),
627 );
628 }
629 _ => {
630 warn!("xref has unknown field type {f_type}");
631
632 return None;
633 }
634 }
635 }
636
637 Some(())
638}
639
640fn read_xref_table_trailer<'a>(
641 reader: &mut Reader<'a>,
642 ctx: ReaderContext<'a>,
643) -> Option<Dict<'a>> {
644 reader.skip_white_spaces();
645 reader.forward_tag(b"xref")?;
646 reader.skip_white_spaces();
647
648 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
649 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
650 }
651
652 reader.skip_white_spaces();
653 reader.forward_tag(b"trailer")?;
654 reader.skip_white_spaces();
655
656 reader.read_with_context::<Dict>(ctx)
657}
658
659struct ObjectStream<'a> {
660 data: &'a [u8],
661 ctx: ReaderContext<'a>,
662 offsets: Vec<(u32, usize)>,
663}
664
665impl<'a> ObjectStream<'a> {
666 fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
667 let num_objects = inner.dict().get::<usize>(N)?;
668 let first_offset = inner.dict().get::<usize>(FIRST)?;
669
670 let mut r = Reader::new(data);
671
672 let mut offsets = vec![];
673
674 for _ in 0..num_objects {
675 r.skip_white_spaces_and_comments();
676 let obj_num = r.read_without_context::<u32>()?;
678 r.skip_white_spaces_and_comments();
679 let relative_offset = r.read_without_context::<usize>()?;
680 offsets.push((obj_num, first_offset + relative_offset));
681 }
682
683 Some(Self { data, ctx, offsets })
684 }
685
686 fn get<T>(&self, index: u32) -> Option<T>
687 where
688 T: ObjectLike<'a>,
689 {
690 let offset = self.offsets.get(index as usize)?.1;
691 let mut r = Reader::new(self.data);
692 r.jump(offset);
693 r.skip_white_spaces_and_comments();
694
695 r.read_with_context::<T>(self.ctx)
696 }
697}