1use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11 ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28 Unknown,
29 Encrypted,
30}
31
32pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34 let mut xref_map = FxHashMap::default();
35 let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36 let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37 .ok_or(XRefError::Unknown)?;
38
39 XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44 warn!("xref table was invalid, trying to manually build xref table");
45 let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47 if let Some(trailer_dict_data) = trailer_dict {
48 warn!("rebuild xref table with {} entries", xref_map.len());
49
50 XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51 } else {
52 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54 None
55 }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59 let mut xref_map = FxHashMap::default();
60 let mut trailer_dict = None;
61
62 let mut r = Reader::new(data);
63
64 let mut dummy_ctx = ReaderContext::dummy();
65 let mut last_obj_num = None;
66
67 loop {
68 let cur_pos = r.offset();
69
70 let mut old_r = r.clone();
71
72 if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74 last_obj_num = Some(obj_id);
75 dummy_ctx.obj_number = Some(obj_id);
76 } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
77 if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
78 trailer_dict = Some(dict.clone());
79 }
80
81 if let Some(stream) = old_r.read::<Stream>(dummy_ctx) {
82 if stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
83 && let Some(data) = stream.decoded().ok()
84 && let Some(last_obj_num) = last_obj_num
85 {
86 if let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx) {
87 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
88 let id = ObjectIdentifier::new(*obj_num as i32, 0);
89 xref_map.insert(
90 id,
91 EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
92 );
93 }
94 }
95 }
96 }
97 } else {
98 r.read_byte();
99 }
100
101 if r.at_end() {
102 break;
103 }
104 }
105
106 (xref_map, trailer_dict.map(|d| d.data()))
107}
108
109static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
110
111#[derive(Debug, Clone)]
113pub struct XRef(Inner);
114
115impl XRef {
116 fn new(
117 data: PdfData,
118 xref_map: XrefMap,
119 trailer_dict_data: &[u8],
120 repaired: bool,
121 ) -> Result<Self, XRefError> {
122 let trailer_data = TrailerData::dummy();
126
127 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
128 data: Arc::new(Data::new(data)),
129 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
130 trailer_data,
131 })));
132
133 let mut r = Reader::new(trailer_dict_data);
134 let trailer_dict = r
135 .read_with_context::<Dict>(ReaderContext::new(&xref, false))
136 .ok_or(XRefError::Unknown)?;
137
138 if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
139 warn!("encrypted PDF files are not yet supported");
140
141 return Err(XRefError::Encrypted);
142 }
143
144 let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
145 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
146 let version = root
147 .get::<Name>(VERSION)
148 .and_then(|v| PdfVersion::from_bytes(v.deref()));
149
150 let td = TrailerData {
151 pages_ref: pages_ref.into(),
152 version,
153 };
154
155 match &mut xref.0 {
156 Inner::Dummy => unreachable!(),
157 Inner::Some(r) => {
158 Arc::make_mut(r).trailer_data = td;
159 }
160 }
161
162 Ok(xref)
163 }
164
165 fn is_repaired(&self) -> bool {
166 match &self.0 {
167 Inner::Dummy => false,
168 Inner::Some(r) => {
169 let locked = r.map.read().unwrap();
170 locked.repaired
171 }
172 }
173 }
174
175 pub(crate) fn dummy() -> &'static XRef {
176 DUMMY_XREF
177 }
178
179 pub(crate) fn len(&self) -> usize {
180 match &self.0 {
181 Inner::Dummy => 0,
182 Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
183 }
184 }
185
186 pub(crate) fn trailer_data(&self) -> &TrailerData {
187 match &self.0 {
188 Inner::Dummy => unreachable!(),
189 Inner::Some(r) => &r.trailer_data,
190 }
191 }
192
193 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
194 match &self.0 {
195 Inner::Dummy => unimplemented!(),
196 Inner::Some(r) => iter::from_fn(move || {
197 let locked = r.map.read().unwrap();
198 let mut iter = locked.xref_map.keys();
199
200 iter.next().and_then(|k| self.get(*k))
201 }),
202 }
203 }
204
205 pub(crate) fn repair(&self) {
206 let Inner::Some(r) = &self.0 else {
207 unreachable!();
208 };
209
210 let mut locked = r.map.try_write().unwrap();
211 assert!(!locked.repaired);
212
213 let (xref_map, _) = fallback_xref_map(r.data.get());
214 locked.xref_map = xref_map;
215 locked.repaired = true;
216 }
217
218 #[allow(private_bounds)]
220 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
221 where
222 T: ObjectLike<'a>,
223 {
224 let Inner::Some(repr) = &self.0 else {
225 return None;
226 };
227
228 let locked = repr.map.try_read().unwrap();
229
230 let mut r = Reader::new(repr.data.get());
231
232 let entry = *locked.xref_map.get(&id).or({
233 None
236 })?;
237 drop(locked);
238
239 match entry {
240 EntryType::Normal(offset) => {
241 r.jump(offset);
242
243 if let Some(object) =
244 r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
245 {
246 if object.id() == &id {
247 return Some(object.get());
248 }
249 } else {
250 if r.skip_not_in_content_stream::<IndirectObject<Object>>()
253 .is_some()
254 {
255 return None;
256 }
257 };
258
259 if self.is_repaired() {
261 error!(
262 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
263 );
264
265 None
266 } else {
267 warn!("broken xref, attempting to repair");
268
269 self.repair();
270
271 self.get::<T>(id)
273 }
274 }
275 EntryType::ObjStream(obj_stram_gen_num, index) => {
276 let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
278
279 let stream = self.get::<Stream>(obj_stream_id)?;
280 let data = repr.data.get_with(obj_stream_id, self)?;
281 let object_stream =
282 ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
283 object_stream.get(index)
284 }
285 }
286 }
287}
288
289pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
290 let mut finder = Reader::new(data);
291 let mut pos = finder.len().checked_sub(1)?;
292 finder.jump(pos);
293
294 let needle = b"startxref";
295
296 loop {
297 if finder.forward_tag(needle).is_some() {
298 finder.skip_white_spaces_and_comments();
299
300 let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
301
302 return Some(offset);
303 }
304
305 pos = pos.checked_sub(1)?;
306 finder.jump(pos);
307 }
308}
309
310#[derive(Debug, PartialEq, Eq, Clone, Copy)]
312enum EntryType {
313 Normal(usize),
315 ObjStream(u32, u32),
319}
320
321type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
322
323#[derive(Debug)]
325struct MapRepr {
326 xref_map: XrefMap,
327 repaired: bool,
328}
329
330#[derive(Debug, Copy, Clone)]
331pub(crate) struct TrailerData {
332 pub pages_ref: ObjectIdentifier,
333 pub version: Option<PdfVersion>,
334}
335
336impl TrailerData {
337 pub fn dummy() -> Self {
338 Self {
339 pages_ref: ObjectIdentifier::new(0, 0),
340 version: None,
341 }
342 }
343}
344
345#[derive(Debug, Clone)]
346struct SomeRepr {
347 data: Arc<Data>,
348 map: Arc<RwLock<MapRepr>>,
349 trailer_data: TrailerData,
350}
351
352#[derive(Debug, Clone)]
353enum Inner {
354 Dummy,
356 Some(Arc<SomeRepr>),
358}
359
360#[derive(Debug)]
361struct XRefEntry {
362 offset: usize,
363 gen_number: i32,
364 used: bool,
365}
366
367impl XRefEntry {
368 pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
369 #[inline(always)]
370 fn parse_u32(data: &[u8]) -> Option<u32> {
371 let mut accum = 0;
372
373 for byte in data {
374 accum *= 10;
375
376 match *byte {
377 b'0'..=b'9' => accum += (*byte - b'0') as u32,
378 _ => return None,
379 }
380 }
381
382 Some(accum)
383 }
384
385 let offset = parse_u32(&data[0..10])? as usize;
386 let gen_number = parse_u32(&data[11..16])? as i32;
387
388 let used = data[17] == b'n';
389
390 Some(Self {
391 offset,
392 gen_number,
393 used,
394 })
395 }
396}
397
398fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
399 let mut reader = Reader::new(data);
400 reader.jump(pos);
401
402 let mut r2 = reader.clone();
403 if reader
404 .clone()
405 .read_without_context::<ObjectIdentifier>()
406 .is_some()
407 {
408 populate_from_xref_stream(data, &mut r2, xref_map)
409 } else {
410 populate_from_xref_table(data, &mut r2, xref_map)
411 }
412}
413
414pub(super) struct SubsectionHeader {
415 pub(super) start: u32,
416 pub(super) num_entries: u32,
417}
418
419impl Readable<'_> for SubsectionHeader {
420 fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
421 r.skip_white_spaces();
422 let start = r.read_without_context::<u32>()?;
423 r.skip_white_spaces();
424 let num_entries = r.read_without_context::<u32>()?;
425 r.skip_white_spaces();
426
427 Some(Self { start, num_entries })
428 }
429}
430
431fn populate_from_xref_table<'a>(
433 data: &'a [u8],
434 reader: &mut Reader<'a>,
435 insert_map: &mut XrefMap,
436) -> Option<&'a [u8]> {
437 let trailer = {
438 let mut reader = reader.clone();
439 read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
440 };
441
442 reader.skip_white_spaces();
443 reader.forward_tag(b"xref")?;
444 reader.skip_white_spaces();
445
446 let mut max_obj = 0;
447
448 if let Some(prev) = trailer.get::<i32>(PREV) {
449 populate_xref_impl(data, prev as usize, insert_map)?;
451 }
452
453 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
456 populate_xref_impl(data, xref_stm as usize, insert_map)?;
457 }
458
459 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
460 reader.skip_white_spaces();
461
462 let start = header.start;
463 let end = start + header.num_entries;
464
465 for obj_number in start..end {
466 max_obj = max(max_obj, obj_number);
467 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
468 let entry = XRefEntry::read(bytes)?;
469
470 if entry.used {
473 insert_map.insert(
474 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
475 EntryType::Normal(entry.offset),
476 );
477 }
478 }
479 }
480
481 Some(trailer.data())
482}
483
484fn populate_from_xref_stream<'a>(
485 data: &'a [u8],
486 reader: &mut Reader<'a>,
487 insert_map: &mut XrefMap,
488) -> Option<&'a [u8]> {
489 let stream = reader
490 .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
491 .get();
492
493 if let Some(prev) = stream.dict().get::<i32>(PREV) {
494 let _ = populate_xref_impl(data, prev as usize, insert_map)?;
496 }
497
498 let size = stream.dict().get::<u32>(SIZE)?;
499
500 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
501
502 if f2_len > size_of::<u64>() as u8 {
503 error!("xref offset length is larger than the allowed limit");
504
505 return None;
506 }
507
508 if f1_len != 1 {
510 warn!("first field in xref stream was longer than 1");
511 }
512
513 let xref_data = stream.decoded().ok()?;
514 let mut xref_reader = Reader::new(xref_data.as_ref());
515
516 if let Some(arr) = stream.dict().get::<Array>(INDEX) {
517 let iter = arr.iter::<(u32, u32)>();
518
519 for (start, num_elements) in iter {
520 xref_stream_subsection(
521 &mut xref_reader,
522 start,
523 num_elements,
524 f1_len,
525 f2_len,
526 f3_len,
527 insert_map,
528 )?;
529 }
530 } else {
531 xref_stream_subsection(
532 &mut xref_reader,
533 0,
534 size,
535 f1_len,
536 f2_len,
537 f3_len,
538 insert_map,
539 )?;
540 }
541
542 Some(stream.dict().data())
543}
544
545fn xref_stream_num(data: &[u8]) -> Option<u32> {
546 Some(match data.len() {
547 0 => return None,
548 1 => u8::from_be(data[0]) as u32,
549 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
550 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
551 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
552 8 => {
553 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
554 return Some(num);
555 } else {
556 warn!("xref stream number is too large");
557
558 return None;
559 }
560 }
561 n => {
562 warn!("invalid xref stream number {n}");
563
564 return None;
565 }
566 })
567}
568
569fn xref_stream_subsection<'a>(
570 xref_reader: &mut Reader<'a>,
571 start: u32,
572 num_elements: u32,
573 f1_len: u8,
574 f2_len: u8,
575 f3_len: u8,
576 insert_map: &mut XrefMap,
577) -> Option<()> {
578 for i in 0..num_elements {
579 let f_type = if f1_len == 0 {
580 1
581 } else {
582 xref_reader.read_bytes(1)?[0]
584 };
585
586 let obj_number = start + i;
587
588 match f_type {
589 0 => {
591 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
592 }
593 1 => {
594 let offset = if f2_len > 0 {
595 let data = xref_reader.read_bytes(f2_len as usize)?;
596 xref_stream_num(data)?
597 } else {
598 0
599 };
600
601 let gen_number = if f3_len > 0 {
602 let data = xref_reader.read_bytes(f3_len as usize)?;
603 xref_stream_num(data)?
604 } else {
605 0
606 };
607
608 insert_map.insert(
609 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
610 EntryType::Normal(offset as usize),
611 );
612 }
613 2 => {
614 let obj_stream_number = {
615 let data = xref_reader.read_bytes(f2_len as usize)?;
616 xref_stream_num(data)?
617 };
618 let gen_number = 0;
619 let index = if f3_len > 0 {
620 let data = xref_reader.read_bytes(f3_len as usize)?;
621 xref_stream_num(data)?
622 } else {
623 0
624 };
625
626 insert_map.insert(
627 ObjectIdentifier::new(obj_number as i32, gen_number),
628 EntryType::ObjStream(obj_stream_number, index),
629 );
630 }
631 _ => {
632 warn!("xref has unknown field type {f_type}");
633
634 return None;
635 }
636 }
637 }
638
639 Some(())
640}
641
642fn read_xref_table_trailer<'a>(
643 reader: &mut Reader<'a>,
644 ctx: ReaderContext<'a>,
645) -> Option<Dict<'a>> {
646 reader.skip_white_spaces();
647 reader.forward_tag(b"xref")?;
648 reader.skip_white_spaces();
649
650 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
651 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
652 }
653
654 reader.skip_white_spaces();
655 reader.forward_tag(b"trailer")?;
656 reader.skip_white_spaces();
657
658 reader.read_with_context::<Dict>(ctx)
659}
660
661struct ObjectStream<'a> {
662 data: &'a [u8],
663 ctx: ReaderContext<'a>,
664 offsets: Vec<(u32, usize)>,
665}
666
667impl<'a> ObjectStream<'a> {
668 fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
669 let num_objects = inner.dict().get::<usize>(N)?;
670 let first_offset = inner.dict().get::<usize>(FIRST)?;
671
672 let mut r = Reader::new(data);
673
674 let mut offsets = vec![];
675
676 for _ in 0..num_objects {
677 r.skip_white_spaces_and_comments();
678 let obj_num = r.read_without_context::<u32>()?;
680 r.skip_white_spaces_and_comments();
681 let relative_offset = r.read_without_context::<usize>()?;
682 offsets.push((obj_num, first_offset + relative_offset));
683 }
684
685 Some(Self { data, ctx, offsets })
686 }
687
688 fn get<T>(&self, index: u32) -> Option<T>
689 where
690 T: ObjectLike<'a>,
691 {
692 let offset = self.offsets.get(index as usize)?.1;
693 let mut r = Reader::new(self.data);
694 r.jump(offset);
695 r.skip_white_spaces_and_comments();
696
697 r.read_with_context::<T>(self.ctx)
698 }
699}