1use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10 ENCRYPT, FIRST, ID, INDEX, N, OCPROPERTIES, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
11};
12use crate::object::indirect::IndirectObject;
13use crate::object::{Array, MaybeRef};
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use crate::{PdfData, object};
18use log::{error, warn};
19use rustc_hash::FxHashMap;
20use std::cmp::max;
21use std::iter;
22use std::ops::Deref;
23use std::sync::{Arc, RwLock};
24
25pub(crate) const XREF_ENTRY_LEN: usize = 20;
26
27#[derive(Debug, Copy, Clone)]
28pub(crate) enum XRefError {
29 Unknown,
30 Encryption(DecryptionError),
31}
32
33pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
35 let mut xref_map = FxHashMap::default();
36 let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
37 let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
38 .ok_or(XRefError::Unknown)?;
39
40 XRef::new(data.clone(), xref_map, trailer, false)
41}
42
43pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
45 warn!("xref table was invalid, trying to manually build xref table");
46 let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
47
48 if let Some(trailer_dict_data) = trailer_dict {
49 warn!("rebuild xref table with {} entries", xref_map.len());
50
51 XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
52 } else {
53 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
54
55 None
56 }
57}
58
59fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
60 let mut xref_map = FxHashMap::default();
61 let mut trailer_dicts = vec![];
62
63 let mut r = Reader::new(data);
64
65 let mut dummy_ctx = ReaderContext::dummy();
66 let mut last_obj_num = None;
67
68 loop {
69 let cur_pos = r.offset();
70
71 let mut old_r = r.clone();
72
73 if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
74 let mut cloned = r.clone();
75 cloned.skip_white_spaces_and_comments();
77 if cloned.skip::<Object>(false).is_some() {
78 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
79 last_obj_num = Some(obj_id);
80 dummy_ctx.obj_number = Some(obj_id);
81 }
82 } else if let Some(dict) = r.read::<Dict>(&dummy_ctx) {
83 if dict.contains_key(ROOT) {
84 trailer_dicts.push(dict);
85 }
86
87 if let Some(stream) = old_r.read::<Stream>(&dummy_ctx)
88 && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
89 && let Some(data) = stream.decoded().ok()
90 && let Some(last_obj_num) = last_obj_num
91 && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
92 {
93 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
94 let id = ObjectIdentifier::new(*obj_num as i32, 0);
95 xref_map.insert(
96 id,
97 EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
98 );
99 }
100 }
101 } else {
102 r.read_byte();
103 }
104
105 if r.at_end() {
106 break;
107 }
108 }
109
110 let mut trailer_dict = None;
112
113 for dict in trailer_dicts {
114 if let Some(root_id) = dict.get_raw::<Dict>(ROOT) {
115 let check = |dict: &Dict| -> bool { dict.contains_key(PAGES) };
116
117 match root_id {
118 MaybeRef::Ref(r) => {
119 if let Some(EntryType::Normal(offset)) = xref_map.get(&r.into()) {
120 let mut reader = Reader::new(&data[*offset..]);
121
122 if let Some(obj) =
123 reader.read_with_context::<IndirectObject<Dict>>(&dummy_ctx)
124 && check(&obj.clone().get())
125 {
126 trailer_dict = Some(dict);
127 }
128 }
129 }
130 MaybeRef::NotRef(d) => {
131 if check(&d) {
132 trailer_dict = Some(dict);
133 }
134 }
135 }
136 }
137 }
138
139 (xref_map, trailer_dict.map(|d| d.data()))
140}
141
142static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
143
144#[derive(Debug, Clone)]
146pub struct XRef(Inner);
147
148impl XRef {
149 fn new(
150 data: PdfData,
151 xref_map: XrefMap,
152 trailer_dict_data: &[u8],
153 repaired: bool,
154 ) -> Result<Self, XRefError> {
155 let trailer_data = TrailerData::dummy();
159
160 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
161 data: Arc::new(Data::new(data)),
162 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
163 decryptor: Arc::new(Decryptor::None),
164 has_ocgs: false,
165 trailer_data,
166 })));
167
168 let mut r = Reader::new(trailer_dict_data);
169 let trailer_dict = r
170 .read_with_context::<Dict>(&ReaderContext::new(&xref, false))
171 .ok_or(XRefError::Unknown)?;
172
173 let decryptor = if let Some(encryption_dict) = trailer_dict.get::<Dict>(ENCRYPT) {
174 let Some(id) = trailer_dict
175 .get::<Array>(ID)
176 .and_then(|a| a.flex_iter().next::<object::String>())
177 else {
178 return Err(XRefError::Encryption(DecryptionError::MissingIDEntry));
179 };
180
181 get(&encryption_dict, id.get().as_ref()).map_err(XRefError::Encryption)?
182 } else {
183 Decryptor::None
184 };
185
186 let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
187 let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
188 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
189 let has_ocgs = root.get::<Dict>(OCPROPERTIES).is_some();
190 let version = root
191 .get::<Name>(VERSION)
192 .and_then(|v| PdfVersion::from_bytes(v.deref()));
193
194 let td = TrailerData {
195 pages_ref: pages_ref.into(),
196 root_ref: root_ref.into(),
197 version,
198 };
199
200 match &mut xref.0 {
201 Inner::Dummy => unreachable!(),
202 Inner::Some(r) => {
203 let mutable = Arc::make_mut(r);
204 mutable.trailer_data = td;
205 mutable.decryptor = Arc::new(decryptor);
206 mutable.has_ocgs = has_ocgs;
207 }
208 }
209
210 Ok(xref)
211 }
212
213 fn is_repaired(&self) -> bool {
214 match &self.0 {
215 Inner::Dummy => false,
216 Inner::Some(r) => {
217 let locked = r.map.read().unwrap();
218 locked.repaired
219 }
220 }
221 }
222
223 pub(crate) fn dummy() -> &'static XRef {
224 DUMMY_XREF
225 }
226
227 pub(crate) fn len(&self) -> usize {
228 match &self.0 {
229 Inner::Dummy => 0,
230 Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
231 }
232 }
233
234 pub(crate) fn trailer_data(&self) -> &TrailerData {
235 match &self.0 {
236 Inner::Dummy => unreachable!(),
237 Inner::Some(r) => &r.trailer_data,
238 }
239 }
240
241 pub fn root_id(&self) -> ObjectIdentifier {
243 self.trailer_data().root_ref
244 }
245
246 pub fn has_optional_content_groups(&self) -> bool {
248 match &self.0 {
249 Inner::Dummy => false,
250 Inner::Some(r) => r.has_ocgs,
251 }
252 }
253
254 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
255 match &self.0 {
256 Inner::Dummy => unimplemented!(),
257 Inner::Some(r) => iter::from_fn(move || {
258 let locked = r.map.read().unwrap();
259 let mut iter = locked.xref_map.keys();
260
261 iter.next()
262 .and_then(|k| self.get_with(*k, &ReaderContext::new(self, false)))
263 }),
264 }
265 }
266
267 pub(crate) fn repair(&self) {
268 let Inner::Some(r) = &self.0 else {
269 unreachable!();
270 };
271
272 let mut locked = r.map.try_write().unwrap();
273 assert!(!locked.repaired);
274
275 let (xref_map, _) = fallback_xref_map(r.data.get());
276 locked.xref_map = xref_map;
277 locked.repaired = true;
278 }
279
280 #[inline]
281 pub(crate) fn needs_decryption(&self, ctx: &ReaderContext) -> bool {
282 match &self.0 {
283 Inner::Dummy => false,
284 Inner::Some(r) => {
285 if matches!(r.decryptor.as_ref(), Decryptor::None) {
286 false
287 } else {
288 !ctx.in_content_stream
289 }
290 }
291 }
292 }
293
294 #[inline]
295 pub(crate) fn decrypt(
296 &self,
297 id: ObjectIdentifier,
298 data: &[u8],
299 target: DecryptionTarget,
300 ) -> Option<Vec<u8>> {
301 match &self.0 {
302 Inner::Dummy => Some(data.to_vec()),
303 Inner::Some(r) => r.decryptor.decrypt(id, data, target),
304 }
305 }
306
307 #[allow(private_bounds)]
309 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
310 where
311 T: ObjectLike<'a>,
312 {
313 let ctx = ReaderContext::new(self, false);
314 self.get_with(id, &ctx)
315 }
316
317 #[allow(private_bounds)]
319 pub(crate) fn get_with<'a, T>(
320 &'a self,
321 id: ObjectIdentifier,
322 ctx: &ReaderContext<'a>,
323 ) -> Option<T>
324 where
325 T: ObjectLike<'a>,
326 {
327 let Inner::Some(repr) = &self.0 else {
328 return None;
329 };
330
331 let locked = repr.map.try_read().unwrap();
332
333 let mut r = Reader::new(repr.data.get());
334
335 let entry = *locked.xref_map.get(&id).or({
336 None
339 })?;
340 drop(locked);
341
342 let mut ctx = ctx.clone();
343 ctx.in_content_stream = false;
344
345 match entry {
346 EntryType::Normal(offset) => {
347 r.jump(offset);
348
349 if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
350 if object.id() == &id {
351 return Some(object.get());
352 }
353 } else {
354 if r.skip_not_in_content_stream::<IndirectObject<Object>>()
357 .is_some()
358 {
359 return None;
360 }
361 };
362
363 if self.is_repaired() {
365 error!(
366 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
367 );
368
369 None
370 } else {
371 warn!("broken xref, attempting to repair");
372
373 self.repair();
374
375 self.get_with::<T>(id, &ctx)
377 }
378 }
379 EntryType::ObjStream(obj_stram_gen_num, index) => {
380 let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
382
383 let stream = self.get_with::<Stream>(obj_stream_id, &ctx)?;
384 let data = repr.data.get_with(obj_stream_id, &ctx)?;
385 let object_stream = ObjectStream::new(stream, data, &ctx)?;
386 object_stream.get(index)
387 }
388 }
389 }
390}
391
392pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
393 let mut finder = Reader::new(data);
394 let mut pos = finder.len().checked_sub(1)?;
395 finder.jump(pos);
396
397 let needle = b"startxref";
398
399 loop {
400 if finder.forward_tag(needle).is_some() {
401 finder.skip_white_spaces_and_comments();
402
403 let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
404
405 return Some(offset);
406 }
407
408 pos = pos.checked_sub(1)?;
409 finder.jump(pos);
410 }
411}
412
413#[derive(Debug, PartialEq, Eq, Clone, Copy)]
415enum EntryType {
416 Normal(usize),
418 ObjStream(u32, u32),
422}
423
424type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
425
426#[derive(Debug)]
428struct MapRepr {
429 xref_map: XrefMap,
430 repaired: bool,
431}
432
433#[derive(Debug, Copy, Clone)]
434pub(crate) struct TrailerData {
435 pub pages_ref: ObjectIdentifier,
436 pub root_ref: ObjectIdentifier,
437 pub version: Option<PdfVersion>,
438}
439
440impl TrailerData {
441 pub fn dummy() -> Self {
442 Self {
443 pages_ref: ObjectIdentifier::new(0, 0),
444 root_ref: ObjectIdentifier::new(0, 0),
445 version: None,
446 }
447 }
448}
449
450#[derive(Debug, Clone)]
451struct SomeRepr {
452 data: Arc<Data>,
453 map: Arc<RwLock<MapRepr>>,
454 decryptor: Arc<Decryptor>,
455 has_ocgs: bool,
456 trailer_data: TrailerData,
457}
458
459#[derive(Debug, Clone)]
460enum Inner {
461 Dummy,
463 Some(Arc<SomeRepr>),
465}
466
467#[derive(Debug)]
468struct XRefEntry {
469 offset: usize,
470 gen_number: i32,
471 used: bool,
472}
473
474impl XRefEntry {
475 pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
476 #[inline(always)]
477 fn parse_u32(data: &[u8]) -> Option<u32> {
478 let mut accum = 0;
479
480 for byte in data {
481 accum *= 10;
482
483 match *byte {
484 b'0'..=b'9' => accum += (*byte - b'0') as u32,
485 _ => return None,
486 }
487 }
488
489 Some(accum)
490 }
491
492 let offset = parse_u32(&data[0..10])? as usize;
493 let gen_number = parse_u32(&data[11..16])? as i32;
494
495 let used = data[17] == b'n';
496
497 Some(Self {
498 offset,
499 gen_number,
500 used,
501 })
502 }
503}
504
505fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
506 let mut reader = Reader::new(data);
507 reader.jump(pos);
508 reader.skip_white_spaces_and_comments();
510
511 let mut r2 = reader.clone();
512 if reader
513 .clone()
514 .read_without_context::<ObjectIdentifier>()
515 .is_some()
516 {
517 populate_from_xref_stream(data, &mut r2, xref_map)
518 } else {
519 populate_from_xref_table(data, &mut r2, xref_map)
520 }
521}
522
523pub(super) struct SubsectionHeader {
524 pub(super) start: u32,
525 pub(super) num_entries: u32,
526}
527
528impl Readable<'_> for SubsectionHeader {
529 fn read(r: &mut Reader<'_>, _: &ReaderContext) -> Option<Self> {
530 r.skip_white_spaces();
531 let start = r.read_without_context::<u32>()?;
532 r.skip_white_spaces();
533 let num_entries = r.read_without_context::<u32>()?;
534 r.skip_white_spaces();
535
536 Some(Self { start, num_entries })
537 }
538}
539
540fn populate_from_xref_table<'a>(
542 data: &'a [u8],
543 reader: &mut Reader<'a>,
544 insert_map: &mut XrefMap,
545) -> Option<&'a [u8]> {
546 let trailer = {
547 let mut reader = reader.clone();
548 read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
549 };
550
551 reader.skip_white_spaces();
552 reader.forward_tag(b"xref")?;
553 reader.skip_white_spaces();
554
555 let mut max_obj = 0;
556
557 if let Some(prev) = trailer.get::<i32>(PREV) {
558 populate_xref_impl(data, prev as usize, insert_map)?;
560 }
561
562 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
565 populate_xref_impl(data, xref_stm as usize, insert_map)?;
566 }
567
568 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
569 reader.skip_white_spaces();
570
571 let start = header.start;
572 let end = start + header.num_entries;
573
574 for obj_number in start..end {
575 max_obj = max(max_obj, obj_number);
576 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
577 let entry = XRefEntry::read(bytes)?;
578
579 if entry.used {
582 insert_map.insert(
583 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
584 EntryType::Normal(entry.offset),
585 );
586 }
587 }
588 }
589
590 Some(trailer.data())
591}
592
593fn populate_from_xref_stream<'a>(
594 data: &'a [u8],
595 reader: &mut Reader<'a>,
596 insert_map: &mut XrefMap,
597) -> Option<&'a [u8]> {
598 let stream = reader
599 .read_with_context::<IndirectObject<Stream>>(&ReaderContext::dummy())?
600 .get();
601
602 if let Some(prev) = stream.dict().get::<i32>(PREV) {
603 let _ = populate_xref_impl(data, prev as usize, insert_map)?;
605 }
606
607 let size = stream.dict().get::<u32>(SIZE)?;
608
609 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
610
611 if f2_len > size_of::<u64>() as u8 {
612 error!("xref offset length is larger than the allowed limit");
613
614 return None;
615 }
616
617 if f1_len != 1 {
619 warn!("first field in xref stream was longer than 1");
620 }
621
622 let xref_data = stream.decoded().ok()?;
623 let mut xref_reader = Reader::new(xref_data.as_ref());
624
625 if let Some(arr) = stream.dict().get::<Array>(INDEX) {
626 let iter = arr.iter::<(u32, u32)>();
627
628 for (start, num_elements) in iter {
629 xref_stream_subsection(
630 &mut xref_reader,
631 start,
632 num_elements,
633 f1_len,
634 f2_len,
635 f3_len,
636 insert_map,
637 )?;
638 }
639 } else {
640 xref_stream_subsection(
641 &mut xref_reader,
642 0,
643 size,
644 f1_len,
645 f2_len,
646 f3_len,
647 insert_map,
648 )?;
649 }
650
651 Some(stream.dict().data())
652}
653
654fn xref_stream_num(data: &[u8]) -> Option<u32> {
655 Some(match data.len() {
656 0 => return None,
657 1 => u8::from_be(data[0]) as u32,
658 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
659 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
660 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
661 8 => {
662 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
663 return Some(num);
664 } else {
665 warn!("xref stream number is too large");
666
667 return None;
668 }
669 }
670 n => {
671 warn!("invalid xref stream number {n}");
672
673 return None;
674 }
675 })
676}
677
678fn xref_stream_subsection<'a>(
679 xref_reader: &mut Reader<'a>,
680 start: u32,
681 num_elements: u32,
682 f1_len: u8,
683 f2_len: u8,
684 f3_len: u8,
685 insert_map: &mut XrefMap,
686) -> Option<()> {
687 for i in 0..num_elements {
688 let f_type = if f1_len == 0 {
689 1
690 } else {
691 xref_reader.read_bytes(1)?[0]
693 };
694
695 let obj_number = start + i;
696
697 match f_type {
698 0 => {
700 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
701 }
702 1 => {
703 let offset = if f2_len > 0 {
704 let data = xref_reader.read_bytes(f2_len as usize)?;
705 xref_stream_num(data)?
706 } else {
707 0
708 };
709
710 let gen_number = if f3_len > 0 {
711 let data = xref_reader.read_bytes(f3_len as usize)?;
712 xref_stream_num(data)?
713 } else {
714 0
715 };
716
717 insert_map.insert(
718 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
719 EntryType::Normal(offset as usize),
720 );
721 }
722 2 => {
723 let obj_stream_number = {
724 let data = xref_reader.read_bytes(f2_len as usize)?;
725 xref_stream_num(data)?
726 };
727 let gen_number = 0;
728 let index = if f3_len > 0 {
729 let data = xref_reader.read_bytes(f3_len as usize)?;
730 xref_stream_num(data)?
731 } else {
732 0
733 };
734
735 insert_map.insert(
736 ObjectIdentifier::new(obj_number as i32, gen_number),
737 EntryType::ObjStream(obj_stream_number, index),
738 );
739 }
740 _ => {
741 warn!("xref has unknown field type {f_type}");
742
743 return None;
744 }
745 }
746 }
747
748 Some(())
749}
750
751fn read_xref_table_trailer<'a>(
752 reader: &mut Reader<'a>,
753 ctx: &ReaderContext<'a>,
754) -> Option<Dict<'a>> {
755 reader.skip_white_spaces();
756 reader.forward_tag(b"xref")?;
757 reader.skip_white_spaces();
758
759 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
760 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
761 }
762
763 reader.skip_white_spaces();
764 reader.forward_tag(b"trailer")?;
765 reader.skip_white_spaces();
766
767 reader.read_with_context::<Dict>(ctx)
768}
769
770struct ObjectStream<'a> {
771 data: &'a [u8],
772 ctx: ReaderContext<'a>,
773 offsets: Vec<(u32, usize)>,
774}
775
776impl<'a> ObjectStream<'a> {
777 fn new(inner: Stream<'a>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
778 let num_objects = inner.dict().get::<usize>(N)?;
779 let first_offset = inner.dict().get::<usize>(FIRST)?;
780
781 let mut r = Reader::new(data);
782
783 let mut offsets = vec![];
784
785 for _ in 0..num_objects {
786 r.skip_white_spaces_and_comments();
787 let obj_num = r.read_without_context::<u32>()?;
789 r.skip_white_spaces_and_comments();
790 let relative_offset = r.read_without_context::<usize>()?;
791 offsets.push((obj_num, first_offset + relative_offset));
792 }
793
794 Some(Self {
795 data,
796 ctx: ctx.clone(),
797 offsets,
798 })
799 }
800
801 fn get<T>(&self, index: u32) -> Option<T>
802 where
803 T: ObjectLike<'a>,
804 {
805 let offset = self.offsets.get(index as usize)?.1;
806 let mut r = Reader::new(self.data);
807 r.jump(offset);
808 r.skip_white_spaces_and_comments();
809
810 r.read_with_context::<T>(&self.ctx)
811 }
812}