1use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10 AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11 OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::dict::probe_dict;
14use crate::object::indirect::IndirectObject;
15use crate::object::{Array, MaybeRef};
16use crate::object::{DateTime, Dict};
17use crate::object::{Object, ObjectLike};
18use crate::pdf::PdfVersion;
19use crate::reader::Reader;
20use crate::reader::{Readable, ReaderContext, ReaderExt};
21use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
22use crate::trivia::is_white_space_character;
23use crate::util::findr_needle;
24use crate::{PdfData, object};
25use alloc::collections::BTreeSet;
26use alloc::vec;
27use alloc::vec::Vec;
28use core::cmp::max;
29use core::iter;
30use core::ops::Deref;
31
32pub(crate) const XREF_ENTRY_LEN: usize = 20;
33
34#[derive(Debug, Copy, Clone)]
35pub(crate) enum XRefError {
36 Unknown,
37 Encryption(DecryptionError),
38}
39
40pub(crate) fn root_xref(data: PdfData, password: &[u8]) -> Result<XRef, XRefError> {
42 let mut xref_map = FxHashMap::default();
43 let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
44 let trailer =
45 populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
46
47 XRef::new(
48 data.clone(),
49 xref_map,
50 XRefInput::TrailerDictData(trailer),
51 false,
52 password,
53 )
54}
55
56pub(crate) fn fallback(data: PdfData, password: &[u8]) -> Option<XRef> {
58 warn!("xref table was invalid, trying to manually build xref table");
59 let (xref_map, xref_input) = fallback_xref_map(&data, password);
60
61 if let Some(xref_input) = xref_input {
62 warn!("rebuild xref table with {} entries", xref_map.len());
63
64 XRef::new(data.clone(), xref_map, xref_input, true, password).ok()
65 } else {
66 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
67
68 None
69 }
70}
71
72fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
73 fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
74}
75
76fn fallback_xref_map_inner<'a>(
77 data: &'a PdfData,
78 mut dummy_ctx: ReaderContext<'a>,
79 recurse: bool,
80 password: &[u8],
81) -> (XrefMap, Option<XRefInput<'a>>) {
82 let mut xref_map = FxHashMap::default();
83 let mut trailer_dicts = vec![];
84 let mut root_ref = None;
85
86 let mut r = Reader::new(data.as_ref());
87
88 let mut last_obj_num = None;
89
90 loop {
91 let cur_pos = r.offset();
92
93 let mut old_r = r.clone();
94
95 if r.peek_byte().is_some_and(|b: u8| b.is_ascii_digit()) {
97 if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
98 let mut cloned = r.clone();
99 cloned.skip_white_spaces_and_comments();
101 if cloned.skip::<Object<'_>>(false).is_some() {
102 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
103 last_obj_num = Some(obj_id);
104 dummy_ctx.set_obj_number(obj_id);
105 }
106 } else {
107 r.forward_while(|b| !is_white_space_character(b));
109 }
110 } else {
111 let mut probe_reader = r.clone();
114 if r.peek_bytes(2).is_some_and(|b| b == b"<<")
115 && let Some(probe) =
116 { probe_dict(&mut probe_reader, &dummy_ctx, Some(b"<<"), b">>") }
117 {
118 r = probe_reader;
119 if probe.has_root || probe.has_type {
120 let mut dict_reader = Reader::new(probe.data);
121 if let Some(dict) = dict_reader.read_with_context::<Dict<'_>>(&dummy_ctx) {
122 if probe.has_root && dict.contains_key(ROOT) {
123 trailer_dicts.push(dict.clone());
124 }
125
126 if dict
127 .get::<Name<'_>>(TYPE)
128 .is_some_and(|n| n.as_str() == "Catalog")
129 {
130 root_ref = last_obj_num;
131 }
132
133 if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
134 && dict.get::<Name<'_>>(TYPE).as_deref() == Some(b"ObjStm")
135 && let Some(data) = stream.decoded().ok()
136 && let Some(last_obj_num) = last_obj_num
137 && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
138 {
139 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
140 let id = ObjectIdentifier::new(*obj_num as i32, 0);
141 if xref_map
146 .get(&id)
147 .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
148 {
149 xref_map.insert(
150 id,
151 EntryType::ObjStream(
152 last_obj_num.obj_number as u32,
153 idx as u32,
154 ),
155 );
156 }
157 }
158 }
159 }
160 }
161 } else {
162 let old_pos = r.offset;
166 r.forward_while(|b| !is_white_space_character(b));
167 if r.offset == old_pos {
168 r.read_byte();
169 }
170 }
171 }
172
173 if r.at_end() {
174 break;
175 }
176 }
177
178 let mut trailer_dict = None;
180
181 for dict in trailer_dicts {
182 if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
183 let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
184
185 match root_id {
186 MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
187 Some(EntryType::Normal(offset)) => {
188 let mut reader = Reader::new(&data.as_ref()[*offset..]);
189
190 if let Some(obj) =
191 reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
192 && {
193 let obj = obj.get();
194 check(&obj)
195 }
196 {
197 trailer_dict = Some(dict);
198 }
199 }
200 Some(EntryType::ObjStream(obj_num, idx)) => {
201 if let Some(EntryType::Normal(offset)) =
202 xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
203 {
204 let mut reader = Reader::new(&data.as_ref()[*offset..]);
205
206 if let Some(stream) =
207 reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
208 && {
209 let stream = stream.get();
210 if let Some(data) = stream.decoded().ok()
211 && let Some(object_stream) =
212 ObjectStream::new(stream, &data, &dummy_ctx)
213 && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
214 {
215 check(&obj)
216 } else {
217 false
218 }
219 }
220 {
221 trailer_dict = Some(dict);
222 }
223 }
224 }
225 _ => {}
226 },
227 MaybeRef::NotRef(d) => {
228 if check(&d) {
229 trailer_dict = Some(dict);
230 }
231 }
232 }
233 }
234 }
235
236 let has_encryption = trailer_dict
237 .as_ref()
238 .is_some_and(|t| t.contains_key(ENCRYPT));
239
240 if has_encryption && recurse {
241 if let Ok(xref) = XRef::new(
246 data.clone(),
247 xref_map.clone(),
248 XRefInput::TrailerDictData(trailer_dict.as_ref().map(|d| d.data()).unwrap()),
249 true,
250 password,
251 ) {
252 let ctx = ReaderContext::new(&xref, false);
253 let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
254 xref_map = patched_map;
255 }
256 }
257
258 if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
259 (
260 xref_map,
261 Some(XRefInput::TrailerDictData(trailer_dict_data)),
262 )
263 } else if let Some(root_ref) = root_ref {
264 (xref_map, Some(XRefInput::RootRef(root_ref)))
265 } else {
266 (xref_map, None)
267 }
268}
269
270const DUMMY_XREF: XRef = XRef(Inner::Dummy);
271
272#[derive(Debug, Clone)]
274pub struct XRef(Inner);
275
276impl XRef {
277 fn new(
278 data: PdfData,
279 xref_map: XrefMap,
280 input: XRefInput<'_>,
281 repaired: bool,
282 password: &[u8],
283 ) -> Result<Self, XRefError> {
284 let trailer_data = TrailerData::dummy();
288
289 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
290 data: Arc::new(Data::new(data)),
291 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
292 decryptor: Arc::new(Decryptor::None),
293 has_ocgs: false,
294 metadata: Arc::new(Metadata::default()),
295 trailer_data,
296 password: password.to_vec(),
297 })));
298
299 let decryptor = {
304 match input {
305 XRefInput::TrailerDictData(trailer_dict_data) => {
306 let mut r = Reader::new(trailer_dict_data);
307
308 let trailer_dict = r
309 .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
310 .ok_or(XRefError::Unknown)?;
311
312 get_decryptor(&trailer_dict, password)?
313 }
314 XRefInput::RootRef(_) => Decryptor::None,
315 }
316 };
317
318 match &mut xref.0 {
319 Inner::Dummy => unreachable!(),
320 Inner::Some(r) => {
321 let mutable = Arc::make_mut(r);
322 mutable.decryptor = Arc::new(decryptor.clone());
323 }
324 }
325
326 let (trailer_data, has_ocgs, metadata) = match input {
327 XRefInput::TrailerDictData(trailer_dict_data) => {
328 let mut r = Reader::new(trailer_dict_data);
329
330 let trailer_dict = r
331 .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
332 .ok_or(XRefError::Unknown)?;
333
334 let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
335 let root = trailer_dict
336 .get::<Dict<'_>>(ROOT)
337 .ok_or(XRefError::Unknown)?;
338 let metadata = trailer_dict
339 .get::<Dict<'_>>(INFO)
340 .map(|d| parse_metadata(&d))
341 .unwrap_or_default();
342 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
343 let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
344 let version = root
345 .get::<Name<'_>>(VERSION)
346 .and_then(|v| PdfVersion::from_bytes(v.deref()));
347
348 let td = TrailerData {
349 pages_ref: pages_ref.into(),
350 root_ref: root_ref.into(),
351 version,
352 };
353
354 (td, has_ocgs, metadata)
355 }
356 XRefInput::RootRef(root_ref) => {
357 let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
358 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
359
360 let td = TrailerData {
361 pages_ref: pages_ref.into(),
362 root_ref,
363 version: None,
364 };
365
366 (td, false, Metadata::default())
367 }
368 };
369
370 match &mut xref.0 {
371 Inner::Dummy => unreachable!(),
372 Inner::Some(r) => {
373 let mutable = Arc::make_mut(r);
374 mutable.trailer_data = trailer_data;
375 mutable.decryptor = Arc::new(decryptor);
376 mutable.has_ocgs = has_ocgs;
377 mutable.metadata = Arc::new(metadata);
378 }
379 }
380
381 Ok(xref)
382 }
383
384 fn is_repaired(&self) -> bool {
385 match &self.0 {
386 Inner::Dummy => false,
387 Inner::Some(r) => {
388 let locked = r.map.get();
389 locked.repaired
390 }
391 }
392 }
393
394 pub(crate) fn dummy() -> &'static Self {
395 &DUMMY_XREF
396 }
397
398 pub(crate) fn len(&self) -> usize {
399 match &self.0 {
400 Inner::Dummy => 0,
401 Inner::Some(r) => r.map.get().xref_map.len(),
402 }
403 }
404
405 pub(crate) fn trailer_data(&self) -> &TrailerData {
406 match &self.0 {
407 Inner::Dummy => unreachable!(),
408 Inner::Some(r) => &r.trailer_data,
409 }
410 }
411
412 pub(crate) fn metadata(&self) -> &Metadata {
413 match &self.0 {
414 Inner::Dummy => unreachable!(),
415 Inner::Some(r) => &r.metadata,
416 }
417 }
418
419 pub fn root_id(&self) -> ObjectIdentifier {
421 self.trailer_data().root_ref
422 }
423
424 pub fn has_optional_content_groups(&self) -> bool {
426 match &self.0 {
427 Inner::Dummy => false,
428 Inner::Some(r) => r.has_ocgs,
429 }
430 }
431
432 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
433 match &self.0 {
434 Inner::Dummy => unimplemented!(),
435 Inner::Some(r) => {
436 let locked = r.map.get();
437 let mut elements = locked
438 .xref_map
439 .iter()
440 .map(|(id, e)| {
441 let offset = match e {
442 EntryType::Normal(o) => (*o, 0),
443 EntryType::ObjStream(id, index) => {
444 if let Some(EntryType::Normal(offset)) =
445 locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
446 {
447 (*offset, *index)
448 } else {
449 (usize::MAX, 0)
450 }
451 }
452 };
453
454 (*id, offset)
455 })
456 .collect::<Vec<_>>();
457
458 elements.sort_by(|e1, e2| e1.1.cmp(&e2.1));
461
462 let mut iter = elements.into_iter();
463
464 iter::from_fn(move || {
465 for next in iter.by_ref() {
466 if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
467 return Some(obj);
468 } else {
469 continue;
471 }
472 }
473
474 None
475 })
476 }
477 }
478 }
479
480 pub(crate) fn repair(&self) {
481 let Inner::Some(r) = &self.0 else {
482 unreachable!();
483 };
484
485 let mut locked = r.map.try_put().unwrap();
486 assert!(!locked.repaired);
487
488 let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
489 locked.xref_map = xref_map;
490 locked.repaired = true;
491 }
492
493 #[inline]
494 pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
495 match &self.0 {
496 Inner::Dummy => false,
497 Inner::Some(r) => {
498 if matches!(r.decryptor.as_ref(), Decryptor::None) {
499 false
500 } else {
501 !ctx.in_content_stream() && !ctx.in_object_stream()
502 }
503 }
504 }
505 }
506
507 #[inline]
508 pub(crate) fn decrypt(
509 &self,
510 id: ObjectIdentifier,
511 data: &[u8],
512 target: DecryptionTarget,
513 ) -> Option<Vec<u8>> {
514 match &self.0 {
515 Inner::Dummy => Some(data.to_vec()),
516 Inner::Some(r) => r.decryptor.decrypt(id, data, target),
517 }
518 }
519
520 #[allow(private_bounds)]
522 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
523 where
524 T: ObjectLike<'a>,
525 {
526 let ctx = ReaderContext::new(self, false);
527 self.get_with(id, &ctx)
528 }
529
530 #[allow(private_bounds)]
532 pub(crate) fn get_with<'a, T>(
533 &'a self,
534 id: ObjectIdentifier,
535 ctx: &ReaderContext<'a>,
536 ) -> Option<T>
537 where
538 T: ObjectLike<'a>,
539 {
540 let Inner::Some(repr) = &self.0 else {
541 return None;
542 };
543
544 let locked = repr.map.try_get().unwrap();
545
546 let mut r = Reader::new(repr.data.get().as_ref());
547
548 let entry = *locked.xref_map.get(&id).or({
549 None
552 })?;
553 drop(locked);
554
555 let mut ctx = ctx.clone();
556 ctx.set_obj_number(id);
557 ctx.set_in_content_stream(false);
558
559 match entry {
560 EntryType::Normal(offset) => {
561 ctx.set_in_object_stream(false);
562 r.jump(offset);
563
564 if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
565 if object.id() == &id {
566 return Some(object.get());
567 }
568 } else {
569 if r.skip::<IndirectObject<Object<'_>>>(false).is_some() {
572 return None;
573 }
574 };
575
576 if self.is_repaired() {
578 error!(
579 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
580 );
581
582 None
583 } else {
584 warn!("broken xref, attempting to repair");
585
586 self.repair();
587
588 self.get_with::<T>(id, &ctx)
590 }
591 }
592 EntryType::ObjStream(obj_stram_gen_num, index) => {
593 let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
595
596 if obj_stream_id == id {
597 warn!("cycle detected in object stream");
598
599 return None;
600 }
601
602 let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
603 let data = repr.data.get_with(obj_stream_id, &ctx)?;
604 let object_stream = ObjectStream::new(stream, data, &ctx)?;
605 object_stream.get(index)
606 }
607 }
608 }
609}
610
611#[derive(Debug, Copy, Clone)]
614pub(crate) enum XRefInput<'a> {
615 TrailerDictData(&'a [u8]),
618 RootRef(ObjectIdentifier),
626}
627
628pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
629 let needle = b"startxref";
630 let pos = findr_needle(data, needle)?;
631 let mut finder = Reader::new(data);
632 finder.jump(pos);
633 finder.forward_tag(needle)?;
634 finder.skip_white_spaces_and_comments();
635 finder.read_without_context::<i32>()?.try_into().ok()
636}
637
638#[derive(Debug, PartialEq, Eq, Clone, Copy)]
640enum EntryType {
641 Normal(usize),
643 ObjStream(u32, u32),
647}
648
649type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
650
651#[derive(Debug)]
653struct MapRepr {
654 xref_map: XrefMap,
655 repaired: bool,
656}
657
658#[derive(Debug, Copy, Clone)]
659pub(crate) struct TrailerData {
660 pub(crate) pages_ref: ObjectIdentifier,
661 pub(crate) root_ref: ObjectIdentifier,
662 pub(crate) version: Option<PdfVersion>,
663}
664
665impl TrailerData {
666 pub(crate) fn dummy() -> Self {
667 Self {
668 pages_ref: ObjectIdentifier::new(0, 0),
669 root_ref: ObjectIdentifier::new(0, 0),
670 version: None,
671 }
672 }
673}
674
675#[derive(Debug, Clone)]
676struct SomeRepr {
677 data: Arc<Data>,
678 map: Arc<RwLock<MapRepr>>,
679 metadata: Arc<Metadata>,
680 decryptor: Arc<Decryptor>,
681 has_ocgs: bool,
682 password: Vec<u8>,
683 trailer_data: TrailerData,
684}
685
686#[derive(Debug, Clone)]
687enum Inner {
688 Dummy,
690 Some(Arc<SomeRepr>),
692}
693
694#[derive(Debug)]
695struct XRefEntry {
696 offset: usize,
697 gen_number: i32,
698 used: bool,
699}
700
701impl XRefEntry {
702 pub(crate) fn read(data: &[u8]) -> Option<Self> {
703 #[inline(always)]
704 fn parse_u32(data: &[u8]) -> Option<u32> {
705 let mut accum = 0_u32;
706
707 for byte in data {
708 accum = accum.checked_mul(10)?;
709
710 match *byte {
711 b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
712 _ => return None,
713 }
714 }
715
716 Some(accum)
717 }
718
719 let offset = parse_u32(&data[0..10])? as usize;
720 let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
721
722 let used = data[17] == b'n';
723
724 Some(Self {
725 offset,
726 gen_number,
727 used,
728 })
729 }
730}
731
732fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
733 let mut visited = BTreeSet::new();
734 populate_xref_impl_inner(data, pos, xref_map, &mut visited)
735}
736
737const MAX_XREF_CHAIN_DEPTH: usize = 256;
739
740fn populate_xref_impl_inner<'a>(
741 data: &'a [u8],
742 pos: usize,
743 xref_map: &mut XrefMap,
744 visited: &mut BTreeSet<usize>,
745) -> Option<&'a [u8]> {
746 if !visited.insert(pos) {
747 warn!("circular xref PREV chain detected at offset {}", pos);
748
749 return None;
750 }
751
752 if visited.len() > MAX_XREF_CHAIN_DEPTH {
753 warn!(
754 "xref PREV chain exceeds maximum depth of {}",
755 MAX_XREF_CHAIN_DEPTH
756 );
757
758 return None;
759 }
760
761 let mut reader = Reader::new(data);
762 reader.jump(pos);
763 reader.skip_white_spaces_and_comments();
765
766 let mut r2 = reader.clone();
767 if reader
768 .clone()
769 .read_without_context::<ObjectIdentifier>()
770 .is_some()
771 {
772 populate_from_xref_stream(data, &mut r2, xref_map, visited)
773 } else {
774 populate_from_xref_table(data, &mut r2, xref_map, visited)
775 }
776}
777
778pub(super) struct SubsectionHeader {
779 pub(super) start: u32,
780 pub(super) num_entries: u32,
781}
782
783impl Readable<'_> for SubsectionHeader {
784 fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
785 r.skip_white_spaces();
786 let start = r.read_without_context::<u32>()?;
787 r.skip_white_spaces();
788 let num_entries = r.read_without_context::<u32>()?;
789 r.skip_white_spaces();
790
791 Some(Self { start, num_entries })
792 }
793}
794
795fn populate_from_xref_table<'a>(
797 data: &'a [u8],
798 reader: &mut Reader<'a>,
799 insert_map: &mut XrefMap,
800 visited: &mut BTreeSet<usize>,
801) -> Option<&'a [u8]> {
802 let trailer = {
803 let mut reader = reader.clone();
804 read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
805 };
806
807 reader.skip_white_spaces();
808 reader.forward_tag(b"xref")?;
809 reader.skip_white_spaces();
810
811 let mut max_obj = 0;
812
813 if let Some(prev) = trailer.get::<i32>(PREV) {
814 populate_xref_impl_inner(data, prev as usize, insert_map, visited)?;
816 }
817
818 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
821 populate_xref_impl_inner(data, xref_stm as usize, insert_map, visited)?;
822 }
823
824 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
825 reader.skip_white_spaces();
826
827 let start = header.start;
828 let end = start.checked_add(header.num_entries)?;
829
830 for obj_number in start..end {
831 max_obj = max(max_obj, obj_number);
832 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
833 let entry = XRefEntry::read(bytes)?;
834
835 if entry.used {
838 insert_map.insert(
839 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
840 EntryType::Normal(entry.offset),
841 );
842 }
843 }
844 }
845
846 Some(trailer.data())
847}
848
849fn populate_from_xref_stream<'a>(
850 data: &'a [u8],
851 reader: &mut Reader<'a>,
852 insert_map: &mut XrefMap,
853 visited: &mut BTreeSet<usize>,
854) -> Option<&'a [u8]> {
855 let stream = reader
856 .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
857 .get();
858
859 if let Some(prev) = stream.dict().get::<i32>(PREV) {
860 let _ = populate_xref_impl_inner(data, prev as usize, insert_map, visited)?;
862 }
863
864 let size = stream.dict().get::<u32>(SIZE)?;
865
866 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
867
868 if f2_len > size_of::<u64>() as u8 {
869 error!("xref offset length is larger than the allowed limit");
870
871 return None;
872 }
873
874 if f1_len != 1 {
876 warn!("first field in xref stream was longer than 1");
877 }
878
879 let xref_data = stream.decoded().ok()?;
880 let mut xref_reader = Reader::new(xref_data.as_ref());
881
882 if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
883 let iter = arr.iter::<(u32, u32)>();
884
885 for (start, num_elements) in iter {
886 xref_stream_subsection(
887 &mut xref_reader,
888 start,
889 num_elements,
890 f1_len,
891 f2_len,
892 f3_len,
893 insert_map,
894 )?;
895 }
896 } else {
897 xref_stream_subsection(
898 &mut xref_reader,
899 0,
900 size,
901 f1_len,
902 f2_len,
903 f3_len,
904 insert_map,
905 )?;
906 }
907
908 Some(stream.dict().data())
909}
910
911fn xref_stream_num(data: &[u8]) -> Option<u32> {
912 Some(match data.len() {
913 0 => return None,
914 1 => u8::from_be(data[0]) as u32,
915 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
916 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
917 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
918 8 => {
919 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
920 return Some(num);
921 } else {
922 warn!("xref stream number is too large");
923
924 return None;
925 }
926 }
927 _n => {
928 warn!("invalid xref stream number {_n}");
929
930 return None;
931 }
932 })
933}
934
935fn xref_stream_subsection<'a>(
936 xref_reader: &mut Reader<'a>,
937 start: u32,
938 num_elements: u32,
939 f1_len: u8,
940 f2_len: u8,
941 f3_len: u8,
942 insert_map: &mut XrefMap,
943) -> Option<()> {
944 for i in 0..num_elements {
945 let f_type = if f1_len == 0 {
946 1
947 } else {
948 xref_reader.read_bytes(1)?[0]
950 };
951
952 let obj_number = start + i;
953
954 match f_type {
955 0 => {
957 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
958 }
959 1 => {
960 let offset = if f2_len > 0 {
961 let data = xref_reader.read_bytes(f2_len as usize)?;
962 xref_stream_num(data)?
963 } else {
964 0
965 };
966
967 let gen_number = if f3_len > 0 {
968 let data = xref_reader.read_bytes(f3_len as usize)?;
969 xref_stream_num(data)?
970 } else {
971 0
972 };
973
974 insert_map.insert(
975 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
976 EntryType::Normal(offset as usize),
977 );
978 }
979 2 => {
980 let obj_stream_number = {
981 let data = xref_reader.read_bytes(f2_len as usize)?;
982 xref_stream_num(data)?
983 };
984 let gen_number = 0;
985 let index = if f3_len > 0 {
986 let data = xref_reader.read_bytes(f3_len as usize)?;
987 xref_stream_num(data)?
988 } else {
989 0
990 };
991
992 insert_map.insert(
993 ObjectIdentifier::new(obj_number as i32, gen_number),
994 EntryType::ObjStream(obj_stream_number, index),
995 );
996 }
997 _ => {
998 warn!("xref has unknown field type {f_type}");
999
1000 return None;
1001 }
1002 }
1003 }
1004
1005 Some(())
1006}
1007
1008fn read_xref_table_trailer<'a>(
1009 reader: &mut Reader<'a>,
1010 ctx: &ReaderContext<'a>,
1011) -> Option<Dict<'a>> {
1012 reader.skip_white_spaces();
1013 reader.forward_tag(b"xref")?;
1014 reader.skip_white_spaces();
1015
1016 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
1017 let len = XREF_ENTRY_LEN.checked_mul(header.num_entries as usize)?;
1018 reader.jump(reader.offset().checked_add(len)?);
1019 }
1020
1021 reader.skip_white_spaces();
1022 reader.forward_tag(b"trailer")?;
1023 reader.skip_white_spaces();
1024
1025 reader.read_with_context::<Dict<'_>>(ctx)
1026}
1027
1028fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
1029 if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
1030 let id = if let Some(id) = trailer_dict
1031 .get::<Array<'_>>(ID)
1032 .and_then(|a| a.flex_iter().next::<object::String<'_>>())
1033 {
1034 id.to_vec()
1035 } else {
1036 vec![]
1038 };
1039
1040 get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1041 } else {
1042 Ok(Decryptor::None)
1043 }
1044}
1045
1046struct ObjectStream<'a> {
1047 data: &'a [u8],
1048 ctx: ReaderContext<'a>,
1049 offsets: Vec<(u32, usize)>,
1050}
1051
1052impl<'a> ObjectStream<'a> {
1053 fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1054 let num_objects = inner.dict().get::<usize>(N)?;
1055 let first_offset = inner.dict().get::<usize>(FIRST)?;
1056
1057 let mut r = Reader::new(data);
1058
1059 let mut offsets = vec![];
1060
1061 for _ in 0..num_objects {
1062 r.skip_white_spaces_and_comments();
1063 let obj_num = r.read_without_context::<u32>()?;
1065 r.skip_white_spaces_and_comments();
1066 let relative_offset = r.read_without_context::<usize>()?;
1067 offsets.push((obj_num, first_offset + relative_offset));
1068 }
1069
1070 let mut ctx = ctx.clone();
1071 ctx.set_in_object_stream(true);
1072
1073 Some(Self { data, ctx, offsets })
1074 }
1075
1076 fn get<T>(&self, index: u32) -> Option<T>
1077 where
1078 T: ObjectLike<'a>,
1079 {
1080 let offset = self.offsets.get(index as usize)?.1;
1081 let mut r = Reader::new(self.data);
1082 r.jump(offset);
1083 r.skip_white_spaces_and_comments();
1084
1085 r.read_with_context::<T>(&self.ctx)
1086 }
1087}
1088
1089fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1090 Metadata {
1091 creation_date: info_dict
1092 .get::<object::String<'_>>(CREATION_DATE)
1093 .and_then(|c| DateTime::from_bytes(&c)),
1094 modification_date: info_dict
1095 .get::<object::String<'_>>(MOD_DATE)
1096 .and_then(|c| DateTime::from_bytes(&c)),
1097 title: info_dict
1098 .get::<object::String<'_>>(TITLE)
1099 .map(|t| t.to_vec()),
1100 author: info_dict
1101 .get::<object::String<'_>>(AUTHOR)
1102 .map(|t| t.to_vec()),
1103 subject: info_dict
1104 .get::<object::String<'_>>(SUBJECT)
1105 .map(|t| t.to_vec()),
1106 keywords: info_dict
1107 .get::<object::String<'_>>(KEYWORDS)
1108 .map(|t| t.to_vec()),
1109 creator: info_dict
1110 .get::<object::String<'_>>(CREATOR)
1111 .map(|t| t.to_vec()),
1112 producer: info_dict
1113 .get::<object::String<'_>>(PRODUCER)
1114 .map(|t| t.to_vec()),
1115 }
1116}
1117
1118#[cfg(test)]
1119mod tests {
1120 use super::*;
1121
1122 #[test]
1123 fn circular_prev_chain() {
1124 let mut pdf = b"%PDF-1.0\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n".to_vec();
1125 let expected_xref_pos = pdf.len();
1126 pdf.extend_from_slice(
1127 format!(
1128 "xref\n\
1129 0 1\n\
1130 0000000000 65535 f \r\n\
1131 trailer\n<< /Size 1 /Root 1 0 R /Prev {expected_xref_pos} >>\n\
1132 startxref\n{expected_xref_pos}\n%%EOF"
1133 )
1134 .as_bytes(),
1135 );
1136
1137 let mut xref_map = FxHashMap::default();
1138 let xref_pos = find_last_xref_pos(pdf.as_ref()).unwrap();
1139 let _result = populate_xref_impl(pdf.as_ref(), xref_pos, &mut xref_map);
1140 }
1141
1142 #[test]
1143 fn find_last_xref_uses_last_startxref() {
1144 let pdf = b"%PDF-1.0\nstartxref\n5\n%%EOF\nstartxref\n42\n%%EOF";
1145 assert_eq!(find_last_xref_pos(pdf), Some(42));
1146 }
1147
1148 #[test]
1149 fn xref_table_trailer_rejects_overflowing_entry_skip() {
1150 let data = b"xref\n0 999999999999999999999\ntrailer\n<<>>";
1151 let mut reader = Reader::new(data);
1152 assert!(read_xref_table_trailer(&mut reader, &ReaderContext::dummy()).is_none());
1153 }
1154}