1use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10 AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11 OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Array, MaybeRef};
15use crate::object::{DateTime, Dict};
16use crate::object::{Object, ObjectLike};
17use crate::pdf::{PdfLoadLimits, PdfVersion};
18use crate::reader::Reader;
19use crate::reader::{Readable, ReaderContext, ReaderExt};
20use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
21use crate::{PdfData, object};
22use alloc::vec;
23use alloc::vec::Vec;
24use core::cmp::max;
25use core::iter;
26use core::ops::Deref;
27use log::{error, warn};
28
29pub(crate) const XREF_ENTRY_LEN: usize = 20;
30
31#[derive(Debug, Copy, Clone)]
32pub(crate) enum XRefError {
33 Unknown,
34 Encryption(DecryptionError),
35}
36
37pub(crate) fn root_xref(
39 data: PdfData,
40 password: &[u8],
41 limits: PdfLoadLimits,
42) -> Result<XRef, XRefError> {
43 let mut xref_map = FxHashMap::default();
44 let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
45 let trailer =
46 populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
47
48 XRef::new(
49 data.clone(),
50 xref_map,
51 XRefInput::TrailerDictData(trailer),
52 false,
53 password,
54 limits,
55 )
56}
57
58pub(crate) fn fallback(data: PdfData, password: &[u8], limits: PdfLoadLimits) -> Option<XRef> {
60 warn!("xref table was invalid, trying to manually build xref table");
61 let (xref_map, xref_input) = fallback_xref_map(&data, password);
62
63 if let Some(xref_input) = xref_input {
64 warn!("rebuild xref table with {} entries", xref_map.len());
65
66 XRef::new(data.clone(), xref_map, xref_input, true, password, limits).ok()
67 } else {
68 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
69
70 None
71 }
72}
73
74fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
75 fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
76}
77
78fn fallback_xref_map_inner<'a>(
79 data: &'a PdfData,
80 mut dummy_ctx: ReaderContext<'a>,
81 recurse: bool,
82 password: &[u8],
83) -> (XrefMap, Option<XRefInput<'a>>) {
84 let mut xref_map = FxHashMap::default();
85 let mut trailer_dicts = vec![];
86 let mut root_ref = None;
87
88 let mut r = Reader::new(data.as_ref());
89
90 let mut last_obj_num = None;
91
92 loop {
93 let cur_pos = r.offset();
94
95 let mut old_r = r.clone();
96
97 if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
98 let mut cloned = r.clone();
99 cloned.skip_white_spaces_and_comments();
101 if cloned.skip::<Object<'_>>(false).is_some() {
102 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
103 last_obj_num = Some(obj_id);
104 dummy_ctx.set_obj_number(obj_id);
105 }
106 } else if let Some(dict) = r.read::<Dict<'_>>(&dummy_ctx) {
107 if dict.contains_key(ROOT) {
108 trailer_dicts.push(dict.clone());
109 }
110
111 if dict
112 .get::<Name>(TYPE)
113 .is_some_and(|n| n.as_str() == "Catalog")
114 {
115 root_ref = last_obj_num;
116 }
117
118 if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
119 && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
120 && let Some(data) = stream.decoded().ok()
121 && let Some(last_obj_num) = last_obj_num
122 && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
123 {
124 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
125 let id = ObjectIdentifier::new(*obj_num as i32, 0);
126 if xref_map
131 .get(&id)
132 .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
133 {
134 xref_map.insert(
135 id,
136 EntryType::ObjStream(last_obj_num.obj_number as u32, idx as u32),
137 );
138 }
139 }
140 }
141 } else {
142 r.read_byte();
143 }
144
145 if r.at_end() {
146 break;
147 }
148 }
149
150 let mut trailer_dict = None;
152
153 for dict in trailer_dicts {
154 if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
155 let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
156
157 match root_id {
158 MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
159 Some(EntryType::Normal(offset)) => {
160 let mut reader = Reader::new(&data.as_ref()[*offset..]);
161
162 if let Some(obj) =
163 reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
164 && check(&obj.clone().get())
165 {
166 trailer_dict = Some(dict);
167 }
168 }
169 Some(EntryType::ObjStream(obj_num, idx)) => {
170 if let Some(EntryType::Normal(offset)) =
171 xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
172 {
173 let mut reader = Reader::new(&data.as_ref()[*offset..]);
174
175 if let Some(stream) =
176 reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
177 && let Some(data) = stream.clone().get().decoded().ok()
178 && let Some(object_stream) =
179 ObjectStream::new(stream.get(), &data, &dummy_ctx)
180 && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
181 && check(&obj)
182 {
183 trailer_dict = Some(dict);
184 }
185 }
186 }
187 _ => {}
188 },
189 MaybeRef::NotRef(d) => {
190 if check(&d) {
191 trailer_dict = Some(dict);
192 }
193 }
194 }
195 }
196 }
197
198 let has_encryption = trailer_dict
199 .as_ref()
200 .is_some_and(|t| t.contains_key(ENCRYPT));
201
202 if has_encryption && recurse {
203 if let Some(Ok(xref)) = trailer_dict.as_ref().map(|d| {
208 XRef::new(
209 data.clone(),
210 xref_map.clone(),
211 XRefInput::TrailerDictData(d.data()),
212 true,
213 password,
214 PdfLoadLimits::default(),
215 )
216 }) {
217 let ctx = ReaderContext::new(&xref, false);
218 let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
219 xref_map = patched_map;
220 }
221 }
222
223 if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
224 (
225 xref_map,
226 Some(XRefInput::TrailerDictData(trailer_dict_data)),
227 )
228 } else if let Some(root_ref) = root_ref {
229 (xref_map, Some(XRefInput::RootRef(root_ref)))
230 } else {
231 (xref_map, None)
232 }
233}
234
235const DUMMY_XREF: XRef = XRef(Inner::Dummy);
236
237#[derive(Debug, Clone)]
239pub struct XRef(Inner);
240
241impl XRef {
242 fn new(
243 data: PdfData,
244 xref_map: XrefMap,
245 input: XRefInput<'_>,
246 repaired: bool,
247 password: &[u8],
248 load_limits: PdfLoadLimits,
249 ) -> Result<Self, XRefError> {
250 let trailer_data = TrailerData::dummy();
254
255 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
256 data: Arc::new(Data::new(data)),
257 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
258 decryptor: Arc::new(Decryptor::None),
259 has_ocgs: false,
260 metadata: Arc::new(Metadata::default()),
261 trailer_data,
262 password: password.to_vec(),
263 load_limits,
264 })));
265
266 let decryptor = {
271 match input {
272 XRefInput::TrailerDictData(trailer_dict_data) => {
273 let mut r = Reader::new(trailer_dict_data);
274
275 let trailer_dict = r
276 .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
277 .ok_or(XRefError::Unknown)?;
278
279 get_decryptor(&trailer_dict, password)?
280 }
281 XRefInput::RootRef(_) => Decryptor::None,
282 }
283 };
284
285 match &mut xref.0 {
286 Inner::Dummy => unreachable!(),
287 Inner::Some(r) => {
288 let mutable = Arc::make_mut(r);
289 mutable.decryptor = Arc::new(decryptor.clone());
290 }
291 }
292
293 let (trailer_data, has_ocgs, metadata) = match input {
294 XRefInput::TrailerDictData(trailer_dict_data) => {
295 let mut r = Reader::new(trailer_dict_data);
296
297 let trailer_dict = r
298 .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
299 .ok_or(XRefError::Unknown)?;
300
301 let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
302 let root = trailer_dict
303 .get::<Dict<'_>>(ROOT)
304 .ok_or(XRefError::Unknown)?;
305 let metadata = trailer_dict
306 .get::<Dict<'_>>(INFO)
307 .map(|d| parse_metadata(&d))
308 .unwrap_or_default();
309 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
310 let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
311 let version = root
312 .get::<Name>(VERSION)
313 .and_then(|v| PdfVersion::from_bytes(v.deref()));
314
315 let td = TrailerData {
316 pages_ref: pages_ref.into(),
317 root_ref: root_ref.into(),
318 version,
319 };
320
321 (td, has_ocgs, metadata)
322 }
323 XRefInput::RootRef(root_ref) => {
324 let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
325 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
326
327 let td = TrailerData {
328 pages_ref: pages_ref.into(),
329 root_ref,
330 version: None,
331 };
332
333 (td, false, Metadata::default())
334 }
335 };
336
337 match &mut xref.0 {
338 Inner::Dummy => unreachable!(),
339 Inner::Some(r) => {
340 let mutable = Arc::make_mut(r);
341 mutable.trailer_data = trailer_data;
342 mutable.decryptor = Arc::new(decryptor);
343 mutable.has_ocgs = has_ocgs;
344 mutable.metadata = Arc::new(metadata);
345 }
346 }
347
348 Ok(xref)
349 }
350
351 fn is_repaired(&self) -> bool {
352 match &self.0 {
353 Inner::Dummy => false,
354 Inner::Some(r) => {
355 let locked = r.map.get();
356 locked.repaired
357 }
358 }
359 }
360
361 pub(crate) fn dummy() -> &'static Self {
362 &DUMMY_XREF
363 }
364
365 pub(crate) fn load_limits(&self) -> PdfLoadLimits {
366 match &self.0 {
367 Inner::Dummy => PdfLoadLimits::default(),
368 Inner::Some(r) => r.load_limits,
369 }
370 }
371
372 pub(crate) fn len(&self) -> usize {
373 match &self.0 {
374 Inner::Dummy => 0,
375 Inner::Some(r) => r.map.get().xref_map.len(),
376 }
377 }
378
379 pub(crate) fn trailer_data(&self) -> &TrailerData {
380 match &self.0 {
381 Inner::Dummy => unreachable!(),
382 Inner::Some(r) => &r.trailer_data,
383 }
384 }
385
386 #[cfg(test)]
389 pub(crate) fn object_stream_offsets_cache_len(&self) -> usize {
390 match &self.0 {
391 Inner::Dummy => 0,
392 Inner::Some(r) => r.data.object_stream_offsets_cache_len(),
393 }
394 }
395
396 pub(crate) fn metadata(&self) -> &Metadata {
397 match &self.0 {
398 Inner::Dummy => unreachable!(),
399 Inner::Some(r) => &r.metadata,
400 }
401 }
402
403 pub fn root_id(&self) -> ObjectIdentifier {
405 self.trailer_data().root_ref
406 }
407
408 pub fn has_optional_content_groups(&self) -> bool {
410 match &self.0 {
411 Inner::Dummy => false,
412 Inner::Some(r) => r.has_ocgs,
413 }
414 }
415
416 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
417 match &self.0 {
418 Inner::Dummy => unimplemented!(),
419 Inner::Some(r) => {
420 let locked = r.map.get();
421 let mut elements = locked
422 .xref_map
423 .iter()
424 .map(|(id, e)| {
425 let offset = match e {
426 EntryType::Normal(o) => (*o, 0),
427 EntryType::ObjStream(id, index) => {
428 if let Some(EntryType::Normal(offset)) =
429 locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
430 {
431 (*offset, *index)
432 } else {
433 (usize::MAX, 0)
434 }
435 }
436 };
437
438 (*id, offset)
439 })
440 .collect::<Vec<_>>();
441
442 elements.sort_by_key(|e1| e1.1);
445
446 let mut iter = elements.into_iter();
447
448 iter::from_fn(move || {
449 for next in iter.by_ref() {
450 if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
451 return Some(obj);
452 } else {
453 continue;
455 }
456 }
457
458 None
459 })
460 }
461 }
462 }
463
464 pub(crate) fn repair(&self) {
465 let Inner::Some(r) = &self.0 else {
466 unreachable!();
467 };
468
469 let mut locked = r
470 .map
471 .try_put()
472 .expect("xref repair: map lock not contended");
473 assert!(!locked.repaired);
474
475 let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
476 locked.xref_map = xref_map;
477 locked.repaired = true;
478 }
479
480 #[inline]
481 pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
482 match &self.0 {
483 Inner::Dummy => false,
484 Inner::Some(r) => {
485 if matches!(r.decryptor.as_ref(), Decryptor::None) {
486 false
487 } else {
488 !ctx.in_content_stream() && !ctx.in_object_stream()
489 }
490 }
491 }
492 }
493
494 #[inline]
495 pub(crate) fn decrypt(
496 &self,
497 id: ObjectIdentifier,
498 data: &[u8],
499 target: DecryptionTarget,
500 ) -> Option<Vec<u8>> {
501 match &self.0 {
502 Inner::Dummy => Some(data.to_vec()),
503 Inner::Some(r) => r.decryptor.decrypt(id, data, target),
504 }
505 }
506
507 #[allow(private_bounds)]
509 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
510 where
511 T: ObjectLike<'a>,
512 {
513 let ctx = ReaderContext::new(self, false);
514 self.get_with(id, &ctx)
515 }
516
517 #[allow(private_bounds)]
519 pub(crate) fn get_with<'a, T>(
520 &'a self,
521 id: ObjectIdentifier,
522 ctx: &ReaderContext<'a>,
523 ) -> Option<T>
524 where
525 T: ObjectLike<'a>,
526 {
527 let Inner::Some(repr) = &self.0 else {
528 return None;
529 };
530
531 let locked = repr.map.try_get()?;
532
533 let mut r = Reader::new(repr.data.get().as_ref());
534
535 let entry = *locked.xref_map.get(&id).or({
536 None
539 })?;
540 drop(locked);
541
542 let mut ctx = ctx.clone();
543 ctx.set_obj_number(id);
544 ctx.set_in_content_stream(false);
545
546 match entry {
547 EntryType::Normal(offset) => {
548 ctx.set_in_object_stream(false);
549 r.jump(offset);
550
551 if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
552 if object.id() == &id {
553 return Some(object.get());
554 }
555 } else {
556 if r.skip_not_in_content_stream::<IndirectObject<Object<'_>>>()
559 .is_some()
560 {
561 return None;
562 }
563 };
564
565 if self.is_repaired() {
567 error!(
568 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
569 );
570
571 None
572 } else {
573 warn!("broken xref, attempting to repair");
574
575 self.repair();
576
577 self.get_with::<T>(id, &ctx)
579 }
580 }
581 EntryType::ObjStream(obj_stram_gen_num, index) => {
582 let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
584
585 if obj_stream_id == id {
586 warn!("cycle detected in object stream");
587
588 return None;
589 }
590
591 let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
592 let data = repr.data.get_with(obj_stream_id, &ctx)?;
593 let offsets = repr
598 .data
599 .get_object_stream_offsets_or_init(obj_stream_id, || {
600 parse_object_stream_offsets(&stream, data)
601 })?;
602 let object_stream = ObjectStream::from_cached_offsets(data, &ctx, offsets);
603 object_stream.get(index)
604 }
605 }
606 }
607}
608
609#[derive(Debug, Copy, Clone)]
612pub(crate) enum XRefInput<'a> {
613 TrailerDictData(&'a [u8]),
616 RootRef(ObjectIdentifier),
624}
625
626pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
627 let mut finder = Reader::new(data);
628 let mut pos = finder.len().checked_sub(1)?;
629 finder.jump(pos);
630
631 let needle = b"startxref";
632
633 loop {
634 if finder.forward_tag(needle).is_some() {
635 finder.skip_white_spaces_and_comments();
636
637 let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
638
639 return Some(offset);
640 }
641
642 pos = pos.checked_sub(1)?;
643 finder.jump(pos);
644 }
645}
646
647#[derive(Debug, PartialEq, Eq, Clone, Copy)]
649enum EntryType {
650 Normal(usize),
652 ObjStream(u32, u32),
656}
657
658type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
659
660#[derive(Debug)]
662struct MapRepr {
663 xref_map: XrefMap,
664 repaired: bool,
665}
666
667#[derive(Debug, Copy, Clone)]
668pub(crate) struct TrailerData {
669 pub(crate) pages_ref: ObjectIdentifier,
670 pub(crate) root_ref: ObjectIdentifier,
671 pub(crate) version: Option<PdfVersion>,
672}
673
674impl TrailerData {
675 pub(crate) fn dummy() -> Self {
676 Self {
677 pages_ref: ObjectIdentifier::new(0, 0),
678 root_ref: ObjectIdentifier::new(0, 0),
679 version: None,
680 }
681 }
682}
683
684#[derive(Debug, Clone)]
685struct SomeRepr {
686 data: Arc<Data>,
687 map: Arc<RwLock<MapRepr>>,
688 metadata: Arc<Metadata>,
689 decryptor: Arc<Decryptor>,
690 has_ocgs: bool,
691 password: Vec<u8>,
692 trailer_data: TrailerData,
693 load_limits: PdfLoadLimits,
694}
695
696#[derive(Debug, Clone)]
697enum Inner {
698 Dummy,
700 Some(Arc<SomeRepr>),
702}
703
704#[derive(Debug)]
705struct XRefEntry {
706 offset: usize,
707 gen_number: i32,
708 used: bool,
709}
710
711impl XRefEntry {
712 pub(crate) fn read(data: &[u8]) -> Option<Self> {
713 #[inline(always)]
714 fn parse_u32(data: &[u8]) -> Option<u32> {
715 let mut accum = 0_u32;
716
717 for byte in data {
718 accum = accum.checked_mul(10)?;
719
720 match *byte {
721 b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
722 _ => return None,
723 }
724 }
725
726 Some(accum)
727 }
728
729 let offset = parse_u32(&data[0..10])? as usize;
730 let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
731
732 let used = data[17] == b'n';
733
734 Some(Self {
735 offset,
736 gen_number,
737 used,
738 })
739 }
740}
741
742const MAX_XREF_CHAIN_DEPTH: usize = 64;
745
746fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
747 populate_xref_depth(data, pos, xref_map, 0)
748}
749
750fn populate_xref_depth<'a>(
751 data: &'a [u8],
752 pos: usize,
753 xref_map: &mut XrefMap,
754 depth: usize,
755) -> Option<&'a [u8]> {
756 if depth > MAX_XREF_CHAIN_DEPTH {
757 log::warn!("Xref chain depth exceeds {MAX_XREF_CHAIN_DEPTH}, stopping traversal");
758 return None;
759 }
760 let mut reader = Reader::new(data);
761 reader.jump(pos);
762 reader.skip_white_spaces_and_comments();
764
765 let mut r2 = reader.clone();
766 if reader
767 .clone()
768 .read_without_context::<ObjectIdentifier>()
769 .is_some()
770 {
771 populate_from_xref_stream(data, &mut r2, xref_map, depth)
772 } else {
773 populate_from_xref_table(data, &mut r2, xref_map, depth)
774 }
775}
776
777pub(super) struct SubsectionHeader {
778 pub(super) start: u32,
779 pub(super) num_entries: u32,
780}
781
782impl Readable<'_> for SubsectionHeader {
783 fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
784 r.skip_white_spaces();
785 let start = r.read_without_context::<u32>()?;
786 r.skip_white_spaces();
787 let num_entries = r.read_without_context::<u32>()?;
788 r.skip_white_spaces();
789
790 Some(Self { start, num_entries })
791 }
792}
793
794fn populate_from_xref_table<'a>(
796 data: &'a [u8],
797 reader: &mut Reader<'a>,
798 insert_map: &mut XrefMap,
799 depth: usize,
800) -> Option<&'a [u8]> {
801 let trailer = {
802 let mut reader = reader.clone();
803 read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
804 };
805
806 reader.skip_white_spaces();
807 reader.forward_tag(b"xref")?;
808 reader.skip_white_spaces();
809
810 let mut max_obj = 0;
811
812 if let Some(prev) = trailer.get::<i32>(PREV) {
813 populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
815 }
816
817 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
820 populate_xref_depth(data, xref_stm as usize, insert_map, depth + 1)?;
821 }
822
823 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
824 reader.skip_white_spaces();
825
826 let start = header.start;
827 let end = start + header.num_entries;
828
829 for obj_number in start..end {
830 max_obj = max(max_obj, obj_number);
831 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
832 let entry = XRefEntry::read(bytes)?;
833
834 if entry.used {
837 insert_map.insert(
838 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
839 EntryType::Normal(entry.offset),
840 );
841 }
842 }
843 }
844
845 Some(trailer.data())
846}
847
848fn populate_from_xref_stream<'a>(
849 data: &'a [u8],
850 reader: &mut Reader<'a>,
851 insert_map: &mut XrefMap,
852 depth: usize,
853) -> Option<&'a [u8]> {
854 let stream = reader
855 .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
856 .get();
857
858 if let Some(prev) = stream.dict().get::<i32>(PREV) {
859 let _ = populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
861 }
862
863 let size = stream.dict().get::<u32>(SIZE)?;
864
865 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
866
867 if f2_len > size_of::<u64>() as u8 {
868 error!("xref offset length is larger than the allowed limit");
869
870 return None;
871 }
872
873 if f1_len != 1 {
875 warn!("first field in xref stream was longer than 1");
876 }
877
878 let xref_data = stream.decoded().ok()?;
879 let mut xref_reader = Reader::new(xref_data.as_ref());
880
881 if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
882 let iter = arr.iter::<(u32, u32)>();
883
884 for (start, num_elements) in iter {
885 xref_stream_subsection(
886 &mut xref_reader,
887 start,
888 num_elements,
889 f1_len,
890 f2_len,
891 f3_len,
892 insert_map,
893 )?;
894 }
895 } else {
896 xref_stream_subsection(
897 &mut xref_reader,
898 0,
899 size,
900 f1_len,
901 f2_len,
902 f3_len,
903 insert_map,
904 )?;
905 }
906
907 Some(stream.dict().data())
908}
909
910fn xref_stream_num(data: &[u8]) -> Option<u32> {
911 Some(match data.len() {
912 0 => return None,
913 1 => u8::from_be(data[0]) as u32,
914 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
915 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
916 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
917 8 => {
918 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
919 return Some(num);
920 } else {
921 warn!("xref stream number is too large");
922
923 return None;
924 }
925 }
926 n => {
927 warn!("invalid xref stream number {n}");
928
929 return None;
930 }
931 })
932}
933
934fn xref_stream_subsection<'a>(
935 xref_reader: &mut Reader<'a>,
936 start: u32,
937 num_elements: u32,
938 f1_len: u8,
939 f2_len: u8,
940 f3_len: u8,
941 insert_map: &mut XrefMap,
942) -> Option<()> {
943 for i in 0..num_elements {
944 let f_type = if f1_len == 0 {
945 1
946 } else {
947 xref_reader.read_bytes(1)?[0]
949 };
950
951 let obj_number = start + i;
952
953 match f_type {
954 0 => {
956 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
957 }
958 1 => {
959 let offset = if f2_len > 0 {
960 let data = xref_reader.read_bytes(f2_len as usize)?;
961 xref_stream_num(data)?
962 } else {
963 0
964 };
965
966 let gen_number = if f3_len > 0 {
967 let data = xref_reader.read_bytes(f3_len as usize)?;
968 xref_stream_num(data)?
969 } else {
970 0
971 };
972
973 insert_map.insert(
974 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
975 EntryType::Normal(offset as usize),
976 );
977 }
978 2 => {
979 let obj_stream_number = {
980 let data = xref_reader.read_bytes(f2_len as usize)?;
981 xref_stream_num(data)?
982 };
983 let gen_number = 0;
984 let index = if f3_len > 0 {
985 let data = xref_reader.read_bytes(f3_len as usize)?;
986 xref_stream_num(data)?
987 } else {
988 0
989 };
990
991 insert_map.insert(
992 ObjectIdentifier::new(obj_number as i32, gen_number),
993 EntryType::ObjStream(obj_stream_number, index),
994 );
995 }
996 _ => {
997 warn!("xref has unknown field type {f_type}");
998
999 return None;
1000 }
1001 }
1002 }
1003
1004 Some(())
1005}
1006
1007fn read_xref_table_trailer<'a>(
1008 reader: &mut Reader<'a>,
1009 ctx: &ReaderContext<'a>,
1010) -> Option<Dict<'a>> {
1011 reader.skip_white_spaces();
1012 reader.forward_tag(b"xref")?;
1013 reader.skip_white_spaces();
1014
1015 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
1016 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
1017 }
1018
1019 reader.skip_white_spaces();
1020 reader.forward_tag(b"trailer")?;
1021 reader.skip_white_spaces();
1022
1023 reader.read_with_context::<Dict<'_>>(ctx)
1024}
1025
1026fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
1027 if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
1028 let id = if let Some(id) = trailer_dict
1029 .get::<Array<'_>>(ID)
1030 .and_then(|a| a.flex_iter().next::<object::String>())
1031 {
1032 id.to_vec()
1033 } else {
1034 vec![]
1036 };
1037
1038 get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1039 } else {
1040 Ok(Decryptor::None)
1041 }
1042}
1043
1044fn parse_object_stream_offsets(
1053 inner: &Stream<'_>,
1054 data: &[u8],
1055) -> Option<crate::data::ObjectStreamOffsets> {
1056 let num_objects = inner.dict().get::<usize>(N)?;
1057 let first_offset = inner.dict().get::<usize>(FIRST)?;
1058
1059 let mut r = Reader::new(data);
1060 let mut offsets = Vec::with_capacity(num_objects);
1061
1062 for _ in 0..num_objects {
1063 r.skip_white_spaces_and_comments();
1064 let obj_num = r.read_without_context::<u32>()?;
1066 r.skip_white_spaces_and_comments();
1067 let relative_offset = r.read_without_context::<usize>()?;
1068 offsets.push((obj_num, first_offset + relative_offset));
1069 }
1070
1071 Some(offsets)
1072}
1073
1074struct ObjectStream<'a> {
1083 data: &'a [u8],
1084 ctx: ReaderContext<'a>,
1085 offsets: Arc<crate::data::ObjectStreamOffsets>,
1086}
1087
1088impl<'a> ObjectStream<'a> {
1089 fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1093 let offsets = Arc::new(parse_object_stream_offsets(&inner, data)?);
1094
1095 let mut ctx = ctx.clone();
1096 ctx.set_in_object_stream(true);
1097
1098 Some(Self { data, ctx, offsets })
1099 }
1100
1101 fn from_cached_offsets(
1105 data: &'a [u8],
1106 ctx: &ReaderContext<'a>,
1107 offsets: Arc<crate::data::ObjectStreamOffsets>,
1108 ) -> Self {
1109 let mut ctx = ctx.clone();
1110 ctx.set_in_object_stream(true);
1111
1112 Self { data, ctx, offsets }
1113 }
1114
1115 fn get<T>(&self, index: u32) -> Option<T>
1116 where
1117 T: ObjectLike<'a>,
1118 {
1119 let offset = self.offsets.get(index as usize)?.1;
1120 let mut r = Reader::new(self.data);
1121 r.jump(offset);
1122 r.skip_white_spaces_and_comments();
1123
1124 r.read_with_context::<T>(&self.ctx)
1125 }
1126}
1127
1128fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1129 Metadata {
1130 creation_date: info_dict
1131 .get::<object::String>(CREATION_DATE)
1132 .and_then(|c| DateTime::from_bytes(&c)),
1133 modification_date: info_dict
1134 .get::<object::String>(MOD_DATE)
1135 .and_then(|c| DateTime::from_bytes(&c)),
1136 title: info_dict.get::<object::String>(TITLE).map(|t| t.to_vec()),
1137 author: info_dict.get::<object::String>(AUTHOR).map(|t| t.to_vec()),
1138 subject: info_dict.get::<object::String>(SUBJECT).map(|t| t.to_vec()),
1139 keywords: info_dict
1140 .get::<object::String>(KEYWORDS)
1141 .map(|t| t.to_vec()),
1142 creator: info_dict.get::<object::String>(CREATOR).map(|t| t.to_vec()),
1143 producer: info_dict
1144 .get::<object::String>(PRODUCER)
1145 .map(|t| t.to_vec()),
1146 }
1147}
1148
1149#[cfg(test)]
1150mod qf2b_objectstream_cache_tests {
1151 use crate::pdf::Pdf;
1155 use crate::xref::parse_object_stream_offsets;
1156
1157 const FIXTURE: &str = "../xfa-golden-tests/golden/13a7b224_xfa_issue14315.pdf";
1160
1161 fn load_fixture() -> Option<Pdf> {
1162 let bytes = std::fs::read(FIXTURE).ok()?;
1163 Pdf::new(bytes).ok()
1164 }
1165
1166 #[test]
1167 fn qf2b_objstm_cache_populates_and_is_stable_on_repeat() {
1168 let Some(pdf) = load_fixture() else {
1169 return;
1172 };
1173
1174 let xref = pdf.xref();
1175
1176 let after_construction = xref.object_stream_offsets_cache_len();
1181 assert!(
1182 after_construction >= 1,
1183 "fixture is a PDF 1.5+ doc with /ObjStm; at least one offsets table should already be cached after construction; got {after_construction}"
1184 );
1185
1186 let _: Option<crate::object::Dict<'_>> = xref.get(xref.root_id());
1189 assert_eq!(
1190 xref.object_stream_offsets_cache_len(),
1191 after_construction,
1192 "repeated resolution of the same indirect object must reuse the cached offsets table"
1193 );
1194
1195 for raw in 1..=20i32 {
1199 let id = crate::object::ObjectIdentifier::new(raw, 0);
1200 let _: Option<crate::object::Dict<'_>> = xref.get(id);
1201 }
1202 let after_scan = xref.object_stream_offsets_cache_len();
1203
1204 for raw in 1..=20i32 {
1206 let id = crate::object::ObjectIdentifier::new(raw, 0);
1207 let _: Option<crate::object::Dict<'_>> = xref.get(id);
1208 }
1209 assert_eq!(
1210 xref.object_stream_offsets_cache_len(),
1211 after_scan,
1212 "repeated full scans must be cache-stable (no re-parse)"
1213 );
1214 }
1215
1216 #[test]
1217 fn qf2b_two_pdfs_have_independent_caches() {
1218 let Some(pdf_a) = load_fixture() else {
1219 return;
1220 };
1221 let Some(pdf_b) = load_fixture() else {
1222 return;
1223 };
1224
1225 let base_a = pdf_a.xref().object_stream_offsets_cache_len();
1228 let base_b = pdf_b.xref().object_stream_offsets_cache_len();
1229 assert_eq!(base_a, base_b);
1230
1231 for raw in 1..=50i32 {
1234 let id = crate::object::ObjectIdentifier::new(raw, 0);
1235 let _: Option<crate::object::Dict<'_>> = pdf_a.xref().get(id);
1236 }
1237 let warm_a = pdf_a.xref().object_stream_offsets_cache_len();
1238
1239 assert_eq!(
1241 pdf_b.xref().object_stream_offsets_cache_len(),
1242 base_b,
1243 "pdf_b cache must be independent of pdf_a's warming (base_a={base_a}, warm_a={warm_a}, base_b={base_b})"
1244 );
1245 }
1246
1247 #[test]
1248 fn qf2b_parse_helper_returns_none_on_truncated_header() {
1249 use crate::object::Stream;
1253 use crate::reader::{Reader, ReaderContext, ReaderExt};
1254 use crate::xref::DUMMY_XREF;
1255
1256 let raw: &[u8] = b"1 0 obj <</N 3 /First 6 /Length 4>>\nstream\n1 0 \nendstream\nendobj\n";
1259 let mut r = Reader::new(raw);
1260 let ctx = ReaderContext::new(&DUMMY_XREF, false);
1261 let stream: Stream<'_> = r
1262 .read_with_context::<crate::object::indirect::IndirectObject<Stream<'_>>>(&ctx)
1263 .expect("synthetic stream should parse")
1264 .get();
1265
1266 let body: &[u8] = b"1 0 ";
1269 assert!(
1270 parse_object_stream_offsets(&stream, body).is_none(),
1271 "truncated headers must not produce a partial offsets table"
1272 );
1273 }
1274
1275 #[test]
1284 #[ignore = "perf measurement; run with `cargo test --release -- --ignored qf2b_bench`"]
1285 fn qf2b_bench_offsets_parse_vs_cached() {
1286 use std::time::Instant;
1287
1288 let path = "../../corpus/f3800.pdf";
1291 let Ok(bytes) = std::fs::read(path) else {
1292 eprintln!("[qf2b_bench] fixture {path} unavailable; skipping");
1293 return;
1294 };
1295 let pdf = Pdf::new(bytes).expect("load f3800.pdf");
1296 let xref = pdf.xref();
1297
1298 let max_id = (xref.len() as i32).min(3000);
1301 for n in 1..=max_id {
1302 let id = crate::object::ObjectIdentifier::new(n, 0);
1303 let _: Option<crate::object::Object<'_>> = xref.get(id);
1304 }
1305 let cached_objstms = xref.object_stream_offsets_cache_len();
1306 assert!(
1307 cached_objstms >= 5,
1308 "fixture must trigger several /ObjStms (got {cached_objstms})"
1309 );
1310
1311 let mut objstm_ids: Vec<crate::object::ObjectIdentifier> = Vec::new();
1315 for n in 1..=max_id {
1316 let id = crate::object::ObjectIdentifier::new(n, 0);
1317 if let Some(stream) = xref.get::<crate::object::Stream<'_>>(id)
1319 && stream
1320 .dict()
1321 .get::<crate::object::Name>(crate::object::dict::keys::TYPE)
1322 .as_deref()
1323 == Some(b"ObjStm")
1324 {
1325 objstm_ids.push(id);
1326 }
1327 if objstm_ids.len() >= cached_objstms {
1328 break;
1329 }
1330 }
1331 let containers = objstm_ids.len();
1332 assert!(containers > 0);
1333
1334 const REPEATS: u32 = 200;
1337 let mut sink_parse = 0usize;
1338 let t_parse = Instant::now();
1339 for _ in 0..REPEATS {
1340 for id in &objstm_ids {
1341 let stream = xref
1342 .get::<crate::object::Stream<'_>>(*id)
1343 .expect("stream resolves");
1344 let Ok(decoded) = stream.decoded() else {
1345 continue;
1346 };
1347 if let Some(offs) = parse_object_stream_offsets(&stream, &decoded) {
1348 sink_parse = sink_parse.wrapping_add(offs.len());
1349 }
1350 }
1351 }
1352 let parse_elapsed = t_parse.elapsed();
1353
1354 let inner = match &xref.0 {
1357 crate::xref::Inner::Some(r) => r.clone(),
1358 _ => unreachable!(),
1359 };
1360 let mut sink_cache = 0usize;
1361 let t_cache = Instant::now();
1362 for _ in 0..REPEATS {
1363 for id in &objstm_ids {
1364 let offs = inner
1365 .data
1366 .get_object_stream_offsets_or_init(*id, || {
1367 let stream = xref
1368 .get::<crate::object::Stream<'_>>(*id)
1369 .expect("stream resolves");
1370 let decoded = stream.decoded().ok()?;
1371 parse_object_stream_offsets(&stream, &decoded)
1372 })
1373 .expect("cached entry must exist after warm-up");
1374 sink_cache = sink_cache.wrapping_add(offs.len());
1375 }
1376 }
1377 let cache_elapsed = t_cache.elapsed();
1378
1379 assert_eq!(
1380 sink_parse, sink_cache,
1381 "parsed and cached results must agree on offset-count totals"
1382 );
1383
1384 let speedup = parse_elapsed.as_secs_f64() / cache_elapsed.as_secs_f64().max(1e-9);
1385 let reduction = (1.0 - cache_elapsed.as_secs_f64() / parse_elapsed.as_secs_f64()) * 100.0;
1386
1387 eprintln!("[qf2b_bench] fixture: f3800.pdf");
1388 eprintln!("[qf2b_bench] /ObjStm containers measured: {containers}");
1389 eprintln!("[qf2b_bench] iterations per container: {REPEATS}");
1390 eprintln!("[qf2b_bench] direct re-parse total: {parse_elapsed:?}");
1391 eprintln!("[qf2b_bench] cached lookup total: {cache_elapsed:?}");
1392 eprintln!("[qf2b_bench] speedup: {speedup:.1}x");
1393 eprintln!("[qf2b_bench] parse-time reduction: {reduction:.1}%");
1394
1395 assert!(
1399 reduction >= 10.0,
1400 "QF2-B acceptance: ≥ 10 % parse-time reduction required; got {reduction:.2} %"
1401 );
1402 }
1403}