1use crate::PdfData;
4use crate::data::Data;
5use crate::object::Array;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::ObjectIdentifier;
9use crate::object::Stream;
10use crate::object::dict::keys::{
11 ENCRYPT, FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Object, ObjectLike};
15use crate::pdf::PdfVersion;
16use crate::reader::{Readable, Reader, ReaderContext};
17use log::{error, warn};
18use rustc_hash::FxHashMap;
19use std::cmp::max;
20use std::iter;
21use std::ops::Deref;
22use std::sync::{Arc, RwLock};
23
24pub(crate) const XREF_ENTRY_LEN: usize = 20;
25
26#[derive(Debug, Copy, Clone)]
27pub(crate) enum XRefError {
28 Unknown,
29 Encrypted,
30}
31
32pub(crate) fn root_xref(data: PdfData) -> Result<XRef, XRefError> {
34 let mut xref_map = FxHashMap::default();
35 let xref_pos = find_last_xref_pos(data.as_ref().as_ref()).ok_or(XRefError::Unknown)?;
36 let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)
37 .ok_or(XRefError::Unknown)?;
38
39 XRef::new(data.clone(), xref_map, trailer, false)
40}
41
42pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
44 warn!("xref table was invalid, trying to manually build xref table");
45 let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
46
47 if let Some(trailer_dict_data) = trailer_dict {
48 warn!("rebuild xref table with {} entries", xref_map.len());
49
50 XRef::new(data.clone(), xref_map, trailer_dict_data, true).ok()
51 } else {
52 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
53
54 None
55 }
56}
57
58fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
59 let mut xref_map = FxHashMap::default();
60 let mut trailer_dict = None;
61
62 let mut r = Reader::new(data);
63
64 let dummy_ctx = ReaderContext::dummy();
65 let mut last_obj_num = None;
66
67 loop {
68 let cur_pos = r.offset();
69
70 let mut old_r = r.clone();
71
72 if let Some(obj_id) = r.read::<ObjectIdentifier>(dummy_ctx) {
73 xref_map.insert(obj_id, EntryType::Normal(cur_pos));
74 last_obj_num = Some(obj_id);
75 } else if let Some(dict) = r.read::<Dict>(dummy_ctx) {
76 if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
77 trailer_dict = Some(dict.clone());
78 }
79
80 if let Some(stream) = old_r.read::<Stream>(dummy_ctx) {
81 if stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
82 && let Some(data) = stream.decoded().ok()
83 && let Some(last_obj_num) = last_obj_num
84 {
85 if let Some(obj_stream) = ObjectStream::new(stream, &data, dummy_ctx) {
86 for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
87 let id = ObjectIdentifier::new(*obj_num as i32, 0);
88 xref_map.insert(
89 id,
90 EntryType::ObjStream(last_obj_num.obj_num as u32, idx as u32),
91 );
92 }
93 }
94 }
95 }
96 } else {
97 r.read_byte();
98 }
99
100 if r.at_end() {
101 break;
102 }
103 }
104
105 (xref_map, trailer_dict.map(|d| d.data()))
106}
107
108static DUMMY_XREF: &XRef = &XRef(Inner::Dummy);
109
110#[derive(Debug, Clone)]
112pub struct XRef(Inner);
113
114impl XRef {
115 fn new(
116 data: PdfData,
117 xref_map: XrefMap,
118 trailer_dict_data: &[u8],
119 repaired: bool,
120 ) -> Result<Self, XRefError> {
121 let trailer_data = TrailerData::dummy();
125
126 let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
127 data: Arc::new(Data::new(data)),
128 map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
129 trailer_data,
130 })));
131
132 let mut r = Reader::new(trailer_dict_data);
133 let trailer_dict = r
134 .read_with_context::<Dict>(ReaderContext::new(&xref, false))
135 .ok_or(XRefError::Unknown)?;
136
137 if trailer_dict.get::<Dict>(ENCRYPT).is_some() {
138 warn!("encrypted PDF files are not yet supported");
139
140 return Err(XRefError::Encrypted);
141 }
142
143 let root = trailer_dict.get::<Dict>(ROOT).ok_or(XRefError::Unknown)?;
144 let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
145 let version = root
146 .get::<Name>(VERSION)
147 .and_then(|v| PdfVersion::from_bytes(v.deref()));
148
149 let td = TrailerData {
150 pages_ref: pages_ref.into(),
151 version,
152 };
153
154 match &mut xref.0 {
155 Inner::Dummy => unreachable!(),
156 Inner::Some(r) => {
157 Arc::make_mut(r).trailer_data = td;
158 }
159 }
160
161 Ok(xref)
162 }
163
164 fn is_repaired(&self) -> bool {
165 match &self.0 {
166 Inner::Dummy => false,
167 Inner::Some(r) => {
168 let locked = r.map.read().unwrap();
169 locked.repaired
170 }
171 }
172 }
173
174 pub(crate) fn dummy() -> &'static XRef {
175 DUMMY_XREF
176 }
177
178 pub(crate) fn len(&self) -> usize {
179 match &self.0 {
180 Inner::Dummy => 0,
181 Inner::Some(r) => r.map.read().unwrap().xref_map.len(),
182 }
183 }
184
185 pub(crate) fn trailer_data(&self) -> &TrailerData {
186 match &self.0 {
187 Inner::Dummy => unreachable!(),
188 Inner::Some(r) => &r.trailer_data,
189 }
190 }
191
192 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
193 match &self.0 {
194 Inner::Dummy => unimplemented!(),
195 Inner::Some(r) => iter::from_fn(move || {
196 let locked = r.map.read().unwrap();
197 let mut iter = locked.xref_map.keys();
198
199 iter.next().and_then(|k| self.get(*k))
200 }),
201 }
202 }
203
204 pub(crate) fn repair(&self) {
205 let Inner::Some(r) = &self.0 else {
206 unreachable!();
207 };
208
209 let mut locked = r.map.try_write().unwrap();
210 assert!(!locked.repaired);
211
212 let (xref_map, _) = fallback_xref_map(r.data.get());
213 locked.xref_map = xref_map;
214 locked.repaired = true;
215 }
216
217 #[allow(private_bounds)]
219 pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
220 where
221 T: ObjectLike<'a>,
222 {
223 let Inner::Some(repr) = &self.0 else {
224 return None;
225 };
226
227 let locked = repr.map.try_read().unwrap();
228
229 let mut r = Reader::new(repr.data.get());
230
231 let entry = *locked.xref_map.get(&id).or({
232 None
235 })?;
236 drop(locked);
237
238 match entry {
239 EntryType::Normal(offset) => {
240 r.jump(offset);
241
242 if let Some(object) =
243 r.read_with_context::<IndirectObject<T>>(ReaderContext::new(self, false))
244 {
245 if object.id() == &id {
246 return Some(object.get());
247 }
248 } else {
249 if r.skip_not_in_content_stream::<IndirectObject<Object>>()
252 .is_some()
253 {
254 return None;
255 }
256 };
257
258 if self.is_repaired() {
260 error!(
261 "attempt was made at repairing xref, but object {id:?} still couldn't be read"
262 );
263
264 None
265 } else {
266 warn!("broken xref, attempting to repair");
267
268 self.repair();
269
270 self.get::<T>(id)
272 }
273 }
274 EntryType::ObjStream(id, index) => {
275 let id = ObjectIdentifier::new(id as i32, 0);
277
278 let stream = self.get::<Stream>(id)?;
279 let data = repr.data.get_with(id, self)?;
280 let object_stream =
281 ObjectStream::new(stream, data, ReaderContext::new(self, false))?;
282 object_stream.get(index)
283 }
284 }
285 }
286}
287
288pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
289 let mut finder = Reader::new(data);
290 let mut pos = finder.len().checked_sub(1)?;
291 finder.jump(pos);
292
293 let needle = b"startxref";
294
295 loop {
296 if finder.forward_tag(needle).is_some() {
297 finder.skip_white_spaces_and_comments();
298
299 let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
300
301 return Some(offset);
302 }
303
304 pos = pos.checked_sub(1)?;
305 finder.jump(pos);
306 }
307}
308
309#[derive(Debug, PartialEq, Eq, Clone, Copy)]
311enum EntryType {
312 Normal(usize),
314 ObjStream(u32, u32),
318}
319
320type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
321
322#[derive(Debug)]
324struct MapRepr {
325 xref_map: XrefMap,
326 repaired: bool,
327}
328
329#[derive(Debug, Copy, Clone)]
330pub(crate) struct TrailerData {
331 pub pages_ref: ObjectIdentifier,
332 pub version: Option<PdfVersion>,
333}
334
335impl TrailerData {
336 pub fn dummy() -> Self {
337 Self {
338 pages_ref: ObjectIdentifier::new(0, 0),
339 version: None,
340 }
341 }
342}
343
344#[derive(Debug, Clone)]
345struct SomeRepr {
346 data: Arc<Data>,
347 map: Arc<RwLock<MapRepr>>,
348 trailer_data: TrailerData,
349}
350
351#[derive(Debug, Clone)]
352enum Inner {
353 Dummy,
355 Some(Arc<SomeRepr>),
357}
358
359#[derive(Debug)]
360struct XRefEntry {
361 offset: usize,
362 gen_number: i32,
363 used: bool,
364}
365
366impl XRefEntry {
367 pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
368 #[inline(always)]
369 fn parse_u32(data: &[u8]) -> Option<u32> {
370 let mut accum = 0;
371
372 for byte in data {
373 accum *= 10;
374
375 match *byte {
376 b'0'..=b'9' => accum += (*byte - b'0') as u32,
377 _ => return None,
378 }
379 }
380
381 Some(accum)
382 }
383
384 let offset = parse_u32(&data[0..10])? as usize;
385 let gen_number = parse_u32(&data[11..16])? as i32;
386
387 let used = data[17] == b'n';
388
389 Some(Self {
390 offset,
391 gen_number,
392 used,
393 })
394 }
395}
396
397fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
398 let mut reader = Reader::new(data);
399 reader.jump(pos);
400
401 let mut r2 = reader.clone();
402 if reader
403 .clone()
404 .read_without_context::<ObjectIdentifier>()
405 .is_some()
406 {
407 populate_from_xref_stream(data, &mut r2, xref_map)
408 } else {
409 populate_from_xref_table(data, &mut r2, xref_map)
410 }
411}
412
413pub(super) struct SubsectionHeader {
414 pub(super) start: u32,
415 pub(super) num_entries: u32,
416}
417
418impl Readable<'_> for SubsectionHeader {
419 fn read(r: &mut Reader<'_>, _: ReaderContext) -> Option<Self> {
420 r.skip_white_spaces();
421 let start = r.read_without_context::<u32>()?;
422 r.skip_white_spaces();
423 let num_entries = r.read_without_context::<u32>()?;
424 r.skip_white_spaces();
425
426 Some(Self { start, num_entries })
427 }
428}
429
430fn populate_from_xref_table<'a>(
432 data: &'a [u8],
433 reader: &mut Reader<'a>,
434 insert_map: &mut XrefMap,
435) -> Option<&'a [u8]> {
436 let trailer = {
437 let mut reader = reader.clone();
438 read_xref_table_trailer(&mut reader, ReaderContext::dummy())?
439 };
440
441 reader.skip_white_spaces();
442 reader.forward_tag(b"xref")?;
443 reader.skip_white_spaces();
444
445 let mut max_obj = 0;
446
447 if let Some(prev) = trailer.get::<i32>(PREV) {
448 populate_xref_impl(data, prev as usize, insert_map)?;
450 }
451
452 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
455 populate_xref_impl(data, xref_stm as usize, insert_map)?;
456 }
457
458 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
459 reader.skip_white_spaces();
460
461 let start = header.start;
462 let end = start + header.num_entries;
463
464 for obj_number in start..end {
465 max_obj = max(max_obj, obj_number);
466 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
467 let entry = XRefEntry::read(bytes)?;
468
469 if entry.used {
472 insert_map.insert(
473 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
474 EntryType::Normal(entry.offset),
475 );
476 }
477 }
478 }
479
480 Some(trailer.data())
481}
482
483fn populate_from_xref_stream<'a>(
484 data: &'a [u8],
485 reader: &mut Reader<'a>,
486 insert_map: &mut XrefMap,
487) -> Option<&'a [u8]> {
488 let stream = reader
489 .read_with_context::<IndirectObject<Stream>>(ReaderContext::dummy())?
490 .get();
491
492 if let Some(prev) = stream.dict().get::<i32>(PREV) {
493 let _ = populate_xref_impl(data, prev as usize, insert_map)?;
495 }
496
497 let size = stream.dict().get::<u32>(SIZE)?;
498
499 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
500
501 if f2_len > size_of::<u64>() as u8 {
502 error!("xref offset length is larger than the allowed limit");
503
504 return None;
505 }
506
507 if f1_len != 1 {
509 warn!("first field in xref stream was longer than 1");
510 }
511
512 let xref_data = stream.decoded().ok()?;
513 let mut xref_reader = Reader::new(xref_data.as_ref());
514
515 if let Some(arr) = stream.dict().get::<Array>(INDEX) {
516 let iter = arr.iter::<(u32, u32)>();
517
518 for (start, num_elements) in iter {
519 xref_stream_subsection(
520 &mut xref_reader,
521 start,
522 num_elements,
523 f1_len,
524 f2_len,
525 f3_len,
526 insert_map,
527 )?;
528 }
529 } else {
530 xref_stream_subsection(
531 &mut xref_reader,
532 0,
533 size,
534 f1_len,
535 f2_len,
536 f3_len,
537 insert_map,
538 )?;
539 }
540
541 Some(stream.dict().data())
542}
543
544fn xref_stream_num(data: &[u8]) -> Option<u32> {
545 Some(match data.len() {
546 0 => return None,
547 1 => u8::from_be(data[0]) as u32,
548 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
549 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
550 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
551 8 => {
552 if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
553 return Some(num);
554 } else {
555 warn!("xref stream number is too large");
556
557 return None;
558 }
559 }
560 n => {
561 warn!("invalid xref stream number {n}");
562
563 return None;
564 }
565 })
566}
567
568fn xref_stream_subsection<'a>(
569 xref_reader: &mut Reader<'a>,
570 start: u32,
571 num_elements: u32,
572 f1_len: u8,
573 f2_len: u8,
574 f3_len: u8,
575 insert_map: &mut XrefMap,
576) -> Option<()> {
577 for i in 0..num_elements {
578 let f_type = if f1_len == 0 {
579 1
580 } else {
581 xref_reader.read_bytes(1)?[0]
583 };
584
585 let obj_number = start + i;
586
587 match f_type {
588 0 => {
590 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
591 }
592 1 => {
593 let offset = if f2_len > 0 {
594 let data = xref_reader.read_bytes(f2_len as usize)?;
595 xref_stream_num(data)?
596 } else {
597 0
598 };
599
600 let gen_number = if f3_len > 0 {
601 let data = xref_reader.read_bytes(f3_len as usize)?;
602 xref_stream_num(data)?
603 } else {
604 0
605 };
606
607 insert_map.insert(
608 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
609 EntryType::Normal(offset as usize),
610 );
611 }
612 2 => {
613 let obj_stream_number = {
614 let data = xref_reader.read_bytes(f2_len as usize)?;
615 xref_stream_num(data)?
616 };
617 let gen_number = 0;
618 let index = if f3_len > 0 {
619 let data = xref_reader.read_bytes(f3_len as usize)?;
620 xref_stream_num(data)?
621 } else {
622 0
623 };
624
625 insert_map.insert(
626 ObjectIdentifier::new(obj_number as i32, gen_number),
627 EntryType::ObjStream(obj_stream_number, index),
628 );
629 }
630 _ => {
631 warn!("xref has unknown field type {f_type}");
632
633 return None;
634 }
635 }
636 }
637
638 Some(())
639}
640
641fn read_xref_table_trailer<'a>(
642 reader: &mut Reader<'a>,
643 ctx: ReaderContext<'a>,
644) -> Option<Dict<'a>> {
645 reader.skip_white_spaces();
646 reader.forward_tag(b"xref")?;
647 reader.skip_white_spaces();
648
649 while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
650 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
651 }
652
653 reader.skip_white_spaces();
654 reader.forward_tag(b"trailer")?;
655 reader.skip_white_spaces();
656
657 reader.read_with_context::<Dict>(ctx)
658}
659
660struct ObjectStream<'a> {
661 data: &'a [u8],
662 ctx: ReaderContext<'a>,
663 offsets: Vec<(u32, usize)>,
664}
665
666impl<'a> ObjectStream<'a> {
667 fn new(inner: Stream<'a>, data: &'a [u8], ctx: ReaderContext<'a>) -> Option<Self> {
668 let num_objects = inner.dict().get::<usize>(N)?;
669 let first_offset = inner.dict().get::<usize>(FIRST)?;
670
671 let mut r = Reader::new(data);
672
673 let mut offsets = vec![];
674
675 for _ in 0..num_objects {
676 r.skip_white_spaces_and_comments();
677 let obj_num = r.read_without_context::<u32>()?;
679 r.skip_white_spaces_and_comments();
680 let relative_offset = r.read_without_context::<usize>()?;
681 offsets.push((obj_num, first_offset + relative_offset));
682 }
683
684 Some(Self { data, ctx, offsets })
685 }
686
687 fn get<T>(&self, index: u32) -> Option<T>
688 where
689 T: ObjectLike<'a>,
690 {
691 let offset = self.offsets.get(index as usize)?.1;
692 let mut r = Reader::new(self.data);
693 r.jump(offset);
694
695 r.read_with_context::<T>(self.ctx)
696 }
697}