1use crate::PdfData;
4use crate::data::Data;
5use crate::object::ObjectIdentifier;
6use crate::object::array::Array;
7use crate::object::dict::Dict;
8use crate::object::dict::keys::{FIRST, INDEX, N, PAGES, PREV, ROOT, SIZE, W, XREF_STM};
9use crate::object::indirect::IndirectObject;
10use crate::object::stream::Stream;
11use crate::object::{Object, ObjectLike};
12use crate::reader::{Readable, Reader};
13use log::{error, warn};
14use rustc_hash::FxHashMap;
15use std::cmp::max;
16use std::iter;
17use std::sync::{Arc, RwLock};
18
19pub(crate) const XREF_ENTRY_LEN: usize = 20;
20
21pub(crate) fn root_xref(data: PdfData) -> Option<XRef> {
23 let mut xref_map = FxHashMap::default();
24 let xref_pos = find_last_xref_pos(data.as_ref().as_ref())?;
25 let trailer = populate_xref_impl(data.as_ref().as_ref(), xref_pos, &mut xref_map)?;
26
27 XRef::new(data.clone(), xref_map, &trailer, false)
28}
29
30pub(crate) fn fallback(data: PdfData) -> Option<XRef> {
32 warn!("xref table was invalid, trying to manually build xref table");
33 let (xref_map, trailer_dict) = fallback_xref_map(data.as_ref().as_ref());
34
35 if let Some(trailer_dict_data) = trailer_dict {
36 warn!("rebuild xref table with {} entries", xref_map.len());
37
38 XRef::new(data.clone(), xref_map, trailer_dict_data, true)
39 } else {
40 warn!("couldn't find trailer dictionary, failed to rebuild xref table");
41
42 None
43 }
44}
45
46fn fallback_xref_map(data: &[u8]) -> (XrefMap, Option<&[u8]>) {
47 let mut xref_map = FxHashMap::default();
48 let mut trailer_dict = None;
49
50 let mut r = Reader::new(data);
51
52 loop {
53 let cur_pos = r.offset();
54
55 if let Some(obj) = r.read_without_xref::<ObjectIdentifier>() {
56 xref_map.insert(obj, EntryType::Normal(cur_pos));
57 } else if let Some(dict) = r.read::<false, Dict>(XRef::dummy()) {
58 if dict.contains_key(SIZE) && dict.contains_key(ROOT) {
59 trailer_dict = Some(dict);
60 }
61 } else {
62 r.read_byte();
63 }
64
65 if r.at_end() {
66 break;
67 }
68 }
69
70 (xref_map, trailer_dict.map(|d| d.data()))
71}
72
73static DUMMY_XREF: &'static XRef = &XRef(Inner::Dummy);
74
75#[derive(Debug)]
77pub struct XRef(Inner);
78
79impl XRef {
80 fn new(
81 data: PdfData,
82 xref_map: XrefMap,
83 trailer_dict_data: &[u8],
84 repaired: bool,
85 ) -> Option<Self> {
86 let trailer_data = TrailerData::dummy();
90
91 let mut xref = Self(Inner::Some {
92 data: Data::new(data),
93 map: Arc::new(RwLock::new(SomeRepr { xref_map, repaired })),
94 trailer_data,
95 });
96
97 let mut r = Reader::new(&trailer_dict_data);
98 let trailer_dict = r.read_with_xref::<Dict>(&xref)?;
99 let root = trailer_dict.get::<Dict>(ROOT)?;
100 let pages_ref = root.get_ref(PAGES)?;
101
102 let td = TrailerData {
103 pages_ref: pages_ref.into(),
104 };
105
106 match &mut xref.0 {
107 Inner::Dummy => unreachable!(),
108 Inner::Some { trailer_data, .. } => {
109 *trailer_data = td;
110 }
111 }
112
113 Some(xref)
114 }
115
116 pub(crate) fn dummy() -> &'static XRef {
117 DUMMY_XREF
118 }
119
120 pub(crate) fn len(&self) -> usize {
121 match &self.0 {
122 Inner::Dummy => 0,
123 Inner::Some { map, .. } => map.read().unwrap().xref_map.len(),
124 }
125 }
126
127 pub(crate) fn trailer_data(&self) -> &TrailerData {
128 match &self.0 {
129 Inner::Dummy => unreachable!(),
130 Inner::Some { trailer_data, .. } => trailer_data,
131 }
132 }
133
134 pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
135 match &self.0 {
136 Inner::Dummy => unimplemented!(),
137 Inner::Some { map, .. } => iter::from_fn(move || {
138 let locked = map.read().unwrap();
139 let mut iter = locked.xref_map.keys();
140
141 iter.next().and_then(|k| self.get(*k))
142 }),
143 }
144 }
145
146 pub(crate) fn repair(&self) {
147 let Inner::Some { map, data, .. } = &self.0 else {
148 unreachable!();
149 };
150
151 let mut locked = map.try_write().unwrap();
152 assert!(!locked.repaired);
153
154 let (xref_map, _) = fallback_xref_map(data.get());
155 locked.xref_map = xref_map;
156 locked.repaired = true;
157 }
158
159 pub(crate) fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
160 where
161 T: ObjectLike<'a>,
162 {
163 let Inner::Some { map, data, .. } = &self.0 else {
164 return None;
165 };
166
167 let locked = map.try_read().unwrap();
168 let repaired = locked.repaired;
169
170 let mut r = Reader::new(data.get());
171
172 let entry = *locked.xref_map.get(&id).or_else(|| {
173 None
176 })?;
177 drop(locked);
178
179 match entry {
180 EntryType::Normal(offset) => {
181 r.jump(offset);
182
183 if let Some(object) = r.read_with_xref::<IndirectObject<T>>(self) {
184 if object.id() == &id {
185 return Some(object.get());
186 }
187 } else {
188 if r.skip_non_plain::<IndirectObject<Object>>().is_some() {
191 return None;
192 }
193 };
194
195 if repaired {
197 error!(
198 "attempt was made at repairing xref, but object {:?} still couldn't be read",
199 id
200 );
201
202 None
203 } else {
204 warn!("broken xref, attempting to repair");
205
206 self.repair();
207
208 self.get::<T>(id)
210 }
211 }
212 EntryType::ObjStream(id, index) => {
213 let id = ObjectIdentifier::new(id as i32, 0);
215
216 let stream = self.get::<Stream>(id)?;
217 let data = data.get_with(id, self)?;
218 let object_stream = ObjectStream::new(stream, data, self)?;
219 object_stream.get(index)
220 }
221 }
222 }
223}
224
225pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
226 let mut finder = Reader::new(data);
227 let mut pos = finder.len() - 1;
228 finder.jump(pos);
229
230 let needle = b"startxref";
231
232 loop {
233 if finder.forward_tag(needle).is_some() {
234 finder.skip_white_spaces_and_comments();
235
236 let offset = finder.read_without_xref::<i32>()?.try_into().ok()?;
237
238 return Some(offset);
239 }
240
241 pos = pos.checked_sub(1)?;
242 finder.jump(pos);
243 }
244}
245
246#[derive(Debug, PartialEq, Eq, Clone, Copy)]
248enum EntryType {
249 Normal(usize),
251 ObjStream(u32, u32),
255}
256
257type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
258
259#[derive(Debug)]
261struct SomeRepr {
262 xref_map: XrefMap,
263 repaired: bool,
264}
265
266#[derive(Debug, Copy, Clone)]
267pub(crate) struct TrailerData {
268 pub pages_ref: ObjectIdentifier,
269}
270
271impl TrailerData {
272 pub fn dummy() -> Self {
273 Self {
274 pages_ref: ObjectIdentifier::new(0, 0),
275 }
276 }
277}
278
279#[derive(Debug)]
280enum Inner {
281 Dummy,
283 Some {
285 data: Data,
286 map: Arc<RwLock<SomeRepr>>,
287 trailer_data: TrailerData,
288 },
289}
290
291#[derive(Debug)]
292struct XRefEntry {
293 offset: usize,
294 gen_number: i32,
295 used: bool,
296}
297
298impl XRefEntry {
299 pub(crate) fn read(data: &[u8]) -> Option<XRefEntry> {
300 #[inline(always)]
301 fn parse_u32(data: &[u8]) -> Option<u32> {
302 let mut accum = 0;
303
304 for byte in data {
305 accum = accum * 10;
306
307 match *byte {
308 b'0'..=b'9' => accum += (*byte - b'0') as u32,
309 _ => return None,
310 }
311 }
312
313 Some(accum)
314 }
315
316 let offset = parse_u32(&data[0..10])? as usize;
317 let gen_number = parse_u32(&data[11..16])? as i32;
318
319 let used = data[17] == b'n';
320
321 Some(Self {
322 offset,
323 gen_number,
324 used,
325 })
326 }
327}
328
329fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
330 let mut reader = Reader::new(data);
331 reader.jump(pos);
332
333 let mut r2 = reader.clone();
334 if reader
335 .clone()
336 .read_without_xref::<ObjectIdentifier>()
337 .is_some()
338 {
339 populate_from_xref_stream(data, &mut r2, xref_map)
340 } else {
341 populate_from_xref_table(data, &mut r2, xref_map)
342 }
343}
344
345pub(super) struct SubsectionHeader {
346 pub(super) start: u32,
347 pub(super) num_entries: u32,
348}
349
350impl Readable<'_> for SubsectionHeader {
351 fn read<const PLAIN: bool>(r: &mut Reader<'_>, _: &XRef) -> Option<Self> {
352 r.skip_white_spaces();
353 let start = r.read_without_xref::<u32>()?;
354 r.skip_white_spaces();
355 let num_entries = r.read_without_xref::<u32>()?;
356 r.skip_white_spaces();
357
358 Some(Self { start, num_entries })
359 }
360}
361
362fn populate_from_xref_table<'a>(
364 data: &'a [u8],
365 reader: &mut Reader<'a>,
366 insert_map: &mut XrefMap,
367) -> Option<&'a [u8]> {
368 let trailer = {
369 let mut reader = reader.clone();
370 read_xref_table_trailer(&mut reader, XRef::dummy())?
371 };
372
373 reader.skip_white_spaces();
374 reader.forward_tag(b"xref")?;
375 reader.skip_white_spaces();
376
377 let mut max_obj = 0;
378
379 if let Some(prev) = trailer.get::<i32>(PREV) {
380 populate_xref_impl(data, prev as usize, insert_map)?;
382 }
383
384 if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
387 populate_xref_impl(data, xref_stm as usize, insert_map)?;
388 }
389
390 while let Some(header) = reader.read_without_xref::<SubsectionHeader>() {
391 reader.skip_white_spaces();
392
393 let start = header.start;
394 let end = start + header.num_entries;
395
396 for obj_number in start..end {
397 max_obj = max(max_obj, obj_number);
398 let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
399 let entry = XRefEntry::read(bytes)?;
400
401 if entry.used {
404 insert_map.insert(
405 ObjectIdentifier::new(obj_number as i32, entry.gen_number),
406 EntryType::Normal(entry.offset),
407 );
408 }
409 }
410 }
411
412 Some(trailer.data())
413}
414
415fn populate_from_xref_stream<'a>(
416 data: &'a [u8],
417 reader: &mut Reader<'a>,
418 insert_map: &mut XrefMap,
419) -> Option<&'a [u8]> {
420 let stream = reader
421 .read_with_xref::<IndirectObject<Stream>>(XRef::dummy())?
422 .get();
423
424 if let Some(prev) = stream.dict().get::<i32>(PREV) {
425 let _ = populate_xref_impl(data, prev as usize, insert_map)?;
427 }
428
429 let size = stream.dict().get::<u32>(SIZE)?;
430
431 let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
432
433 if f2_len > size_of::<u32>() as u8 {
434 error!("xref offset length is larger than the allowed limit");
435
436 return None;
437 }
438
439 if f1_len != 1 {
441 warn!("first field in xref stream was longer than 1");
442 }
443
444 let xref_data = stream.decoded()?;
445 let mut xref_reader = Reader::new(xref_data.as_ref());
446
447 if let Some(arr) = stream.dict().get::<Array>(INDEX) {
448 let mut iter = arr.iter::<(u32, u32)>();
449
450 while let Some((start, num_elements)) = iter.next() {
451 xref_stream_subsection(
452 &mut xref_reader,
453 start,
454 num_elements,
455 f1_len,
456 f2_len,
457 f3_len,
458 insert_map,
459 )?;
460 }
461 } else {
462 xref_stream_subsection(
463 &mut xref_reader,
464 0,
465 size,
466 f1_len,
467 f2_len,
468 f3_len,
469 insert_map,
470 )?;
471 }
472
473 Some(stream.dict().data())
474}
475
476fn xref_stream_num<'a>(data: &[u8]) -> Option<u32> {
477 Some(match data.len() {
478 0 => return None,
479 1 => u8::from_be(data[0]) as u32,
480 2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
481 3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
482 4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
483 n => {
484 warn!("invalid xref stream number {}", n);
485
486 return None;
487 }
488 })
489}
490
491fn xref_stream_subsection<'a>(
492 xref_reader: &mut Reader<'a>,
493 start: u32,
494 num_elements: u32,
495 f1_len: u8,
496 f2_len: u8,
497 f3_len: u8,
498 insert_map: &mut XrefMap,
499) -> Option<()> {
500 for i in 0..num_elements {
501 let f_type = if f1_len == 0 {
502 1
503 } else {
504 xref_reader.read_bytes(1)?[0]
506 };
507
508 let obj_number = start + i;
509
510 match f_type {
511 0 => {
513 xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
514 }
515 1 => {
516 let offset = if f2_len > 0 {
517 let data = xref_reader.read_bytes(f2_len as usize)?;
518 xref_stream_num(data)?
519 } else {
520 0
521 };
522
523 let gen_number = if f3_len > 0 {
524 let data = xref_reader.read_bytes(f3_len as usize)?;
525 xref_stream_num(data)?
526 } else {
527 0
528 };
529
530 insert_map.insert(
531 ObjectIdentifier::new(obj_number as i32, gen_number as i32),
532 EntryType::Normal(offset as usize),
533 );
534 }
535 2 => {
536 let obj_stream_number = {
537 let data = xref_reader.read_bytes(f2_len as usize)?;
538 xref_stream_num(data)?
539 };
540 let gen_number = 0;
541 let index = if f3_len > 0 {
542 let data = xref_reader.read_bytes(f3_len as usize)?;
543 xref_stream_num(data)?
544 } else {
545 0
546 };
547
548 insert_map.insert(
549 ObjectIdentifier::new(obj_number as i32, gen_number),
550 EntryType::ObjStream(obj_stream_number, index),
551 );
552 }
553 _ => {
554 warn!("xref has unknown field type {}", f_type);
555
556 return None;
557 }
558 }
559 }
560
561 Some(())
562}
563
564fn read_xref_table_trailer<'a>(reader: &mut Reader<'a>, xref: &'a XRef) -> Option<Dict<'a>> {
565 reader.skip_white_spaces();
566 reader.forward_tag(b"xref")?;
567 reader.skip_white_spaces();
568
569 while let Some(header) = reader.read_without_xref::<SubsectionHeader>() {
570 reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
571 }
572
573 reader.skip_white_spaces();
574 reader.forward_tag(b"trailer")?;
575 reader.skip_white_spaces();
576
577 reader.read_with_xref::<Dict>(xref)
578}
579
580struct ObjectStream<'a> {
581 data: &'a [u8],
582 xref: &'a XRef,
583 offsets: Vec<usize>,
584}
585
586impl<'a> ObjectStream<'a> {
587 pub fn new(inner: Stream<'a>, data: &'a [u8], xref: &'a XRef) -> Option<Self> {
588 let num_objects = inner.dict().get::<usize>(N)?;
589 let first_offset = inner.dict().get::<usize>(FIRST)?;
590
591 let mut r = Reader::new(data.as_ref());
592
593 let mut offsets = vec![];
594
595 for _ in 0..num_objects {
596 r.skip_white_spaces_and_comments();
597 let _ = r.read_without_xref::<u32>()?;
599 r.skip_white_spaces_and_comments();
600 let relative_offset = r.read_without_xref::<usize>()?;
601 offsets.push(first_offset + relative_offset);
602 }
603
604 Some(Self {
605 data,
606 xref,
607 offsets,
608 })
609 }
610
611 pub fn get<T>(&self, index: u32) -> Option<T>
612 where
613 T: ObjectLike<'a>,
614 {
615 let offset = *self.offsets.get(index as usize)?;
616 let mut r = Reader::new(&self.data);
617 r.jump(offset);
618
619 r.read_with_xref::<T>(&self.xref)
620 }
621}