1use std::collections::HashMap;
2
3use zpdf_core::{Error, ObjectId, ParseLimits, PdfDict, PdfObject, Result};
4
5use crate::lexer::Lexer;
6
7#[derive(Debug, Clone)]
8pub enum XrefEntry {
9 InUse {
10 offset: u64,
11 gen: u16,
12 },
13 Free {
14 next: u32,
15 gen: u16,
16 },
17 Compressed {
18 stream_obj: u32,
19 index_in_stream: u32,
20 },
21}
22
23#[derive(Debug, Clone, Default)]
24pub struct XrefTable {
25 entries: HashMap<ObjectId, XrefEntry>,
26}
27
28impl XrefTable {
29 pub fn new() -> Self {
30 Self::default()
31 }
32
33 pub fn get(&self, id: ObjectId) -> Option<&XrefEntry> {
34 self.entries.get(&id)
35 }
36
37 pub fn insert(&mut self, id: ObjectId, entry: XrefEntry) {
38 self.entries.entry(id).or_insert(entry);
39 }
40
41 pub fn insert_overwrite(&mut self, id: ObjectId, entry: XrefEntry) {
46 self.entries.insert(id, entry);
47 }
48
49 pub fn len(&self) -> usize {
50 self.entries.len()
51 }
52
53 pub fn is_empty(&self) -> bool {
54 self.entries.is_empty()
55 }
56
57 pub fn object_ids(&self) -> impl Iterator<Item = ObjectId> + '_ {
58 self.entries.keys().copied()
59 }
60}
61
62pub fn parse_xref_and_trailer(data: &[u8], limits: &ParseLimits) -> Result<(XrefTable, PdfDict)> {
63 let startxref_offset = find_startxref(data)?;
64 let mut table = XrefTable::new();
65
66 let xref_offset = startxref_offset;
67 let (trailer, next_prev) = parse_xref_section(data, xref_offset, &mut table, limits)?;
68
69 let mut visited = std::collections::HashSet::new();
73 visited.insert(xref_offset);
74 parse_hybrid_xrefstm(data, &trailer, &mut table, limits, &mut visited);
77 let mut prev = next_prev;
78 while let Some(prev_offset) = prev {
79 if !visited.insert(prev_offset as usize) {
80 break;
81 }
82 let (section_trailer, next) =
83 parse_xref_section(data, prev_offset as usize, &mut table, limits)?;
84 parse_hybrid_xrefstm(data, §ion_trailer, &mut table, limits, &mut visited);
85 prev = next;
86 }
87
88 Ok((table, trailer))
89}
90
91fn parse_hybrid_xrefstm(
99 data: &[u8],
100 trailer: &PdfDict,
101 table: &mut XrefTable,
102 limits: &ParseLimits,
103 visited: &mut std::collections::HashSet<usize>,
104) {
105 let Some(PdfObject::Integer(off)) = trailer.get("XRefStm") else {
106 return;
107 };
108 let Ok(off) = usize::try_from(*off) else {
109 tracing::warn!("/XRefStm offset {off} is negative; ignoring");
110 return;
111 };
112 if !visited.insert(off) {
114 return;
115 }
116 if let Err(e) = parse_xref_stream(data, off, table, limits) {
117 tracing::warn!("failed to parse /XRefStm at offset {off}: {e}");
118 }
119}
120
121fn parse_xref_section(
122 data: &[u8],
123 offset: usize,
124 table: &mut XrefTable,
125 limits: &ParseLimits,
126) -> Result<(PdfDict, Option<u64>)> {
127 if offset >= data.len() {
131 return Err(Error::InvalidXref(offset as u64));
132 }
133 if data[offset..].starts_with(b"xref") {
134 parse_traditional_xref(data, offset, table, limits)
135 } else {
136 parse_xref_stream(data, offset, table, limits)
137 }
138}
139
140fn parse_xref_stream(
141 data: &[u8],
142 offset: usize,
143 table: &mut XrefTable,
144 limits: &ParseLimits,
145) -> Result<(PdfDict, Option<u64>)> {
146 use crate::filters;
147 use crate::object_parser::ObjectParser;
148
149 let parser = ObjectParser::new(data, limits);
150 let obj = parser.parse_indirect_at(offset)?;
151 let stream = match obj {
152 PdfObject::Stream(s) => s,
153 _ => return Err(Error::InvalidXref(offset as u64)),
154 };
155
156 let dict = &stream.dict;
157 if dict.get_name("Type").unwrap_or("") != "XRef" {
158 return Err(Error::InvalidXref(offset as u64));
159 }
160
161 let size = dict.get_i64("Size")? as u32;
162
163 let w_arr = dict.get_array("W")?;
167 if w_arr.len() != 3 {
168 return Err(Error::InvalidXref(offset as u64));
169 }
170 let field_width = |obj: &PdfObject| -> Result<usize> {
171 let w = obj.as_i64()?;
172 if !(0..=8).contains(&w) {
173 return Err(Error::InvalidXref(offset as u64));
174 }
175 Ok(w as usize)
176 };
177 let w1 = field_width(&w_arr[0])?;
178 let w2 = field_width(&w_arr[1])?;
179 let w3 = field_width(&w_arr[2])?;
180 let entry_size = w1 + w2 + w3;
181 if entry_size == 0 {
182 return Err(Error::InvalidXref(offset as u64));
183 }
184
185 let decoded = filters::decode_stream(&stream.data, dict)?;
187
188 let index_ranges: Vec<(u32, u32)> = if let Ok(idx_arr) = dict.get_array("Index") {
190 idx_arr
191 .chunks(2)
192 .filter_map(|pair| {
193 if pair.len() == 2 {
194 Some((pair[0].as_i64().ok()? as u32, pair[1].as_i64().ok()? as u32))
195 } else {
196 None
197 }
198 })
199 .collect()
200 } else {
201 vec![(0, size)]
202 };
203
204 let mut pos = 0usize;
205 for &(start, count) in &index_ranges {
206 for i in 0..count {
207 if pos + entry_size > decoded.len() {
208 break;
209 }
210 let obj_num = start + i;
211
212 let field1 = read_field(&decoded[pos..], w1);
213 let field2 = read_field(&decoded[pos + w1..], w2);
214 let field3 = read_field(&decoded[pos + w1 + w2..], w3);
215 pos += entry_size;
216
217 let entry_type = if w1 == 0 { 1 } else { field1 as u8 };
218 let id = ObjectId(obj_num, field3 as u16);
219
220 match entry_type {
221 0 => {
222 table.insert(
223 id,
224 XrefEntry::Free {
225 next: field2 as u32,
226 gen: field3 as u16,
227 },
228 );
229 }
230 1 => {
231 table.insert(
232 id,
233 XrefEntry::InUse {
234 offset: field2,
235 gen: field3 as u16,
236 },
237 );
238 }
239 2 => {
240 table.insert(
241 ObjectId(obj_num, 0),
242 XrefEntry::Compressed {
243 stream_obj: field2 as u32,
244 index_in_stream: field3 as u32,
245 },
246 );
247 }
248 _ => {}
249 }
250 }
251 }
252
253 let trailer = dict.clone();
255 let prev = trailer.get("Prev").and_then(|obj| match obj {
256 PdfObject::Integer(n) => Some(*n as u64),
257 _ => None,
258 });
259
260 Ok((trailer, prev))
261}
262
263fn read_field(data: &[u8], width: usize) -> u64 {
264 let mut val = 0u64;
265 for &byte in &data[..width] {
266 val = (val << 8) | byte as u64;
267 }
268 val
269}
270
271fn parse_traditional_xref(
272 data: &[u8],
273 offset: usize,
274 table: &mut XrefTable,
275 limits: &ParseLimits,
276) -> Result<(PdfDict, Option<u64>)> {
277 let mut pos = offset + 4; skip_eol(data, &mut pos);
279
280 loop {
282 skip_whitespace(data, &mut pos);
283
284 if pos >= data.len() {
288 return Err(Error::InvalidXref(pos as u64));
289 }
290
291 if data[pos..].starts_with(b"trailer") {
292 pos += 7;
293 break;
294 }
295
296 let (first_obj, count) = parse_subsection_header(data, &mut pos)?;
298
299 for i in 0..count {
300 skip_whitespace(data, &mut pos);
301 let (entry_offset, gen, in_use) = parse_xref_entry_at(data, &mut pos)?;
307 let id = ObjectId(first_obj + i, gen);
308
309 if in_use {
310 table.insert(
311 id,
312 XrefEntry::InUse {
313 offset: entry_offset,
314 gen,
315 },
316 );
317 } else {
318 table.insert(
319 id,
320 XrefEntry::Free {
321 next: entry_offset as u32,
322 gen,
323 },
324 );
325 }
326 }
327 }
328
329 let mut lex = Lexer::new(data, pos, limits);
331 let trailer_obj = lex.next_token()?;
332 let trailer = match trailer_obj {
333 PdfObject::Dict(d) => d,
334 _ => return Err(Error::InvalidXref(pos as u64)),
335 };
336
337 let prev = trailer.get("Prev").and_then(|obj| match obj {
338 PdfObject::Integer(n) => Some(*n as u64),
339 _ => None,
340 });
341
342 Ok((trailer, prev))
343}
344
345fn find_startxref(data: &[u8]) -> Result<usize> {
346 let marker = b"startxref";
353 let marker_pos = data
354 .windows(marker.len())
355 .rposition(|w| w == marker)
356 .ok_or(Error::InvalidXref(0))?;
357
358 let after_marker = marker_pos + marker.len();
359 let num_start = data[after_marker..]
360 .iter()
361 .position(|b| b.is_ascii_digit())
362 .ok_or(Error::InvalidXref(0))?;
363
364 let num_bytes = &data[after_marker + num_start..];
365 let num_end = num_bytes
366 .iter()
367 .position(|b| !b.is_ascii_digit())
368 .unwrap_or(num_bytes.len());
369
370 let offset_str =
371 std::str::from_utf8(&num_bytes[..num_end]).map_err(|_| Error::InvalidXref(0))?;
372 let offset: usize = offset_str.parse().map_err(|_| Error::InvalidXref(0))?;
373
374 Ok(offset)
375}
376
377fn parse_subsection_header(data: &[u8], pos: &mut usize) -> Result<(u32, u32)> {
378 let start = *pos;
379 while *pos < data.len() && data[*pos].is_ascii_digit() {
380 *pos += 1;
381 }
382 let first: u32 = std::str::from_utf8(&data[start..*pos])
383 .map_err(|_| Error::InvalidXref(start as u64))?
384 .parse()
385 .map_err(|_| Error::InvalidXref(start as u64))?;
386
387 skip_whitespace(data, pos);
388
389 let count_start = *pos;
390 while *pos < data.len() && data[*pos].is_ascii_digit() {
391 *pos += 1;
392 }
393 let count: u32 = std::str::from_utf8(&data[count_start..*pos])
394 .map_err(|_| Error::InvalidXref(count_start as u64))?
395 .parse()
396 .map_err(|_| Error::InvalidXref(count_start as u64))?;
397
398 skip_eol(data, pos);
399 Ok((first, count))
400}
401
402fn parse_xref_entry_at(data: &[u8], pos: &mut usize) -> Result<(u64, u16, bool)> {
407 let start = *pos as u64;
408 let offset = read_decimal(data, pos).ok_or(Error::InvalidXref(start))?;
409 skip_whitespace(data, pos);
410 let gen = read_decimal(data, pos)
411 .and_then(|g| u16::try_from(g).ok())
412 .ok_or(Error::InvalidXref(start))?;
413 skip_whitespace(data, pos);
414 let in_use = match data.get(*pos) {
415 Some(b'n') => true,
416 Some(b'f') => false,
417 _ => return Err(Error::InvalidXref(start)),
418 };
419 *pos += 1;
420 Ok((offset, gen, in_use))
421}
422
423fn read_decimal(data: &[u8], pos: &mut usize) -> Option<u64> {
426 let start = *pos;
427 while *pos < data.len() && data[*pos].is_ascii_digit() {
428 *pos += 1;
429 }
430 if *pos == start {
431 return None;
432 }
433 std::str::from_utf8(&data[start..*pos]).ok()?.parse().ok()
434}
435
436fn skip_whitespace(data: &[u8], pos: &mut usize) {
437 while *pos < data.len() && matches!(data[*pos], b' ' | b'\t' | b'\r' | b'\n') {
438 *pos += 1;
439 }
440}
441
442fn skip_eol(data: &[u8], pos: &mut usize) {
443 while *pos < data.len() && matches!(data[*pos], b' ' | b'\t' | b'\r' | b'\n') {
444 *pos += 1;
445 }
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 #[test]
453 fn find_startxref_offset() {
454 let data = b"%PDF-1.4\n...lots of content...\nstartxref\n1234\n%%EOF";
455 let offset = find_startxref(data).unwrap();
456 assert_eq!(offset, 1234);
457 }
458
459 #[test]
460 fn parse_xref_entry_in_use() {
461 let mut pos = 0usize;
462 let (offset, gen, in_use) =
463 parse_xref_entry_at(b"0000000010 00000 n \r\n", &mut pos).unwrap();
464 assert_eq!(offset, 10);
465 assert_eq!(gen, 0);
466 assert!(in_use);
467 assert_eq!(pos, 18, "cursor stops just past the type letter");
468 }
469
470 #[test]
471 fn parse_xref_entry_free() {
472 let mut pos = 0usize;
473 let (offset, gen, in_use) =
474 parse_xref_entry_at(b"0000000000 65535 f \r\n", &mut pos).unwrap();
475 assert_eq!(offset, 0);
476 assert_eq!(gen, 65535);
477 assert!(!in_use);
478 }
479
480 #[test]
481 fn parse_xref_entry_truncated_errors() {
482 let mut pos = 0usize;
483 assert!(parse_xref_entry_at(b"0000000010 000", &mut pos).is_err());
484 }
485
486 #[test]
487 fn traditional_xref_with_19_byte_entries() {
488 let mut d = Vec::new();
490 d.extend_from_slice(b"%PDF-1.4\n");
491 let off1 = d.len();
492 d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
493 let off2 = d.len();
494 d.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
495 let xref_off = d.len();
496 d.extend_from_slice(b"xref\n0 3\n");
497 d.extend_from_slice(b"0000000000 65535 f\n"); d.extend_from_slice(format!("{off1:010} 00000 n\n").as_bytes()); d.extend_from_slice(format!("{off2:010} 00000 n\n").as_bytes()); d.extend_from_slice(
501 format!("trailer\n<< /Size 3 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n")
502 .as_bytes(),
503 );
504
505 let (table, trailer) = parse_xref_and_trailer(&d, &ParseLimits::default()).unwrap();
506 assert_eq!(trailer.get_ref("Root").unwrap(), ObjectId(1, 0));
507 match table.get(ObjectId(1, 0)).unwrap() {
508 XrefEntry::InUse { offset, .. } => assert_eq!(*offset as usize, off1),
509 other => panic!("expected InUse, got {other:?}"),
510 }
511 match table.get(ObjectId(2, 0)).unwrap() {
512 XrefEntry::InUse { offset, .. } => assert_eq!(*offset as usize, off2),
513 other => panic!("expected InUse, got {other:?}"),
514 }
515 }
516
517 fn xref_stream_bytes(w: &str, size: u32, index: &str, body: &[u8]) -> Vec<u8> {
520 let mut d = format!(
521 "9 0 obj\n<< /Type /XRef /Size {size} /W {w} {index} /Length {} >>\nstream\n",
522 body.len()
523 )
524 .into_bytes();
525 d.extend_from_slice(body);
526 d.extend_from_slice(b"\nendstream\nendobj\n");
527 d
528 }
529
530 #[test]
531 fn xref_stream_rejects_negative_w_width() {
532 let d = xref_stream_bytes("[1 -2 2]", 1, "", &[]);
533 let mut table = XrefTable::new();
534 assert!(parse_xref_stream(&d, 0, &mut table, &ParseLimits::default()).is_err());
535 }
536
537 #[test]
538 fn xref_stream_rejects_oversized_w_width() {
539 let d = xref_stream_bytes("[9 4 2]", 1, "", &[]);
540 let mut table = XrefTable::new();
541 assert!(parse_xref_stream(&d, 0, &mut table, &ParseLimits::default()).is_err());
542 }
543
544 #[test]
545 fn xref_stream_rejects_zero_entry_size() {
546 let d = xref_stream_bytes("[0 0 0]", 1, "", &[]);
547 let mut table = XrefTable::new();
548 assert!(parse_xref_stream(&d, 0, &mut table, &ParseLimits::default()).is_err());
549 }
550
551 #[test]
552 fn hybrid_xrefstm_is_parsed_with_correct_precedence() {
553 let mut d = Vec::new();
558 d.extend_from_slice(b"%PDF-1.4\n");
559 let off1 = d.len();
560 d.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
561 let off4_table = d.len();
562 d.extend_from_slice(b"4 0 obj\n<< /Marker /FromTable >>\nendobj\n");
563 let off4_stm = d.len();
564 d.extend_from_slice(b"4 0 obj\n<< /Marker /FromStm >>\nendobj\n");
565 let off5 = d.len();
566 d.extend_from_slice(b"5 0 obj\n<< /Marker /StmOnly >>\nendobj\n");
567
568 let mut body = Vec::new();
570 for (off, gen) in [(off4_stm as u32, 0u16), (off5 as u32, 0)] {
571 body.push(1u8); body.extend_from_slice(&off.to_be_bytes());
573 body.extend_from_slice(&gen.to_be_bytes());
574 }
575 let off6 = d.len();
576 d.extend_from_slice(
577 format!(
578 "6 0 obj\n<< /Type /XRef /Size 7 /W [1 4 2] /Index [4 2] /Length {} >>\nstream\n",
579 body.len()
580 )
581 .as_bytes(),
582 );
583 d.extend_from_slice(&body);
584 d.extend_from_slice(b"\nendstream\nendobj\n");
585
586 let xref_off = d.len();
587 d.extend_from_slice(b"xref\n0 2\n0000000000 65535 f \n");
588 d.extend_from_slice(format!("{off1:010} 00000 n \n").as_bytes());
589 d.extend_from_slice(b"4 1\n");
590 d.extend_from_slice(format!("{off4_table:010} 00000 n \n").as_bytes());
591 d.extend_from_slice(
592 format!(
593 "trailer\n<< /Size 7 /Root 1 0 R /XRefStm {off6} >>\nstartxref\n{xref_off}\n%%EOF\n"
594 )
595 .as_bytes(),
596 );
597
598 let (table, trailer) = parse_xref_and_trailer(&d, &ParseLimits::default()).unwrap();
599 assert_eq!(trailer.get_ref("Root").unwrap(), ObjectId(1, 0));
600 match table.get(ObjectId(5, 0)).unwrap() {
602 XrefEntry::InUse { offset, .. } => assert_eq!(*offset as usize, off5),
603 other => panic!("expected InUse from XRefStm, got {other:?}"),
604 }
605 match table.get(ObjectId(4, 0)).unwrap() {
607 XrefEntry::InUse { offset, .. } => assert_eq!(*offset as usize, off4_table),
608 other => panic!("expected InUse, got {other:?}"),
609 }
610
611 let file = crate::PdfFile::parse(d).unwrap();
613 let o5 = file.resolve(ObjectId(5, 0)).unwrap();
614 assert_eq!(o5.as_dict().unwrap().get_name("Marker").unwrap(), "StmOnly");
615 let o4 = file.resolve(ObjectId(4, 0)).unwrap();
616 assert_eq!(
617 o4.as_dict().unwrap().get_name("Marker").unwrap(),
618 "FromTable"
619 );
620 }
621}