1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::document::build_document;
4use crate::error::{PdfError, PdfResult};
5use crate::stream::decode_stream;
6use crate::types::{
7 ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
8};
9
10pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
11 let version = parse_header(bytes)?;
12 let startxref = find_startxref(bytes)?;
13 let (xref, trailer) = parse_xref_table(bytes, startxref)?;
14
15 let mut objects = BTreeMap::new();
16 let mut max_object_number = 0;
17 let mut compressed: Vec<(ObjectRef, u32, u32)> = Vec::new();
18
19 for (object_ref, entry) in &xref {
20 match entry {
21 XrefEntry::Free => {}
22 XrefEntry::Uncompressed { offset, .. } => {
23 if object_ref.object_number == 0 {
24 continue;
25 }
26 let object = parse_indirect_object(bytes, *offset)?;
27 max_object_number = max_object_number.max(object_ref.object_number);
28 objects.insert(*object_ref, object);
29 }
30 XrefEntry::Compressed {
31 stream_object_number,
32 index,
33 } => {
34 compressed.push((*object_ref, *stream_object_number, *index));
35 }
36 }
37 }
38
39 materialize_object_streams(&mut objects, &mut max_object_number, &compressed)?;
40
41 let file = PdfFile {
42 version,
43 objects,
44 trailer,
45 max_object_number,
46 };
47 build_document(file)
48}
49
50fn parse_header(bytes: &[u8]) -> PdfResult<String> {
51 if !bytes.starts_with(b"%PDF-") {
52 return Err(PdfError::Parse("missing PDF header".to_string()));
53 }
54 let line_end = bytes
55 .iter()
56 .position(|byte| *byte == b'\n' || *byte == b'\r')
57 .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
58 Ok(String::from_utf8_lossy(&bytes[5..line_end])
59 .trim()
60 .to_string())
61}
62
63fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
64 let marker = b"startxref";
65 let position = bytes
66 .windows(marker.len())
67 .rposition(|window| window == marker)
68 .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
69 let mut parser = Cursor::new(bytes, position + marker.len());
70 parser.skip_ws_and_comments();
71 parser.parse_usize()
72}
73
74fn parse_xref_table(
75 bytes: &[u8],
76 start_offset: usize,
77) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
78 let mut merged_entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
79 let mut newest_trailer: Option<PdfDictionary> = None;
80 let mut visited = BTreeSet::new();
81 let mut pending: Vec<usize> = vec![start_offset];
82
83 while let Some(offset) = pending.pop() {
84 if !visited.insert(offset) {
85 continue;
86 }
87 let section = parse_xref_section_at(bytes, offset)?;
88
89 for (object_ref, entry) in section.entries {
91 merged_entries.entry(object_ref).or_insert(entry);
92 }
93
94 if newest_trailer.is_none() {
95 newest_trailer = Some(section.trailer.clone());
96 }
97
98 if let Some(stm_offset) = section
99 .trailer
100 .get("XRefStm")
101 .and_then(PdfValue::as_integer)
102 {
103 pending.push(stm_offset as usize);
104 }
105 if let Some(prev_offset) = section.trailer.get("Prev").and_then(PdfValue::as_integer) {
106 pending.push(prev_offset as usize);
107 }
108 }
109
110 let trailer = newest_trailer
111 .ok_or_else(|| PdfError::Parse("xref chain produced no trailer".to_string()))?;
112 Ok((merged_entries, trailer))
113}
114
115struct XrefSection {
116 entries: BTreeMap<ObjectRef, XrefEntry>,
117 trailer: PdfDictionary,
118}
119
120fn parse_xref_section_at(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
121 let mut probe = Cursor::new(bytes, offset);
122 probe.skip_ws_and_comments();
123 if probe.peek_keyword("xref") {
124 parse_classic_xref_section(bytes, offset)
125 } else {
126 parse_xref_stream_section(bytes, offset)
127 }
128}
129
130fn parse_classic_xref_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
131 let mut cursor = Cursor::new(bytes, offset);
132 cursor.expect_keyword("xref")?;
133 let mut entries = BTreeMap::new();
134 loop {
135 cursor.skip_ws_and_comments();
136 if cursor.peek_keyword("trailer") {
137 break;
138 }
139 let start = cursor.parse_u32()?;
140 cursor.skip_ws_and_comments();
141 let count = cursor.parse_u32()?;
142 cursor.skip_line_breaks();
143 for index in 0..count {
144 let line = cursor.read_line()?;
145 if line.len() < 17 {
146 return Err(PdfError::Parse("invalid xref entry".to_string()));
147 }
148 let parts = String::from_utf8_lossy(line).trim().to_string();
149 let mut fields = parts.split_whitespace();
150 let entry_offset = fields
151 .next()
152 .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
153 .parse::<usize>()
154 .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
155 let generation = fields
156 .next()
157 .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
158 .parse::<u16>()
159 .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
160 let flag = fields
161 .next()
162 .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
163 let object_number = start
164 .checked_add(index)
165 .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
166 let entry = if flag == "n" {
167 XrefEntry::Uncompressed {
168 offset: entry_offset,
169 generation,
170 }
171 } else {
172 XrefEntry::Free
173 };
174 entries.insert(ObjectRef::new(object_number, generation), entry);
175 }
176 }
177 cursor.expect_keyword("trailer")?;
178 let trailer = match cursor.parse_value()? {
179 PdfValue::Dictionary(dictionary) => dictionary,
180 _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
181 };
182 Ok(XrefSection { entries, trailer })
183}
184
185fn parse_xref_stream_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
186 let object = parse_indirect_object(bytes, offset)?;
187 let stream = match object {
188 PdfObject::Stream(stream) => stream,
189 PdfObject::Value(_) => {
190 return Err(PdfError::Parse(
191 "expected xref stream object at startxref offset".to_string(),
192 ));
193 }
194 };
195 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("XRef") {
196 return Err(PdfError::Parse(
197 "xref stream object has wrong Type".to_string(),
198 ));
199 }
200
201 let size = stream
202 .dict
203 .get("Size")
204 .and_then(PdfValue::as_integer)
205 .ok_or_else(|| PdfError::Corrupt("xref stream missing Size".to_string()))?
206 as u32;
207
208 let w = stream
209 .dict
210 .get("W")
211 .and_then(PdfValue::as_array)
212 .ok_or_else(|| PdfError::Corrupt("xref stream missing W".to_string()))?;
213 if w.len() != 3 {
214 return Err(PdfError::Corrupt(
215 "xref stream W must have three entries".to_string(),
216 ));
217 }
218 let w0 = w[0]
219 .as_integer()
220 .ok_or_else(|| PdfError::Corrupt("invalid W[0]".to_string()))? as usize;
221 let w1 = w[1]
222 .as_integer()
223 .ok_or_else(|| PdfError::Corrupt("invalid W[1]".to_string()))? as usize;
224 let w2 = w[2]
225 .as_integer()
226 .ok_or_else(|| PdfError::Corrupt("invalid W[2]".to_string()))? as usize;
227 let row_len = w0 + w1 + w2;
228 if row_len == 0 {
229 return Err(PdfError::Corrupt(
230 "xref stream row width is zero".to_string(),
231 ));
232 }
233
234 let index: Vec<(u32, u32)> = match stream.dict.get("Index") {
235 Some(PdfValue::Array(entries)) => {
236 if entries.len() % 2 != 0 {
237 return Err(PdfError::Corrupt(
238 "xref stream Index must have an even number of entries".to_string(),
239 ));
240 }
241 let mut pairs = Vec::with_capacity(entries.len() / 2);
242 for chunk in entries.chunks(2) {
243 let first = chunk[0]
244 .as_integer()
245 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
246 as u32;
247 let count = chunk[1]
248 .as_integer()
249 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
250 as u32;
251 pairs.push((first, count));
252 }
253 pairs
254 }
255 Some(_) => {
256 return Err(PdfError::Corrupt(
257 "xref stream Index is not an array".to_string(),
258 ));
259 }
260 None => vec![(0, size)],
261 };
262
263 let decoded = decode_stream(&stream)?;
264 let expected_rows: u32 = index.iter().map(|(_, count)| *count).sum();
265 if decoded.len() < expected_rows as usize * row_len {
266 return Err(PdfError::Corrupt(
267 "xref stream body is shorter than declared entries".to_string(),
268 ));
269 }
270
271 let mut entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
272 let mut cursor = 0usize;
273 for (first, count) in index {
274 for i in 0..count {
275 let row = &decoded[cursor..cursor + row_len];
276 cursor += row_len;
277 let field_type = if w0 == 0 { 1u64 } else { read_be(&row[..w0])? };
278 let f2 = read_be(&row[w0..w0 + w1])?;
279 let f3 = read_be(&row[w0 + w1..])?;
280 let object_number = first + i;
281 let entry = match field_type {
282 0 => XrefEntry::Free,
283 1 => XrefEntry::Uncompressed {
284 offset: f2 as usize,
285 generation: f3 as u16,
286 },
287 2 => XrefEntry::Compressed {
288 stream_object_number: f2 as u32,
289 index: f3 as u32,
290 },
291 other => {
292 return Err(PdfError::Unsupported(format!(
293 "xref stream entry type {other} is not supported"
294 )));
295 }
296 };
297 let generation = match entry {
298 XrefEntry::Uncompressed { generation, .. } => generation,
299 _ => 0,
300 };
301 entries.insert(ObjectRef::new(object_number, generation), entry);
302 }
303 }
304
305 Ok(XrefSection {
306 entries,
307 trailer: stream.dict,
308 })
309}
310
311fn read_be(bytes: &[u8]) -> PdfResult<u64> {
312 if bytes.len() > 8 {
313 return Err(PdfError::Corrupt(
314 "xref stream field width exceeds 8 bytes".to_string(),
315 ));
316 }
317 let mut value: u64 = 0;
318 for byte in bytes {
319 value = (value << 8) | *byte as u64;
320 }
321 Ok(value)
322}
323
324fn materialize_object_streams(
325 objects: &mut BTreeMap<ObjectRef, PdfObject>,
326 max_object_number: &mut u32,
327 compressed: &[(ObjectRef, u32, u32)],
328) -> PdfResult<()> {
329 if compressed.is_empty() {
330 return Ok(());
331 }
332
333 let mut by_stream: BTreeMap<u32, Vec<(ObjectRef, u32)>> = BTreeMap::new();
334 for (object_ref, stream_obj_num, index) in compressed {
335 by_stream
336 .entry(*stream_obj_num)
337 .or_default()
338 .push((*object_ref, *index));
339 }
340
341 for (stream_obj_num, mut members) in by_stream {
342 let stream_ref = ObjectRef::new(stream_obj_num, 0);
343 let stream = match objects.get(&stream_ref) {
344 Some(PdfObject::Stream(stream)) => stream.clone(),
345 Some(PdfObject::Value(_)) => {
346 return Err(PdfError::Corrupt(format!(
347 "object stream {stream_obj_num} is not a stream"
348 )));
349 }
350 None => {
351 return Err(PdfError::Corrupt(format!(
352 "compressed entry references missing object stream {stream_obj_num}"
353 )));
354 }
355 };
356 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("ObjStm") {
357 return Err(PdfError::Corrupt(format!(
358 "object {stream_obj_num} is not marked as ObjStm"
359 )));
360 }
361 let n = stream
362 .dict
363 .get("N")
364 .and_then(PdfValue::as_integer)
365 .ok_or_else(|| PdfError::Corrupt("ObjStm missing N".to_string()))?
366 as usize;
367 let first = stream
368 .dict
369 .get("First")
370 .and_then(PdfValue::as_integer)
371 .ok_or_else(|| PdfError::Corrupt("ObjStm missing First".to_string()))?
372 as usize;
373
374 let decoded = decode_stream(&stream)?;
375 if first > decoded.len() {
376 return Err(PdfError::Corrupt(
377 "ObjStm First offset is past end of decoded data".to_string(),
378 ));
379 }
380
381 let header = &decoded[..first];
382 let mut header_cursor = Cursor::new(header, 0);
383 let mut entries: Vec<(u32, usize)> = Vec::with_capacity(n);
384 for _ in 0..n {
385 header_cursor.skip_ws_and_comments();
386 let obj_num = header_cursor.parse_u32()?;
387 header_cursor.skip_ws_and_comments();
388 let rel_offset = header_cursor.parse_usize()?;
389 entries.push((obj_num, rel_offset));
390 }
391
392 members.sort_by_key(|(_, index)| *index);
394 for (member_ref, index) in members {
395 let idx = index as usize;
396 if idx >= entries.len() {
397 return Err(PdfError::Corrupt(format!(
398 "ObjStm {stream_obj_num} has no index {idx}"
399 )));
400 }
401 let (declared_number, rel_offset) = entries[idx];
402 if declared_number != member_ref.object_number {
403 return Err(PdfError::Corrupt(format!(
404 "ObjStm {stream_obj_num} index {idx} has number {declared_number} but xref expected {}",
405 member_ref.object_number
406 )));
407 }
408 let absolute_offset = first
409 .checked_add(rel_offset)
410 .ok_or_else(|| PdfError::Corrupt("ObjStm offset overflow".to_string()))?;
411 if absolute_offset > decoded.len() {
412 return Err(PdfError::Corrupt(
413 "ObjStm member offset is past end of decoded data".to_string(),
414 ));
415 }
416 let mut value_cursor = Cursor::new(&decoded, absolute_offset);
417 let value = value_cursor.parse_value()?;
418 if let PdfValue::Dictionary(dict) = &value {
419 if dict.get("Type").and_then(PdfValue::as_name) == Some("ObjStm") {
420 return Err(PdfError::Unsupported(
421 "nested object streams are not supported".to_string(),
422 ));
423 }
424 }
425 *max_object_number = (*max_object_number).max(member_ref.object_number);
426 objects.insert(member_ref, PdfObject::Value(value));
427 }
428 }
429
430 Ok(())
431}
432
433fn parse_indirect_object(bytes: &[u8], offset: usize) -> PdfResult<PdfObject> {
434 let mut cursor = Cursor::new(bytes, offset);
435 let _object_number = cursor.parse_u32()?;
436 cursor.skip_ws_and_comments();
437 let _generation = cursor.parse_u16()?;
438 cursor.skip_ws_and_comments();
439 cursor.expect_keyword("obj")?;
440 cursor.skip_ws_and_comments();
441
442 let value = cursor.parse_value()?;
443 cursor.skip_ws_and_comments();
444 if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
445 let dict = match value {
446 PdfValue::Dictionary(dict) => dict,
447 _ => unreachable!(),
448 };
449 cursor.expect_keyword("stream")?;
450 cursor.consume_stream_line_break();
451 let stream_start = cursor.position;
452 let length_hint = dict
458 .get("Length")
459 .and_then(PdfValue::as_integer)
460 .filter(|&len| len >= 0)
461 .map(|len| len as usize);
462 let (data, endstream_pos) = match length_hint {
463 Some(len) if stream_start + len <= bytes.len() => {
464 let mut check = stream_start + len;
467 while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
468 check += 1;
469 }
470 if bytes.get(check..check + 9) == Some(b"endstream") {
471 (bytes[stream_start..stream_start + len].to_vec(), check)
472 } else {
473 let pos = find_keyword(bytes, stream_start, b"endstream")
475 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
476 (bytes[stream_start..pos].to_vec(), pos)
477 }
478 }
479 _ => {
480 let pos = find_keyword(bytes, stream_start, b"endstream")
481 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
482 (bytes[stream_start..pos].to_vec(), pos)
483 }
484 };
485 cursor.position = endstream_pos;
486 cursor.expect_keyword("endstream")?;
487 cursor.skip_ws_and_comments();
488 cursor.expect_keyword("endobj")?;
489 Ok(PdfObject::Stream(PdfStream { dict, data }))
490 } else {
491 cursor.expect_keyword("endobj")?;
492 Ok(PdfObject::Value(value))
493 }
494}
495
496fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
497 bytes[start..]
498 .windows(keyword.len())
499 .position(|window| window == keyword)
500 .map(|relative| start + relative)
501}
502
503struct Cursor<'a> {
504 bytes: &'a [u8],
505 position: usize,
506}
507
508impl<'a> Cursor<'a> {
509 fn new(bytes: &'a [u8], position: usize) -> Self {
510 Self { bytes, position }
511 }
512
513 fn eof(&self) -> bool {
514 self.position >= self.bytes.len()
515 }
516
517 fn current(&self) -> Option<u8> {
518 self.bytes.get(self.position).copied()
519 }
520
521 fn skip_ws_and_comments(&mut self) {
522 while let Some(byte) = self.current() {
523 match byte {
524 b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
525 b'%' => {
526 while let Some(next) = self.current() {
527 self.position += 1;
528 if next == b'\n' || next == b'\r' {
529 break;
530 }
531 }
532 }
533 _ => break,
534 }
535 }
536 }
537
538 fn skip_line_breaks(&mut self) {
539 while matches!(self.current(), Some(b'\n' | b'\r')) {
540 self.position += 1;
541 }
542 }
543
544 fn read_line(&mut self) -> PdfResult<&'a [u8]> {
545 if self.eof() {
546 return Err(PdfError::Parse("unexpected end of file".to_string()));
547 }
548 let start = self.position;
549 while let Some(byte) = self.current() {
550 if byte == b'\n' || byte == b'\r' {
551 let end = self.position;
552 self.skip_line_breaks();
553 return Ok(&self.bytes[start..end]);
554 }
555 self.position += 1;
556 }
557 Ok(&self.bytes[start..self.position])
558 }
559
560 fn peek_keyword(&self, keyword: &str) -> bool {
561 self.bytes
562 .get(self.position..self.position + keyword.len())
563 .map(|slice| slice == keyword.as_bytes())
564 .unwrap_or(false)
565 }
566
567 fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
568 self.skip_ws_and_comments();
569 if self.peek_keyword(keyword) {
570 self.position += keyword.len();
571 Ok(())
572 } else {
573 Err(PdfError::Parse(format!("expected keyword {keyword}")))
574 }
575 }
576
577 fn consume_stream_line_break(&mut self) {
578 if self.current() == Some(b'\r') {
579 self.position += 1;
580 }
581 if self.current() == Some(b'\n') {
582 self.position += 1;
583 }
584 }
585
586 fn parse_u32(&mut self) -> PdfResult<u32> {
587 let token = self.parse_token()?;
588 token
589 .parse::<u32>()
590 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
591 }
592
593 fn parse_u16(&mut self) -> PdfResult<u16> {
594 let token = self.parse_token()?;
595 token
596 .parse::<u16>()
597 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
598 }
599
600 fn parse_usize(&mut self) -> PdfResult<usize> {
601 let token = self.parse_token()?;
602 token
603 .parse::<usize>()
604 .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
605 }
606
607 fn parse_token(&mut self) -> PdfResult<String> {
608 self.skip_ws_and_comments();
609 let start = self.position;
610 while let Some(byte) = self.current() {
611 if is_delimiter(byte) || is_whitespace(byte) {
612 break;
613 }
614 self.position += 1;
615 }
616 if self.position == start {
617 return Err(PdfError::Parse("expected token".to_string()));
618 }
619 Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
620 }
621
622 fn parse_value(&mut self) -> PdfResult<PdfValue> {
623 self.skip_ws_and_comments();
624 match self.current() {
625 Some(b'/') => self.parse_name(),
626 Some(b'(') => self.parse_literal_string(),
627 Some(b'[') => self.parse_array(),
628 Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
629 self.parse_dictionary()
630 }
631 Some(b'<') => self.parse_hex_string(),
632 Some(b't') if self.peek_keyword("true") => {
633 self.position += 4;
634 Ok(PdfValue::Bool(true))
635 }
636 Some(b'f') if self.peek_keyword("false") => {
637 self.position += 5;
638 Ok(PdfValue::Bool(false))
639 }
640 Some(b'n') if self.peek_keyword("null") => {
641 self.position += 4;
642 Ok(PdfValue::Null)
643 }
644 Some(_) => self.parse_number_or_reference(),
645 None => Err(PdfError::Parse("unexpected end of file".to_string())),
646 }
647 }
648
649 fn parse_name(&mut self) -> PdfResult<PdfValue> {
650 self.position += 1;
651 let mut raw = Vec::new();
652 while let Some(byte) = self.current() {
653 if is_delimiter(byte) || is_whitespace(byte) {
654 break;
655 }
656 if byte == b'#' {
657 let high =
658 self.bytes.get(self.position + 1).copied().ok_or_else(|| {
659 PdfError::Parse("truncated #XX escape in name".to_string())
660 })?;
661 let low =
662 self.bytes.get(self.position + 2).copied().ok_or_else(|| {
663 PdfError::Parse("truncated #XX escape in name".to_string())
664 })?;
665 let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
666 .map_err(|_| {
667 PdfError::Parse("invalid #XX hex escape in name".to_string())
668 })?;
669 raw.push(decoded);
670 self.position += 3;
671 } else {
672 raw.push(byte);
673 self.position += 1;
674 }
675 }
676 Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
677 }
678
679 fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
680 self.position += 1;
681 let mut output = Vec::new();
682 let mut depth = 1usize;
683 while let Some(byte) = self.current() {
684 self.position += 1;
685 match byte {
686 b'\\' => {
687 let escaped = self
688 .current()
689 .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
690 self.position += 1;
691 match escaped {
692 b'n' => output.push(b'\n'),
693 b'r' => output.push(b'\r'),
694 b't' => output.push(b'\t'),
695 b'b' => output.push(0x08),
696 b'f' => output.push(0x0C),
697 b'(' | b')' | b'\\' => output.push(escaped),
698 b'\n' => {}
699 b'\r' => {
700 if self.current() == Some(b'\n') {
701 self.position += 1;
702 }
703 }
704 b'0'..=b'7' => {
705 let mut octal = vec![escaped];
706 for _ in 0..2 {
707 match self.current() {
708 Some(next @ b'0'..=b'7') => {
709 octal.push(next);
710 self.position += 1;
711 }
712 _ => break,
713 }
714 }
715 let value =
717 u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
718 .unwrap_or(0);
719 output.push((value % 256) as u8);
720 }
721 other => output.push(other),
722 }
723 }
724 b'(' => {
725 depth += 1;
726 output.push(byte);
727 }
728 b')' => {
729 depth -= 1;
730 if depth == 0 {
731 return Ok(PdfValue::String(PdfString(output)));
732 }
733 output.push(byte);
734 }
735 _ => output.push(byte),
736 }
737 }
738 Err(PdfError::Parse("unterminated literal string".to_string()))
739 }
740
741 fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
742 self.position += 1;
743 let start = self.position;
744 while self.current() != Some(b'>') {
745 if self.eof() {
746 return Err(PdfError::Parse("unterminated hex string".to_string()));
747 }
748 self.position += 1;
749 }
750 let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
751 .chars()
752 .filter(|character| !character.is_whitespace())
753 .collect::<String>();
754 self.position += 1;
755 let mut chars = raw.chars().collect::<Vec<_>>();
756 if chars.len() % 2 != 0 {
757 chars.push('0');
758 }
759 let mut bytes = Vec::with_capacity(chars.len() / 2);
760 for pair in chars.chunks(2) {
761 let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
762 .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
763 bytes.push(value);
764 }
765 Ok(PdfValue::String(PdfString(bytes)))
766 }
767
768 fn parse_array(&mut self) -> PdfResult<PdfValue> {
769 self.position += 1;
770 let mut values = Vec::new();
771 loop {
772 self.skip_ws_and_comments();
773 match self.current() {
774 Some(b']') => {
775 self.position += 1;
776 break;
777 }
778 Some(_) => values.push(self.parse_value()?),
779 None => return Err(PdfError::Parse("unterminated array".to_string())),
780 }
781 }
782 Ok(PdfValue::Array(values))
783 }
784
785 fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
786 self.position += 2;
787 let mut dictionary = PdfDictionary::new();
788 loop {
789 self.skip_ws_and_comments();
790 if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
791 self.position += 2;
792 break;
793 }
794 let key = match self.parse_name()? {
795 PdfValue::Name(name) => name,
796 _ => unreachable!(),
797 };
798 let value = self.parse_value()?;
799 dictionary.insert(key, value);
800 }
801 Ok(PdfValue::Dictionary(dictionary))
802 }
803
804 fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
805 let first_token = self.parse_token()?;
806 if first_token.contains('.') || first_token.contains(['e', 'E']) {
807 return first_token
808 .parse::<f64>()
809 .map(PdfValue::Number)
810 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
811 }
812
813 let checkpoint = self.position;
814 self.skip_ws_and_comments();
815 if let Ok(second_token) = self.parse_token() {
816 self.skip_ws_and_comments();
817 if self.current() == Some(b'R')
818 && second_token
819 .chars()
820 .all(|character| character.is_ascii_digit())
821 {
822 self.position += 1;
823 return Ok(PdfValue::Reference(ObjectRef::new(
824 first_token
825 .parse::<u32>()
826 .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
827 second_token
828 .parse::<u16>()
829 .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
830 )));
831 }
832 }
833 self.position = checkpoint;
834 first_token
835 .parse::<i64>()
836 .map(PdfValue::Integer)
837 .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
838 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
839 }
840}
841
842fn is_whitespace(byte: u8) -> bool {
843 matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
844}
845
846fn is_delimiter(byte: u8) -> bool {
847 matches!(
848 byte,
849 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
850 )
851}
852
853#[cfg(test)]
854mod tests {
855 use super::parse_pdf;
856 use crate::error::PdfError;
857 use crate::types::PdfObject;
858
859 #[test]
860 fn parses_simple_pdf_fixture() {
861 let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
862 let document = parse_pdf(bytes).expect("fixture should parse");
863 assert_eq!(document.pages.len(), 1);
864 }
865
866 #[test]
867 fn parses_incremental_update_fixture() {
868 let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
869 let document = parse_pdf(bytes).expect("incremental fixture should parse");
870 assert_eq!(document.pages.len(), 1);
871
872 let content_refs = &document.pages[0].content_refs;
875 assert!(!content_refs.is_empty());
876 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
877 let stream_data = match content_obj {
878 PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
879 _ => panic!("expected stream object for page content"),
880 };
881 assert!(
882 stream_data.contains("Updated Secret"),
883 "content stream should contain updated text"
884 );
885 assert!(
886 !stream_data.contains("Original Secret"),
887 "content stream should not contain original text"
888 );
889 }
890
891 #[test]
892 fn circular_prev_chain_does_not_loop() {
893 let mut pdf = Vec::new();
897 pdf.extend_from_slice(b"%PDF-1.4\n");
898
899 let obj1_offset = pdf.len();
901 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
902
903 let obj2_offset = pdf.len();
905 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
906
907 let xref_offset = pdf.len();
908 pdf.extend_from_slice(b"xref\n0 3\n");
909 pdf.extend_from_slice(b"0000000000 65535 f \n");
910 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
911 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
912 pdf.extend_from_slice(b"trailer\n");
913 pdf.extend_from_slice(
915 format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
916 );
917 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
918
919 let document = parse_pdf(&pdf).expect("circular Prev should be tolerated");
920 assert_eq!(document.pages.len(), 0);
921 }
922
923 #[test]
924 fn parses_uncompressed_xref_stream() {
925 let mut pdf: Vec<u8> = Vec::new();
928 pdf.extend_from_slice(b"%PDF-1.5\n");
929
930 let obj1_offset = pdf.len();
931 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
932 let obj2_offset = pdf.len();
933 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
934
935 let row_for = |t: u8, off: u16, generation: u8| {
938 let mut row = [0u8; 4];
939 row[0] = t;
940 row[1] = (off >> 8) as u8;
941 row[2] = off as u8;
942 row[3] = generation;
943 row
944 };
945 let mut body = Vec::new();
946 body.extend_from_slice(&row_for(0, 0, 0xFF)); body.extend_from_slice(&row_for(1, obj1_offset as u16, 0));
948 body.extend_from_slice(&row_for(1, obj2_offset as u16, 0));
949 body.extend_from_slice(&row_for(1, 0, 0)); let xref_obj_offset = pdf.len();
952 let self_offset = xref_obj_offset as u16;
954 body[12] = 1;
955 body[13] = (self_offset >> 8) as u8;
956 body[14] = self_offset as u8;
957 body[15] = 0;
958
959 let stream_dict = format!(
960 "<< /Type /XRef /Size 4 /W [1 2 1] /Root 1 0 R /Length {} >>",
961 body.len()
962 );
963 pdf.extend_from_slice(format!("3 0 obj\n{stream_dict}\nstream\n").as_bytes());
964 pdf.extend_from_slice(&body);
965 pdf.extend_from_slice(b"\nendstream\nendobj\n");
966 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_obj_offset).as_bytes());
967
968 let document = parse_pdf(&pdf).expect("xref stream fixture should parse");
969 assert_eq!(document.pages.len(), 0);
970 assert!(document.file.objects.len() >= 2);
972 }
973
974 #[test]
975 fn parses_object_stream_via_xref_stream() {
976 use flate2::{Compression, write::ZlibEncoder};
977 use std::io::Write;
978
979 let mut pdf: Vec<u8> = Vec::new();
986 pdf.extend_from_slice(b"%PDF-1.5\n");
987
988 let obj1_offset = pdf.len();
989 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
990
991 let member_payload = b"<< /Type /Pages /Count 0 /Kids [] >>";
993 let header = b"2 0 ";
994 let first = header.len();
995 let mut decompressed = Vec::new();
996 decompressed.extend_from_slice(header);
997 decompressed.extend_from_slice(member_payload);
998
999 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1000 encoder.write_all(&decompressed).unwrap();
1001 let compressed = encoder.finish().unwrap();
1002
1003 let obj3_offset = pdf.len();
1004 let objstm_dict = format!(
1005 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1006 first,
1007 compressed.len()
1008 );
1009 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1010 pdf.extend_from_slice(&compressed);
1011 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1012
1013 let row_for = |t: u8, a: u32, b: u16| {
1017 let mut row = [0u8; 5];
1018 row[0] = t;
1019 row[1] = (a >> 16) as u8;
1020 row[2] = (a >> 8) as u8;
1021 row[3] = a as u8;
1022 row[4] = b as u8;
1023 row
1024 };
1025
1026 let obj4_offset = pdf.len();
1027 let mut body = Vec::new();
1028 body.extend_from_slice(&row_for(0, 0, 0xFF));
1029 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1030 body.extend_from_slice(&row_for(2, 3, 0));
1031 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1032 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1033
1034 let stream_dict = format!(
1035 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1036 body.len()
1037 );
1038 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1039 pdf.extend_from_slice(&body);
1040 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1041 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1042
1043 let document = parse_pdf(&pdf).expect("ObjStm fixture should parse");
1044 assert_eq!(document.pages.len(), 0);
1045 let pages_ref = document.catalog.pages_ref;
1047 let pages_dict = document.file.get_dictionary(pages_ref).unwrap();
1048 assert_eq!(pages_dict.get("Type").and_then(|v| v.as_name()), Some("Pages"));
1049 }
1050
1051 #[test]
1052 fn rejects_nested_object_stream() {
1053 use flate2::{Compression, write::ZlibEncoder};
1054 use std::io::Write;
1055
1056 let mut pdf: Vec<u8> = Vec::new();
1058 pdf.extend_from_slice(b"%PDF-1.5\n");
1059
1060 let obj1_offset = pdf.len();
1061 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1062
1063 let member_payload = b"<< /Type /ObjStm /N 0 /First 0 /Length 0 >>";
1064 let header = b"2 0 ";
1065 let first = header.len();
1066 let mut decompressed = Vec::new();
1067 decompressed.extend_from_slice(header);
1068 decompressed.extend_from_slice(member_payload);
1069
1070 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1071 encoder.write_all(&decompressed).unwrap();
1072 let compressed = encoder.finish().unwrap();
1073
1074 let obj3_offset = pdf.len();
1075 let objstm_dict = format!(
1076 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1077 first,
1078 compressed.len()
1079 );
1080 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1081 pdf.extend_from_slice(&compressed);
1082 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1083
1084 let row_for = |t: u8, a: u32, b: u16| {
1085 let mut row = [0u8; 5];
1086 row[0] = t;
1087 row[1] = (a >> 16) as u8;
1088 row[2] = (a >> 8) as u8;
1089 row[3] = a as u8;
1090 row[4] = b as u8;
1091 row
1092 };
1093
1094 let obj4_offset = pdf.len();
1095 let mut body = Vec::new();
1096 body.extend_from_slice(&row_for(0, 0, 0xFF));
1097 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1098 body.extend_from_slice(&row_for(2, 3, 0));
1099 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1100 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1101
1102 let stream_dict = format!(
1103 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1104 body.len()
1105 );
1106 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1107 pdf.extend_from_slice(&body);
1108 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1109 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1110
1111 match parse_pdf(&pdf) {
1112 Err(PdfError::Unsupported(message)) => assert!(
1113 message.contains("nested object streams"),
1114 "got: {message}"
1115 ),
1116 other => panic!("expected Unsupported, got: {other:?}"),
1117 }
1118 }
1119}