1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::document::build_document;
4use crate::error::{PdfError, PdfResult};
5use crate::types::{
6 ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
7};
8
9pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
10 let version = parse_header(bytes)?;
11 let startxref = find_startxref(bytes)?;
12 let (xref, trailer) = parse_xref_table(bytes, startxref)?;
13
14 let mut objects = BTreeMap::new();
15 let mut max_object_number = 0;
16 for (object_ref, entry) in xref {
17 if !entry.in_use {
18 continue;
19 }
20 if object_ref.object_number == 0 {
21 continue;
22 }
23 let object = parse_indirect_object(bytes, entry.offset)?;
24 max_object_number = max_object_number.max(object_ref.object_number);
25 objects.insert(object_ref, object);
26 }
27 let file = PdfFile {
28 version,
29 objects,
30 trailer,
31 max_object_number,
32 };
33 build_document(file)
34}
35
36fn parse_header(bytes: &[u8]) -> PdfResult<String> {
37 if !bytes.starts_with(b"%PDF-") {
38 return Err(PdfError::Parse("missing PDF header".to_string()));
39 }
40 let line_end = bytes
41 .iter()
42 .position(|byte| *byte == b'\n' || *byte == b'\r')
43 .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
44 Ok(String::from_utf8_lossy(&bytes[5..line_end])
45 .trim()
46 .to_string())
47}
48
49fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
50 let marker = b"startxref";
51 let position = bytes
52 .windows(marker.len())
53 .rposition(|window| window == marker)
54 .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
55 let mut parser = Cursor::new(bytes, position + marker.len());
56 parser.skip_ws_and_comments();
57 parser.parse_usize()
58}
59
60fn parse_xref_table(
61 bytes: &[u8],
62 start_offset: usize,
63) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
64 let mut merged_entries = BTreeMap::new();
65 let mut newest_trailer = None;
66 let mut visited = BTreeSet::new();
67 let mut offset = start_offset;
68
69 loop {
70 if !visited.insert(offset) {
71 return Err(PdfError::Parse("circular Prev chain".to_string()));
72 }
73 let (section_entries, trailer) = parse_xref_section(bytes, offset)?;
74
75 if trailer.contains_key("XRefStm") {
76 return Err(PdfError::Unsupported(
77 "xref streams are not supported".to_string(),
78 ));
79 }
80
81 for (object_ref, entry) in section_entries {
83 merged_entries.entry(object_ref).or_insert(entry);
84 }
85
86 if newest_trailer.is_none() {
87 newest_trailer = Some(trailer.clone());
88 }
89
90 match trailer.get("Prev").and_then(PdfValue::as_integer) {
91 Some(prev_offset) => offset = prev_offset as usize,
92 None => break,
93 }
94 }
95
96 Ok((merged_entries, newest_trailer.unwrap()))
97}
98
99fn parse_xref_section(
100 bytes: &[u8],
101 offset: usize,
102) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
103 let mut cursor = Cursor::new(bytes, offset);
104 cursor.expect_keyword("xref")?;
105 let mut entries = BTreeMap::new();
106 loop {
107 cursor.skip_ws_and_comments();
108 if cursor.peek_keyword("trailer") {
109 break;
110 }
111 let start = cursor.parse_u32()?;
112 cursor.skip_ws_and_comments();
113 let count = cursor.parse_u32()?;
114 cursor.skip_line_breaks();
115 for index in 0..count {
116 let line = cursor.read_line()?;
117 if line.len() < 17 {
118 return Err(PdfError::Parse("invalid xref entry".to_string()));
119 }
120 let parts = String::from_utf8_lossy(line).trim().to_string();
121 let mut fields = parts.split_whitespace();
122 let entry_offset = fields
123 .next()
124 .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
125 .parse::<usize>()
126 .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
127 let generation = fields
128 .next()
129 .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
130 .parse::<u16>()
131 .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
132 let flag = fields
133 .next()
134 .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
135 let object_number = start
136 .checked_add(index)
137 .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
138 entries.insert(
139 ObjectRef::new(object_number, generation),
140 XrefEntry {
141 offset: entry_offset,
142 generation,
143 in_use: flag == "n",
144 },
145 );
146 }
147 }
148 cursor.expect_keyword("trailer")?;
149 let trailer = match cursor.parse_value()? {
150 PdfValue::Dictionary(dictionary) => dictionary,
151 _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
152 };
153 Ok((entries, trailer))
154}
155
156fn parse_indirect_object(bytes: &[u8], offset: usize) -> PdfResult<PdfObject> {
157 let mut cursor = Cursor::new(bytes, offset);
158 let _object_number = cursor.parse_u32()?;
159 cursor.skip_ws_and_comments();
160 let _generation = cursor.parse_u16()?;
161 cursor.skip_ws_and_comments();
162 cursor.expect_keyword("obj")?;
163 cursor.skip_ws_and_comments();
164
165 let value = cursor.parse_value()?;
166 cursor.skip_ws_and_comments();
167 if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
168 let dict = match value {
169 PdfValue::Dictionary(dict) => dict,
170 _ => unreachable!(),
171 };
172 cursor.expect_keyword("stream")?;
173 cursor.consume_stream_line_break();
174 let stream_start = cursor.position;
175 let length_hint = dict
181 .get("Length")
182 .and_then(PdfValue::as_integer)
183 .filter(|&len| len >= 0)
184 .map(|len| len as usize);
185 let (data, endstream_pos) = match length_hint {
186 Some(len) if stream_start + len <= bytes.len() => {
187 let mut check = stream_start + len;
190 while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
191 check += 1;
192 }
193 if bytes.get(check..check + 9) == Some(b"endstream") {
194 (bytes[stream_start..stream_start + len].to_vec(), check)
195 } else {
196 let pos = find_keyword(bytes, stream_start, b"endstream")
198 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
199 (bytes[stream_start..pos].to_vec(), pos)
200 }
201 }
202 _ => {
203 let pos = find_keyword(bytes, stream_start, b"endstream")
204 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
205 (bytes[stream_start..pos].to_vec(), pos)
206 }
207 };
208 cursor.position = endstream_pos;
209 cursor.expect_keyword("endstream")?;
210 cursor.skip_ws_and_comments();
211 cursor.expect_keyword("endobj")?;
212 Ok(PdfObject::Stream(PdfStream { dict, data }))
213 } else {
214 cursor.expect_keyword("endobj")?;
215 Ok(PdfObject::Value(value))
216 }
217}
218
219fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
220 bytes[start..]
221 .windows(keyword.len())
222 .position(|window| window == keyword)
223 .map(|relative| start + relative)
224}
225
226struct Cursor<'a> {
227 bytes: &'a [u8],
228 position: usize,
229}
230
231impl<'a> Cursor<'a> {
232 fn new(bytes: &'a [u8], position: usize) -> Self {
233 Self { bytes, position }
234 }
235
236 fn eof(&self) -> bool {
237 self.position >= self.bytes.len()
238 }
239
240 fn current(&self) -> Option<u8> {
241 self.bytes.get(self.position).copied()
242 }
243
244 fn skip_ws_and_comments(&mut self) {
245 while let Some(byte) = self.current() {
246 match byte {
247 b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
248 b'%' => {
249 while let Some(next) = self.current() {
250 self.position += 1;
251 if next == b'\n' || next == b'\r' {
252 break;
253 }
254 }
255 }
256 _ => break,
257 }
258 }
259 }
260
261 fn skip_line_breaks(&mut self) {
262 while matches!(self.current(), Some(b'\n' | b'\r')) {
263 self.position += 1;
264 }
265 }
266
267 fn read_line(&mut self) -> PdfResult<&'a [u8]> {
268 if self.eof() {
269 return Err(PdfError::Parse("unexpected end of file".to_string()));
270 }
271 let start = self.position;
272 while let Some(byte) = self.current() {
273 if byte == b'\n' || byte == b'\r' {
274 let end = self.position;
275 self.skip_line_breaks();
276 return Ok(&self.bytes[start..end]);
277 }
278 self.position += 1;
279 }
280 Ok(&self.bytes[start..self.position])
281 }
282
283 fn peek_keyword(&self, keyword: &str) -> bool {
284 self.bytes
285 .get(self.position..self.position + keyword.len())
286 .map(|slice| slice == keyword.as_bytes())
287 .unwrap_or(false)
288 }
289
290 fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
291 self.skip_ws_and_comments();
292 if self.peek_keyword(keyword) {
293 self.position += keyword.len();
294 Ok(())
295 } else {
296 Err(PdfError::Parse(format!("expected keyword {keyword}")))
297 }
298 }
299
300 fn consume_stream_line_break(&mut self) {
301 if self.current() == Some(b'\r') {
302 self.position += 1;
303 }
304 if self.current() == Some(b'\n') {
305 self.position += 1;
306 }
307 }
308
309 fn parse_u32(&mut self) -> PdfResult<u32> {
310 let token = self.parse_token()?;
311 token
312 .parse::<u32>()
313 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
314 }
315
316 fn parse_u16(&mut self) -> PdfResult<u16> {
317 let token = self.parse_token()?;
318 token
319 .parse::<u16>()
320 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
321 }
322
323 fn parse_usize(&mut self) -> PdfResult<usize> {
324 let token = self.parse_token()?;
325 token
326 .parse::<usize>()
327 .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
328 }
329
330 fn parse_token(&mut self) -> PdfResult<String> {
331 self.skip_ws_and_comments();
332 let start = self.position;
333 while let Some(byte) = self.current() {
334 if is_delimiter(byte) || is_whitespace(byte) {
335 break;
336 }
337 self.position += 1;
338 }
339 if self.position == start {
340 return Err(PdfError::Parse("expected token".to_string()));
341 }
342 Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
343 }
344
345 fn parse_value(&mut self) -> PdfResult<PdfValue> {
346 self.skip_ws_and_comments();
347 match self.current() {
348 Some(b'/') => self.parse_name(),
349 Some(b'(') => self.parse_literal_string(),
350 Some(b'[') => self.parse_array(),
351 Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
352 self.parse_dictionary()
353 }
354 Some(b'<') => self.parse_hex_string(),
355 Some(b't') if self.peek_keyword("true") => {
356 self.position += 4;
357 Ok(PdfValue::Bool(true))
358 }
359 Some(b'f') if self.peek_keyword("false") => {
360 self.position += 5;
361 Ok(PdfValue::Bool(false))
362 }
363 Some(b'n') if self.peek_keyword("null") => {
364 self.position += 4;
365 Ok(PdfValue::Null)
366 }
367 Some(_) => self.parse_number_or_reference(),
368 None => Err(PdfError::Parse("unexpected end of file".to_string())),
369 }
370 }
371
372 fn parse_name(&mut self) -> PdfResult<PdfValue> {
373 self.position += 1;
374 let mut raw = Vec::new();
375 while let Some(byte) = self.current() {
376 if is_delimiter(byte) || is_whitespace(byte) {
377 break;
378 }
379 if byte == b'#' {
380 let high =
381 self.bytes.get(self.position + 1).copied().ok_or_else(|| {
382 PdfError::Parse("truncated #XX escape in name".to_string())
383 })?;
384 let low =
385 self.bytes.get(self.position + 2).copied().ok_or_else(|| {
386 PdfError::Parse("truncated #XX escape in name".to_string())
387 })?;
388 let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
389 .map_err(|_| {
390 PdfError::Parse("invalid #XX hex escape in name".to_string())
391 })?;
392 raw.push(decoded);
393 self.position += 3;
394 } else {
395 raw.push(byte);
396 self.position += 1;
397 }
398 }
399 Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
400 }
401
402 fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
403 self.position += 1;
404 let mut output = Vec::new();
405 let mut depth = 1usize;
406 while let Some(byte) = self.current() {
407 self.position += 1;
408 match byte {
409 b'\\' => {
410 let escaped = self
411 .current()
412 .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
413 self.position += 1;
414 match escaped {
415 b'n' => output.push(b'\n'),
416 b'r' => output.push(b'\r'),
417 b't' => output.push(b'\t'),
418 b'b' => output.push(0x08),
419 b'f' => output.push(0x0C),
420 b'(' | b')' | b'\\' => output.push(escaped),
421 b'\n' => {}
422 b'\r' => {
423 if self.current() == Some(b'\n') {
424 self.position += 1;
425 }
426 }
427 b'0'..=b'7' => {
428 let mut octal = vec![escaped];
429 for _ in 0..2 {
430 match self.current() {
431 Some(next @ b'0'..=b'7') => {
432 octal.push(next);
433 self.position += 1;
434 }
435 _ => break,
436 }
437 }
438 let value =
440 u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
441 .unwrap_or(0);
442 output.push((value % 256) as u8);
443 }
444 other => output.push(other),
445 }
446 }
447 b'(' => {
448 depth += 1;
449 output.push(byte);
450 }
451 b')' => {
452 depth -= 1;
453 if depth == 0 {
454 return Ok(PdfValue::String(PdfString(output)));
455 }
456 output.push(byte);
457 }
458 _ => output.push(byte),
459 }
460 }
461 Err(PdfError::Parse("unterminated literal string".to_string()))
462 }
463
464 fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
465 self.position += 1;
466 let start = self.position;
467 while self.current() != Some(b'>') {
468 if self.eof() {
469 return Err(PdfError::Parse("unterminated hex string".to_string()));
470 }
471 self.position += 1;
472 }
473 let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
474 .chars()
475 .filter(|character| !character.is_whitespace())
476 .collect::<String>();
477 self.position += 1;
478 let mut chars = raw.chars().collect::<Vec<_>>();
479 if chars.len() % 2 != 0 {
480 chars.push('0');
481 }
482 let mut bytes = Vec::with_capacity(chars.len() / 2);
483 for pair in chars.chunks(2) {
484 let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
485 .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
486 bytes.push(value);
487 }
488 Ok(PdfValue::String(PdfString(bytes)))
489 }
490
491 fn parse_array(&mut self) -> PdfResult<PdfValue> {
492 self.position += 1;
493 let mut values = Vec::new();
494 loop {
495 self.skip_ws_and_comments();
496 match self.current() {
497 Some(b']') => {
498 self.position += 1;
499 break;
500 }
501 Some(_) => values.push(self.parse_value()?),
502 None => return Err(PdfError::Parse("unterminated array".to_string())),
503 }
504 }
505 Ok(PdfValue::Array(values))
506 }
507
508 fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
509 self.position += 2;
510 let mut dictionary = PdfDictionary::new();
511 loop {
512 self.skip_ws_and_comments();
513 if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
514 self.position += 2;
515 break;
516 }
517 let key = match self.parse_name()? {
518 PdfValue::Name(name) => name,
519 _ => unreachable!(),
520 };
521 let value = self.parse_value()?;
522 dictionary.insert(key, value);
523 }
524 Ok(PdfValue::Dictionary(dictionary))
525 }
526
527 fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
528 let first_token = self.parse_token()?;
529 if first_token.contains('.') || first_token.contains(['e', 'E']) {
530 return first_token
531 .parse::<f64>()
532 .map(PdfValue::Number)
533 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
534 }
535
536 let checkpoint = self.position;
537 self.skip_ws_and_comments();
538 if let Ok(second_token) = self.parse_token() {
539 self.skip_ws_and_comments();
540 if self.current() == Some(b'R')
541 && second_token
542 .chars()
543 .all(|character| character.is_ascii_digit())
544 {
545 self.position += 1;
546 return Ok(PdfValue::Reference(ObjectRef::new(
547 first_token
548 .parse::<u32>()
549 .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
550 second_token
551 .parse::<u16>()
552 .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
553 )));
554 }
555 }
556 self.position = checkpoint;
557 first_token
558 .parse::<i64>()
559 .map(PdfValue::Integer)
560 .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
561 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
562 }
563}
564
565fn is_whitespace(byte: u8) -> bool {
566 matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
567}
568
569fn is_delimiter(byte: u8) -> bool {
570 matches!(
571 byte,
572 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
573 )
574}
575
576#[cfg(test)]
577mod tests {
578 use super::parse_pdf;
579 use crate::error::PdfError;
580 use crate::types::PdfObject;
581
582 #[test]
583 fn parses_simple_pdf_fixture() {
584 let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
585 let document = parse_pdf(bytes).expect("fixture should parse");
586 assert_eq!(document.pages.len(), 1);
587 }
588
589 #[test]
590 fn parses_incremental_update_fixture() {
591 let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
592 let document = parse_pdf(bytes).expect("incremental fixture should parse");
593 assert_eq!(document.pages.len(), 1);
594
595 let content_refs = &document.pages[0].content_refs;
598 assert!(!content_refs.is_empty());
599 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
600 let stream_data = match content_obj {
601 PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
602 _ => panic!("expected stream object for page content"),
603 };
604 assert!(
605 stream_data.contains("Updated Secret"),
606 "content stream should contain updated text"
607 );
608 assert!(
609 !stream_data.contains("Original Secret"),
610 "content stream should not contain original text"
611 );
612 }
613
614 #[test]
615 fn rejects_circular_prev_chain() {
616 let mut pdf = Vec::new();
618 pdf.extend_from_slice(b"%PDF-1.4\n");
619
620 let obj1_offset = pdf.len();
622 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
623
624 let obj2_offset = pdf.len();
626 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
627
628 let xref_offset = pdf.len();
629 pdf.extend_from_slice(b"xref\n0 3\n");
630 pdf.extend_from_slice(b"0000000000 65535 f \n");
631 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
632 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
633 pdf.extend_from_slice(b"trailer\n");
634 pdf.extend_from_slice(
636 format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
637 );
638 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
639
640 let result = parse_pdf(&pdf);
641 match result {
642 Err(PdfError::Parse(message)) => {
643 assert!(
644 message.contains("circular Prev chain"),
645 "expected circular chain error, got: {message}"
646 );
647 }
648 other => panic!("expected Parse error, got: {other:?}"),
649 }
650 }
651}