1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::crypto::{BytesKind, StandardSecurityHandler};
4use crate::document::build_document;
5use crate::error::{PdfError, PdfResult};
6use crate::stream::decode_stream;
7use crate::types::{
8 ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
9};
10
11pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
15 parse_pdf_with_password(bytes, b"")
16}
17
18pub fn parse_pdf_with_password(
24 bytes: &[u8],
25 password: &[u8],
26) -> PdfResult<crate::document::ParsedDocument> {
27 let version = parse_header(bytes)?;
28 let startxref = find_startxref(bytes)?;
29 let (xref, mut trailer) = parse_xref_table(bytes, startxref)?;
30
31 let mut objects = BTreeMap::new();
32 let mut max_object_number = 0;
33 let mut compressed: Vec<(ObjectRef, u32, u32)> = Vec::new();
34
35 for (object_ref, entry) in &xref {
36 match entry {
37 XrefEntry::Free => {}
38 XrefEntry::Uncompressed { offset, .. } => {
39 if object_ref.object_number == 0 {
40 continue;
41 }
42 let object = parse_indirect_object(bytes, *offset)?;
43 max_object_number = max_object_number.max(object_ref.object_number);
44 objects.insert(*object_ref, object);
45 }
46 XrefEntry::Compressed {
47 stream_object_number,
48 index,
49 } => {
50 compressed.push((*object_ref, *stream_object_number, *index));
51 }
52 }
53 }
54
55 decrypt_document_if_encrypted(&mut objects, &mut trailer, password)?;
61
62 materialize_object_streams(&mut objects, &mut max_object_number, &compressed)?;
63
64 let file = PdfFile {
65 version,
66 objects,
67 trailer,
68 max_object_number,
69 };
70 build_document(file)
71}
72
73fn decrypt_document_if_encrypted(
74 objects: &mut BTreeMap<ObjectRef, PdfObject>,
75 trailer: &mut PdfDictionary,
76 password: &[u8],
77) -> PdfResult<()> {
78 let encrypt_ref = match trailer.get("Encrypt") {
79 Some(PdfValue::Reference(object_ref)) => *object_ref,
80 Some(PdfValue::Dictionary(_)) => {
81 return Err(PdfError::Unsupported(
82 "direct (non-indirect) /Encrypt dictionaries are not supported".to_string(),
83 ));
84 }
85 Some(_) => {
86 return Err(PdfError::Corrupt(
87 "trailer /Encrypt is not a reference".to_string(),
88 ));
89 }
90 None => return Ok(()),
91 };
92
93 let encrypt_dict = match objects.get(&encrypt_ref) {
94 Some(PdfObject::Value(PdfValue::Dictionary(dict))) => dict.clone(),
95 _ => {
96 return Err(PdfError::Corrupt(
97 "trailer /Encrypt does not point at a dictionary".to_string(),
98 ));
99 }
100 };
101
102 let id_first = extract_id_first(trailer)?;
103
104 let handler = StandardSecurityHandler::open(&encrypt_dict, &id_first, password)?
105 .ok_or(PdfError::InvalidPassword)?;
106
107 let refs: Vec<ObjectRef> = objects.keys().copied().collect();
108 for object_ref in refs {
109 if object_ref == encrypt_ref {
110 continue;
113 }
114 let object = objects
115 .get_mut(&object_ref)
116 .expect("ref obtained from map keys must still be present");
117 match object {
118 PdfObject::Stream(stream) => {
119 let type_name = stream.dict.get("Type").and_then(PdfValue::as_name);
123 let is_xref_stream = type_name == Some("XRef");
124 let is_exempt_metadata =
125 !handler.encrypts_metadata() && type_name == Some("Metadata");
126 decrypt_strings_in_dict(&mut stream.dict, &handler, object_ref)?;
127 if !is_xref_stream && !is_exempt_metadata {
128 stream.data =
129 handler.decrypt_bytes(&stream.data, object_ref, BytesKind::Stream)?;
130 }
131 }
132 PdfObject::Value(value) => {
133 decrypt_strings_in_value(value, &handler, object_ref)?;
134 }
135 }
136 }
137
138 trailer.remove("Encrypt");
139 Ok(())
140}
141
142fn extract_id_first(trailer: &PdfDictionary) -> PdfResult<Vec<u8>> {
143 match trailer.get("ID") {
144 Some(PdfValue::Array(entries)) => match entries.first() {
145 Some(PdfValue::String(value)) => Ok(value.0.clone()),
146 _ => Err(PdfError::Corrupt(
147 "trailer /ID[0] is not a string — cannot derive encryption key".to_string(),
148 )),
149 },
150 _ => Err(PdfError::Corrupt(
151 "encrypted PDF is missing the trailer /ID array required for key derivation"
152 .to_string(),
153 )),
154 }
155}
156
157fn decrypt_strings_in_value(
158 value: &mut PdfValue,
159 handler: &StandardSecurityHandler,
160 object_ref: ObjectRef,
161) -> PdfResult<()> {
162 match value {
163 PdfValue::String(string) => {
164 string.0 = handler.decrypt_bytes(&string.0, object_ref, BytesKind::String)?;
165 }
166 PdfValue::Array(items) => {
167 for item in items {
168 decrypt_strings_in_value(item, handler, object_ref)?;
169 }
170 }
171 PdfValue::Dictionary(dict) => {
172 decrypt_strings_in_dict(dict, handler, object_ref)?;
173 }
174 _ => {}
175 }
176 Ok(())
177}
178
179fn decrypt_strings_in_dict(
180 dict: &mut PdfDictionary,
181 handler: &StandardSecurityHandler,
182 object_ref: ObjectRef,
183) -> PdfResult<()> {
184 for value in dict.values_mut() {
185 decrypt_strings_in_value(value, handler, object_ref)?;
186 }
187 Ok(())
188}
189
190fn parse_header(bytes: &[u8]) -> PdfResult<String> {
191 if !bytes.starts_with(b"%PDF-") {
192 return Err(PdfError::Parse("missing PDF header".to_string()));
193 }
194 let line_end = bytes
195 .iter()
196 .position(|byte| *byte == b'\n' || *byte == b'\r')
197 .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
198 Ok(String::from_utf8_lossy(&bytes[5..line_end])
199 .trim()
200 .to_string())
201}
202
203fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
204 let marker = b"startxref";
205 let position = bytes
206 .windows(marker.len())
207 .rposition(|window| window == marker)
208 .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
209 let mut parser = Cursor::new(bytes, position + marker.len());
210 parser.skip_ws_and_comments();
211 parser.parse_usize()
212}
213
214fn parse_xref_table(
215 bytes: &[u8],
216 start_offset: usize,
217) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
218 let mut merged_entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
219 let mut newest_trailer: Option<PdfDictionary> = None;
220 let mut visited = BTreeSet::new();
221 let mut pending: Vec<usize> = vec![start_offset];
222
223 while let Some(offset) = pending.pop() {
224 if !visited.insert(offset) {
225 continue;
226 }
227 let section = parse_xref_section_at(bytes, offset)?;
228
229 for (object_ref, entry) in section.entries {
231 merged_entries.entry(object_ref).or_insert(entry);
232 }
233
234 if newest_trailer.is_none() {
235 newest_trailer = Some(section.trailer.clone());
236 }
237
238 if let Some(stm_offset) = section
239 .trailer
240 .get("XRefStm")
241 .and_then(PdfValue::as_integer)
242 {
243 pending.push(stm_offset as usize);
244 }
245 if let Some(prev_offset) = section.trailer.get("Prev").and_then(PdfValue::as_integer) {
246 pending.push(prev_offset as usize);
247 }
248 }
249
250 let trailer = newest_trailer
251 .ok_or_else(|| PdfError::Parse("xref chain produced no trailer".to_string()))?;
252 Ok((merged_entries, trailer))
253}
254
255struct XrefSection {
256 entries: BTreeMap<ObjectRef, XrefEntry>,
257 trailer: PdfDictionary,
258}
259
260fn parse_xref_section_at(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
261 let mut probe = Cursor::new(bytes, offset);
262 probe.skip_ws_and_comments();
263 if probe.peek_keyword("xref") {
264 parse_classic_xref_section(bytes, offset)
265 } else {
266 parse_xref_stream_section(bytes, offset)
267 }
268}
269
270fn parse_classic_xref_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
271 let mut cursor = Cursor::new(bytes, offset);
272 cursor.expect_keyword("xref")?;
273 let mut entries = BTreeMap::new();
274 loop {
275 cursor.skip_ws_and_comments();
276 if cursor.peek_keyword("trailer") {
277 break;
278 }
279 let start = cursor.parse_u32()?;
280 cursor.skip_ws_and_comments();
281 let count = cursor.parse_u32()?;
282 cursor.skip_line_breaks();
283 for index in 0..count {
284 let line = cursor.read_line()?;
285 if line.len() < 17 {
286 return Err(PdfError::Parse("invalid xref entry".to_string()));
287 }
288 let parts = String::from_utf8_lossy(line).trim().to_string();
289 let mut fields = parts.split_whitespace();
290 let entry_offset = fields
291 .next()
292 .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
293 .parse::<usize>()
294 .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
295 let generation = fields
296 .next()
297 .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
298 .parse::<u16>()
299 .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
300 let flag = fields
301 .next()
302 .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
303 let object_number = start
304 .checked_add(index)
305 .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
306 let entry = if flag == "n" {
307 XrefEntry::Uncompressed {
308 offset: entry_offset,
309 generation,
310 }
311 } else {
312 XrefEntry::Free
313 };
314 entries.insert(ObjectRef::new(object_number, generation), entry);
315 }
316 }
317 cursor.expect_keyword("trailer")?;
318 let trailer = match cursor.parse_value()? {
319 PdfValue::Dictionary(dictionary) => dictionary,
320 _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
321 };
322 Ok(XrefSection { entries, trailer })
323}
324
325fn parse_xref_stream_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
326 let object = parse_indirect_object(bytes, offset)?;
327 let stream = match object {
328 PdfObject::Stream(stream) => stream,
329 PdfObject::Value(_) => {
330 return Err(PdfError::Parse(
331 "expected xref stream object at startxref offset".to_string(),
332 ));
333 }
334 };
335 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("XRef") {
336 return Err(PdfError::Parse(
337 "xref stream object has wrong Type".to_string(),
338 ));
339 }
340
341 let size = stream
342 .dict
343 .get("Size")
344 .and_then(PdfValue::as_integer)
345 .ok_or_else(|| PdfError::Corrupt("xref stream missing Size".to_string()))?
346 as u32;
347
348 let w = stream
349 .dict
350 .get("W")
351 .and_then(PdfValue::as_array)
352 .ok_or_else(|| PdfError::Corrupt("xref stream missing W".to_string()))?;
353 if w.len() != 3 {
354 return Err(PdfError::Corrupt(
355 "xref stream W must have three entries".to_string(),
356 ));
357 }
358 let w0 = w[0]
359 .as_integer()
360 .ok_or_else(|| PdfError::Corrupt("invalid W[0]".to_string()))? as usize;
361 let w1 = w[1]
362 .as_integer()
363 .ok_or_else(|| PdfError::Corrupt("invalid W[1]".to_string()))? as usize;
364 let w2 = w[2]
365 .as_integer()
366 .ok_or_else(|| PdfError::Corrupt("invalid W[2]".to_string()))? as usize;
367 let row_len = w0 + w1 + w2;
368 if row_len == 0 {
369 return Err(PdfError::Corrupt(
370 "xref stream row width is zero".to_string(),
371 ));
372 }
373
374 let index: Vec<(u32, u32)> = match stream.dict.get("Index") {
375 Some(PdfValue::Array(entries)) => {
376 if entries.len() % 2 != 0 {
377 return Err(PdfError::Corrupt(
378 "xref stream Index must have an even number of entries".to_string(),
379 ));
380 }
381 let mut pairs = Vec::with_capacity(entries.len() / 2);
382 for chunk in entries.chunks(2) {
383 let first = chunk[0]
384 .as_integer()
385 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
386 as u32;
387 let count = chunk[1]
388 .as_integer()
389 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
390 as u32;
391 pairs.push((first, count));
392 }
393 pairs
394 }
395 Some(_) => {
396 return Err(PdfError::Corrupt(
397 "xref stream Index is not an array".to_string(),
398 ));
399 }
400 None => vec![(0, size)],
401 };
402
403 let decoded = decode_stream(&stream)?;
404 let expected_rows: u32 = index.iter().map(|(_, count)| *count).sum();
405 if decoded.len() < expected_rows as usize * row_len {
406 return Err(PdfError::Corrupt(
407 "xref stream body is shorter than declared entries".to_string(),
408 ));
409 }
410
411 let mut entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
412 let mut cursor = 0usize;
413 for (first, count) in index {
414 for i in 0..count {
415 let row = &decoded[cursor..cursor + row_len];
416 cursor += row_len;
417 let field_type = if w0 == 0 { 1u64 } else { read_be(&row[..w0])? };
418 let f2 = read_be(&row[w0..w0 + w1])?;
419 let f3 = read_be(&row[w0 + w1..])?;
420 let object_number = first + i;
421 let entry = match field_type {
422 0 => XrefEntry::Free,
423 1 => XrefEntry::Uncompressed {
424 offset: f2 as usize,
425 generation: f3 as u16,
426 },
427 2 => XrefEntry::Compressed {
428 stream_object_number: f2 as u32,
429 index: f3 as u32,
430 },
431 other => {
432 return Err(PdfError::Unsupported(format!(
433 "xref stream entry type {other} is not supported"
434 )));
435 }
436 };
437 let generation = match entry {
438 XrefEntry::Uncompressed { generation, .. } => generation,
439 _ => 0,
440 };
441 entries.insert(ObjectRef::new(object_number, generation), entry);
442 }
443 }
444
445 Ok(XrefSection {
446 entries,
447 trailer: stream.dict,
448 })
449}
450
451fn read_be(bytes: &[u8]) -> PdfResult<u64> {
452 if bytes.len() > 8 {
453 return Err(PdfError::Corrupt(
454 "xref stream field width exceeds 8 bytes".to_string(),
455 ));
456 }
457 let mut value: u64 = 0;
458 for byte in bytes {
459 value = (value << 8) | *byte as u64;
460 }
461 Ok(value)
462}
463
464fn materialize_object_streams(
465 objects: &mut BTreeMap<ObjectRef, PdfObject>,
466 max_object_number: &mut u32,
467 compressed: &[(ObjectRef, u32, u32)],
468) -> PdfResult<()> {
469 if compressed.is_empty() {
470 return Ok(());
471 }
472
473 let mut by_stream: BTreeMap<u32, Vec<(ObjectRef, u32)>> = BTreeMap::new();
474 for (object_ref, stream_obj_num, index) in compressed {
475 by_stream
476 .entry(*stream_obj_num)
477 .or_default()
478 .push((*object_ref, *index));
479 }
480
481 for (stream_obj_num, mut members) in by_stream {
482 let stream_ref = ObjectRef::new(stream_obj_num, 0);
483 let stream = match objects.get(&stream_ref) {
484 Some(PdfObject::Stream(stream)) => stream.clone(),
485 Some(PdfObject::Value(_)) => {
486 return Err(PdfError::Corrupt(format!(
487 "object stream {stream_obj_num} is not a stream"
488 )));
489 }
490 None => {
491 return Err(PdfError::Corrupt(format!(
492 "compressed entry references missing object stream {stream_obj_num}"
493 )));
494 }
495 };
496 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("ObjStm") {
497 return Err(PdfError::Corrupt(format!(
498 "object {stream_obj_num} is not marked as ObjStm"
499 )));
500 }
501 let n = stream
502 .dict
503 .get("N")
504 .and_then(PdfValue::as_integer)
505 .ok_or_else(|| PdfError::Corrupt("ObjStm missing N".to_string()))?
506 as usize;
507 let first = stream
508 .dict
509 .get("First")
510 .and_then(PdfValue::as_integer)
511 .ok_or_else(|| PdfError::Corrupt("ObjStm missing First".to_string()))?
512 as usize;
513
514 let decoded = decode_stream(&stream)?;
515 if first > decoded.len() {
516 return Err(PdfError::Corrupt(
517 "ObjStm First offset is past end of decoded data".to_string(),
518 ));
519 }
520
521 let header = &decoded[..first];
522 let mut header_cursor = Cursor::new(header, 0);
523 let mut entries: Vec<(u32, usize)> = Vec::with_capacity(n);
524 for _ in 0..n {
525 header_cursor.skip_ws_and_comments();
526 let obj_num = header_cursor.parse_u32()?;
527 header_cursor.skip_ws_and_comments();
528 let rel_offset = header_cursor.parse_usize()?;
529 entries.push((obj_num, rel_offset));
530 }
531
532 members.sort_by_key(|(_, index)| *index);
534 for (member_ref, index) in members {
535 let idx = index as usize;
536 if idx >= entries.len() {
537 return Err(PdfError::Corrupt(format!(
538 "ObjStm {stream_obj_num} has no index {idx}"
539 )));
540 }
541 let (declared_number, rel_offset) = entries[idx];
542 if declared_number != member_ref.object_number {
543 return Err(PdfError::Corrupt(format!(
544 "ObjStm {stream_obj_num} index {idx} has number {declared_number} but xref expected {}",
545 member_ref.object_number
546 )));
547 }
548 let absolute_offset = first
549 .checked_add(rel_offset)
550 .ok_or_else(|| PdfError::Corrupt("ObjStm offset overflow".to_string()))?;
551 if absolute_offset > decoded.len() {
552 return Err(PdfError::Corrupt(
553 "ObjStm member offset is past end of decoded data".to_string(),
554 ));
555 }
556 let mut value_cursor = Cursor::new(&decoded, absolute_offset);
557 let value = value_cursor.parse_value()?;
558 if let PdfValue::Dictionary(dict) = &value {
559 if dict.get("Type").and_then(PdfValue::as_name) == Some("ObjStm") {
560 return Err(PdfError::Unsupported(
561 "nested object streams are not supported".to_string(),
562 ));
563 }
564 }
565 *max_object_number = (*max_object_number).max(member_ref.object_number);
566 objects.insert(member_ref, PdfObject::Value(value));
567 }
568 }
569
570 Ok(())
571}
572
573fn parse_indirect_object(bytes: &[u8], offset: usize) -> PdfResult<PdfObject> {
574 let mut cursor = Cursor::new(bytes, offset);
575 let _object_number = cursor.parse_u32()?;
576 cursor.skip_ws_and_comments();
577 let _generation = cursor.parse_u16()?;
578 cursor.skip_ws_and_comments();
579 cursor.expect_keyword("obj")?;
580 cursor.skip_ws_and_comments();
581
582 let value = cursor.parse_value()?;
583 cursor.skip_ws_and_comments();
584 if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
585 let dict = match value {
586 PdfValue::Dictionary(dict) => dict,
587 _ => unreachable!(),
588 };
589 cursor.expect_keyword("stream")?;
590 cursor.consume_stream_line_break();
591 let stream_start = cursor.position;
592 let length_hint = dict
598 .get("Length")
599 .and_then(PdfValue::as_integer)
600 .filter(|&len| len >= 0)
601 .map(|len| len as usize);
602 let (data, endstream_pos) = match length_hint {
603 Some(len) if stream_start + len <= bytes.len() => {
604 let mut check = stream_start + len;
607 while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
608 check += 1;
609 }
610 if bytes.get(check..check + 9) == Some(b"endstream") {
611 (bytes[stream_start..stream_start + len].to_vec(), check)
612 } else {
613 let pos = find_keyword(bytes, stream_start, b"endstream")
615 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
616 (bytes[stream_start..pos].to_vec(), pos)
617 }
618 }
619 _ => {
620 let pos = find_keyword(bytes, stream_start, b"endstream")
621 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
622 (bytes[stream_start..pos].to_vec(), pos)
623 }
624 };
625 cursor.position = endstream_pos;
626 cursor.expect_keyword("endstream")?;
627 cursor.skip_ws_and_comments();
628 cursor.expect_keyword("endobj")?;
629 Ok(PdfObject::Stream(PdfStream { dict, data }))
630 } else {
631 cursor.expect_keyword("endobj")?;
632 Ok(PdfObject::Value(value))
633 }
634}
635
636fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
637 bytes[start..]
638 .windows(keyword.len())
639 .position(|window| window == keyword)
640 .map(|relative| start + relative)
641}
642
643struct Cursor<'a> {
644 bytes: &'a [u8],
645 position: usize,
646}
647
648impl<'a> Cursor<'a> {
649 fn new(bytes: &'a [u8], position: usize) -> Self {
650 Self { bytes, position }
651 }
652
653 fn eof(&self) -> bool {
654 self.position >= self.bytes.len()
655 }
656
657 fn current(&self) -> Option<u8> {
658 self.bytes.get(self.position).copied()
659 }
660
661 fn skip_ws_and_comments(&mut self) {
662 while let Some(byte) = self.current() {
663 match byte {
664 b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
665 b'%' => {
666 while let Some(next) = self.current() {
667 self.position += 1;
668 if next == b'\n' || next == b'\r' {
669 break;
670 }
671 }
672 }
673 _ => break,
674 }
675 }
676 }
677
678 fn skip_line_breaks(&mut self) {
679 while matches!(self.current(), Some(b'\n' | b'\r')) {
680 self.position += 1;
681 }
682 }
683
684 fn read_line(&mut self) -> PdfResult<&'a [u8]> {
685 if self.eof() {
686 return Err(PdfError::Parse("unexpected end of file".to_string()));
687 }
688 let start = self.position;
689 while let Some(byte) = self.current() {
690 if byte == b'\n' || byte == b'\r' {
691 let end = self.position;
692 self.skip_line_breaks();
693 return Ok(&self.bytes[start..end]);
694 }
695 self.position += 1;
696 }
697 Ok(&self.bytes[start..self.position])
698 }
699
700 fn peek_keyword(&self, keyword: &str) -> bool {
701 self.bytes
702 .get(self.position..self.position + keyword.len())
703 .map(|slice| slice == keyword.as_bytes())
704 .unwrap_or(false)
705 }
706
707 fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
708 self.skip_ws_and_comments();
709 if self.peek_keyword(keyword) {
710 self.position += keyword.len();
711 Ok(())
712 } else {
713 Err(PdfError::Parse(format!("expected keyword {keyword}")))
714 }
715 }
716
717 fn consume_stream_line_break(&mut self) {
718 if self.current() == Some(b'\r') {
719 self.position += 1;
720 }
721 if self.current() == Some(b'\n') {
722 self.position += 1;
723 }
724 }
725
726 fn parse_u32(&mut self) -> PdfResult<u32> {
727 let token = self.parse_token()?;
728 token
729 .parse::<u32>()
730 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
731 }
732
733 fn parse_u16(&mut self) -> PdfResult<u16> {
734 let token = self.parse_token()?;
735 token
736 .parse::<u16>()
737 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
738 }
739
740 fn parse_usize(&mut self) -> PdfResult<usize> {
741 let token = self.parse_token()?;
742 token
743 .parse::<usize>()
744 .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
745 }
746
747 fn parse_token(&mut self) -> PdfResult<String> {
748 self.skip_ws_and_comments();
749 let start = self.position;
750 while let Some(byte) = self.current() {
751 if is_delimiter(byte) || is_whitespace(byte) {
752 break;
753 }
754 self.position += 1;
755 }
756 if self.position == start {
757 return Err(PdfError::Parse("expected token".to_string()));
758 }
759 Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
760 }
761
762 fn parse_value(&mut self) -> PdfResult<PdfValue> {
763 self.skip_ws_and_comments();
764 match self.current() {
765 Some(b'/') => self.parse_name(),
766 Some(b'(') => self.parse_literal_string(),
767 Some(b'[') => self.parse_array(),
768 Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
769 self.parse_dictionary()
770 }
771 Some(b'<') => self.parse_hex_string(),
772 Some(b't') if self.peek_keyword("true") => {
773 self.position += 4;
774 Ok(PdfValue::Bool(true))
775 }
776 Some(b'f') if self.peek_keyword("false") => {
777 self.position += 5;
778 Ok(PdfValue::Bool(false))
779 }
780 Some(b'n') if self.peek_keyword("null") => {
781 self.position += 4;
782 Ok(PdfValue::Null)
783 }
784 Some(_) => self.parse_number_or_reference(),
785 None => Err(PdfError::Parse("unexpected end of file".to_string())),
786 }
787 }
788
789 fn parse_name(&mut self) -> PdfResult<PdfValue> {
790 self.position += 1;
791 let mut raw = Vec::new();
792 while let Some(byte) = self.current() {
793 if is_delimiter(byte) || is_whitespace(byte) {
794 break;
795 }
796 if byte == b'#' {
797 let high =
798 self.bytes.get(self.position + 1).copied().ok_or_else(|| {
799 PdfError::Parse("truncated #XX escape in name".to_string())
800 })?;
801 let low =
802 self.bytes.get(self.position + 2).copied().ok_or_else(|| {
803 PdfError::Parse("truncated #XX escape in name".to_string())
804 })?;
805 let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
806 .map_err(|_| {
807 PdfError::Parse("invalid #XX hex escape in name".to_string())
808 })?;
809 raw.push(decoded);
810 self.position += 3;
811 } else {
812 raw.push(byte);
813 self.position += 1;
814 }
815 }
816 Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
817 }
818
819 fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
820 self.position += 1;
821 let mut output = Vec::new();
822 let mut depth = 1usize;
823 while let Some(byte) = self.current() {
824 self.position += 1;
825 match byte {
826 b'\\' => {
827 let escaped = self
828 .current()
829 .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
830 self.position += 1;
831 match escaped {
832 b'n' => output.push(b'\n'),
833 b'r' => output.push(b'\r'),
834 b't' => output.push(b'\t'),
835 b'b' => output.push(0x08),
836 b'f' => output.push(0x0C),
837 b'(' | b')' | b'\\' => output.push(escaped),
838 b'\n' => {}
839 b'\r' => {
840 if self.current() == Some(b'\n') {
841 self.position += 1;
842 }
843 }
844 b'0'..=b'7' => {
845 let mut octal = vec![escaped];
846 for _ in 0..2 {
847 match self.current() {
848 Some(next @ b'0'..=b'7') => {
849 octal.push(next);
850 self.position += 1;
851 }
852 _ => break,
853 }
854 }
855 let value =
857 u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
858 .unwrap_or(0);
859 output.push((value % 256) as u8);
860 }
861 other => output.push(other),
862 }
863 }
864 b'(' => {
865 depth += 1;
866 output.push(byte);
867 }
868 b')' => {
869 depth -= 1;
870 if depth == 0 {
871 return Ok(PdfValue::String(PdfString(output)));
872 }
873 output.push(byte);
874 }
875 _ => output.push(byte),
876 }
877 }
878 Err(PdfError::Parse("unterminated literal string".to_string()))
879 }
880
881 fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
882 self.position += 1;
883 let start = self.position;
884 while self.current() != Some(b'>') {
885 if self.eof() {
886 return Err(PdfError::Parse("unterminated hex string".to_string()));
887 }
888 self.position += 1;
889 }
890 let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
891 .chars()
892 .filter(|character| !character.is_whitespace())
893 .collect::<String>();
894 self.position += 1;
895 let mut chars = raw.chars().collect::<Vec<_>>();
896 if chars.len() % 2 != 0 {
897 chars.push('0');
898 }
899 let mut bytes = Vec::with_capacity(chars.len() / 2);
900 for pair in chars.chunks(2) {
901 let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
902 .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
903 bytes.push(value);
904 }
905 Ok(PdfValue::String(PdfString(bytes)))
906 }
907
908 fn parse_array(&mut self) -> PdfResult<PdfValue> {
909 self.position += 1;
910 let mut values = Vec::new();
911 loop {
912 self.skip_ws_and_comments();
913 match self.current() {
914 Some(b']') => {
915 self.position += 1;
916 break;
917 }
918 Some(_) => values.push(self.parse_value()?),
919 None => return Err(PdfError::Parse("unterminated array".to_string())),
920 }
921 }
922 Ok(PdfValue::Array(values))
923 }
924
925 fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
926 self.position += 2;
927 let mut dictionary = PdfDictionary::new();
928 loop {
929 self.skip_ws_and_comments();
930 if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
931 self.position += 2;
932 break;
933 }
934 let key = match self.parse_name()? {
935 PdfValue::Name(name) => name,
936 _ => unreachable!(),
937 };
938 let value = self.parse_value()?;
939 dictionary.insert(key, value);
940 }
941 Ok(PdfValue::Dictionary(dictionary))
942 }
943
944 fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
945 let first_token = self.parse_token()?;
946 if first_token.contains('.') || first_token.contains(['e', 'E']) {
947 return first_token
948 .parse::<f64>()
949 .map(PdfValue::Number)
950 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
951 }
952
953 let checkpoint = self.position;
954 self.skip_ws_and_comments();
955 if let Ok(second_token) = self.parse_token() {
956 self.skip_ws_and_comments();
957 if self.current() == Some(b'R')
958 && second_token
959 .chars()
960 .all(|character| character.is_ascii_digit())
961 {
962 self.position += 1;
963 return Ok(PdfValue::Reference(ObjectRef::new(
964 first_token
965 .parse::<u32>()
966 .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
967 second_token
968 .parse::<u16>()
969 .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
970 )));
971 }
972 }
973 self.position = checkpoint;
974 first_token
975 .parse::<i64>()
976 .map(PdfValue::Integer)
977 .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
978 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
979 }
980}
981
982fn is_whitespace(byte: u8) -> bool {
983 matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
984}
985
986fn is_delimiter(byte: u8) -> bool {
987 matches!(
988 byte,
989 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
990 )
991}
992
993#[cfg(test)]
994mod tests {
995 use super::{parse_pdf, parse_pdf_with_password};
996 use crate::error::PdfError;
997 use crate::types::PdfObject;
998
999 #[test]
1000 fn parses_simple_pdf_fixture() {
1001 let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
1002 let document = parse_pdf(bytes).expect("fixture should parse");
1003 assert_eq!(document.pages.len(), 1);
1004 }
1005
1006 #[test]
1007 fn parses_incremental_update_fixture() {
1008 let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
1009 let document = parse_pdf(bytes).expect("incremental fixture should parse");
1010 assert_eq!(document.pages.len(), 1);
1011
1012 let content_refs = &document.pages[0].content_refs;
1015 assert!(!content_refs.is_empty());
1016 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
1017 let stream_data = match content_obj {
1018 PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
1019 _ => panic!("expected stream object for page content"),
1020 };
1021 assert!(
1022 stream_data.contains("Updated Secret"),
1023 "content stream should contain updated text"
1024 );
1025 assert!(
1026 !stream_data.contains("Original Secret"),
1027 "content stream should not contain original text"
1028 );
1029 }
1030
1031 #[test]
1032 fn circular_prev_chain_does_not_loop() {
1033 let mut pdf = Vec::new();
1037 pdf.extend_from_slice(b"%PDF-1.4\n");
1038
1039 let obj1_offset = pdf.len();
1041 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1042
1043 let obj2_offset = pdf.len();
1045 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1046
1047 let xref_offset = pdf.len();
1048 pdf.extend_from_slice(b"xref\n0 3\n");
1049 pdf.extend_from_slice(b"0000000000 65535 f \n");
1050 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
1051 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
1052 pdf.extend_from_slice(b"trailer\n");
1053 pdf.extend_from_slice(
1055 format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
1056 );
1057 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
1058
1059 let document = parse_pdf(&pdf).expect("circular Prev should be tolerated");
1060 assert_eq!(document.pages.len(), 0);
1061 }
1062
1063 #[test]
1064 fn parses_uncompressed_xref_stream() {
1065 let mut pdf: Vec<u8> = Vec::new();
1068 pdf.extend_from_slice(b"%PDF-1.5\n");
1069
1070 let obj1_offset = pdf.len();
1071 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1072 let obj2_offset = pdf.len();
1073 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1074
1075 let row_for = |t: u8, off: u16, generation: u8| {
1078 let mut row = [0u8; 4];
1079 row[0] = t;
1080 row[1] = (off >> 8) as u8;
1081 row[2] = off as u8;
1082 row[3] = generation;
1083 row
1084 };
1085 let mut body = Vec::new();
1086 body.extend_from_slice(&row_for(0, 0, 0xFF)); body.extend_from_slice(&row_for(1, obj1_offset as u16, 0));
1088 body.extend_from_slice(&row_for(1, obj2_offset as u16, 0));
1089 body.extend_from_slice(&row_for(1, 0, 0)); let xref_obj_offset = pdf.len();
1092 let self_offset = xref_obj_offset as u16;
1094 body[12] = 1;
1095 body[13] = (self_offset >> 8) as u8;
1096 body[14] = self_offset as u8;
1097 body[15] = 0;
1098
1099 let stream_dict = format!(
1100 "<< /Type /XRef /Size 4 /W [1 2 1] /Root 1 0 R /Length {} >>",
1101 body.len()
1102 );
1103 pdf.extend_from_slice(format!("3 0 obj\n{stream_dict}\nstream\n").as_bytes());
1104 pdf.extend_from_slice(&body);
1105 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1106 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_obj_offset).as_bytes());
1107
1108 let document = parse_pdf(&pdf).expect("xref stream fixture should parse");
1109 assert_eq!(document.pages.len(), 0);
1110 assert!(document.file.objects.len() >= 2);
1112 }
1113
1114 #[test]
1115 fn parses_object_stream_via_xref_stream() {
1116 use flate2::{Compression, write::ZlibEncoder};
1117 use std::io::Write;
1118
1119 let mut pdf: Vec<u8> = Vec::new();
1126 pdf.extend_from_slice(b"%PDF-1.5\n");
1127
1128 let obj1_offset = pdf.len();
1129 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1130
1131 let member_payload = b"<< /Type /Pages /Count 0 /Kids [] >>";
1133 let header = b"2 0 ";
1134 let first = header.len();
1135 let mut decompressed = Vec::new();
1136 decompressed.extend_from_slice(header);
1137 decompressed.extend_from_slice(member_payload);
1138
1139 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1140 encoder.write_all(&decompressed).unwrap();
1141 let compressed = encoder.finish().unwrap();
1142
1143 let obj3_offset = pdf.len();
1144 let objstm_dict = format!(
1145 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1146 first,
1147 compressed.len()
1148 );
1149 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1150 pdf.extend_from_slice(&compressed);
1151 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1152
1153 let row_for = |t: u8, a: u32, b: u16| {
1157 let mut row = [0u8; 5];
1158 row[0] = t;
1159 row[1] = (a >> 16) as u8;
1160 row[2] = (a >> 8) as u8;
1161 row[3] = a as u8;
1162 row[4] = b as u8;
1163 row
1164 };
1165
1166 let obj4_offset = pdf.len();
1167 let mut body = Vec::new();
1168 body.extend_from_slice(&row_for(0, 0, 0xFF));
1169 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1170 body.extend_from_slice(&row_for(2, 3, 0));
1171 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1172 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1173
1174 let stream_dict = format!(
1175 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1176 body.len()
1177 );
1178 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1179 pdf.extend_from_slice(&body);
1180 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1181 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1182
1183 let document = parse_pdf(&pdf).expect("ObjStm fixture should parse");
1184 assert_eq!(document.pages.len(), 0);
1185 let pages_ref = document.catalog.pages_ref;
1187 let pages_dict = document.file.get_dictionary(pages_ref).unwrap();
1188 assert_eq!(
1189 pages_dict.get("Type").and_then(|v| v.as_name()),
1190 Some("Pages")
1191 );
1192 }
1193
1194 #[test]
1195 fn rejects_nested_object_stream() {
1196 use flate2::{Compression, write::ZlibEncoder};
1197 use std::io::Write;
1198
1199 let mut pdf: Vec<u8> = Vec::new();
1201 pdf.extend_from_slice(b"%PDF-1.5\n");
1202
1203 let obj1_offset = pdf.len();
1204 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1205
1206 let member_payload = b"<< /Type /ObjStm /N 0 /First 0 /Length 0 >>";
1207 let header = b"2 0 ";
1208 let first = header.len();
1209 let mut decompressed = Vec::new();
1210 decompressed.extend_from_slice(header);
1211 decompressed.extend_from_slice(member_payload);
1212
1213 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1214 encoder.write_all(&decompressed).unwrap();
1215 let compressed = encoder.finish().unwrap();
1216
1217 let obj3_offset = pdf.len();
1218 let objstm_dict = format!(
1219 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1220 first,
1221 compressed.len()
1222 );
1223 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1224 pdf.extend_from_slice(&compressed);
1225 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1226
1227 let row_for = |t: u8, a: u32, b: u16| {
1228 let mut row = [0u8; 5];
1229 row[0] = t;
1230 row[1] = (a >> 16) as u8;
1231 row[2] = (a >> 8) as u8;
1232 row[3] = a as u8;
1233 row[4] = b as u8;
1234 row
1235 };
1236
1237 let obj4_offset = pdf.len();
1238 let mut body = Vec::new();
1239 body.extend_from_slice(&row_for(0, 0, 0xFF));
1240 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1241 body.extend_from_slice(&row_for(2, 3, 0));
1242 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1243 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1244
1245 let stream_dict = format!(
1246 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1247 body.len()
1248 );
1249 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1250 pdf.extend_from_slice(&body);
1251 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1252 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1253
1254 match parse_pdf(&pdf) {
1255 Err(PdfError::Unsupported(message)) => {
1256 assert!(message.contains("nested object streams"), "got: {message}")
1257 }
1258 other => panic!("expected Unsupported, got: {other:?}"),
1259 }
1260 }
1261
1262 fn build_rc4_encrypted_pdf(
1268 user_password: &[u8],
1269 owner_password: &[u8],
1270 ) -> (Vec<u8>, &'static [u8]) {
1271 use crate::crypto::SecurityRevision;
1272 use crate::crypto::test_helpers::{
1273 compute_file_key, compute_o, compute_u_r3, object_key, rc4,
1274 };
1275
1276 let id_first: [u8; 16] = [
1277 0x6e, 0x05, 0xb1, 0x20, 0x63, 0x94, 0x69, 0x1f, 0x22, 0x2c, 0x32, 0xac, 0x61, 0x8b,
1278 0xe6, 0x8d,
1279 ];
1280 let permissions: i32 = -4;
1281 let key_length_bytes = 16;
1282
1283 let owner_entry = compute_o(
1284 owner_password,
1285 user_password,
1286 SecurityRevision::R3,
1287 key_length_bytes,
1288 );
1289 let file_key = compute_file_key(
1290 user_password,
1291 &owner_entry,
1292 permissions,
1293 &id_first,
1294 key_length_bytes,
1295 );
1296 let u_entry = compute_u_r3(&file_key, &id_first);
1297
1298 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1299 let mut out = Vec::with_capacity(bytes.len() + 2);
1300 out.push(b'(');
1301 for &byte in bytes {
1302 match byte {
1303 b'(' | b')' | b'\\' => {
1304 out.push(b'\\');
1305 out.push(byte);
1306 }
1307 _ => out.push(byte),
1308 }
1309 }
1310 out.push(b')');
1311 out
1312 };
1313
1314 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(CIPHERED SECRET) Tj\nET\n";
1315 let content_cipher = rc4(&object_key(&file_key, 4, 0), content_plain);
1316
1317 let mut pdf: Vec<u8> = Vec::new();
1318 pdf.extend_from_slice(b"%PDF-1.4\n");
1319
1320 let catalog_offset = pdf.len();
1321 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1322
1323 let pages_offset = pdf.len();
1324 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1325
1326 let page_offset = pdf.len();
1327 pdf.extend_from_slice(
1328 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1329 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1330 );
1331
1332 let content_offset = pdf.len();
1333 pdf.extend_from_slice(
1334 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1335 );
1336 pdf.extend_from_slice(&content_cipher);
1337 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1338
1339 let font_offset = pdf.len();
1340 pdf.extend_from_slice(
1341 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1342 /Encoding /WinAnsiEncoding >>\nendobj\n",
1343 );
1344
1345 let encrypt_offset = pdf.len();
1346 pdf.extend_from_slice(b"6 0 obj\n<< /Filter /Standard /V 2 /R 3 /Length 128 ");
1347 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1348 pdf.extend_from_slice(b"/O ");
1349 pdf.extend_from_slice(&escape_literal(&owner_entry));
1350 pdf.extend_from_slice(b" /U ");
1351 pdf.extend_from_slice(&escape_literal(&u_entry));
1352 pdf.extend_from_slice(b" >>\nendobj\n");
1353
1354 let xref_offset = pdf.len();
1355 pdf.extend_from_slice(b"xref\n0 7\n");
1356 pdf.extend_from_slice(b"0000000000 65535 f \n");
1357 for offset in [
1358 catalog_offset,
1359 pages_offset,
1360 page_offset,
1361 content_offset,
1362 font_offset,
1363 encrypt_offset,
1364 ] {
1365 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1366 }
1367 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1368 pdf.extend_from_slice(&escape_literal(&id_first));
1369 pdf.extend_from_slice(&escape_literal(&id_first));
1370 pdf.extend_from_slice(b"] >>\n");
1371 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1372
1373 (pdf, content_plain)
1374 }
1375
1376 fn assert_decrypts_content_stream(document: &crate::document::ParsedDocument, expected: &[u8]) {
1377 assert_eq!(document.pages.len(), 1);
1378 assert!(
1379 !document.file.trailer.contains_key("Encrypt"),
1380 "trailer /Encrypt must be stripped once the document is decrypted in place"
1381 );
1382 let content_ref = document.pages[0].content_refs[0];
1383 let stream = match document.file.get_object(content_ref).unwrap() {
1384 PdfObject::Stream(stream) => stream,
1385 _ => panic!("page content must be a stream"),
1386 };
1387 assert_eq!(stream.data, expected);
1388 }
1389
1390 #[test]
1391 fn parses_rc4_encrypted_pdf_with_empty_password() {
1392 let (pdf, plain) = build_rc4_encrypted_pdf(b"", b"arbitrary-owner-password");
1397 let document = parse_pdf(&pdf).expect("empty-password PDF should decrypt");
1398 assert_decrypts_content_stream(&document, plain);
1399 }
1400
1401 #[test]
1402 fn parses_rc4_encrypted_pdf_with_user_password() {
1403 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1404 let document =
1405 parse_pdf_with_password(&pdf, b"userpw").expect("correct user password should decrypt");
1406 assert_decrypts_content_stream(&document, plain);
1407 }
1408
1409 #[test]
1410 fn parses_rc4_encrypted_pdf_with_owner_password() {
1411 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1412 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1413 .expect("correct owner password should decrypt");
1414 assert_decrypts_content_stream(&document, plain);
1415 }
1416
1417 #[test]
1418 fn rejects_wrong_password_with_invalid_password_error() {
1419 let (pdf, _) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1420 let err =
1421 parse_pdf_with_password(&pdf, b"wrongpw").expect_err("wrong password must not decrypt");
1422 assert_eq!(err, PdfError::InvalidPassword);
1423 }
1424
1425 #[test]
1426 fn parses_rc4_encrypted_pdf_with_utf8_password() {
1427 let password = "pässwörd".as_bytes();
1428 let (pdf, plain) = build_rc4_encrypted_pdf(password, b"ownerpw");
1429 let document =
1430 parse_pdf_with_password(&pdf, password).expect("UTF-8 user password should decrypt");
1431 assert_decrypts_content_stream(&document, plain);
1432 }
1433
1434 fn build_aes_128_encrypted_pdf(
1439 user_password: &[u8],
1440 owner_password: &[u8],
1441 encrypt_metadata: bool,
1442 ) -> (Vec<u8>, &'static [u8]) {
1443 use crate::crypto::SecurityRevision;
1444 use crate::crypto::test_helpers::{
1445 aes_128_cbc_encrypt, compute_file_key_r4, compute_o, compute_u_r3, object_key_aes,
1446 };
1447
1448 let id_first: [u8; 16] = [
1449 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
1450 0x99, 0x00,
1451 ];
1452 let permissions: i32 = -4;
1453
1454 let owner_entry = compute_o(owner_password, user_password, SecurityRevision::R4, 16);
1455 let file_key = compute_file_key_r4(
1456 user_password,
1457 &owner_entry,
1458 permissions,
1459 &id_first,
1460 encrypt_metadata,
1461 );
1462 let u_entry = compute_u_r3(&file_key, &id_first);
1463
1464 let content_iv = [0x42u8; 16];
1468 let content_plain: &'static [u8] =
1469 b"BT\n/F1 24 Tf\n72 700 Td\n(AES SECRET REMOVED) Tj\nET\n";
1470 let content_key = object_key_aes(&file_key, 4, 0);
1471 let content_cipher = aes_128_cbc_encrypt(&content_key, &content_iv, content_plain);
1472
1473 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1474 let mut out = Vec::with_capacity(bytes.len() + 2);
1475 out.push(b'(');
1476 for &byte in bytes {
1477 match byte {
1478 b'(' | b')' | b'\\' => {
1479 out.push(b'\\');
1480 out.push(byte);
1481 }
1482 _ => out.push(byte),
1483 }
1484 }
1485 out.push(b')');
1486 out
1487 };
1488
1489 let mut pdf: Vec<u8> = Vec::new();
1490 pdf.extend_from_slice(b"%PDF-1.5\n");
1491
1492 let catalog_offset = pdf.len();
1493 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1494
1495 let pages_offset = pdf.len();
1496 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1497
1498 let page_offset = pdf.len();
1499 pdf.extend_from_slice(
1500 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1501 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1502 );
1503
1504 let content_offset = pdf.len();
1505 pdf.extend_from_slice(
1506 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1507 );
1508 pdf.extend_from_slice(&content_cipher);
1509 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1510
1511 let font_offset = pdf.len();
1512 pdf.extend_from_slice(
1513 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1514 /Encoding /WinAnsiEncoding >>\nendobj\n",
1515 );
1516
1517 let encrypt_offset = pdf.len();
1518 pdf.extend_from_slice(
1519 b"6 0 obj\n<< /Filter /Standard /V 4 /R 4 /Length 128 \
1520 /CF << /StdCF << /CFM /AESV2 /Length 16 /AuthEvent /DocOpen >> >> \
1521 /StmF /StdCF /StrF /StdCF ",
1522 );
1523 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1524 if !encrypt_metadata {
1525 pdf.extend_from_slice(b"/EncryptMetadata false ");
1526 }
1527 pdf.extend_from_slice(b"/O ");
1528 pdf.extend_from_slice(&escape_literal(&owner_entry));
1529 pdf.extend_from_slice(b" /U ");
1530 pdf.extend_from_slice(&escape_literal(&u_entry));
1531 pdf.extend_from_slice(b" >>\nendobj\n");
1532
1533 let xref_offset = pdf.len();
1534 pdf.extend_from_slice(b"xref\n0 7\n");
1535 pdf.extend_from_slice(b"0000000000 65535 f \n");
1536 for offset in [
1537 catalog_offset,
1538 pages_offset,
1539 page_offset,
1540 content_offset,
1541 font_offset,
1542 encrypt_offset,
1543 ] {
1544 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1545 }
1546 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1547 pdf.extend_from_slice(&escape_literal(&id_first));
1548 pdf.extend_from_slice(&escape_literal(&id_first));
1549 pdf.extend_from_slice(b"] >>\n");
1550 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1551
1552 (pdf, content_plain)
1553 }
1554
1555 #[test]
1556 fn parses_aes_128_encrypted_pdf_with_empty_password() {
1557 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"arbitrary-owner-password", true);
1558 let document = parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt");
1559 assert_decrypts_content_stream(&document, plain);
1560 }
1561
1562 #[test]
1563 fn parses_aes_128_encrypted_pdf_with_user_password() {
1564 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1565 let document = parse_pdf_with_password(&pdf, b"userpw")
1566 .expect("correct user password should decrypt AES-128 PDF");
1567 assert_decrypts_content_stream(&document, plain);
1568 }
1569
1570 #[test]
1571 fn parses_aes_128_encrypted_pdf_with_owner_password() {
1572 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1573 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1574 .expect("correct owner password should decrypt AES-128 PDF");
1575 assert_decrypts_content_stream(&document, plain);
1576 }
1577
1578 #[test]
1579 fn aes_128_rejects_wrong_password() {
1580 let (pdf, _) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1581 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1582 .expect_err("wrong password must not decrypt AES-128 PDF");
1583 assert_eq!(err, PdfError::InvalidPassword);
1584 }
1585
1586 fn build_aes_256_encrypted_pdf(
1590 user_password: &[u8],
1591 owner_password: &[u8],
1592 revision: crate::crypto::SecurityRevision,
1593 ) -> (Vec<u8>, &'static [u8]) {
1594 use crate::crypto::test_helpers::{
1595 aes_256_cbc_encrypt, compute_v5_o_and_oe, compute_v5_u_and_ue,
1596 };
1597
1598 let permissions: i32 = -4;
1599 let file_key = [0x13u8; 32];
1600 let u_validation_salt = [0xAAu8; 8];
1601 let u_key_salt = [0xBBu8; 8];
1602 let o_validation_salt = [0xCCu8; 8];
1603 let o_key_salt = [0xDDu8; 8];
1604
1605 let (u_entry, ue_entry) = compute_v5_u_and_ue(
1606 user_password,
1607 &u_validation_salt,
1608 &u_key_salt,
1609 &file_key,
1610 revision,
1611 );
1612 let u_vector: [u8; 48] = u_entry.as_slice().try_into().expect("U is 48 bytes");
1613 let (o_entry, oe_entry) = compute_v5_o_and_oe(
1614 owner_password,
1615 &o_validation_salt,
1616 &o_key_salt,
1617 &u_vector,
1618 &file_key,
1619 revision,
1620 );
1621
1622 let content_iv = [0x42u8; 16];
1623 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(AES-256 SECRET) Tj\nET\n";
1624 let content_cipher = aes_256_cbc_encrypt(&file_key, &content_iv, content_plain);
1625
1626 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1627 let mut out = Vec::with_capacity(bytes.len() + 2);
1628 out.push(b'(');
1629 for &byte in bytes {
1630 match byte {
1631 b'(' | b')' | b'\\' => {
1632 out.push(b'\\');
1633 out.push(byte);
1634 }
1635 _ => out.push(byte),
1636 }
1637 }
1638 out.push(b')');
1639 out
1640 };
1641
1642 let mut pdf: Vec<u8> = Vec::new();
1643 pdf.extend_from_slice(b"%PDF-2.0\n");
1644
1645 let catalog_offset = pdf.len();
1646 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1647
1648 let pages_offset = pdf.len();
1649 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1650
1651 let page_offset = pdf.len();
1652 pdf.extend_from_slice(
1653 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1654 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1655 );
1656
1657 let content_offset = pdf.len();
1658 pdf.extend_from_slice(
1659 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1660 );
1661 pdf.extend_from_slice(&content_cipher);
1662 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1663
1664 let font_offset = pdf.len();
1665 pdf.extend_from_slice(
1666 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1667 /Encoding /WinAnsiEncoding >>\nendobj\n",
1668 );
1669
1670 let r_value = match revision {
1671 crate::crypto::SecurityRevision::R5 => 5,
1672 crate::crypto::SecurityRevision::R6 => 6,
1673 _ => panic!("V=5 fixture requires R=5 or R=6"),
1674 };
1675
1676 let encrypt_offset = pdf.len();
1677 pdf.extend_from_slice(
1678 format!(
1679 "6 0 obj\n<< /Filter /Standard /V 5 /R {r_value} /Length 256 \
1680 /CF << /StdCF << /CFM /AESV3 /Length 32 /AuthEvent /DocOpen >> >> \
1681 /StmF /StdCF /StrF /StdCF /P {permissions} "
1682 )
1683 .as_bytes(),
1684 );
1685 pdf.extend_from_slice(b"/O ");
1686 pdf.extend_from_slice(&escape_literal(&o_entry));
1687 pdf.extend_from_slice(b" /U ");
1688 pdf.extend_from_slice(&escape_literal(&u_entry));
1689 pdf.extend_from_slice(b" /OE ");
1690 pdf.extend_from_slice(&escape_literal(&oe_entry));
1691 pdf.extend_from_slice(b" /UE ");
1692 pdf.extend_from_slice(&escape_literal(&ue_entry));
1693 pdf.extend_from_slice(b" >>\nendobj\n");
1694
1695 let xref_offset = pdf.len();
1696 pdf.extend_from_slice(b"xref\n0 7\n");
1697 pdf.extend_from_slice(b"0000000000 65535 f \n");
1698 for offset in [
1699 catalog_offset,
1700 pages_offset,
1701 page_offset,
1702 content_offset,
1703 font_offset,
1704 encrypt_offset,
1705 ] {
1706 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1707 }
1708 let id_literal: [u8; 16] = [
1711 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE,
1712 0xFF, 0x00,
1713 ];
1714 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1715 pdf.extend_from_slice(&escape_literal(&id_literal));
1716 pdf.extend_from_slice(&escape_literal(&id_literal));
1717 pdf.extend_from_slice(b"] >>\n");
1718 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1719
1720 (pdf, content_plain)
1721 }
1722
1723 #[test]
1724 fn parses_aes_256_r6_encrypted_pdf_with_user_password() {
1725 let (pdf, plain) =
1726 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1727 let document = parse_pdf_with_password(&pdf, b"userpw")
1728 .expect("correct user password should decrypt AES-256 R=6 PDF");
1729 assert_decrypts_content_stream(&document, plain);
1730 }
1731
1732 #[test]
1733 fn parses_aes_256_r6_encrypted_pdf_with_owner_password() {
1734 let (pdf, plain) =
1735 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1736 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1737 .expect("correct owner password should decrypt AES-256 R=6 PDF");
1738 assert_decrypts_content_stream(&document, plain);
1739 }
1740
1741 #[test]
1742 fn parses_aes_256_r5_encrypted_pdf_with_empty_password() {
1743 let (pdf, plain) =
1744 build_aes_256_encrypted_pdf(b"", b"ownerpw", crate::crypto::SecurityRevision::R5);
1745 let document = parse_pdf(&pdf).expect("empty-password AES-256 R=5 PDF should decrypt");
1746 assert_decrypts_content_stream(&document, plain);
1747 }
1748
1749 #[test]
1750 fn aes_256_rejects_wrong_password() {
1751 let (pdf, _) =
1752 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1753 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1754 .expect_err("wrong password must not decrypt AES-256 PDF");
1755 assert_eq!(err, PdfError::InvalidPassword);
1756 }
1757
1758 #[test]
1759 fn parses_aes_128_with_encrypt_metadata_false() {
1760 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"ownerpw", false);
1764 let document =
1765 parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt with metadata off");
1766 assert_decrypts_content_stream(&document, plain);
1767 }
1768}