1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::crypto::{BytesKind, StandardSecurityHandler};
4use crate::document::build_document;
5use crate::error::{PdfError, PdfResult};
6use crate::stream::decode_stream;
7use crate::types::{
8 ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
9};
10
11pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
15 parse_pdf_with_password(bytes, b"")
16}
17
18pub fn parse_pdf_with_password(
24 bytes: &[u8],
25 password: &[u8],
26) -> PdfResult<crate::document::ParsedDocument> {
27 let version = parse_header(bytes)?;
28 let startxref = find_startxref(bytes)?;
29 let (xref, mut trailer) = parse_xref_table(bytes, startxref)?;
30
31 let mut objects = BTreeMap::new();
32 let mut max_object_number = 0;
33 let mut compressed: Vec<(ObjectRef, u32, u32)> = Vec::new();
34
35 for (object_ref, entry) in &xref {
36 match entry {
37 XrefEntry::Free => {}
38 XrefEntry::Uncompressed { offset, .. } => {
39 if object_ref.object_number == 0 {
40 continue;
41 }
42 let object = parse_indirect_object(bytes, *offset, Some(&xref))?;
43 max_object_number = max_object_number.max(object_ref.object_number);
44 objects.insert(*object_ref, object);
45 }
46 XrefEntry::Compressed {
47 stream_object_number,
48 index,
49 } => {
50 compressed.push((*object_ref, *stream_object_number, *index));
51 }
52 }
53 }
54
55 decrypt_document_if_encrypted(&mut objects, &mut trailer, password)?;
61
62 materialize_object_streams(&mut objects, &mut max_object_number, &compressed)?;
63
64 let file = PdfFile {
65 version,
66 objects,
67 trailer,
68 max_object_number,
69 };
70 build_document(file)
71}
72
73fn decrypt_document_if_encrypted(
74 objects: &mut BTreeMap<ObjectRef, PdfObject>,
75 trailer: &mut PdfDictionary,
76 password: &[u8],
77) -> PdfResult<()> {
78 let encrypt_ref = match trailer.get("Encrypt") {
79 Some(PdfValue::Reference(object_ref)) => *object_ref,
80 Some(PdfValue::Dictionary(_)) => {
81 return Err(PdfError::Unsupported(
82 "direct (non-indirect) /Encrypt dictionaries are not supported".to_string(),
83 ));
84 }
85 Some(_) => {
86 return Err(PdfError::Corrupt(
87 "trailer /Encrypt is not a reference".to_string(),
88 ));
89 }
90 None => return Ok(()),
91 };
92
93 let encrypt_dict = match objects.get(&encrypt_ref) {
94 Some(PdfObject::Value(PdfValue::Dictionary(dict))) => dict.clone(),
95 _ => {
96 return Err(PdfError::Corrupt(
97 "trailer /Encrypt does not point at a dictionary".to_string(),
98 ));
99 }
100 };
101
102 let id_first = extract_id_first(trailer)?;
103
104 let handler = StandardSecurityHandler::open(&encrypt_dict, &id_first, password)?
105 .ok_or(PdfError::InvalidPassword)?;
106
107 let refs: Vec<ObjectRef> = objects.keys().copied().collect();
108 for object_ref in refs {
109 if object_ref == encrypt_ref {
110 continue;
113 }
114 let object = objects
115 .get_mut(&object_ref)
116 .expect("ref obtained from map keys must still be present");
117 match object {
118 PdfObject::Stream(stream) => {
119 let type_name = stream.dict.get("Type").and_then(PdfValue::as_name);
123 let is_xref_stream = type_name == Some("XRef");
124 let is_exempt_metadata =
125 !handler.encrypts_metadata() && type_name == Some("Metadata");
126 decrypt_strings_in_dict(&mut stream.dict, &handler, object_ref)?;
127 if !is_xref_stream && !is_exempt_metadata {
128 stream.data =
129 handler.decrypt_bytes(&stream.data, object_ref, BytesKind::Stream)?;
130 }
131 }
132 PdfObject::Value(value) => {
133 decrypt_strings_in_value(value, &handler, object_ref)?;
134 }
135 }
136 }
137
138 trailer.remove("Encrypt");
139 Ok(())
140}
141
142fn extract_id_first(trailer: &PdfDictionary) -> PdfResult<Vec<u8>> {
143 match trailer.get("ID") {
144 Some(PdfValue::Array(entries)) => match entries.first() {
145 Some(PdfValue::String(value)) => Ok(value.0.clone()),
146 _ => Err(PdfError::Corrupt(
147 "trailer /ID[0] is not a string — cannot derive encryption key".to_string(),
148 )),
149 },
150 _ => Err(PdfError::Corrupt(
151 "encrypted PDF is missing the trailer /ID array required for key derivation"
152 .to_string(),
153 )),
154 }
155}
156
157fn decrypt_strings_in_value(
158 value: &mut PdfValue,
159 handler: &StandardSecurityHandler,
160 object_ref: ObjectRef,
161) -> PdfResult<()> {
162 match value {
163 PdfValue::String(string) => {
164 string.0 = handler.decrypt_bytes(&string.0, object_ref, BytesKind::String)?;
165 }
166 PdfValue::Array(items) => {
167 for item in items {
168 decrypt_strings_in_value(item, handler, object_ref)?;
169 }
170 }
171 PdfValue::Dictionary(dict) => {
172 decrypt_strings_in_dict(dict, handler, object_ref)?;
173 }
174 _ => {}
175 }
176 Ok(())
177}
178
179fn decrypt_strings_in_dict(
180 dict: &mut PdfDictionary,
181 handler: &StandardSecurityHandler,
182 object_ref: ObjectRef,
183) -> PdfResult<()> {
184 for value in dict.values_mut() {
185 decrypt_strings_in_value(value, handler, object_ref)?;
186 }
187 Ok(())
188}
189
190fn parse_header(bytes: &[u8]) -> PdfResult<String> {
191 if !bytes.starts_with(b"%PDF-") {
192 return Err(PdfError::Parse("missing PDF header".to_string()));
193 }
194 let line_end = bytes
195 .iter()
196 .position(|byte| *byte == b'\n' || *byte == b'\r')
197 .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
198 Ok(String::from_utf8_lossy(&bytes[5..line_end])
199 .trim()
200 .to_string())
201}
202
203fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
204 let marker = b"startxref";
205 let position = bytes
206 .windows(marker.len())
207 .rposition(|window| window == marker)
208 .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
209 let mut parser = Cursor::new(bytes, position + marker.len());
210 parser.skip_ws_and_comments();
211 parser.parse_usize()
212}
213
214fn parse_xref_table(
215 bytes: &[u8],
216 start_offset: usize,
217) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary)> {
218 let mut merged_entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
219 let mut newest_trailer: Option<PdfDictionary> = None;
220 let mut visited = BTreeSet::new();
221 let mut pending: Vec<usize> = vec![start_offset];
222
223 while let Some(offset) = pending.pop() {
224 if !visited.insert(offset) {
225 continue;
226 }
227 let section = parse_xref_section_at(bytes, offset)?;
228
229 for (object_ref, entry) in section.entries {
231 merged_entries.entry(object_ref).or_insert(entry);
232 }
233
234 if newest_trailer.is_none() {
235 newest_trailer = Some(section.trailer.clone());
236 }
237
238 if let Some(stm_offset) = section
239 .trailer
240 .get("XRefStm")
241 .and_then(PdfValue::as_integer)
242 {
243 pending.push(stm_offset as usize);
244 }
245 if let Some(prev_offset) = section.trailer.get("Prev").and_then(PdfValue::as_integer) {
246 pending.push(prev_offset as usize);
247 }
248 }
249
250 let trailer = newest_trailer
251 .ok_or_else(|| PdfError::Parse("xref chain produced no trailer".to_string()))?;
252 Ok((merged_entries, trailer))
253}
254
255struct XrefSection {
256 entries: BTreeMap<ObjectRef, XrefEntry>,
257 trailer: PdfDictionary,
258}
259
260fn parse_xref_section_at(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
261 let mut probe = Cursor::new(bytes, offset);
262 probe.skip_ws_and_comments();
263 if probe.peek_keyword("xref") {
264 parse_classic_xref_section(bytes, offset)
265 } else {
266 parse_xref_stream_section(bytes, offset)
267 }
268}
269
270fn parse_classic_xref_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
271 let mut cursor = Cursor::new(bytes, offset);
272 cursor.expect_keyword("xref")?;
273 let mut entries = BTreeMap::new();
274 loop {
275 cursor.skip_ws_and_comments();
276 if cursor.peek_keyword("trailer") {
277 break;
278 }
279 let start = cursor.parse_u32()?;
280 cursor.skip_ws_and_comments();
281 let count = cursor.parse_u32()?;
282 cursor.skip_line_breaks();
283 for index in 0..count {
284 let line = cursor.read_line()?;
285 if line.len() < 17 {
286 return Err(PdfError::Parse("invalid xref entry".to_string()));
287 }
288 let parts = String::from_utf8_lossy(line).trim().to_string();
289 let mut fields = parts.split_whitespace();
290 let entry_offset = fields
291 .next()
292 .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
293 .parse::<usize>()
294 .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
295 let generation = fields
296 .next()
297 .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
298 .parse::<u16>()
299 .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
300 let flag = fields
301 .next()
302 .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
303 let object_number = start
304 .checked_add(index)
305 .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
306 let entry = if flag == "n" {
307 XrefEntry::Uncompressed {
308 offset: entry_offset,
309 generation,
310 }
311 } else {
312 XrefEntry::Free
313 };
314 entries.insert(ObjectRef::new(object_number, generation), entry);
315 }
316 }
317 cursor.expect_keyword("trailer")?;
318 let trailer = match cursor.parse_value()? {
319 PdfValue::Dictionary(dictionary) => dictionary,
320 _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
321 };
322 Ok(XrefSection { entries, trailer })
323}
324
325fn parse_xref_stream_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
326 let object = parse_indirect_object(bytes, offset, None)?;
331 let stream = match object {
332 PdfObject::Stream(stream) => stream,
333 PdfObject::Value(_) => {
334 return Err(PdfError::Parse(
335 "expected xref stream object at startxref offset".to_string(),
336 ));
337 }
338 };
339 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("XRef") {
340 return Err(PdfError::Parse(
341 "xref stream object has wrong Type".to_string(),
342 ));
343 }
344
345 let size = stream
346 .dict
347 .get("Size")
348 .and_then(PdfValue::as_integer)
349 .ok_or_else(|| PdfError::Corrupt("xref stream missing Size".to_string()))?
350 as u32;
351
352 let w = stream
353 .dict
354 .get("W")
355 .and_then(PdfValue::as_array)
356 .ok_or_else(|| PdfError::Corrupt("xref stream missing W".to_string()))?;
357 if w.len() != 3 {
358 return Err(PdfError::Corrupt(
359 "xref stream W must have three entries".to_string(),
360 ));
361 }
362 let w0 = w[0]
363 .as_integer()
364 .ok_or_else(|| PdfError::Corrupt("invalid W[0]".to_string()))? as usize;
365 let w1 = w[1]
366 .as_integer()
367 .ok_or_else(|| PdfError::Corrupt("invalid W[1]".to_string()))? as usize;
368 let w2 = w[2]
369 .as_integer()
370 .ok_or_else(|| PdfError::Corrupt("invalid W[2]".to_string()))? as usize;
371 let row_len = w0 + w1 + w2;
372 if row_len == 0 {
373 return Err(PdfError::Corrupt(
374 "xref stream row width is zero".to_string(),
375 ));
376 }
377
378 let index: Vec<(u32, u32)> = match stream.dict.get("Index") {
379 Some(PdfValue::Array(entries)) => {
380 if entries.len() % 2 != 0 {
381 return Err(PdfError::Corrupt(
382 "xref stream Index must have an even number of entries".to_string(),
383 ));
384 }
385 let mut pairs = Vec::with_capacity(entries.len() / 2);
386 for chunk in entries.chunks(2) {
387 let first = chunk[0]
388 .as_integer()
389 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
390 as u32;
391 let count = chunk[1]
392 .as_integer()
393 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
394 as u32;
395 pairs.push((first, count));
396 }
397 pairs
398 }
399 Some(_) => {
400 return Err(PdfError::Corrupt(
401 "xref stream Index is not an array".to_string(),
402 ));
403 }
404 None => vec![(0, size)],
405 };
406
407 let decoded = decode_stream(&stream)?;
408 let expected_rows: u32 = index.iter().map(|(_, count)| *count).sum();
409 if decoded.len() < expected_rows as usize * row_len {
410 return Err(PdfError::Corrupt(
411 "xref stream body is shorter than declared entries".to_string(),
412 ));
413 }
414
415 let mut entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
416 let mut cursor = 0usize;
417 for (first, count) in index {
418 for i in 0..count {
419 let row = &decoded[cursor..cursor + row_len];
420 cursor += row_len;
421 let field_type = if w0 == 0 { 1u64 } else { read_be(&row[..w0])? };
422 let f2 = read_be(&row[w0..w0 + w1])?;
423 let f3 = read_be(&row[w0 + w1..])?;
424 let object_number = first + i;
425 let entry = match field_type {
426 0 => XrefEntry::Free,
427 1 => XrefEntry::Uncompressed {
428 offset: f2 as usize,
429 generation: f3 as u16,
430 },
431 2 => XrefEntry::Compressed {
432 stream_object_number: f2 as u32,
433 index: f3 as u32,
434 },
435 other => {
436 return Err(PdfError::Unsupported(format!(
437 "xref stream entry type {other} is not supported"
438 )));
439 }
440 };
441 let generation = match entry {
442 XrefEntry::Uncompressed { generation, .. } => generation,
443 _ => 0,
444 };
445 entries.insert(ObjectRef::new(object_number, generation), entry);
446 }
447 }
448
449 Ok(XrefSection {
450 entries,
451 trailer: stream.dict,
452 })
453}
454
455fn read_be(bytes: &[u8]) -> PdfResult<u64> {
456 if bytes.len() > 8 {
457 return Err(PdfError::Corrupt(
458 "xref stream field width exceeds 8 bytes".to_string(),
459 ));
460 }
461 let mut value: u64 = 0;
462 for byte in bytes {
463 value = (value << 8) | *byte as u64;
464 }
465 Ok(value)
466}
467
468fn materialize_object_streams(
469 objects: &mut BTreeMap<ObjectRef, PdfObject>,
470 max_object_number: &mut u32,
471 compressed: &[(ObjectRef, u32, u32)],
472) -> PdfResult<()> {
473 if compressed.is_empty() {
474 return Ok(());
475 }
476
477 let mut by_stream: BTreeMap<u32, Vec<(ObjectRef, u32)>> = BTreeMap::new();
478 for (object_ref, stream_obj_num, index) in compressed {
479 by_stream
480 .entry(*stream_obj_num)
481 .or_default()
482 .push((*object_ref, *index));
483 }
484
485 for (stream_obj_num, mut members) in by_stream {
486 let stream_ref = ObjectRef::new(stream_obj_num, 0);
487 let stream = match objects.get(&stream_ref) {
488 Some(PdfObject::Stream(stream)) => stream.clone(),
489 Some(PdfObject::Value(_)) => {
490 return Err(PdfError::Corrupt(format!(
491 "object stream {stream_obj_num} is not a stream"
492 )));
493 }
494 None => {
495 return Err(PdfError::Corrupt(format!(
496 "compressed entry references missing object stream {stream_obj_num}"
497 )));
498 }
499 };
500 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("ObjStm") {
501 return Err(PdfError::Corrupt(format!(
502 "object {stream_obj_num} is not marked as ObjStm"
503 )));
504 }
505 let n = stream
506 .dict
507 .get("N")
508 .and_then(PdfValue::as_integer)
509 .ok_or_else(|| PdfError::Corrupt("ObjStm missing N".to_string()))?
510 as usize;
511 let first = stream
512 .dict
513 .get("First")
514 .and_then(PdfValue::as_integer)
515 .ok_or_else(|| PdfError::Corrupt("ObjStm missing First".to_string()))?
516 as usize;
517
518 let decoded = decode_stream(&stream)?;
519 if first > decoded.len() {
520 return Err(PdfError::Corrupt(
521 "ObjStm First offset is past end of decoded data".to_string(),
522 ));
523 }
524
525 let header = &decoded[..first];
526 let mut header_cursor = Cursor::new(header, 0);
527 let mut entries: Vec<(u32, usize)> = Vec::with_capacity(n);
528 for _ in 0..n {
529 header_cursor.skip_ws_and_comments();
530 let obj_num = header_cursor.parse_u32()?;
531 header_cursor.skip_ws_and_comments();
532 let rel_offset = header_cursor.parse_usize()?;
533 entries.push((obj_num, rel_offset));
534 }
535
536 members.sort_by_key(|(_, index)| *index);
538 for (member_ref, index) in members {
539 let idx = index as usize;
540 if idx >= entries.len() {
541 return Err(PdfError::Corrupt(format!(
542 "ObjStm {stream_obj_num} has no index {idx}"
543 )));
544 }
545 let (declared_number, rel_offset) = entries[idx];
546 if declared_number != member_ref.object_number {
547 return Err(PdfError::Corrupt(format!(
548 "ObjStm {stream_obj_num} index {idx} has number {declared_number} but xref expected {}",
549 member_ref.object_number
550 )));
551 }
552 let absolute_offset = first
553 .checked_add(rel_offset)
554 .ok_or_else(|| PdfError::Corrupt("ObjStm offset overflow".to_string()))?;
555 if absolute_offset > decoded.len() {
556 return Err(PdfError::Corrupt(
557 "ObjStm member offset is past end of decoded data".to_string(),
558 ));
559 }
560 let mut value_cursor = Cursor::new(&decoded, absolute_offset);
561 let value = value_cursor.parse_value()?;
562 if let PdfValue::Dictionary(dict) = &value {
563 if dict.get("Type").and_then(PdfValue::as_name) == Some("ObjStm") {
564 return Err(PdfError::Unsupported(
565 "nested object streams are not supported".to_string(),
566 ));
567 }
568 }
569 *max_object_number = (*max_object_number).max(member_ref.object_number);
570 objects.insert(member_ref, PdfObject::Value(value));
571 }
572 }
573
574 Ok(())
575}
576
577fn parse_indirect_object(
578 bytes: &[u8],
579 offset: usize,
580 xref: Option<&BTreeMap<ObjectRef, XrefEntry>>,
581) -> PdfResult<PdfObject> {
582 let mut cursor = Cursor::new(bytes, offset);
583 let _object_number = cursor.parse_u32()?;
584 cursor.skip_ws_and_comments();
585 let _generation = cursor.parse_u16()?;
586 cursor.skip_ws_and_comments();
587 cursor.expect_keyword("obj")?;
588 cursor.skip_ws_and_comments();
589
590 let value = cursor.parse_value()?;
591 cursor.skip_ws_and_comments();
592 if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
593 let dict = match value {
594 PdfValue::Dictionary(dict) => dict,
595 _ => unreachable!(),
596 };
597 cursor.expect_keyword("stream")?;
598 cursor.consume_stream_line_break();
599 let stream_start = cursor.position;
600 let length_hint = match dict.get("Length") {
608 Some(PdfValue::Integer(len)) if *len >= 0 => Some(*len as usize),
609 Some(PdfValue::Reference(target)) => {
610 xref.and_then(|map| resolve_stream_length_ref(bytes, map, *target))
611 }
612 _ => None,
613 };
614 let (data, endstream_pos) = match length_hint {
615 Some(len) if stream_start + len <= bytes.len() => {
616 let mut check = stream_start + len;
619 while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
620 check += 1;
621 }
622 if bytes.get(check..check + 9) == Some(b"endstream") {
623 (bytes[stream_start..stream_start + len].to_vec(), check)
624 } else {
625 let pos = find_keyword(bytes, stream_start, b"endstream")
627 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
628 (bytes[stream_start..pos].to_vec(), pos)
629 }
630 }
631 _ => {
632 let pos = find_keyword(bytes, stream_start, b"endstream")
633 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
634 (bytes[stream_start..pos].to_vec(), pos)
635 }
636 };
637 cursor.position = endstream_pos;
638 cursor.expect_keyword("endstream")?;
639 cursor.skip_ws_and_comments();
640 cursor.expect_keyword("endobj")?;
641 Ok(PdfObject::Stream(PdfStream { dict, data }))
642 } else {
643 cursor.expect_keyword("endobj")?;
644 Ok(PdfObject::Value(value))
645 }
646}
647
648fn resolve_stream_length_ref(
657 bytes: &[u8],
658 xref: &BTreeMap<ObjectRef, XrefEntry>,
659 target: ObjectRef,
660) -> Option<usize> {
661 let entry = xref.get(&target)?;
662 let offset = match entry {
663 XrefEntry::Uncompressed { offset, .. } => *offset,
664 XrefEntry::Compressed { .. } | XrefEntry::Free => return None,
667 };
668 let object = parse_indirect_object(bytes, offset, None).ok()?;
672 match object {
673 PdfObject::Value(PdfValue::Integer(len)) if len >= 0 => Some(len as usize),
674 _ => None,
675 }
676}
677
678fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
679 bytes[start..]
680 .windows(keyword.len())
681 .position(|window| window == keyword)
682 .map(|relative| start + relative)
683}
684
685struct Cursor<'a> {
686 bytes: &'a [u8],
687 position: usize,
688}
689
690impl<'a> Cursor<'a> {
691 fn new(bytes: &'a [u8], position: usize) -> Self {
692 Self { bytes, position }
693 }
694
695 fn eof(&self) -> bool {
696 self.position >= self.bytes.len()
697 }
698
699 fn current(&self) -> Option<u8> {
700 self.bytes.get(self.position).copied()
701 }
702
703 fn skip_ws_and_comments(&mut self) {
704 while let Some(byte) = self.current() {
705 match byte {
706 b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
707 b'%' => {
708 while let Some(next) = self.current() {
709 self.position += 1;
710 if next == b'\n' || next == b'\r' {
711 break;
712 }
713 }
714 }
715 _ => break,
716 }
717 }
718 }
719
720 fn skip_line_breaks(&mut self) {
721 while matches!(self.current(), Some(b'\n' | b'\r')) {
722 self.position += 1;
723 }
724 }
725
726 fn read_line(&mut self) -> PdfResult<&'a [u8]> {
727 if self.eof() {
728 return Err(PdfError::Parse("unexpected end of file".to_string()));
729 }
730 let start = self.position;
731 while let Some(byte) = self.current() {
732 if byte == b'\n' || byte == b'\r' {
733 let end = self.position;
734 self.skip_line_breaks();
735 return Ok(&self.bytes[start..end]);
736 }
737 self.position += 1;
738 }
739 Ok(&self.bytes[start..self.position])
740 }
741
742 fn peek_keyword(&self, keyword: &str) -> bool {
743 self.bytes
744 .get(self.position..self.position + keyword.len())
745 .map(|slice| slice == keyword.as_bytes())
746 .unwrap_or(false)
747 }
748
749 fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
750 self.skip_ws_and_comments();
751 if self.peek_keyword(keyword) {
752 self.position += keyword.len();
753 Ok(())
754 } else {
755 Err(PdfError::Parse(format!("expected keyword {keyword}")))
756 }
757 }
758
759 fn consume_stream_line_break(&mut self) {
760 if self.current() == Some(b'\r') {
761 self.position += 1;
762 }
763 if self.current() == Some(b'\n') {
764 self.position += 1;
765 }
766 }
767
768 fn parse_u32(&mut self) -> PdfResult<u32> {
769 let token = self.parse_token()?;
770 token
771 .parse::<u32>()
772 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
773 }
774
775 fn parse_u16(&mut self) -> PdfResult<u16> {
776 let token = self.parse_token()?;
777 token
778 .parse::<u16>()
779 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
780 }
781
782 fn parse_usize(&mut self) -> PdfResult<usize> {
783 let token = self.parse_token()?;
784 token
785 .parse::<usize>()
786 .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
787 }
788
789 fn parse_token(&mut self) -> PdfResult<String> {
790 self.skip_ws_and_comments();
791 let start = self.position;
792 while let Some(byte) = self.current() {
793 if is_delimiter(byte) || is_whitespace(byte) {
794 break;
795 }
796 self.position += 1;
797 }
798 if self.position == start {
799 return Err(PdfError::Parse("expected token".to_string()));
800 }
801 Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
802 }
803
804 fn parse_value(&mut self) -> PdfResult<PdfValue> {
805 self.skip_ws_and_comments();
806 match self.current() {
807 Some(b'/') => self.parse_name(),
808 Some(b'(') => self.parse_literal_string(),
809 Some(b'[') => self.parse_array(),
810 Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
811 self.parse_dictionary()
812 }
813 Some(b'<') => self.parse_hex_string(),
814 Some(b't') if self.peek_keyword("true") => {
815 self.position += 4;
816 Ok(PdfValue::Bool(true))
817 }
818 Some(b'f') if self.peek_keyword("false") => {
819 self.position += 5;
820 Ok(PdfValue::Bool(false))
821 }
822 Some(b'n') if self.peek_keyword("null") => {
823 self.position += 4;
824 Ok(PdfValue::Null)
825 }
826 Some(_) => self.parse_number_or_reference(),
827 None => Err(PdfError::Parse("unexpected end of file".to_string())),
828 }
829 }
830
831 fn parse_name(&mut self) -> PdfResult<PdfValue> {
832 self.position += 1;
833 let mut raw = Vec::new();
834 while let Some(byte) = self.current() {
835 if is_delimiter(byte) || is_whitespace(byte) {
836 break;
837 }
838 if byte == b'#' {
839 let high =
840 self.bytes.get(self.position + 1).copied().ok_or_else(|| {
841 PdfError::Parse("truncated #XX escape in name".to_string())
842 })?;
843 let low =
844 self.bytes.get(self.position + 2).copied().ok_or_else(|| {
845 PdfError::Parse("truncated #XX escape in name".to_string())
846 })?;
847 let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
848 .map_err(|_| {
849 PdfError::Parse("invalid #XX hex escape in name".to_string())
850 })?;
851 raw.push(decoded);
852 self.position += 3;
853 } else {
854 raw.push(byte);
855 self.position += 1;
856 }
857 }
858 Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
859 }
860
861 fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
862 self.position += 1;
863 let mut output = Vec::new();
864 let mut depth = 1usize;
865 while let Some(byte) = self.current() {
866 self.position += 1;
867 match byte {
868 b'\\' => {
869 let escaped = self
870 .current()
871 .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
872 self.position += 1;
873 match escaped {
874 b'n' => output.push(b'\n'),
875 b'r' => output.push(b'\r'),
876 b't' => output.push(b'\t'),
877 b'b' => output.push(0x08),
878 b'f' => output.push(0x0C),
879 b'(' | b')' | b'\\' => output.push(escaped),
880 b'\n' => {}
881 b'\r' => {
882 if self.current() == Some(b'\n') {
883 self.position += 1;
884 }
885 }
886 b'0'..=b'7' => {
887 let mut octal = vec![escaped];
888 for _ in 0..2 {
889 match self.current() {
890 Some(next @ b'0'..=b'7') => {
891 octal.push(next);
892 self.position += 1;
893 }
894 _ => break,
895 }
896 }
897 let value =
899 u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
900 .unwrap_or(0);
901 output.push((value % 256) as u8);
902 }
903 other => output.push(other),
904 }
905 }
906 b'(' => {
907 depth += 1;
908 output.push(byte);
909 }
910 b')' => {
911 depth -= 1;
912 if depth == 0 {
913 return Ok(PdfValue::String(PdfString(output)));
914 }
915 output.push(byte);
916 }
917 _ => output.push(byte),
918 }
919 }
920 Err(PdfError::Parse("unterminated literal string".to_string()))
921 }
922
923 fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
924 self.position += 1;
925 let start = self.position;
926 while self.current() != Some(b'>') {
927 if self.eof() {
928 return Err(PdfError::Parse("unterminated hex string".to_string()));
929 }
930 self.position += 1;
931 }
932 let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
933 .chars()
934 .filter(|character| !character.is_whitespace())
935 .collect::<String>();
936 self.position += 1;
937 let mut chars = raw.chars().collect::<Vec<_>>();
938 if chars.len() % 2 != 0 {
939 chars.push('0');
940 }
941 let mut bytes = Vec::with_capacity(chars.len() / 2);
942 for pair in chars.chunks(2) {
943 let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
944 .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
945 bytes.push(value);
946 }
947 Ok(PdfValue::String(PdfString(bytes)))
948 }
949
950 fn parse_array(&mut self) -> PdfResult<PdfValue> {
951 self.position += 1;
952 let mut values = Vec::new();
953 loop {
954 self.skip_ws_and_comments();
955 match self.current() {
956 Some(b']') => {
957 self.position += 1;
958 break;
959 }
960 Some(_) => values.push(self.parse_value()?),
961 None => return Err(PdfError::Parse("unterminated array".to_string())),
962 }
963 }
964 Ok(PdfValue::Array(values))
965 }
966
967 fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
968 self.position += 2;
969 let mut dictionary = PdfDictionary::new();
970 loop {
971 self.skip_ws_and_comments();
972 if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
973 self.position += 2;
974 break;
975 }
976 let key = match self.parse_name()? {
977 PdfValue::Name(name) => name,
978 _ => unreachable!(),
979 };
980 let value = self.parse_value()?;
981 dictionary.insert(key, value);
982 }
983 Ok(PdfValue::Dictionary(dictionary))
984 }
985
986 fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
987 let first_token = self.parse_token()?;
988 if first_token.contains('.') || first_token.contains(['e', 'E']) {
989 return first_token
990 .parse::<f64>()
991 .map(PdfValue::Number)
992 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
993 }
994
995 let checkpoint = self.position;
996 self.skip_ws_and_comments();
997 if let Ok(second_token) = self.parse_token() {
998 self.skip_ws_and_comments();
999 if self.current() == Some(b'R')
1000 && second_token
1001 .chars()
1002 .all(|character| character.is_ascii_digit())
1003 {
1004 self.position += 1;
1005 return Ok(PdfValue::Reference(ObjectRef::new(
1006 first_token
1007 .parse::<u32>()
1008 .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
1009 second_token
1010 .parse::<u16>()
1011 .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
1012 )));
1013 }
1014 }
1015 self.position = checkpoint;
1016 first_token
1017 .parse::<i64>()
1018 .map(PdfValue::Integer)
1019 .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
1020 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
1021 }
1022}
1023
1024fn is_whitespace(byte: u8) -> bool {
1025 matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
1026}
1027
1028fn is_delimiter(byte: u8) -> bool {
1029 matches!(
1030 byte,
1031 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
1032 )
1033}
1034
1035#[cfg(test)]
1036mod tests {
1037 use super::{parse_pdf, parse_pdf_with_password};
1038 use crate::error::PdfError;
1039 use crate::types::PdfObject;
1040
1041 #[test]
1042 fn parses_simple_pdf_fixture() {
1043 let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
1044 let document = parse_pdf(bytes).expect("fixture should parse");
1045 assert_eq!(document.pages.len(), 1);
1046 }
1047
1048 #[test]
1049 fn parses_incremental_update_fixture() {
1050 let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
1051 let document = parse_pdf(bytes).expect("incremental fixture should parse");
1052 assert_eq!(document.pages.len(), 1);
1053
1054 let content_refs = &document.pages[0].content_refs;
1057 assert!(!content_refs.is_empty());
1058 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
1059 let stream_data = match content_obj {
1060 PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
1061 _ => panic!("expected stream object for page content"),
1062 };
1063 assert!(
1064 stream_data.contains("Updated Secret"),
1065 "content stream should contain updated text"
1066 );
1067 assert!(
1068 !stream_data.contains("Original Secret"),
1069 "content stream should not contain original text"
1070 );
1071 }
1072
1073 #[test]
1074 fn circular_prev_chain_does_not_loop() {
1075 let mut pdf = Vec::new();
1079 pdf.extend_from_slice(b"%PDF-1.4\n");
1080
1081 let obj1_offset = pdf.len();
1083 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1084
1085 let obj2_offset = pdf.len();
1087 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1088
1089 let xref_offset = pdf.len();
1090 pdf.extend_from_slice(b"xref\n0 3\n");
1091 pdf.extend_from_slice(b"0000000000 65535 f \n");
1092 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
1093 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
1094 pdf.extend_from_slice(b"trailer\n");
1095 pdf.extend_from_slice(
1097 format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
1098 );
1099 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
1100
1101 let document = parse_pdf(&pdf).expect("circular Prev should be tolerated");
1102 assert_eq!(document.pages.len(), 0);
1103 }
1104
1105 #[test]
1106 fn stream_length_indirect_reference_is_resolved() {
1107 let payload = b"--endstream--HIDDEN";
1112 let mut pdf = Vec::new();
1113 pdf.extend_from_slice(b"%PDF-1.4\n");
1114
1115 let obj1_offset = pdf.len();
1116 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1117
1118 let obj2_offset = pdf.len();
1119 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1120
1121 let obj3_offset = pdf.len();
1122 pdf.extend_from_slice(
1123 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> /Contents 4 0 R >>\nendobj\n",
1124 );
1125
1126 let obj4_offset = pdf.len();
1127 pdf.extend_from_slice(b"4 0 obj\n<< /Length 5 0 R >>\nstream\n");
1128 pdf.extend_from_slice(payload);
1129 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1130
1131 let obj5_offset = pdf.len();
1132 pdf.extend_from_slice(format!("5 0 obj\n{}\nendobj\n", payload.len()).as_bytes());
1133
1134 let xref_offset = pdf.len();
1135 pdf.extend_from_slice(b"xref\n0 6\n");
1136 pdf.extend_from_slice(b"0000000000 65535 f \n");
1137 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
1138 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
1139 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj3_offset).as_bytes());
1140 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj4_offset).as_bytes());
1141 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj5_offset).as_bytes());
1142 pdf.extend_from_slice(b"trailer\n<< /Size 6 /Root 1 0 R >>\n");
1143 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
1144
1145 let document = parse_pdf(&pdf).expect("indirect-length fixture should parse");
1146 let content_refs = &document.pages[0].content_refs;
1147 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
1148 let data = match content_obj {
1149 PdfObject::Stream(stream) => &stream.data,
1150 _ => panic!("expected stream object for page content"),
1151 };
1152 assert_eq!(
1153 data.as_slice(),
1154 payload,
1155 "resolved indirect /Length should yield the exact original payload bytes"
1156 );
1157 }
1158
1159 #[test]
1160 fn parses_uncompressed_xref_stream() {
1161 let mut pdf: Vec<u8> = Vec::new();
1164 pdf.extend_from_slice(b"%PDF-1.5\n");
1165
1166 let obj1_offset = pdf.len();
1167 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1168 let obj2_offset = pdf.len();
1169 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1170
1171 let row_for = |t: u8, off: u16, generation: u8| {
1174 let mut row = [0u8; 4];
1175 row[0] = t;
1176 row[1] = (off >> 8) as u8;
1177 row[2] = off as u8;
1178 row[3] = generation;
1179 row
1180 };
1181 let mut body = Vec::new();
1182 body.extend_from_slice(&row_for(0, 0, 0xFF)); body.extend_from_slice(&row_for(1, obj1_offset as u16, 0));
1184 body.extend_from_slice(&row_for(1, obj2_offset as u16, 0));
1185 body.extend_from_slice(&row_for(1, 0, 0)); let xref_obj_offset = pdf.len();
1188 let self_offset = xref_obj_offset as u16;
1190 body[12] = 1;
1191 body[13] = (self_offset >> 8) as u8;
1192 body[14] = self_offset as u8;
1193 body[15] = 0;
1194
1195 let stream_dict = format!(
1196 "<< /Type /XRef /Size 4 /W [1 2 1] /Root 1 0 R /Length {} >>",
1197 body.len()
1198 );
1199 pdf.extend_from_slice(format!("3 0 obj\n{stream_dict}\nstream\n").as_bytes());
1200 pdf.extend_from_slice(&body);
1201 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1202 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_obj_offset).as_bytes());
1203
1204 let document = parse_pdf(&pdf).expect("xref stream fixture should parse");
1205 assert_eq!(document.pages.len(), 0);
1206 assert!(document.file.objects.len() >= 2);
1208 }
1209
1210 #[test]
1211 fn parses_object_stream_via_xref_stream() {
1212 use flate2::{Compression, write::ZlibEncoder};
1213 use std::io::Write;
1214
1215 let mut pdf: Vec<u8> = Vec::new();
1222 pdf.extend_from_slice(b"%PDF-1.5\n");
1223
1224 let obj1_offset = pdf.len();
1225 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1226
1227 let member_payload = b"<< /Type /Pages /Count 0 /Kids [] >>";
1229 let header = b"2 0 ";
1230 let first = header.len();
1231 let mut decompressed = Vec::new();
1232 decompressed.extend_from_slice(header);
1233 decompressed.extend_from_slice(member_payload);
1234
1235 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1236 encoder.write_all(&decompressed).unwrap();
1237 let compressed = encoder.finish().unwrap();
1238
1239 let obj3_offset = pdf.len();
1240 let objstm_dict = format!(
1241 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1242 first,
1243 compressed.len()
1244 );
1245 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1246 pdf.extend_from_slice(&compressed);
1247 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1248
1249 let row_for = |t: u8, a: u32, b: u16| {
1253 let mut row = [0u8; 5];
1254 row[0] = t;
1255 row[1] = (a >> 16) as u8;
1256 row[2] = (a >> 8) as u8;
1257 row[3] = a as u8;
1258 row[4] = b as u8;
1259 row
1260 };
1261
1262 let obj4_offset = pdf.len();
1263 let mut body = Vec::new();
1264 body.extend_from_slice(&row_for(0, 0, 0xFF));
1265 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1266 body.extend_from_slice(&row_for(2, 3, 0));
1267 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1268 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1269
1270 let stream_dict = format!(
1271 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1272 body.len()
1273 );
1274 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1275 pdf.extend_from_slice(&body);
1276 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1277 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1278
1279 let document = parse_pdf(&pdf).expect("ObjStm fixture should parse");
1280 assert_eq!(document.pages.len(), 0);
1281 let pages_ref = document.catalog.pages_ref;
1283 let pages_dict = document.file.get_dictionary(pages_ref).unwrap();
1284 assert_eq!(
1285 pages_dict.get("Type").and_then(|v| v.as_name()),
1286 Some("Pages")
1287 );
1288 }
1289
1290 #[test]
1291 fn rejects_nested_object_stream() {
1292 use flate2::{Compression, write::ZlibEncoder};
1293 use std::io::Write;
1294
1295 let mut pdf: Vec<u8> = Vec::new();
1297 pdf.extend_from_slice(b"%PDF-1.5\n");
1298
1299 let obj1_offset = pdf.len();
1300 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1301
1302 let member_payload = b"<< /Type /ObjStm /N 0 /First 0 /Length 0 >>";
1303 let header = b"2 0 ";
1304 let first = header.len();
1305 let mut decompressed = Vec::new();
1306 decompressed.extend_from_slice(header);
1307 decompressed.extend_from_slice(member_payload);
1308
1309 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1310 encoder.write_all(&decompressed).unwrap();
1311 let compressed = encoder.finish().unwrap();
1312
1313 let obj3_offset = pdf.len();
1314 let objstm_dict = format!(
1315 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1316 first,
1317 compressed.len()
1318 );
1319 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1320 pdf.extend_from_slice(&compressed);
1321 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1322
1323 let row_for = |t: u8, a: u32, b: u16| {
1324 let mut row = [0u8; 5];
1325 row[0] = t;
1326 row[1] = (a >> 16) as u8;
1327 row[2] = (a >> 8) as u8;
1328 row[3] = a as u8;
1329 row[4] = b as u8;
1330 row
1331 };
1332
1333 let obj4_offset = pdf.len();
1334 let mut body = Vec::new();
1335 body.extend_from_slice(&row_for(0, 0, 0xFF));
1336 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1337 body.extend_from_slice(&row_for(2, 3, 0));
1338 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1339 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1340
1341 let stream_dict = format!(
1342 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1343 body.len()
1344 );
1345 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1346 pdf.extend_from_slice(&body);
1347 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1348 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1349
1350 match parse_pdf(&pdf) {
1351 Err(PdfError::Unsupported(message)) => {
1352 assert!(message.contains("nested object streams"), "got: {message}")
1353 }
1354 other => panic!("expected Unsupported, got: {other:?}"),
1355 }
1356 }
1357
1358 fn build_rc4_encrypted_pdf(
1364 user_password: &[u8],
1365 owner_password: &[u8],
1366 ) -> (Vec<u8>, &'static [u8]) {
1367 use crate::crypto::SecurityRevision;
1368 use crate::crypto::test_helpers::{
1369 compute_file_key, compute_o, compute_u_r3, object_key, rc4,
1370 };
1371
1372 let id_first: [u8; 16] = [
1373 0x6e, 0x05, 0xb1, 0x20, 0x63, 0x94, 0x69, 0x1f, 0x22, 0x2c, 0x32, 0xac, 0x61, 0x8b,
1374 0xe6, 0x8d,
1375 ];
1376 let permissions: i32 = -4;
1377 let key_length_bytes = 16;
1378
1379 let owner_entry = compute_o(
1380 owner_password,
1381 user_password,
1382 SecurityRevision::R3,
1383 key_length_bytes,
1384 );
1385 let file_key = compute_file_key(
1386 user_password,
1387 &owner_entry,
1388 permissions,
1389 &id_first,
1390 key_length_bytes,
1391 );
1392 let u_entry = compute_u_r3(&file_key, &id_first);
1393
1394 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1395 let mut out = Vec::with_capacity(bytes.len() + 2);
1396 out.push(b'(');
1397 for &byte in bytes {
1398 match byte {
1399 b'(' | b')' | b'\\' => {
1400 out.push(b'\\');
1401 out.push(byte);
1402 }
1403 _ => out.push(byte),
1404 }
1405 }
1406 out.push(b')');
1407 out
1408 };
1409
1410 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(CIPHERED SECRET) Tj\nET\n";
1411 let content_cipher = rc4(&object_key(&file_key, 4, 0), content_plain);
1412
1413 let mut pdf: Vec<u8> = Vec::new();
1414 pdf.extend_from_slice(b"%PDF-1.4\n");
1415
1416 let catalog_offset = pdf.len();
1417 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1418
1419 let pages_offset = pdf.len();
1420 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1421
1422 let page_offset = pdf.len();
1423 pdf.extend_from_slice(
1424 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1425 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1426 );
1427
1428 let content_offset = pdf.len();
1429 pdf.extend_from_slice(
1430 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1431 );
1432 pdf.extend_from_slice(&content_cipher);
1433 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1434
1435 let font_offset = pdf.len();
1436 pdf.extend_from_slice(
1437 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1438 /Encoding /WinAnsiEncoding >>\nendobj\n",
1439 );
1440
1441 let encrypt_offset = pdf.len();
1442 pdf.extend_from_slice(b"6 0 obj\n<< /Filter /Standard /V 2 /R 3 /Length 128 ");
1443 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1444 pdf.extend_from_slice(b"/O ");
1445 pdf.extend_from_slice(&escape_literal(&owner_entry));
1446 pdf.extend_from_slice(b" /U ");
1447 pdf.extend_from_slice(&escape_literal(&u_entry));
1448 pdf.extend_from_slice(b" >>\nendobj\n");
1449
1450 let xref_offset = pdf.len();
1451 pdf.extend_from_slice(b"xref\n0 7\n");
1452 pdf.extend_from_slice(b"0000000000 65535 f \n");
1453 for offset in [
1454 catalog_offset,
1455 pages_offset,
1456 page_offset,
1457 content_offset,
1458 font_offset,
1459 encrypt_offset,
1460 ] {
1461 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1462 }
1463 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1464 pdf.extend_from_slice(&escape_literal(&id_first));
1465 pdf.extend_from_slice(&escape_literal(&id_first));
1466 pdf.extend_from_slice(b"] >>\n");
1467 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1468
1469 (pdf, content_plain)
1470 }
1471
1472 fn assert_decrypts_content_stream(document: &crate::document::ParsedDocument, expected: &[u8]) {
1473 assert_eq!(document.pages.len(), 1);
1474 assert!(
1475 !document.file.trailer.contains_key("Encrypt"),
1476 "trailer /Encrypt must be stripped once the document is decrypted in place"
1477 );
1478 let content_ref = document.pages[0].content_refs[0];
1479 let stream = match document.file.get_object(content_ref).unwrap() {
1480 PdfObject::Stream(stream) => stream,
1481 _ => panic!("page content must be a stream"),
1482 };
1483 assert_eq!(stream.data, expected);
1484 }
1485
1486 #[test]
1487 fn parses_rc4_encrypted_pdf_with_empty_password() {
1488 let (pdf, plain) = build_rc4_encrypted_pdf(b"", b"arbitrary-owner-password");
1493 let document = parse_pdf(&pdf).expect("empty-password PDF should decrypt");
1494 assert_decrypts_content_stream(&document, plain);
1495 }
1496
1497 #[test]
1498 fn parses_rc4_encrypted_pdf_with_user_password() {
1499 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1500 let document =
1501 parse_pdf_with_password(&pdf, b"userpw").expect("correct user password should decrypt");
1502 assert_decrypts_content_stream(&document, plain);
1503 }
1504
1505 #[test]
1506 fn parses_rc4_encrypted_pdf_with_owner_password() {
1507 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1508 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1509 .expect("correct owner password should decrypt");
1510 assert_decrypts_content_stream(&document, plain);
1511 }
1512
1513 #[test]
1514 fn rejects_wrong_password_with_invalid_password_error() {
1515 let (pdf, _) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1516 let err =
1517 parse_pdf_with_password(&pdf, b"wrongpw").expect_err("wrong password must not decrypt");
1518 assert_eq!(err, PdfError::InvalidPassword);
1519 }
1520
1521 #[test]
1522 fn parses_rc4_encrypted_pdf_with_utf8_password() {
1523 let password = "pässwörd".as_bytes();
1524 let (pdf, plain) = build_rc4_encrypted_pdf(password, b"ownerpw");
1525 let document =
1526 parse_pdf_with_password(&pdf, password).expect("UTF-8 user password should decrypt");
1527 assert_decrypts_content_stream(&document, plain);
1528 }
1529
1530 fn build_aes_128_encrypted_pdf(
1535 user_password: &[u8],
1536 owner_password: &[u8],
1537 encrypt_metadata: bool,
1538 ) -> (Vec<u8>, &'static [u8]) {
1539 use crate::crypto::SecurityRevision;
1540 use crate::crypto::test_helpers::{
1541 aes_128_cbc_encrypt, compute_file_key_r4, compute_o, compute_u_r3, object_key_aes,
1542 };
1543
1544 let id_first: [u8; 16] = [
1545 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
1546 0x99, 0x00,
1547 ];
1548 let permissions: i32 = -4;
1549
1550 let owner_entry = compute_o(owner_password, user_password, SecurityRevision::R4, 16);
1551 let file_key = compute_file_key_r4(
1552 user_password,
1553 &owner_entry,
1554 permissions,
1555 &id_first,
1556 encrypt_metadata,
1557 );
1558 let u_entry = compute_u_r3(&file_key, &id_first);
1559
1560 let content_iv = [0x42u8; 16];
1564 let content_plain: &'static [u8] =
1565 b"BT\n/F1 24 Tf\n72 700 Td\n(AES SECRET REMOVED) Tj\nET\n";
1566 let content_key = object_key_aes(&file_key, 4, 0);
1567 let content_cipher = aes_128_cbc_encrypt(&content_key, &content_iv, content_plain);
1568
1569 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1570 let mut out = Vec::with_capacity(bytes.len() + 2);
1571 out.push(b'(');
1572 for &byte in bytes {
1573 match byte {
1574 b'(' | b')' | b'\\' => {
1575 out.push(b'\\');
1576 out.push(byte);
1577 }
1578 _ => out.push(byte),
1579 }
1580 }
1581 out.push(b')');
1582 out
1583 };
1584
1585 let mut pdf: Vec<u8> = Vec::new();
1586 pdf.extend_from_slice(b"%PDF-1.5\n");
1587
1588 let catalog_offset = pdf.len();
1589 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1590
1591 let pages_offset = pdf.len();
1592 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1593
1594 let page_offset = pdf.len();
1595 pdf.extend_from_slice(
1596 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1597 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1598 );
1599
1600 let content_offset = pdf.len();
1601 pdf.extend_from_slice(
1602 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1603 );
1604 pdf.extend_from_slice(&content_cipher);
1605 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1606
1607 let font_offset = pdf.len();
1608 pdf.extend_from_slice(
1609 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1610 /Encoding /WinAnsiEncoding >>\nendobj\n",
1611 );
1612
1613 let encrypt_offset = pdf.len();
1614 pdf.extend_from_slice(
1615 b"6 0 obj\n<< /Filter /Standard /V 4 /R 4 /Length 128 \
1616 /CF << /StdCF << /CFM /AESV2 /Length 16 /AuthEvent /DocOpen >> >> \
1617 /StmF /StdCF /StrF /StdCF ",
1618 );
1619 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1620 if !encrypt_metadata {
1621 pdf.extend_from_slice(b"/EncryptMetadata false ");
1622 }
1623 pdf.extend_from_slice(b"/O ");
1624 pdf.extend_from_slice(&escape_literal(&owner_entry));
1625 pdf.extend_from_slice(b" /U ");
1626 pdf.extend_from_slice(&escape_literal(&u_entry));
1627 pdf.extend_from_slice(b" >>\nendobj\n");
1628
1629 let xref_offset = pdf.len();
1630 pdf.extend_from_slice(b"xref\n0 7\n");
1631 pdf.extend_from_slice(b"0000000000 65535 f \n");
1632 for offset in [
1633 catalog_offset,
1634 pages_offset,
1635 page_offset,
1636 content_offset,
1637 font_offset,
1638 encrypt_offset,
1639 ] {
1640 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1641 }
1642 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1643 pdf.extend_from_slice(&escape_literal(&id_first));
1644 pdf.extend_from_slice(&escape_literal(&id_first));
1645 pdf.extend_from_slice(b"] >>\n");
1646 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1647
1648 (pdf, content_plain)
1649 }
1650
1651 #[test]
1652 fn parses_aes_128_encrypted_pdf_with_empty_password() {
1653 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"arbitrary-owner-password", true);
1654 let document = parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt");
1655 assert_decrypts_content_stream(&document, plain);
1656 }
1657
1658 #[test]
1659 fn parses_aes_128_encrypted_pdf_with_user_password() {
1660 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1661 let document = parse_pdf_with_password(&pdf, b"userpw")
1662 .expect("correct user password should decrypt AES-128 PDF");
1663 assert_decrypts_content_stream(&document, plain);
1664 }
1665
1666 #[test]
1667 fn parses_aes_128_encrypted_pdf_with_owner_password() {
1668 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1669 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1670 .expect("correct owner password should decrypt AES-128 PDF");
1671 assert_decrypts_content_stream(&document, plain);
1672 }
1673
1674 #[test]
1675 fn aes_128_rejects_wrong_password() {
1676 let (pdf, _) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1677 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1678 .expect_err("wrong password must not decrypt AES-128 PDF");
1679 assert_eq!(err, PdfError::InvalidPassword);
1680 }
1681
1682 fn build_aes_256_encrypted_pdf(
1686 user_password: &[u8],
1687 owner_password: &[u8],
1688 revision: crate::crypto::SecurityRevision,
1689 ) -> (Vec<u8>, &'static [u8]) {
1690 use crate::crypto::test_helpers::{
1691 aes_256_cbc_encrypt, compute_v5_o_and_oe, compute_v5_u_and_ue,
1692 };
1693
1694 let permissions: i32 = -4;
1695 let file_key = [0x13u8; 32];
1696 let u_validation_salt = [0xAAu8; 8];
1697 let u_key_salt = [0xBBu8; 8];
1698 let o_validation_salt = [0xCCu8; 8];
1699 let o_key_salt = [0xDDu8; 8];
1700
1701 let (u_entry, ue_entry) = compute_v5_u_and_ue(
1702 user_password,
1703 &u_validation_salt,
1704 &u_key_salt,
1705 &file_key,
1706 revision,
1707 );
1708 let u_vector: [u8; 48] = u_entry.as_slice().try_into().expect("U is 48 bytes");
1709 let (o_entry, oe_entry) = compute_v5_o_and_oe(
1710 owner_password,
1711 &o_validation_salt,
1712 &o_key_salt,
1713 &u_vector,
1714 &file_key,
1715 revision,
1716 );
1717
1718 let content_iv = [0x42u8; 16];
1719 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(AES-256 SECRET) Tj\nET\n";
1720 let content_cipher = aes_256_cbc_encrypt(&file_key, &content_iv, content_plain);
1721
1722 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1723 let mut out = Vec::with_capacity(bytes.len() + 2);
1724 out.push(b'(');
1725 for &byte in bytes {
1726 match byte {
1727 b'(' | b')' | b'\\' => {
1728 out.push(b'\\');
1729 out.push(byte);
1730 }
1731 _ => out.push(byte),
1732 }
1733 }
1734 out.push(b')');
1735 out
1736 };
1737
1738 let mut pdf: Vec<u8> = Vec::new();
1739 pdf.extend_from_slice(b"%PDF-2.0\n");
1740
1741 let catalog_offset = pdf.len();
1742 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1743
1744 let pages_offset = pdf.len();
1745 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1746
1747 let page_offset = pdf.len();
1748 pdf.extend_from_slice(
1749 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1750 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1751 );
1752
1753 let content_offset = pdf.len();
1754 pdf.extend_from_slice(
1755 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1756 );
1757 pdf.extend_from_slice(&content_cipher);
1758 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1759
1760 let font_offset = pdf.len();
1761 pdf.extend_from_slice(
1762 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1763 /Encoding /WinAnsiEncoding >>\nendobj\n",
1764 );
1765
1766 let r_value = match revision {
1767 crate::crypto::SecurityRevision::R5 => 5,
1768 crate::crypto::SecurityRevision::R6 => 6,
1769 _ => panic!("V=5 fixture requires R=5 or R=6"),
1770 };
1771
1772 let encrypt_offset = pdf.len();
1773 pdf.extend_from_slice(
1774 format!(
1775 "6 0 obj\n<< /Filter /Standard /V 5 /R {r_value} /Length 256 \
1776 /CF << /StdCF << /CFM /AESV3 /Length 32 /AuthEvent /DocOpen >> >> \
1777 /StmF /StdCF /StrF /StdCF /P {permissions} "
1778 )
1779 .as_bytes(),
1780 );
1781 pdf.extend_from_slice(b"/O ");
1782 pdf.extend_from_slice(&escape_literal(&o_entry));
1783 pdf.extend_from_slice(b" /U ");
1784 pdf.extend_from_slice(&escape_literal(&u_entry));
1785 pdf.extend_from_slice(b" /OE ");
1786 pdf.extend_from_slice(&escape_literal(&oe_entry));
1787 pdf.extend_from_slice(b" /UE ");
1788 pdf.extend_from_slice(&escape_literal(&ue_entry));
1789 pdf.extend_from_slice(b" >>\nendobj\n");
1790
1791 let xref_offset = pdf.len();
1792 pdf.extend_from_slice(b"xref\n0 7\n");
1793 pdf.extend_from_slice(b"0000000000 65535 f \n");
1794 for offset in [
1795 catalog_offset,
1796 pages_offset,
1797 page_offset,
1798 content_offset,
1799 font_offset,
1800 encrypt_offset,
1801 ] {
1802 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1803 }
1804 let id_literal: [u8; 16] = [
1807 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE,
1808 0xFF, 0x00,
1809 ];
1810 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1811 pdf.extend_from_slice(&escape_literal(&id_literal));
1812 pdf.extend_from_slice(&escape_literal(&id_literal));
1813 pdf.extend_from_slice(b"] >>\n");
1814 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1815
1816 (pdf, content_plain)
1817 }
1818
1819 #[test]
1820 fn parses_aes_256_r6_encrypted_pdf_with_user_password() {
1821 let (pdf, plain) =
1822 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1823 let document = parse_pdf_with_password(&pdf, b"userpw")
1824 .expect("correct user password should decrypt AES-256 R=6 PDF");
1825 assert_decrypts_content_stream(&document, plain);
1826 }
1827
1828 #[test]
1829 fn parses_aes_256_r6_encrypted_pdf_with_owner_password() {
1830 let (pdf, plain) =
1831 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1832 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1833 .expect("correct owner password should decrypt AES-256 R=6 PDF");
1834 assert_decrypts_content_stream(&document, plain);
1835 }
1836
1837 #[test]
1838 fn parses_aes_256_r5_encrypted_pdf_with_empty_password() {
1839 let (pdf, plain) =
1840 build_aes_256_encrypted_pdf(b"", b"ownerpw", crate::crypto::SecurityRevision::R5);
1841 let document = parse_pdf(&pdf).expect("empty-password AES-256 R=5 PDF should decrypt");
1842 assert_decrypts_content_stream(&document, plain);
1843 }
1844
1845 #[test]
1846 fn aes_256_rejects_wrong_password() {
1847 let (pdf, _) =
1848 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1849 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1850 .expect_err("wrong password must not decrypt AES-256 PDF");
1851 assert_eq!(err, PdfError::InvalidPassword);
1852 }
1853
1854 #[test]
1855 fn parses_aes_128_with_encrypt_metadata_false() {
1856 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"ownerpw", false);
1860 let document =
1861 parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt with metadata off");
1862 assert_decrypts_content_stream(&document, plain);
1863 }
1864}