1use std::collections::{BTreeMap, BTreeSet};
2
3use crate::crypto::{BytesKind, StandardSecurityHandler};
4use crate::document::build_document;
5use crate::error::{PdfError, PdfResult};
6use crate::pubsec::{PubSecCredential, open_pubsec};
7use crate::stream::decode_stream;
8use crate::types::{
9 ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfStream, PdfString, PdfValue, XrefEntry,
10 XrefForm,
11};
12
13#[derive(Clone, Copy)]
19pub enum PdfCredential<'a> {
20 Password(&'a [u8]),
21 Certificate {
22 cert_der: &'a [u8],
23 private_key_der: &'a [u8],
24 },
25}
26
27pub fn parse_pdf(bytes: &[u8]) -> PdfResult<crate::document::ParsedDocument> {
32 parse_pdf_with_credential(bytes, PdfCredential::Password(b""))
33}
34
35pub fn parse_pdf_with_password(
41 bytes: &[u8],
42 password: &[u8],
43) -> PdfResult<crate::document::ParsedDocument> {
44 parse_pdf_with_credential(bytes, PdfCredential::Password(password))
45}
46
47pub fn parse_pdf_with_certificate(
55 bytes: &[u8],
56 cert_der: &[u8],
57 private_key_der: &[u8],
58) -> PdfResult<crate::document::ParsedDocument> {
59 parse_pdf_with_credential(
60 bytes,
61 PdfCredential::Certificate {
62 cert_der,
63 private_key_der,
64 },
65 )
66}
67
68pub fn parse_pdf_with_credential(
72 bytes: &[u8],
73 credential: PdfCredential,
74) -> PdfResult<crate::document::ParsedDocument> {
75 let version = parse_header(bytes)?;
76 let startxref = find_startxref(bytes)?;
77 let (xref, mut trailer, xref_form) = parse_xref_table(bytes, startxref)?;
78
79 let mut objects = BTreeMap::new();
80 let mut max_object_number = 0;
81 let mut compressed: Vec<(ObjectRef, u32, u32)> = Vec::new();
82
83 for (object_ref, entry) in &xref {
84 match entry {
85 XrefEntry::Free => {}
86 XrefEntry::Uncompressed { offset, .. } => {
87 if object_ref.object_number == 0 {
88 continue;
89 }
90 let object = parse_indirect_object(bytes, *offset, Some(&xref))?;
91 max_object_number = max_object_number.max(object_ref.object_number);
92 objects.insert(*object_ref, object);
93 }
94 XrefEntry::Compressed {
95 stream_object_number,
96 index,
97 } => {
98 compressed.push((*object_ref, *stream_object_number, *index));
99 }
100 }
101 }
102
103 decrypt_document_if_encrypted(&mut objects, &mut trailer, credential)?;
109
110 materialize_object_streams(&mut objects, &mut max_object_number, &compressed)?;
111
112 let file = PdfFile {
113 version,
114 objects,
115 trailer,
116 max_object_number,
117 xref_form,
118 };
119 build_document(file)
120}
121
122fn decrypt_document_if_encrypted(
123 objects: &mut BTreeMap<ObjectRef, PdfObject>,
124 trailer: &mut PdfDictionary,
125 credential: PdfCredential,
126) -> PdfResult<()> {
127 let encrypt_ref = match trailer.get("Encrypt") {
128 Some(PdfValue::Reference(object_ref)) => *object_ref,
129 Some(PdfValue::Dictionary(_)) => {
130 return Err(PdfError::Unsupported(
131 "direct (non-indirect) /Encrypt dictionaries are not supported".to_string(),
132 ));
133 }
134 Some(_) => {
135 return Err(PdfError::Corrupt(
136 "trailer /Encrypt is not a reference".to_string(),
137 ));
138 }
139 None => return Ok(()),
140 };
141
142 let encrypt_dict = match objects.get(&encrypt_ref) {
143 Some(PdfObject::Value(PdfValue::Dictionary(dict))) => dict.clone(),
144 _ => {
145 return Err(PdfError::Corrupt(
146 "trailer /Encrypt does not point at a dictionary".to_string(),
147 ));
148 }
149 };
150
151 let filter_name = encrypt_dict
152 .get("Filter")
153 .and_then(PdfValue::as_name)
154 .unwrap_or("");
155
156 let handler = match filter_name {
157 "Standard" => match credential {
158 PdfCredential::Password(password) => {
159 let id_first = extract_id_first(trailer)?;
160 StandardSecurityHandler::open(&encrypt_dict, &id_first, password)?
161 .ok_or(PdfError::InvalidPassword)?
162 }
163 PdfCredential::Certificate { .. } => {
164 return Err(PdfError::Unsupported(
165 "/Filter /Standard requires a password, not a certificate".to_string(),
166 ));
167 }
168 },
169 "Adobe.PubSec" => match credential {
170 PdfCredential::Certificate {
171 cert_der,
172 private_key_der,
173 } => open_pubsec(
174 &encrypt_dict,
175 &PubSecCredential {
176 certificate_der: cert_der,
177 private_key_der,
178 },
179 )?,
180 PdfCredential::Password(_) => {
181 return Err(PdfError::Unsupported(
182 "/Filter /Adobe.PubSec requires a certificate, not a password".to_string(),
183 ));
184 }
185 },
186 other => {
187 return Err(PdfError::Unsupported(format!(
188 "encryption filter /{other} is not supported"
189 )));
190 }
191 };
192
193 let refs: Vec<ObjectRef> = objects.keys().copied().collect();
194 for object_ref in refs {
195 if object_ref == encrypt_ref {
196 continue;
199 }
200 let object = objects
201 .get_mut(&object_ref)
202 .expect("ref obtained from map keys must still be present");
203 match object {
204 PdfObject::Stream(stream) => {
205 let type_name = stream.dict.get("Type").and_then(PdfValue::as_name);
209 let is_xref_stream = type_name == Some("XRef");
210 let is_exempt_metadata =
211 !handler.encrypts_metadata() && type_name == Some("Metadata");
212 decrypt_strings_in_dict(&mut stream.dict, &handler, object_ref)?;
213 if !is_xref_stream && !is_exempt_metadata {
214 stream.data =
215 handler.decrypt_bytes(&stream.data, object_ref, BytesKind::Stream)?;
216 }
217 }
218 PdfObject::Value(value) => {
219 decrypt_strings_in_value(value, &handler, object_ref)?;
220 }
221 }
222 }
223
224 trailer.remove("Encrypt");
225 objects.remove(&encrypt_ref);
229 Ok(())
230}
231
232fn extract_id_first(trailer: &PdfDictionary) -> PdfResult<Vec<u8>> {
233 match trailer.get("ID") {
234 Some(PdfValue::Array(entries)) => match entries.first() {
235 Some(PdfValue::String(value)) => Ok(value.0.clone()),
236 _ => Err(PdfError::Corrupt(
237 "trailer /ID[0] is not a string — cannot derive encryption key".to_string(),
238 )),
239 },
240 _ => Err(PdfError::Corrupt(
241 "encrypted PDF is missing the trailer /ID array required for key derivation"
242 .to_string(),
243 )),
244 }
245}
246
247fn decrypt_strings_in_value(
248 value: &mut PdfValue,
249 handler: &StandardSecurityHandler,
250 object_ref: ObjectRef,
251) -> PdfResult<()> {
252 match value {
253 PdfValue::String(string) => {
254 string.0 = handler.decrypt_bytes(&string.0, object_ref, BytesKind::String)?;
255 }
256 PdfValue::Array(items) => {
257 for item in items {
258 decrypt_strings_in_value(item, handler, object_ref)?;
259 }
260 }
261 PdfValue::Dictionary(dict) => {
262 decrypt_strings_in_dict(dict, handler, object_ref)?;
263 }
264 _ => {}
265 }
266 Ok(())
267}
268
269fn decrypt_strings_in_dict(
270 dict: &mut PdfDictionary,
271 handler: &StandardSecurityHandler,
272 object_ref: ObjectRef,
273) -> PdfResult<()> {
274 for value in dict.values_mut() {
275 decrypt_strings_in_value(value, handler, object_ref)?;
276 }
277 Ok(())
278}
279
280fn parse_header(bytes: &[u8]) -> PdfResult<String> {
281 if !bytes.starts_with(b"%PDF-") {
282 return Err(PdfError::Parse("missing PDF header".to_string()));
283 }
284 let line_end = bytes
285 .iter()
286 .position(|byte| *byte == b'\n' || *byte == b'\r')
287 .ok_or_else(|| PdfError::Parse("unterminated header".to_string()))?;
288 Ok(String::from_utf8_lossy(&bytes[5..line_end])
289 .trim()
290 .to_string())
291}
292
293fn find_startxref(bytes: &[u8]) -> PdfResult<usize> {
294 let marker = b"startxref";
295 let position = bytes
296 .windows(marker.len())
297 .rposition(|window| window == marker)
298 .ok_or_else(|| PdfError::Parse("missing startxref".to_string()))?;
299 let mut parser = Cursor::new(bytes, position + marker.len());
300 parser.skip_ws_and_comments();
301 parser.parse_usize()
302}
303
304fn parse_xref_table(
305 bytes: &[u8],
306 start_offset: usize,
307) -> PdfResult<(BTreeMap<ObjectRef, XrefEntry>, PdfDictionary, XrefForm)> {
308 let mut merged_entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
309 let mut newest_trailer: Option<PdfDictionary> = None;
310 let mut top_form: Option<XrefForm> = None;
315 let mut visited = BTreeSet::new();
316 let mut pending: Vec<usize> = vec![start_offset];
317
318 while let Some(offset) = pending.pop() {
319 if !visited.insert(offset) {
320 continue;
321 }
322 let section = parse_xref_section_at(bytes, offset)?;
323
324 for (object_ref, entry) in section.entries {
326 merged_entries.entry(object_ref).or_insert(entry);
327 }
328
329 if newest_trailer.is_none() {
330 newest_trailer = Some(section.trailer.clone());
331 top_form = Some(section.form);
332 }
333
334 if let Some(stm_offset) = section
335 .trailer
336 .get("XRefStm")
337 .and_then(PdfValue::as_integer)
338 {
339 pending.push(stm_offset as usize);
340 }
341 if let Some(prev_offset) = section.trailer.get("Prev").and_then(PdfValue::as_integer) {
342 pending.push(prev_offset as usize);
343 }
344 }
345
346 let trailer = newest_trailer
347 .ok_or_else(|| PdfError::Parse("xref chain produced no trailer".to_string()))?;
348 let form = top_form.unwrap_or(XrefForm::Classic);
349 Ok((merged_entries, trailer, form))
350}
351
352struct XrefSection {
353 entries: BTreeMap<ObjectRef, XrefEntry>,
354 trailer: PdfDictionary,
355 form: XrefForm,
356}
357
358fn parse_xref_section_at(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
359 let mut probe = Cursor::new(bytes, offset);
360 probe.skip_ws_and_comments();
361 if probe.peek_keyword("xref") {
362 parse_classic_xref_section(bytes, offset)
363 } else {
364 parse_xref_stream_section(bytes, offset)
365 }
366}
367
368fn parse_classic_xref_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
369 let mut cursor = Cursor::new(bytes, offset);
370 cursor.expect_keyword("xref")?;
371 let mut entries = BTreeMap::new();
372 loop {
373 cursor.skip_ws_and_comments();
374 if cursor.peek_keyword("trailer") {
375 break;
376 }
377 let start = cursor.parse_u32()?;
378 cursor.skip_ws_and_comments();
379 let count = cursor.parse_u32()?;
380 cursor.skip_line_breaks();
381 for index in 0..count {
382 let line = cursor.read_line()?;
383 if line.len() < 17 {
384 return Err(PdfError::Parse("invalid xref entry".to_string()));
385 }
386 let parts = String::from_utf8_lossy(line).trim().to_string();
387 let mut fields = parts.split_whitespace();
388 let entry_offset = fields
389 .next()
390 .ok_or_else(|| PdfError::Parse("invalid xref entry offset".to_string()))?
391 .parse::<usize>()
392 .map_err(|_| PdfError::Parse("invalid xref entry offset".to_string()))?;
393 let generation = fields
394 .next()
395 .ok_or_else(|| PdfError::Parse("invalid xref generation".to_string()))?
396 .parse::<u16>()
397 .map_err(|_| PdfError::Parse("invalid xref generation".to_string()))?;
398 let flag = fields
399 .next()
400 .ok_or_else(|| PdfError::Parse("invalid xref flag".to_string()))?;
401 let object_number = start
402 .checked_add(index)
403 .ok_or_else(|| PdfError::Parse("xref object number overflow".to_string()))?;
404 let entry = if flag == "n" {
405 XrefEntry::Uncompressed {
406 offset: entry_offset,
407 generation,
408 }
409 } else {
410 XrefEntry::Free
411 };
412 entries.insert(ObjectRef::new(object_number, generation), entry);
413 }
414 }
415 cursor.expect_keyword("trailer")?;
416 let trailer = match cursor.parse_value()? {
417 PdfValue::Dictionary(dictionary) => dictionary,
418 _ => return Err(PdfError::Parse("trailer is not a dictionary".to_string())),
419 };
420 Ok(XrefSection {
421 entries,
422 trailer,
423 form: XrefForm::Classic,
424 })
425}
426
427fn parse_xref_stream_section(bytes: &[u8], offset: usize) -> PdfResult<XrefSection> {
428 let object = parse_indirect_object(bytes, offset, None)?;
433 let stream = match object {
434 PdfObject::Stream(stream) => stream,
435 PdfObject::Value(_) => {
436 return Err(PdfError::Parse(
437 "expected xref stream object at startxref offset".to_string(),
438 ));
439 }
440 };
441 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("XRef") {
442 return Err(PdfError::Parse(
443 "xref stream object has wrong Type".to_string(),
444 ));
445 }
446
447 let size = stream
448 .dict
449 .get("Size")
450 .and_then(PdfValue::as_integer)
451 .ok_or_else(|| PdfError::Corrupt("xref stream missing Size".to_string()))?
452 as u32;
453
454 let w = stream
455 .dict
456 .get("W")
457 .and_then(PdfValue::as_array)
458 .ok_or_else(|| PdfError::Corrupt("xref stream missing W".to_string()))?;
459 if w.len() != 3 {
460 return Err(PdfError::Corrupt(
461 "xref stream W must have three entries".to_string(),
462 ));
463 }
464 let w0 = w[0]
465 .as_integer()
466 .ok_or_else(|| PdfError::Corrupt("invalid W[0]".to_string()))? as usize;
467 let w1 = w[1]
468 .as_integer()
469 .ok_or_else(|| PdfError::Corrupt("invalid W[1]".to_string()))? as usize;
470 let w2 = w[2]
471 .as_integer()
472 .ok_or_else(|| PdfError::Corrupt("invalid W[2]".to_string()))? as usize;
473 let row_len = w0 + w1 + w2;
474 if row_len == 0 {
475 return Err(PdfError::Corrupt(
476 "xref stream row width is zero".to_string(),
477 ));
478 }
479
480 let index: Vec<(u32, u32)> = match stream.dict.get("Index") {
481 Some(PdfValue::Array(entries)) => {
482 if entries.len() % 2 != 0 {
483 return Err(PdfError::Corrupt(
484 "xref stream Index must have an even number of entries".to_string(),
485 ));
486 }
487 let mut pairs = Vec::with_capacity(entries.len() / 2);
488 for chunk in entries.chunks(2) {
489 let first = chunk[0]
490 .as_integer()
491 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
492 as u32;
493 let count = chunk[1]
494 .as_integer()
495 .ok_or_else(|| PdfError::Corrupt("invalid Index entry".to_string()))?
496 as u32;
497 pairs.push((first, count));
498 }
499 pairs
500 }
501 Some(_) => {
502 return Err(PdfError::Corrupt(
503 "xref stream Index is not an array".to_string(),
504 ));
505 }
506 None => vec![(0, size)],
507 };
508
509 let decoded = decode_stream(&stream)?;
510 let expected_rows: u32 = index.iter().map(|(_, count)| *count).sum();
511 if decoded.len() < expected_rows as usize * row_len {
512 return Err(PdfError::Corrupt(
513 "xref stream body is shorter than declared entries".to_string(),
514 ));
515 }
516
517 let mut entries: BTreeMap<ObjectRef, XrefEntry> = BTreeMap::new();
518 let mut cursor = 0usize;
519 for (first, count) in index {
520 for i in 0..count {
521 let row = &decoded[cursor..cursor + row_len];
522 cursor += row_len;
523 let field_type = if w0 == 0 { 1u64 } else { read_be(&row[..w0])? };
524 let f2 = read_be(&row[w0..w0 + w1])?;
525 let f3 = read_be(&row[w0 + w1..])?;
526 let object_number = first + i;
527 let entry = match field_type {
528 0 => XrefEntry::Free,
529 1 => XrefEntry::Uncompressed {
530 offset: f2 as usize,
531 generation: f3 as u16,
532 },
533 2 => XrefEntry::Compressed {
534 stream_object_number: f2 as u32,
535 index: f3 as u32,
536 },
537 other => {
538 return Err(PdfError::Unsupported(format!(
539 "xref stream entry type {other} is not supported"
540 )));
541 }
542 };
543 let generation = match entry {
544 XrefEntry::Uncompressed { generation, .. } => generation,
545 _ => 0,
546 };
547 entries.insert(ObjectRef::new(object_number, generation), entry);
548 }
549 }
550
551 Ok(XrefSection {
552 entries,
553 trailer: stream.dict,
554 form: XrefForm::Stream,
555 })
556}
557
558fn read_be(bytes: &[u8]) -> PdfResult<u64> {
559 if bytes.len() > 8 {
560 return Err(PdfError::Corrupt(
561 "xref stream field width exceeds 8 bytes".to_string(),
562 ));
563 }
564 let mut value: u64 = 0;
565 for byte in bytes {
566 value = (value << 8) | *byte as u64;
567 }
568 Ok(value)
569}
570
571fn materialize_object_streams(
572 objects: &mut BTreeMap<ObjectRef, PdfObject>,
573 max_object_number: &mut u32,
574 compressed: &[(ObjectRef, u32, u32)],
575) -> PdfResult<()> {
576 if compressed.is_empty() {
577 return Ok(());
578 }
579
580 let mut by_stream: BTreeMap<u32, Vec<(ObjectRef, u32)>> = BTreeMap::new();
581 for (object_ref, stream_obj_num, index) in compressed {
582 by_stream
583 .entry(*stream_obj_num)
584 .or_default()
585 .push((*object_ref, *index));
586 }
587
588 for (stream_obj_num, mut members) in by_stream {
589 let stream_ref = ObjectRef::new(stream_obj_num, 0);
590 let stream = match objects.get(&stream_ref) {
591 Some(PdfObject::Stream(stream)) => stream.clone(),
592 Some(PdfObject::Value(_)) => {
593 return Err(PdfError::Corrupt(format!(
594 "object stream {stream_obj_num} is not a stream"
595 )));
596 }
597 None => {
598 return Err(PdfError::Corrupt(format!(
599 "compressed entry references missing object stream {stream_obj_num}"
600 )));
601 }
602 };
603 if stream.dict.get("Type").and_then(PdfValue::as_name) != Some("ObjStm") {
604 return Err(PdfError::Corrupt(format!(
605 "object {stream_obj_num} is not marked as ObjStm"
606 )));
607 }
608 let n = stream
609 .dict
610 .get("N")
611 .and_then(PdfValue::as_integer)
612 .ok_or_else(|| PdfError::Corrupt("ObjStm missing N".to_string()))?
613 as usize;
614 let first = stream
615 .dict
616 .get("First")
617 .and_then(PdfValue::as_integer)
618 .ok_or_else(|| PdfError::Corrupt("ObjStm missing First".to_string()))?
619 as usize;
620
621 let decoded = decode_stream(&stream)?;
622 if first > decoded.len() {
623 return Err(PdfError::Corrupt(
624 "ObjStm First offset is past end of decoded data".to_string(),
625 ));
626 }
627
628 let header = &decoded[..first];
629 let mut header_cursor = Cursor::new(header, 0);
630 let mut entries: Vec<(u32, usize)> = Vec::with_capacity(n);
631 for _ in 0..n {
632 header_cursor.skip_ws_and_comments();
633 let obj_num = header_cursor.parse_u32()?;
634 header_cursor.skip_ws_and_comments();
635 let rel_offset = header_cursor.parse_usize()?;
636 entries.push((obj_num, rel_offset));
637 }
638
639 members.sort_by_key(|(_, index)| *index);
641 for (member_ref, index) in members {
642 let idx = index as usize;
643 if idx >= entries.len() {
644 return Err(PdfError::Corrupt(format!(
645 "ObjStm {stream_obj_num} has no index {idx}"
646 )));
647 }
648 let (declared_number, rel_offset) = entries[idx];
649 if declared_number != member_ref.object_number {
650 return Err(PdfError::Corrupt(format!(
651 "ObjStm {stream_obj_num} index {idx} has number {declared_number} but xref expected {}",
652 member_ref.object_number
653 )));
654 }
655 let absolute_offset = first
656 .checked_add(rel_offset)
657 .ok_or_else(|| PdfError::Corrupt("ObjStm offset overflow".to_string()))?;
658 if absolute_offset > decoded.len() {
659 return Err(PdfError::Corrupt(
660 "ObjStm member offset is past end of decoded data".to_string(),
661 ));
662 }
663 let mut value_cursor = Cursor::new(&decoded, absolute_offset);
664 let value = value_cursor.parse_value()?;
665 if let PdfValue::Dictionary(dict) = &value {
666 if dict.get("Type").and_then(PdfValue::as_name) == Some("ObjStm") {
667 return Err(PdfError::Unsupported(
668 "nested object streams are not supported".to_string(),
669 ));
670 }
671 }
672 *max_object_number = (*max_object_number).max(member_ref.object_number);
673 objects.insert(member_ref, PdfObject::Value(value));
674 }
675 objects.remove(&stream_ref);
681 }
682
683 Ok(())
684}
685
686fn parse_indirect_object(
687 bytes: &[u8],
688 offset: usize,
689 xref: Option<&BTreeMap<ObjectRef, XrefEntry>>,
690) -> PdfResult<PdfObject> {
691 let mut cursor = Cursor::new(bytes, offset);
692 let _object_number = cursor.parse_u32()?;
693 cursor.skip_ws_and_comments();
694 let _generation = cursor.parse_u16()?;
695 cursor.skip_ws_and_comments();
696 cursor.expect_keyword("obj")?;
697 cursor.skip_ws_and_comments();
698
699 let value = cursor.parse_value()?;
700 cursor.skip_ws_and_comments();
701 if matches!(value, PdfValue::Dictionary(_)) && cursor.peek_keyword("stream") {
702 let dict = match value {
703 PdfValue::Dictionary(dict) => dict,
704 _ => unreachable!(),
705 };
706 cursor.expect_keyword("stream")?;
707 cursor.consume_stream_line_break();
708 let stream_start = cursor.position;
709 let length_hint = match dict.get("Length") {
717 Some(PdfValue::Integer(len)) if *len >= 0 => Some(*len as usize),
718 Some(PdfValue::Reference(target)) => {
719 xref.and_then(|map| resolve_stream_length_ref(bytes, map, *target))
720 }
721 _ => None,
722 };
723 let (data, endstream_pos) = match length_hint {
724 Some(len) if stream_start + len <= bytes.len() => {
725 let mut check = stream_start + len;
728 while check < bytes.len() && matches!(bytes[check], b'\r' | b'\n') {
729 check += 1;
730 }
731 if bytes.get(check..check + 9) == Some(b"endstream") {
732 (bytes[stream_start..stream_start + len].to_vec(), check)
733 } else {
734 let pos = find_keyword(bytes, stream_start, b"endstream")
736 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
737 (bytes[stream_start..pos].to_vec(), pos)
738 }
739 }
740 _ => {
741 let pos = find_keyword(bytes, stream_start, b"endstream")
742 .ok_or_else(|| PdfError::Parse("stream missing endstream".to_string()))?;
743 (bytes[stream_start..pos].to_vec(), pos)
744 }
745 };
746 cursor.position = endstream_pos;
747 cursor.expect_keyword("endstream")?;
748 cursor.skip_ws_and_comments();
749 cursor.expect_keyword("endobj")?;
750 Ok(PdfObject::Stream(PdfStream { dict, data }))
751 } else {
752 cursor.expect_keyword("endobj")?;
753 Ok(PdfObject::Value(value))
754 }
755}
756
757fn resolve_stream_length_ref(
766 bytes: &[u8],
767 xref: &BTreeMap<ObjectRef, XrefEntry>,
768 target: ObjectRef,
769) -> Option<usize> {
770 let entry = xref.get(&target)?;
771 let offset = match entry {
772 XrefEntry::Uncompressed { offset, .. } => *offset,
773 XrefEntry::Compressed { .. } | XrefEntry::Free => return None,
776 };
777 let object = parse_indirect_object(bytes, offset, None).ok()?;
781 match object {
782 PdfObject::Value(PdfValue::Integer(len)) if len >= 0 => Some(len as usize),
783 _ => None,
784 }
785}
786
787fn find_keyword(bytes: &[u8], start: usize, keyword: &[u8]) -> Option<usize> {
788 bytes[start..]
789 .windows(keyword.len())
790 .position(|window| window == keyword)
791 .map(|relative| start + relative)
792}
793
794struct Cursor<'a> {
795 bytes: &'a [u8],
796 position: usize,
797}
798
799impl<'a> Cursor<'a> {
800 fn new(bytes: &'a [u8], position: usize) -> Self {
801 Self { bytes, position }
802 }
803
804 fn eof(&self) -> bool {
805 self.position >= self.bytes.len()
806 }
807
808 fn current(&self) -> Option<u8> {
809 self.bytes.get(self.position).copied()
810 }
811
812 fn skip_ws_and_comments(&mut self) {
813 while let Some(byte) = self.current() {
814 match byte {
815 b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00 => self.position += 1,
816 b'%' => {
817 while let Some(next) = self.current() {
818 self.position += 1;
819 if next == b'\n' || next == b'\r' {
820 break;
821 }
822 }
823 }
824 _ => break,
825 }
826 }
827 }
828
829 fn skip_line_breaks(&mut self) {
830 while matches!(self.current(), Some(b'\n' | b'\r')) {
831 self.position += 1;
832 }
833 }
834
835 fn read_line(&mut self) -> PdfResult<&'a [u8]> {
836 if self.eof() {
837 return Err(PdfError::Parse("unexpected end of file".to_string()));
838 }
839 let start = self.position;
840 while let Some(byte) = self.current() {
841 if byte == b'\n' || byte == b'\r' {
842 let end = self.position;
843 self.skip_line_breaks();
844 return Ok(&self.bytes[start..end]);
845 }
846 self.position += 1;
847 }
848 Ok(&self.bytes[start..self.position])
849 }
850
851 fn peek_keyword(&self, keyword: &str) -> bool {
852 self.bytes
853 .get(self.position..self.position + keyword.len())
854 .map(|slice| slice == keyword.as_bytes())
855 .unwrap_or(false)
856 }
857
858 fn expect_keyword(&mut self, keyword: &str) -> PdfResult<()> {
859 self.skip_ws_and_comments();
860 if self.peek_keyword(keyword) {
861 self.position += keyword.len();
862 Ok(())
863 } else {
864 Err(PdfError::Parse(format!("expected keyword {keyword}")))
865 }
866 }
867
868 fn consume_stream_line_break(&mut self) {
869 if self.current() == Some(b'\r') {
870 self.position += 1;
871 }
872 if self.current() == Some(b'\n') {
873 self.position += 1;
874 }
875 }
876
877 fn parse_u32(&mut self) -> PdfResult<u32> {
878 let token = self.parse_token()?;
879 token
880 .parse::<u32>()
881 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
882 }
883
884 fn parse_u16(&mut self) -> PdfResult<u16> {
885 let token = self.parse_token()?;
886 token
887 .parse::<u16>()
888 .map_err(|_| PdfError::Parse(format!("invalid integer token: {token}")))
889 }
890
891 fn parse_usize(&mut self) -> PdfResult<usize> {
892 let token = self.parse_token()?;
893 token
894 .parse::<usize>()
895 .map_err(|_| PdfError::Parse(format!("invalid offset token: {token}")))
896 }
897
898 fn parse_token(&mut self) -> PdfResult<String> {
899 self.skip_ws_and_comments();
900 let start = self.position;
901 while let Some(byte) = self.current() {
902 if is_delimiter(byte) || is_whitespace(byte) {
903 break;
904 }
905 self.position += 1;
906 }
907 if self.position == start {
908 return Err(PdfError::Parse("expected token".to_string()));
909 }
910 Ok(String::from_utf8_lossy(&self.bytes[start..self.position]).to_string())
911 }
912
913 fn parse_value(&mut self) -> PdfResult<PdfValue> {
914 self.skip_ws_and_comments();
915 match self.current() {
916 Some(b'/') => self.parse_name(),
917 Some(b'(') => self.parse_literal_string(),
918 Some(b'[') => self.parse_array(),
919 Some(b'<') if self.bytes.get(self.position + 1) == Some(&b'<') => {
920 self.parse_dictionary()
921 }
922 Some(b'<') => self.parse_hex_string(),
923 Some(b't') if self.peek_keyword("true") => {
924 self.position += 4;
925 Ok(PdfValue::Bool(true))
926 }
927 Some(b'f') if self.peek_keyword("false") => {
928 self.position += 5;
929 Ok(PdfValue::Bool(false))
930 }
931 Some(b'n') if self.peek_keyword("null") => {
932 self.position += 4;
933 Ok(PdfValue::Null)
934 }
935 Some(_) => self.parse_number_or_reference(),
936 None => Err(PdfError::Parse("unexpected end of file".to_string())),
937 }
938 }
939
940 fn parse_name(&mut self) -> PdfResult<PdfValue> {
941 self.position += 1;
942 let mut raw = Vec::new();
943 while let Some(byte) = self.current() {
944 if is_delimiter(byte) || is_whitespace(byte) {
945 break;
946 }
947 if byte == b'#' {
948 let high =
949 self.bytes.get(self.position + 1).copied().ok_or_else(|| {
950 PdfError::Parse("truncated #XX escape in name".to_string())
951 })?;
952 let low =
953 self.bytes.get(self.position + 2).copied().ok_or_else(|| {
954 PdfError::Parse("truncated #XX escape in name".to_string())
955 })?;
956 let decoded = u8::from_str_radix(&format!("{}{}", high as char, low as char), 16)
957 .map_err(|_| {
958 PdfError::Parse("invalid #XX hex escape in name".to_string())
959 })?;
960 raw.push(decoded);
961 self.position += 3;
962 } else {
963 raw.push(byte);
964 self.position += 1;
965 }
966 }
967 Ok(PdfValue::Name(String::from_utf8_lossy(&raw).to_string()))
968 }
969
970 fn parse_literal_string(&mut self) -> PdfResult<PdfValue> {
971 self.position += 1;
972 let mut output = Vec::new();
973 let mut depth = 1usize;
974 while let Some(byte) = self.current() {
975 self.position += 1;
976 match byte {
977 b'\\' => {
978 let escaped = self
979 .current()
980 .ok_or_else(|| PdfError::Parse("unterminated string escape".to_string()))?;
981 self.position += 1;
982 match escaped {
983 b'n' => output.push(b'\n'),
984 b'r' => output.push(b'\r'),
985 b't' => output.push(b'\t'),
986 b'b' => output.push(0x08),
987 b'f' => output.push(0x0C),
988 b'(' | b')' | b'\\' => output.push(escaped),
989 b'\n' => {}
990 b'\r' => {
991 if self.current() == Some(b'\n') {
992 self.position += 1;
993 }
994 }
995 b'0'..=b'7' => {
996 let mut octal = vec![escaped];
997 for _ in 0..2 {
998 match self.current() {
999 Some(next @ b'0'..=b'7') => {
1000 octal.push(next);
1001 self.position += 1;
1002 }
1003 _ => break,
1004 }
1005 }
1006 let value =
1008 u16::from_str_radix(std::str::from_utf8(&octal).unwrap_or("0"), 8)
1009 .unwrap_or(0);
1010 output.push((value % 256) as u8);
1011 }
1012 other => output.push(other),
1013 }
1014 }
1015 b'(' => {
1016 depth += 1;
1017 output.push(byte);
1018 }
1019 b')' => {
1020 depth -= 1;
1021 if depth == 0 {
1022 return Ok(PdfValue::String(PdfString(output)));
1023 }
1024 output.push(byte);
1025 }
1026 _ => output.push(byte),
1027 }
1028 }
1029 Err(PdfError::Parse("unterminated literal string".to_string()))
1030 }
1031
1032 fn parse_hex_string(&mut self) -> PdfResult<PdfValue> {
1033 self.position += 1;
1034 let start = self.position;
1035 while self.current() != Some(b'>') {
1036 if self.eof() {
1037 return Err(PdfError::Parse("unterminated hex string".to_string()));
1038 }
1039 self.position += 1;
1040 }
1041 let raw = String::from_utf8_lossy(&self.bytes[start..self.position])
1042 .chars()
1043 .filter(|character| !character.is_whitespace())
1044 .collect::<String>();
1045 self.position += 1;
1046 let mut chars = raw.chars().collect::<Vec<_>>();
1047 if chars.len() % 2 != 0 {
1048 chars.push('0');
1049 }
1050 let mut bytes = Vec::with_capacity(chars.len() / 2);
1051 for pair in chars.chunks(2) {
1052 let value = u8::from_str_radix(&pair.iter().collect::<String>(), 16)
1053 .map_err(|_| PdfError::Parse("invalid hex string".to_string()))?;
1054 bytes.push(value);
1055 }
1056 Ok(PdfValue::String(PdfString(bytes)))
1057 }
1058
1059 fn parse_array(&mut self) -> PdfResult<PdfValue> {
1060 self.position += 1;
1061 let mut values = Vec::new();
1062 loop {
1063 self.skip_ws_and_comments();
1064 match self.current() {
1065 Some(b']') => {
1066 self.position += 1;
1067 break;
1068 }
1069 Some(_) => values.push(self.parse_value()?),
1070 None => return Err(PdfError::Parse("unterminated array".to_string())),
1071 }
1072 }
1073 Ok(PdfValue::Array(values))
1074 }
1075
1076 fn parse_dictionary(&mut self) -> PdfResult<PdfValue> {
1077 self.position += 2;
1078 let mut dictionary = PdfDictionary::new();
1079 loop {
1080 self.skip_ws_and_comments();
1081 if self.current() == Some(b'>') && self.bytes.get(self.position + 1) == Some(&b'>') {
1082 self.position += 2;
1083 break;
1084 }
1085 let key = match self.parse_name()? {
1086 PdfValue::Name(name) => name,
1087 _ => unreachable!(),
1088 };
1089 let value = self.parse_value()?;
1090 dictionary.insert(key, value);
1091 }
1092 Ok(PdfValue::Dictionary(dictionary))
1093 }
1094
1095 fn parse_number_or_reference(&mut self) -> PdfResult<PdfValue> {
1096 let first_token = self.parse_token()?;
1097 if first_token.contains('.') || first_token.contains(['e', 'E']) {
1098 return first_token
1099 .parse::<f64>()
1100 .map(PdfValue::Number)
1101 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")));
1102 }
1103
1104 let checkpoint = self.position;
1105 self.skip_ws_and_comments();
1106 if let Ok(second_token) = self.parse_token() {
1107 self.skip_ws_and_comments();
1108 if self.current() == Some(b'R')
1109 && second_token
1110 .chars()
1111 .all(|character| character.is_ascii_digit())
1112 {
1113 self.position += 1;
1114 return Ok(PdfValue::Reference(ObjectRef::new(
1115 first_token
1116 .parse::<u32>()
1117 .map_err(|_| PdfError::Parse("invalid reference object".to_string()))?,
1118 second_token
1119 .parse::<u16>()
1120 .map_err(|_| PdfError::Parse("invalid reference generation".to_string()))?,
1121 )));
1122 }
1123 }
1124 self.position = checkpoint;
1125 first_token
1126 .parse::<i64>()
1127 .map(PdfValue::Integer)
1128 .or_else(|_| first_token.parse::<f64>().map(PdfValue::Number))
1129 .map_err(|_| PdfError::Parse(format!("invalid number token: {first_token}")))
1130 }
1131}
1132
1133fn is_whitespace(byte: u8) -> bool {
1134 matches!(byte, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x00)
1135}
1136
1137fn is_delimiter(byte: u8) -> bool {
1138 matches!(
1139 byte,
1140 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
1141 )
1142}
1143
1144#[cfg(test)]
1145mod tests {
1146 use super::{parse_pdf, parse_pdf_with_certificate, parse_pdf_with_password};
1147 use crate::error::PdfError;
1148 use crate::types::{PdfObject, PdfValue};
1149
1150 #[test]
1151 fn parses_simple_pdf_fixture() {
1152 let bytes = include_bytes!("../../../tests/fixtures/simple-text.pdf");
1153 let document = parse_pdf(bytes).expect("fixture should parse");
1154 assert_eq!(document.pages.len(), 1);
1155 }
1156
1157 #[test]
1158 fn parses_incremental_update_fixture() {
1159 let bytes = include_bytes!("../../../tests/fixtures/incremental-update.pdf");
1160 let document = parse_pdf(bytes).expect("incremental fixture should parse");
1161 assert_eq!(document.pages.len(), 1);
1162
1163 let content_refs = &document.pages[0].content_refs;
1166 assert!(!content_refs.is_empty());
1167 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
1168 let stream_data = match content_obj {
1169 PdfObject::Stream(stream) => String::from_utf8_lossy(&stream.data),
1170 _ => panic!("expected stream object for page content"),
1171 };
1172 assert!(
1173 stream_data.contains("Updated Secret"),
1174 "content stream should contain updated text"
1175 );
1176 assert!(
1177 !stream_data.contains("Original Secret"),
1178 "content stream should not contain original text"
1179 );
1180 }
1181
1182 #[test]
1183 fn circular_prev_chain_does_not_loop() {
1184 let mut pdf = Vec::new();
1188 pdf.extend_from_slice(b"%PDF-1.4\n");
1189
1190 let obj1_offset = pdf.len();
1192 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1193
1194 let obj2_offset = pdf.len();
1196 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1197
1198 let xref_offset = pdf.len();
1199 pdf.extend_from_slice(b"xref\n0 3\n");
1200 pdf.extend_from_slice(b"0000000000 65535 f \n");
1201 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
1202 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
1203 pdf.extend_from_slice(b"trailer\n");
1204 pdf.extend_from_slice(
1206 format!("<< /Size 3 /Root 1 0 R /Prev {} >>\n", xref_offset).as_bytes(),
1207 );
1208 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
1209
1210 let document = parse_pdf(&pdf).expect("circular Prev should be tolerated");
1211 assert_eq!(document.pages.len(), 0);
1212 }
1213
1214 #[test]
1215 fn stream_length_indirect_reference_is_resolved() {
1216 let payload = b"--endstream--HIDDEN";
1221 let mut pdf = Vec::new();
1222 pdf.extend_from_slice(b"%PDF-1.4\n");
1223
1224 let obj1_offset = pdf.len();
1225 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1226
1227 let obj2_offset = pdf.len();
1228 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1229
1230 let obj3_offset = pdf.len();
1231 pdf.extend_from_slice(
1232 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> /Contents 4 0 R >>\nendobj\n",
1233 );
1234
1235 let obj4_offset = pdf.len();
1236 pdf.extend_from_slice(b"4 0 obj\n<< /Length 5 0 R >>\nstream\n");
1237 pdf.extend_from_slice(payload);
1238 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1239
1240 let obj5_offset = pdf.len();
1241 pdf.extend_from_slice(format!("5 0 obj\n{}\nendobj\n", payload.len()).as_bytes());
1242
1243 let xref_offset = pdf.len();
1244 pdf.extend_from_slice(b"xref\n0 6\n");
1245 pdf.extend_from_slice(b"0000000000 65535 f \n");
1246 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_offset).as_bytes());
1247 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_offset).as_bytes());
1248 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj3_offset).as_bytes());
1249 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj4_offset).as_bytes());
1250 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj5_offset).as_bytes());
1251 pdf.extend_from_slice(b"trailer\n<< /Size 6 /Root 1 0 R >>\n");
1252 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_offset).as_bytes());
1253
1254 let document = parse_pdf(&pdf).expect("indirect-length fixture should parse");
1255 let content_refs = &document.pages[0].content_refs;
1256 let content_obj = document.file.objects.get(&content_refs[0]).unwrap();
1257 let data = match content_obj {
1258 PdfObject::Stream(stream) => &stream.data,
1259 _ => panic!("expected stream object for page content"),
1260 };
1261 assert_eq!(
1262 data.as_slice(),
1263 payload,
1264 "resolved indirect /Length should yield the exact original payload bytes"
1265 );
1266 }
1267
1268 #[test]
1269 fn parses_uncompressed_xref_stream() {
1270 let mut pdf: Vec<u8> = Vec::new();
1273 pdf.extend_from_slice(b"%PDF-1.5\n");
1274
1275 let obj1_offset = pdf.len();
1276 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1277 let obj2_offset = pdf.len();
1278 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 0 /Kids [] >>\nendobj\n");
1279
1280 let row_for = |t: u8, off: u16, generation: u8| {
1283 let mut row = [0u8; 4];
1284 row[0] = t;
1285 row[1] = (off >> 8) as u8;
1286 row[2] = off as u8;
1287 row[3] = generation;
1288 row
1289 };
1290 let mut body = Vec::new();
1291 body.extend_from_slice(&row_for(0, 0, 0xFF)); body.extend_from_slice(&row_for(1, obj1_offset as u16, 0));
1293 body.extend_from_slice(&row_for(1, obj2_offset as u16, 0));
1294 body.extend_from_slice(&row_for(1, 0, 0)); let xref_obj_offset = pdf.len();
1297 let self_offset = xref_obj_offset as u16;
1299 body[12] = 1;
1300 body[13] = (self_offset >> 8) as u8;
1301 body[14] = self_offset as u8;
1302 body[15] = 0;
1303
1304 let stream_dict = format!(
1305 "<< /Type /XRef /Size 4 /W [1 2 1] /Root 1 0 R /Length {} >>",
1306 body.len()
1307 );
1308 pdf.extend_from_slice(format!("3 0 obj\n{stream_dict}\nstream\n").as_bytes());
1309 pdf.extend_from_slice(&body);
1310 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1311 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", xref_obj_offset).as_bytes());
1312
1313 let document = parse_pdf(&pdf).expect("xref stream fixture should parse");
1314 assert_eq!(document.pages.len(), 0);
1315 assert!(document.file.objects.len() >= 2);
1317 }
1318
1319 #[test]
1320 fn parses_object_stream_via_xref_stream() {
1321 use flate2::{Compression, write::ZlibEncoder};
1322 use std::io::Write;
1323
1324 let mut pdf: Vec<u8> = Vec::new();
1331 pdf.extend_from_slice(b"%PDF-1.5\n");
1332
1333 let obj1_offset = pdf.len();
1334 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1335
1336 let member_payload = b"<< /Type /Pages /Count 0 /Kids [] >>";
1338 let header = b"2 0 ";
1339 let first = header.len();
1340 let mut decompressed = Vec::new();
1341 decompressed.extend_from_slice(header);
1342 decompressed.extend_from_slice(member_payload);
1343
1344 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1345 encoder.write_all(&decompressed).unwrap();
1346 let compressed = encoder.finish().unwrap();
1347
1348 let obj3_offset = pdf.len();
1349 let objstm_dict = format!(
1350 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1351 first,
1352 compressed.len()
1353 );
1354 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1355 pdf.extend_from_slice(&compressed);
1356 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1357
1358 let row_for = |t: u8, a: u32, b: u16| {
1362 let mut row = [0u8; 5];
1363 row[0] = t;
1364 row[1] = (a >> 16) as u8;
1365 row[2] = (a >> 8) as u8;
1366 row[3] = a as u8;
1367 row[4] = b as u8;
1368 row
1369 };
1370
1371 let obj4_offset = pdf.len();
1372 let mut body = Vec::new();
1373 body.extend_from_slice(&row_for(0, 0, 0xFF));
1374 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1375 body.extend_from_slice(&row_for(2, 3, 0));
1376 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1377 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1378
1379 let stream_dict = format!(
1380 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1381 body.len()
1382 );
1383 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1384 pdf.extend_from_slice(&body);
1385 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1386 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1387
1388 let document = parse_pdf(&pdf).expect("ObjStm fixture should parse");
1389 assert_eq!(document.pages.len(), 0);
1390 let pages_ref = document.catalog.pages_ref;
1392 let pages_dict = document.file.get_dictionary(pages_ref).unwrap();
1393 assert_eq!(
1394 pages_dict.get("Type").and_then(|v| v.as_name()),
1395 Some("Pages")
1396 );
1397 }
1398
1399 #[test]
1400 fn rejects_nested_object_stream() {
1401 use flate2::{Compression, write::ZlibEncoder};
1402 use std::io::Write;
1403
1404 let mut pdf: Vec<u8> = Vec::new();
1406 pdf.extend_from_slice(b"%PDF-1.5\n");
1407
1408 let obj1_offset = pdf.len();
1409 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1410
1411 let member_payload = b"<< /Type /ObjStm /N 0 /First 0 /Length 0 >>";
1412 let header = b"2 0 ";
1413 let first = header.len();
1414 let mut decompressed = Vec::new();
1415 decompressed.extend_from_slice(header);
1416 decompressed.extend_from_slice(member_payload);
1417
1418 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1419 encoder.write_all(&decompressed).unwrap();
1420 let compressed = encoder.finish().unwrap();
1421
1422 let obj3_offset = pdf.len();
1423 let objstm_dict = format!(
1424 "<< /Type /ObjStm /N 1 /First {} /Filter /FlateDecode /Length {} >>",
1425 first,
1426 compressed.len()
1427 );
1428 pdf.extend_from_slice(format!("3 0 obj\n{objstm_dict}\nstream\n").as_bytes());
1429 pdf.extend_from_slice(&compressed);
1430 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1431
1432 let row_for = |t: u8, a: u32, b: u16| {
1433 let mut row = [0u8; 5];
1434 row[0] = t;
1435 row[1] = (a >> 16) as u8;
1436 row[2] = (a >> 8) as u8;
1437 row[3] = a as u8;
1438 row[4] = b as u8;
1439 row
1440 };
1441
1442 let obj4_offset = pdf.len();
1443 let mut body = Vec::new();
1444 body.extend_from_slice(&row_for(0, 0, 0xFF));
1445 body.extend_from_slice(&row_for(1, obj1_offset as u32, 0));
1446 body.extend_from_slice(&row_for(2, 3, 0));
1447 body.extend_from_slice(&row_for(1, obj3_offset as u32, 0));
1448 body.extend_from_slice(&row_for(1, obj4_offset as u32, 0));
1449
1450 let stream_dict = format!(
1451 "<< /Type /XRef /Size 5 /W [1 3 1] /Root 1 0 R /Length {} >>",
1452 body.len()
1453 );
1454 pdf.extend_from_slice(format!("4 0 obj\n{stream_dict}\nstream\n").as_bytes());
1455 pdf.extend_from_slice(&body);
1456 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1457 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF\n", obj4_offset).as_bytes());
1458
1459 match parse_pdf(&pdf) {
1460 Err(PdfError::Unsupported(message)) => {
1461 assert!(message.contains("nested object streams"), "got: {message}")
1462 }
1463 other => panic!("expected Unsupported, got: {other:?}"),
1464 }
1465 }
1466
1467 fn build_rc4_encrypted_pdf(
1473 user_password: &[u8],
1474 owner_password: &[u8],
1475 ) -> (Vec<u8>, &'static [u8]) {
1476 use crate::crypto::SecurityRevision;
1477 use crate::crypto::test_helpers::{
1478 compute_file_key, compute_o, compute_u_r3, object_key, rc4,
1479 };
1480
1481 let id_first: [u8; 16] = [
1482 0x6e, 0x05, 0xb1, 0x20, 0x63, 0x94, 0x69, 0x1f, 0x22, 0x2c, 0x32, 0xac, 0x61, 0x8b,
1483 0xe6, 0x8d,
1484 ];
1485 let permissions: i32 = -4;
1486 let key_length_bytes = 16;
1487
1488 let owner_entry = compute_o(
1489 owner_password,
1490 user_password,
1491 SecurityRevision::R3,
1492 key_length_bytes,
1493 );
1494 let file_key = compute_file_key(
1495 user_password,
1496 &owner_entry,
1497 permissions,
1498 &id_first,
1499 key_length_bytes,
1500 );
1501 let u_entry = compute_u_r3(&file_key, &id_first);
1502
1503 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1504 let mut out = Vec::with_capacity(bytes.len() + 2);
1505 out.push(b'(');
1506 for &byte in bytes {
1507 match byte {
1508 b'(' | b')' | b'\\' => {
1509 out.push(b'\\');
1510 out.push(byte);
1511 }
1512 _ => out.push(byte),
1513 }
1514 }
1515 out.push(b')');
1516 out
1517 };
1518
1519 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(CIPHERED SECRET) Tj\nET\n";
1520 let content_cipher = rc4(&object_key(&file_key, 4, 0), content_plain);
1521
1522 let mut pdf: Vec<u8> = Vec::new();
1523 pdf.extend_from_slice(b"%PDF-1.4\n");
1524
1525 let catalog_offset = pdf.len();
1526 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1527
1528 let pages_offset = pdf.len();
1529 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1530
1531 let page_offset = pdf.len();
1532 pdf.extend_from_slice(
1533 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1534 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1535 );
1536
1537 let content_offset = pdf.len();
1538 pdf.extend_from_slice(
1539 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1540 );
1541 pdf.extend_from_slice(&content_cipher);
1542 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1543
1544 let font_offset = pdf.len();
1545 pdf.extend_from_slice(
1546 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1547 /Encoding /WinAnsiEncoding >>\nendobj\n",
1548 );
1549
1550 let encrypt_offset = pdf.len();
1551 pdf.extend_from_slice(b"6 0 obj\n<< /Filter /Standard /V 2 /R 3 /Length 128 ");
1552 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1553 pdf.extend_from_slice(b"/O ");
1554 pdf.extend_from_slice(&escape_literal(&owner_entry));
1555 pdf.extend_from_slice(b" /U ");
1556 pdf.extend_from_slice(&escape_literal(&u_entry));
1557 pdf.extend_from_slice(b" >>\nendobj\n");
1558
1559 let xref_offset = pdf.len();
1560 pdf.extend_from_slice(b"xref\n0 7\n");
1561 pdf.extend_from_slice(b"0000000000 65535 f \n");
1562 for offset in [
1563 catalog_offset,
1564 pages_offset,
1565 page_offset,
1566 content_offset,
1567 font_offset,
1568 encrypt_offset,
1569 ] {
1570 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1571 }
1572 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1573 pdf.extend_from_slice(&escape_literal(&id_first));
1574 pdf.extend_from_slice(&escape_literal(&id_first));
1575 pdf.extend_from_slice(b"] >>\n");
1576 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1577
1578 (pdf, content_plain)
1579 }
1580
1581 fn assert_decrypts_content_stream(document: &crate::document::ParsedDocument, expected: &[u8]) {
1582 assert_eq!(document.pages.len(), 1);
1583 assert!(
1584 !document.file.trailer.contains_key("Encrypt"),
1585 "trailer /Encrypt must be stripped once the document is decrypted in place"
1586 );
1587 let content_ref = document.pages[0].content_refs[0];
1588 let stream = match document.file.get_object(content_ref).unwrap() {
1589 PdfObject::Stream(stream) => stream,
1590 _ => panic!("page content must be a stream"),
1591 };
1592 assert_eq!(stream.data, expected);
1593 }
1594
1595 #[test]
1596 fn parses_rc4_encrypted_pdf_with_empty_password() {
1597 let (pdf, plain) = build_rc4_encrypted_pdf(b"", b"arbitrary-owner-password");
1602 let document = parse_pdf(&pdf).expect("empty-password PDF should decrypt");
1603 assert_decrypts_content_stream(&document, plain);
1604 }
1605
1606 #[test]
1607 fn parses_rc4_encrypted_pdf_with_user_password() {
1608 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1609 let document =
1610 parse_pdf_with_password(&pdf, b"userpw").expect("correct user password should decrypt");
1611 assert_decrypts_content_stream(&document, plain);
1612 }
1613
1614 #[test]
1615 fn parses_rc4_encrypted_pdf_with_owner_password() {
1616 let (pdf, plain) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1617 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1618 .expect("correct owner password should decrypt");
1619 assert_decrypts_content_stream(&document, plain);
1620 }
1621
1622 #[test]
1623 fn rejects_wrong_password_with_invalid_password_error() {
1624 let (pdf, _) = build_rc4_encrypted_pdf(b"userpw", b"ownerpw");
1625 let err =
1626 parse_pdf_with_password(&pdf, b"wrongpw").expect_err("wrong password must not decrypt");
1627 assert_eq!(err, PdfError::InvalidPassword);
1628 }
1629
1630 #[test]
1631 fn parses_rc4_encrypted_pdf_with_utf8_password() {
1632 let password = "pässwörd".as_bytes();
1633 let (pdf, plain) = build_rc4_encrypted_pdf(password, b"ownerpw");
1634 let document =
1635 parse_pdf_with_password(&pdf, password).expect("UTF-8 user password should decrypt");
1636 assert_decrypts_content_stream(&document, plain);
1637 }
1638
1639 fn build_aes_128_encrypted_pdf(
1644 user_password: &[u8],
1645 owner_password: &[u8],
1646 encrypt_metadata: bool,
1647 ) -> (Vec<u8>, &'static [u8]) {
1648 use crate::crypto::SecurityRevision;
1649 use crate::crypto::test_helpers::{
1650 aes_128_cbc_encrypt, compute_file_key_r4, compute_o, compute_u_r3, object_key_aes,
1651 };
1652
1653 let id_first: [u8; 16] = [
1654 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88,
1655 0x99, 0x00,
1656 ];
1657 let permissions: i32 = -4;
1658
1659 let owner_entry = compute_o(owner_password, user_password, SecurityRevision::R4, 16);
1660 let file_key = compute_file_key_r4(
1661 user_password,
1662 &owner_entry,
1663 permissions,
1664 &id_first,
1665 encrypt_metadata,
1666 );
1667 let u_entry = compute_u_r3(&file_key, &id_first);
1668
1669 let content_iv = [0x42u8; 16];
1673 let content_plain: &'static [u8] =
1674 b"BT\n/F1 24 Tf\n72 700 Td\n(AES SECRET REMOVED) Tj\nET\n";
1675 let content_key = object_key_aes(&file_key, 4, 0);
1676 let content_cipher = aes_128_cbc_encrypt(&content_key, &content_iv, content_plain);
1677
1678 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1679 let mut out = Vec::with_capacity(bytes.len() + 2);
1680 out.push(b'(');
1681 for &byte in bytes {
1682 match byte {
1683 b'(' | b')' | b'\\' => {
1684 out.push(b'\\');
1685 out.push(byte);
1686 }
1687 _ => out.push(byte),
1688 }
1689 }
1690 out.push(b')');
1691 out
1692 };
1693
1694 let mut pdf: Vec<u8> = Vec::new();
1695 pdf.extend_from_slice(b"%PDF-1.5\n");
1696
1697 let catalog_offset = pdf.len();
1698 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1699
1700 let pages_offset = pdf.len();
1701 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1702
1703 let page_offset = pdf.len();
1704 pdf.extend_from_slice(
1705 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1706 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1707 );
1708
1709 let content_offset = pdf.len();
1710 pdf.extend_from_slice(
1711 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1712 );
1713 pdf.extend_from_slice(&content_cipher);
1714 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1715
1716 let font_offset = pdf.len();
1717 pdf.extend_from_slice(
1718 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1719 /Encoding /WinAnsiEncoding >>\nendobj\n",
1720 );
1721
1722 let encrypt_offset = pdf.len();
1723 pdf.extend_from_slice(
1724 b"6 0 obj\n<< /Filter /Standard /V 4 /R 4 /Length 128 \
1725 /CF << /StdCF << /CFM /AESV2 /Length 16 /AuthEvent /DocOpen >> >> \
1726 /StmF /StdCF /StrF /StdCF ",
1727 );
1728 pdf.extend_from_slice(format!("/P {permissions} ").as_bytes());
1729 if !encrypt_metadata {
1730 pdf.extend_from_slice(b"/EncryptMetadata false ");
1731 }
1732 pdf.extend_from_slice(b"/O ");
1733 pdf.extend_from_slice(&escape_literal(&owner_entry));
1734 pdf.extend_from_slice(b" /U ");
1735 pdf.extend_from_slice(&escape_literal(&u_entry));
1736 pdf.extend_from_slice(b" >>\nendobj\n");
1737
1738 let xref_offset = pdf.len();
1739 pdf.extend_from_slice(b"xref\n0 7\n");
1740 pdf.extend_from_slice(b"0000000000 65535 f \n");
1741 for offset in [
1742 catalog_offset,
1743 pages_offset,
1744 page_offset,
1745 content_offset,
1746 font_offset,
1747 encrypt_offset,
1748 ] {
1749 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1750 }
1751 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1752 pdf.extend_from_slice(&escape_literal(&id_first));
1753 pdf.extend_from_slice(&escape_literal(&id_first));
1754 pdf.extend_from_slice(b"] >>\n");
1755 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1756
1757 (pdf, content_plain)
1758 }
1759
1760 #[test]
1761 fn parses_aes_128_encrypted_pdf_with_empty_password() {
1762 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"arbitrary-owner-password", true);
1763 let document = parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt");
1764 assert_decrypts_content_stream(&document, plain);
1765 }
1766
1767 #[test]
1768 fn parses_aes_128_encrypted_pdf_with_user_password() {
1769 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1770 let document = parse_pdf_with_password(&pdf, b"userpw")
1771 .expect("correct user password should decrypt AES-128 PDF");
1772 assert_decrypts_content_stream(&document, plain);
1773 }
1774
1775 #[test]
1776 fn parses_aes_128_encrypted_pdf_with_owner_password() {
1777 let (pdf, plain) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1778 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1779 .expect("correct owner password should decrypt AES-128 PDF");
1780 assert_decrypts_content_stream(&document, plain);
1781 }
1782
1783 #[test]
1784 fn aes_128_rejects_wrong_password() {
1785 let (pdf, _) = build_aes_128_encrypted_pdf(b"userpw", b"ownerpw", true);
1786 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1787 .expect_err("wrong password must not decrypt AES-128 PDF");
1788 assert_eq!(err, PdfError::InvalidPassword);
1789 }
1790
1791 fn build_aes_256_encrypted_pdf(
1795 user_password: &[u8],
1796 owner_password: &[u8],
1797 revision: crate::crypto::SecurityRevision,
1798 ) -> (Vec<u8>, &'static [u8]) {
1799 use crate::crypto::test_helpers::{
1800 aes_256_cbc_encrypt, compute_v5_o_and_oe, compute_v5_u_and_ue,
1801 };
1802
1803 let permissions: i32 = -4;
1804 let file_key = [0x13u8; 32];
1805 let u_validation_salt = [0xAAu8; 8];
1806 let u_key_salt = [0xBBu8; 8];
1807 let o_validation_salt = [0xCCu8; 8];
1808 let o_key_salt = [0xDDu8; 8];
1809
1810 let (u_entry, ue_entry) = compute_v5_u_and_ue(
1811 user_password,
1812 &u_validation_salt,
1813 &u_key_salt,
1814 &file_key,
1815 revision,
1816 );
1817 let u_vector: [u8; 48] = u_entry.as_slice().try_into().expect("U is 48 bytes");
1818 let (o_entry, oe_entry) = compute_v5_o_and_oe(
1819 owner_password,
1820 &o_validation_salt,
1821 &o_key_salt,
1822 &u_vector,
1823 &file_key,
1824 revision,
1825 );
1826
1827 let content_iv = [0x42u8; 16];
1828 let content_plain: &'static [u8] = b"BT\n/F1 24 Tf\n72 700 Td\n(AES-256 SECRET) Tj\nET\n";
1829 let content_cipher = aes_256_cbc_encrypt(&file_key, &content_iv, content_plain);
1830
1831 let escape_literal = |bytes: &[u8]| -> Vec<u8> {
1832 let mut out = Vec::with_capacity(bytes.len() + 2);
1833 out.push(b'(');
1834 for &byte in bytes {
1835 match byte {
1836 b'(' | b')' | b'\\' => {
1837 out.push(b'\\');
1838 out.push(byte);
1839 }
1840 _ => out.push(byte),
1841 }
1842 }
1843 out.push(b')');
1844 out
1845 };
1846
1847 let mut pdf: Vec<u8> = Vec::new();
1848 pdf.extend_from_slice(b"%PDF-2.0\n");
1849
1850 let catalog_offset = pdf.len();
1851 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1852
1853 let pages_offset = pdf.len();
1854 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
1855
1856 let page_offset = pdf.len();
1857 pdf.extend_from_slice(
1858 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
1859 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
1860 );
1861
1862 let content_offset = pdf.len();
1863 pdf.extend_from_slice(
1864 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
1865 );
1866 pdf.extend_from_slice(&content_cipher);
1867 pdf.extend_from_slice(b"\nendstream\nendobj\n");
1868
1869 let font_offset = pdf.len();
1870 pdf.extend_from_slice(
1871 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
1872 /Encoding /WinAnsiEncoding >>\nendobj\n",
1873 );
1874
1875 let r_value = match revision {
1876 crate::crypto::SecurityRevision::R5 => 5,
1877 crate::crypto::SecurityRevision::R6 => 6,
1878 _ => panic!("V=5 fixture requires R=5 or R=6"),
1879 };
1880
1881 let encrypt_offset = pdf.len();
1882 pdf.extend_from_slice(
1883 format!(
1884 "6 0 obj\n<< /Filter /Standard /V 5 /R {r_value} /Length 256 \
1885 /CF << /StdCF << /CFM /AESV3 /Length 32 /AuthEvent /DocOpen >> >> \
1886 /StmF /StdCF /StrF /StdCF /P {permissions} "
1887 )
1888 .as_bytes(),
1889 );
1890 pdf.extend_from_slice(b"/O ");
1891 pdf.extend_from_slice(&escape_literal(&o_entry));
1892 pdf.extend_from_slice(b" /U ");
1893 pdf.extend_from_slice(&escape_literal(&u_entry));
1894 pdf.extend_from_slice(b" /OE ");
1895 pdf.extend_from_slice(&escape_literal(&oe_entry));
1896 pdf.extend_from_slice(b" /UE ");
1897 pdf.extend_from_slice(&escape_literal(&ue_entry));
1898 pdf.extend_from_slice(b" >>\nendobj\n");
1899
1900 let xref_offset = pdf.len();
1901 pdf.extend_from_slice(b"xref\n0 7\n");
1902 pdf.extend_from_slice(b"0000000000 65535 f \n");
1903 for offset in [
1904 catalog_offset,
1905 pages_offset,
1906 page_offset,
1907 content_offset,
1908 font_offset,
1909 encrypt_offset,
1910 ] {
1911 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
1912 }
1913 let id_literal: [u8; 16] = [
1916 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE,
1917 0xFF, 0x00,
1918 ];
1919 pdf.extend_from_slice(b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [");
1920 pdf.extend_from_slice(&escape_literal(&id_literal));
1921 pdf.extend_from_slice(&escape_literal(&id_literal));
1922 pdf.extend_from_slice(b"] >>\n");
1923 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
1924
1925 (pdf, content_plain)
1926 }
1927
1928 #[test]
1929 fn parses_aes_256_r6_encrypted_pdf_with_user_password() {
1930 let (pdf, plain) =
1931 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1932 let document = parse_pdf_with_password(&pdf, b"userpw")
1933 .expect("correct user password should decrypt AES-256 R=6 PDF");
1934 assert_decrypts_content_stream(&document, plain);
1935 }
1936
1937 #[test]
1938 fn parses_aes_256_r6_encrypted_pdf_with_owner_password() {
1939 let (pdf, plain) =
1940 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1941 let document = parse_pdf_with_password(&pdf, b"ownerpw")
1942 .expect("correct owner password should decrypt AES-256 R=6 PDF");
1943 assert_decrypts_content_stream(&document, plain);
1944 }
1945
1946 #[test]
1947 fn parses_aes_256_r5_encrypted_pdf_with_empty_password() {
1948 let (pdf, plain) =
1949 build_aes_256_encrypted_pdf(b"", b"ownerpw", crate::crypto::SecurityRevision::R5);
1950 let document = parse_pdf(&pdf).expect("empty-password AES-256 R=5 PDF should decrypt");
1951 assert_decrypts_content_stream(&document, plain);
1952 }
1953
1954 #[test]
1955 fn aes_256_rejects_wrong_password() {
1956 let (pdf, _) =
1957 build_aes_256_encrypted_pdf(b"userpw", b"ownerpw", crate::crypto::SecurityRevision::R6);
1958 let err = parse_pdf_with_password(&pdf, b"wrongpw")
1959 .expect_err("wrong password must not decrypt AES-256 PDF");
1960 assert_eq!(err, PdfError::InvalidPassword);
1961 }
1962
1963 #[test]
1964 fn parses_aes_128_with_encrypt_metadata_false() {
1965 let (pdf, plain) = build_aes_128_encrypted_pdf(b"", b"ownerpw", false);
1969 let document =
1970 parse_pdf(&pdf).expect("empty-password AES-128 PDF should decrypt with metadata off");
1971 assert_decrypts_content_stream(&document, plain);
1972 }
1973
1974 #[test]
1975 fn decryption_drops_original_encrypt_dictionary_object() {
1976 let (pdf, _) = build_aes_128_encrypted_pdf(b"", b"ownerpw", true);
1981 let document = parse_pdf(&pdf).expect("encrypted PDF should decrypt");
1982 for (object_ref, object) in &document.file.objects {
1983 if let PdfObject::Value(PdfValue::Dictionary(dict)) = object {
1984 let has_o = dict.contains_key("O");
1985 let has_u = dict.contains_key("U");
1986 let has_filter_standard =
1987 dict.get("Filter").and_then(PdfValue::as_name) == Some("Standard");
1988 assert!(
1989 !(has_o && has_u && has_filter_standard),
1990 "Encrypt dictionary at {} {} survived parse",
1991 object_ref.object_number,
1992 object_ref.generation
1993 );
1994 }
1995 }
1996 }
1997
1998 #[test]
1999 fn materialize_drops_objstm_containers() {
2000 let bytes = include_bytes!("../../../tests/fixtures/xref-object-stream.pdf");
2005 let document = parse_pdf(bytes).expect("xref+ObjStm fixture should parse");
2006 for (object_ref, object) in &document.file.objects {
2007 if let PdfObject::Stream(stream) = object {
2008 let type_name = stream.dict.get("Type").and_then(PdfValue::as_name);
2009 assert_ne!(
2010 type_name,
2011 Some("ObjStm"),
2012 "ObjStm container at {} {} survived parse",
2013 object_ref.object_number,
2014 object_ref.generation
2015 );
2016 }
2017 }
2018 }
2019
2020 struct PubSecFixture {
2025 pdf: Vec<u8>,
2026 cert_der: Vec<u8>,
2027 private_key_der: Vec<u8>,
2028 plaintext: Vec<u8>,
2029 }
2030
2031 fn build_pubsec_encrypted_pdf(sub_filter: &str) -> PubSecFixture {
2038 use cms::builder::{
2039 ContentEncryptionAlgorithm, EnvelopedDataBuilder, KeyEncryptionInfo,
2040 KeyTransRecipientInfoBuilder,
2041 };
2042 use cms::cert::IssuerAndSerialNumber;
2043 use cms::content_info::ContentInfo;
2044 use cms::enveloped_data::RecipientIdentifier;
2045 use const_oid::ObjectIdentifier;
2046 use der::asn1::{Any, PrintableString, SetOfVec};
2047 use der::{Decode, Encode};
2048 use rand_chacha::ChaCha8Rng;
2049 use rand_core::SeedableRng;
2050 use rsa::pkcs1v15::SigningKey;
2051 use rsa::pkcs8::{EncodePrivateKey, EncodePublicKey};
2052 use rsa::{RsaPrivateKey, RsaPublicKey};
2053 use sha2::Sha256;
2054 use spki::SubjectPublicKeyInfoOwned;
2055 use std::time::Duration;
2056 use x509_cert::Certificate;
2057 use x509_cert::attr::AttributeTypeAndValue;
2058 use x509_cert::builder::{Builder, CertificateBuilder, Profile};
2059 use x509_cert::name::{Name, RdnSequence, RelativeDistinguishedName};
2060 use x509_cert::serial_number::SerialNumber;
2061 use x509_cert::time::Validity;
2062
2063 let mut rng = ChaCha8Rng::from_seed([0x42u8; 32]);
2064 let private_key = RsaPrivateKey::new(&mut rng, 2048).expect("RSA-2048 keygen must succeed");
2065 let public_key = RsaPublicKey::from(&private_key);
2066 let private_key_der = private_key
2067 .to_pkcs8_der()
2068 .expect("PKCS#8 encode")
2069 .as_bytes()
2070 .to_vec();
2071
2072 let serial_number = SerialNumber::from(0x01020304u32);
2074 let validity = Validity::from_now(Duration::from_secs(3600 * 24 * 30))
2075 .expect("validity computation must succeed");
2076 let cn = AttributeTypeAndValue {
2077 oid: const_oid::db::rfc4519::CN,
2078 value: Any::from(
2079 &PrintableString::new(b"open-redact-pdf-test-recipient").expect("printable string"),
2080 ),
2081 };
2082 let rdn_set = SetOfVec::try_from(vec![cn]).expect("rdn set");
2083 let mut subject = RdnSequence::default();
2084 subject.0.push(RelativeDistinguishedName::from(rdn_set));
2085 let subject_name =
2086 Name::from_der(&subject.to_der().expect("subject encode")).expect("subject re-decode");
2087
2088 let signer: SigningKey<Sha256> = SigningKey::new(private_key.clone());
2089 let pub_key_der = public_key.to_public_key_der().expect("RSA public key DER");
2090 let pub_key_info =
2091 SubjectPublicKeyInfoOwned::try_from(pub_key_der.as_bytes()).expect("SPKI from DER");
2092 let cert_builder = CertificateBuilder::new(
2093 Profile::Root,
2094 serial_number.clone(),
2095 validity,
2096 subject_name.clone(),
2097 pub_key_info.clone(),
2098 &signer,
2099 )
2100 .expect("CertificateBuilder::new");
2101 let certificate: Certificate = cert_builder.build().expect("cert build");
2102 let cert_der = certificate.to_der().expect("cert DER");
2103
2104 let mut seed_and_perms = [0u8; 24];
2106 rsa::rand_core::RngCore::fill_bytes(&mut rng, &mut seed_and_perms);
2107 seed_and_perms[20..24].copy_from_slice(&[0xFFu8, 0xFF, 0xFF, 0xFF]);
2108
2109 let recipient_identifier =
2111 RecipientIdentifier::IssuerAndSerialNumber(IssuerAndSerialNumber {
2112 issuer: certificate.tbs_certificate.issuer.clone(),
2113 serial_number: certificate.tbs_certificate.serial_number.clone(),
2114 });
2115 let recipient_info_builder = KeyTransRecipientInfoBuilder::new(
2116 recipient_identifier,
2117 KeyEncryptionInfo::Rsa(public_key.clone()),
2118 &mut rng,
2119 )
2120 .expect("KeyTransRecipientInfoBuilder::new");
2121
2122 let mut enveloped_builder = EnvelopedDataBuilder::new(
2123 None,
2124 &seed_and_perms,
2125 ContentEncryptionAlgorithm::Aes128Cbc,
2126 None,
2127 )
2128 .expect("EnvelopedDataBuilder::new");
2129 let mut envelope_rng = ChaCha8Rng::from_seed([0xA5u8; 32]);
2134 let enveloped_data = enveloped_builder
2135 .add_recipient_info(recipient_info_builder)
2136 .expect("add_recipient_info")
2137 .build_with_rng(&mut envelope_rng)
2138 .expect("build_with_rng");
2139
2140 const ID_ENVELOPED: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.113549.1.7.3");
2142 let enveloped_der = enveloped_data.to_der().expect("envelope DER");
2143 let content_info = ContentInfo {
2144 content_type: ID_ENVELOPED,
2145 content: Any::from_der(&enveloped_der).expect("Any from envelope DER"),
2146 };
2147 let recipient_blob = content_info.to_der().expect("content_info DER");
2148
2149 let plaintext_content: Vec<u8> =
2151 b"BT\n/F1 24 Tf\n72 700 Td\n(PUBSEC SECRET) Tj\nET\n".to_vec();
2152 let (file_key, content_cipher, sub_filter_str, v_value, r_value, length_bits, cfm_name) =
2153 match sub_filter {
2154 "adbe.pkcs7.s5" => {
2155 use crate::crypto::test_helpers::aes_256_cbc_encrypt;
2156 use sha2::Digest as _;
2157 let mut hasher = sha2::Sha256::new();
2158 hasher.update(&seed_and_perms[..20]);
2159 hasher.update(&recipient_blob);
2160 hasher.update(&seed_and_perms[20..24]);
2161 let file_key: [u8; 32] = hasher.finalize().into();
2162 let iv = [0x55u8; 16];
2163 let cipher = aes_256_cbc_encrypt(&file_key, &iv, &plaintext_content);
2164 (
2165 file_key.to_vec(),
2166 cipher,
2167 "adbe.pkcs7.s5",
2168 5i32,
2169 5i32,
2170 256i32,
2171 "AESV3",
2172 )
2173 }
2174 "adbe.pkcs7.s4" => {
2175 use crate::crypto::test_helpers::{aes_128_cbc_encrypt, object_key_aes};
2176 use sha1::{Digest as _, Sha1};
2177 let mut hasher = Sha1::new();
2178 hasher.update(&seed_and_perms[..20]);
2179 hasher.update(&recipient_blob);
2180 hasher.update(&seed_and_perms[20..24]);
2181 let hash = hasher.finalize();
2182 let file_key: [u8; 16] = hash[..16].try_into().expect("16 bytes");
2183 let object_key = object_key_aes(&file_key, 4, 0);
2184 let iv = [0x77u8; 16];
2185 let cipher = aes_128_cbc_encrypt(&object_key, &iv, &plaintext_content);
2186 (
2187 file_key.to_vec(),
2188 cipher,
2189 "adbe.pkcs7.s4",
2190 4i32,
2191 4i32,
2192 128i32,
2193 "AESV2",
2194 )
2195 }
2196 other => panic!("unsupported sub_filter for fixture builder: {other}"),
2197 };
2198 let _ = (file_key, length_bits); let blob_hex_string = {
2203 let mut s = String::from("<");
2204 for byte in &recipient_blob {
2205 s.push_str(&format!("{byte:02X}"));
2206 }
2207 s.push('>');
2208 s
2209 };
2210
2211 let mut pdf: Vec<u8> = Vec::new();
2212 pdf.extend_from_slice(b"%PDF-1.7\n");
2213
2214 let catalog_offset = pdf.len();
2215 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
2216 let pages_offset = pdf.len();
2217 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Count 1 /Kids [3 0 R] >>\nendobj\n");
2218 let page_offset = pdf.len();
2219 pdf.extend_from_slice(
2220 b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
2221 /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>\nendobj\n",
2222 );
2223 let content_offset = pdf.len();
2224 pdf.extend_from_slice(
2225 format!("4 0 obj\n<< /Length {} >>\nstream\n", content_cipher.len()).as_bytes(),
2226 );
2227 pdf.extend_from_slice(&content_cipher);
2228 pdf.extend_from_slice(b"\nendstream\nendobj\n");
2229 let font_offset = pdf.len();
2230 pdf.extend_from_slice(
2231 b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica \
2232 /Encoding /WinAnsiEncoding >>\nendobj\n",
2233 );
2234
2235 let encrypt_offset = pdf.len();
2236 if v_value == 5 {
2237 pdf.extend_from_slice(
2238 format!(
2239 "6 0 obj\n<< /Filter /Adobe.PubSec /SubFilter /{sub_filter_str} \
2240 /V {v_value} /R {r_value} /Length {length_bits} \
2241 /CF << /DefaultCryptFilter << /CFM /{cfm_name} /Length 32 \
2242 /AuthEvent /DocOpen /Recipients [{blob_hex_string}] >> >> \
2243 /StmF /DefaultCryptFilter /StrF /DefaultCryptFilter \
2244 /EncryptMetadata true >>\nendobj\n"
2245 )
2246 .as_bytes(),
2247 );
2248 } else {
2249 pdf.extend_from_slice(
2251 format!(
2252 "6 0 obj\n<< /Filter /Adobe.PubSec /SubFilter /{sub_filter_str} \
2253 /V {v_value} /R {r_value} /Length {length_bits} \
2254 /CF << /DefaultCryptFilter << /CFM /{cfm_name} /Length 16 \
2255 /AuthEvent /DocOpen >> >> \
2256 /StmF /DefaultCryptFilter /StrF /DefaultCryptFilter \
2257 /Recipients [{blob_hex_string}] /EncryptMetadata true >>\nendobj\n"
2258 )
2259 .as_bytes(),
2260 );
2261 }
2262
2263 let xref_offset = pdf.len();
2264 pdf.extend_from_slice(b"xref\n0 7\n");
2265 pdf.extend_from_slice(b"0000000000 65535 f \n");
2266 for offset in [
2267 catalog_offset,
2268 pages_offset,
2269 page_offset,
2270 content_offset,
2271 font_offset,
2272 encrypt_offset,
2273 ] {
2274 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
2275 }
2276 pdf.extend_from_slice(
2277 b"trailer\n<< /Size 7 /Root 1 0 R /Encrypt 6 0 R /ID [<00112233445566778899AABBCCDDEEFF><00112233445566778899AABBCCDDEEFF>] >>\n",
2278 );
2279 pdf.extend_from_slice(format!("startxref\n{xref_offset}\n%%EOF\n").as_bytes());
2280
2281 PubSecFixture {
2282 pdf,
2283 cert_der,
2284 private_key_der,
2285 plaintext: plaintext_content,
2286 }
2287 }
2288
2289 #[test]
2290 fn parses_pubsec_s5_encrypted_pdf() {
2291 let fixture = build_pubsec_encrypted_pdf("adbe.pkcs7.s5");
2292 let document =
2293 parse_pdf_with_certificate(&fixture.pdf, &fixture.cert_der, &fixture.private_key_der)
2294 .expect("PubSec s5 PDF should decrypt with matching certificate");
2295 assert_decrypts_content_stream(&document, &fixture.plaintext);
2296 }
2297
2298 #[test]
2299 fn parses_pubsec_s4_encrypted_pdf() {
2300 let fixture = build_pubsec_encrypted_pdf("adbe.pkcs7.s4");
2301 let document =
2302 parse_pdf_with_certificate(&fixture.pdf, &fixture.cert_der, &fixture.private_key_der)
2303 .expect("PubSec s4 PDF should decrypt with matching certificate");
2304 assert_decrypts_content_stream(&document, &fixture.plaintext);
2305 }
2306
2307 #[test]
2308 fn pubsec_rejects_password_credential() {
2309 let fixture = build_pubsec_encrypted_pdf("adbe.pkcs7.s5");
2310 let err = parse_pdf_with_password(&fixture.pdf, b"any-password")
2311 .expect_err("PubSec PDF must reject a password credential");
2312 match err {
2313 PdfError::Unsupported(message) => {
2314 assert!(
2315 message.contains("certificate"),
2316 "error should mention certificate, got: {message}"
2317 );
2318 }
2319 other => panic!("expected Unsupported, got {other:?}"),
2320 }
2321 }
2322
2323 #[test]
2324 fn pubsec_s5_rejects_unknown_certificate() {
2325 use der::asn1::{Any, PrintableString, SetOfVec};
2329 use der::{Decode, Encode};
2330 use rand_chacha::ChaCha8Rng;
2331 use rand_core::SeedableRng;
2332 use rsa::RsaPrivateKey;
2333 use rsa::pkcs1v15::SigningKey;
2334 use rsa::pkcs8::{EncodePrivateKey, EncodePublicKey};
2335 use sha2::Sha256;
2336 use spki::SubjectPublicKeyInfoOwned;
2337 use std::time::Duration;
2338 use x509_cert::attr::AttributeTypeAndValue;
2339 use x509_cert::builder::{Builder, CertificateBuilder, Profile};
2340 use x509_cert::name::{Name, RdnSequence, RelativeDistinguishedName};
2341 use x509_cert::serial_number::SerialNumber;
2342 use x509_cert::time::Validity;
2343
2344 let fixture = build_pubsec_encrypted_pdf("adbe.pkcs7.s5");
2345
2346 let mut rng = ChaCha8Rng::from_seed([0x99u8; 32]);
2348 let other_private = RsaPrivateKey::new(&mut rng, 2048).expect("other RSA-2048 keygen");
2349 let other_public = rsa::RsaPublicKey::from(&other_private);
2350 let other_pkcs8 = other_private
2351 .to_pkcs8_der()
2352 .expect("PKCS#8 encode")
2353 .as_bytes()
2354 .to_vec();
2355
2356 let cn = AttributeTypeAndValue {
2357 oid: const_oid::db::rfc4519::CN,
2358 value: Any::from(&PrintableString::new(b"unrelated-cert").expect("printable string")),
2359 };
2360 let rdn_set = SetOfVec::try_from(vec![cn]).expect("rdn set");
2361 let mut subject = RdnSequence::default();
2362 subject.0.push(RelativeDistinguishedName::from(rdn_set));
2363 let subject_name =
2364 Name::from_der(&subject.to_der().expect("subject encode")).expect("subject re-decode");
2365 let signer: SigningKey<Sha256> = SigningKey::new(other_private.clone());
2366 let other_pub_der = other_public
2367 .to_public_key_der()
2368 .expect("RSA public key DER");
2369 let pub_key_info =
2370 SubjectPublicKeyInfoOwned::try_from(other_pub_der.as_bytes()).expect("SPKI from DER");
2371 let cert_builder = CertificateBuilder::new(
2372 Profile::Root,
2373 SerialNumber::from(0x55u32),
2374 Validity::from_now(Duration::from_secs(3600 * 24 * 30)).expect("validity"),
2375 subject_name,
2376 pub_key_info,
2377 &signer,
2378 )
2379 .expect("CertificateBuilder::new");
2380 let other_cert: x509_cert::Certificate = cert_builder.build().expect("cert build");
2381 let other_cert_der = other_cert.to_der().expect("cert DER");
2382
2383 let err = parse_pdf_with_certificate(&fixture.pdf, &other_cert_der, &other_pkcs8)
2384 .expect_err("unrelated certificate must not unlock the PubSec PDF");
2385 assert_eq!(err, PdfError::InvalidPassword);
2386 }
2387
2388 #[test]
2389 fn standard_pdf_rejects_certificate_credential() {
2390 let (pdf, _) = build_aes_128_encrypted_pdf(b"", b"ownerpw", true);
2391 let err = parse_pdf_with_certificate(&pdf, &[0x30, 0x00], &[0x30, 0x00])
2394 .expect_err("Standard-encrypted PDF must reject a certificate credential");
2395 match err {
2396 PdfError::Unsupported(message) => {
2397 assert!(
2398 message.contains("password"),
2399 "error should mention password, got: {message}"
2400 );
2401 }
2402 other => panic!("expected Unsupported, got {other:?}"),
2403 }
2404 }
2405}