1use log::{error, warn};
2use std::cmp;
3use std::collections::{BTreeMap, HashSet};
4use std::convert::TryInto;
5#[cfg(not(feature = "async"))]
6use std::fs::File;
7#[cfg(not(feature = "async"))]
8use std::io::Read;
9use std::path::Path;
10use std::sync::Mutex;
11
12#[cfg(feature = "rayon")]
13use rayon::prelude::*;
14#[cfg(feature = "async")]
15use tokio::fs::File;
16#[cfg(feature = "async")]
17use tokio::io::{AsyncRead, AsyncReadExt};
18#[cfg(feature = "async")]
19use tokio::pin;
20
21use crate::encryption::{self, EncryptionState};
22use crate::error::{ParseError, XrefError};
23use crate::load_options::LoadOptions;
24use crate::object_stream::ObjectStream;
25use crate::parser::{self, ParserInput};
26use crate::xref::XrefEntry;
27use crate::{Dictionary, Document, Error, IncrementalDocument, Object, ObjectId, Result};
28
29type FilterFunc = fn((u32, u16), &mut Object) -> Option<((u32, u16), Object)>;
30
31#[cfg(not(feature = "async"))]
32impl Document {
33 #[inline]
35 pub fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
36 let file = File::open(path)?;
37 let capacity = Some(file.metadata()?.len() as usize);
38 Self::load_internal(file, capacity, None, None)
39 }
40
41 #[inline]
43 pub fn load_with_password<P: AsRef<Path>>(path: P, password: &str) -> Result<Document> {
44 let file = File::open(path)?;
45 let capacity = Some(file.metadata()?.len() as usize);
46 Self::load_internal(file, capacity, None, Some(password.to_string()))
47 }
48
49 #[inline]
50 pub fn load_filtered<P: AsRef<Path>>(path: P, filter_func: FilterFunc) -> Result<Document> {
51 let file = File::open(path)?;
52 let capacity = Some(file.metadata()?.len() as usize);
53 Self::load_internal(file, capacity, Some(filter_func), None)
54 }
55
56 #[inline]
58 pub fn load_from<R: Read>(source: R) -> Result<Document> {
59 Self::load_internal(source, None, None, None)
60 }
61
62 #[inline]
64 pub fn load_from_with_password<R: Read>(source: R, password: &str) -> Result<Document> {
65 Self::load_internal(source, None, None, Some(password.to_string()))
66 }
67
68 fn load_internal<R: Read>(
69 mut source: R,
70 capacity: Option<usize>,
71 filter_func: Option<FilterFunc>,
72 password: Option<String>,
73 ) -> Result<Document> {
74 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
75 source.read_to_end(&mut buffer)?;
76
77 Reader {
78 buffer: &buffer,
79 document: Document::new(),
80 encryption_state: None,
81 raw_objects: BTreeMap::new(),
82 password,
83 options: LoadOptions::default(),
84 }
85 .read(filter_func)
86 }
87
88 pub fn load_mem(buffer: &[u8]) -> Result<Document> {
90 buffer.try_into()
91 }
92
93 pub fn load_mem_with_options(buffer: &[u8], opts: &LoadOptions) -> Result<Document> {
100 if let Some(limit) = opts.max_file_bytes {
103 if buffer.len() > limit {
104 return Err(Error::DocumentTooLarge {
105 size: buffer.len(),
106 limit,
107 });
108 }
109 }
110 Reader {
111 buffer,
112 document: Document::new(),
113 encryption_state: None,
114 raw_objects: BTreeMap::new(),
115 password: None,
116 options: opts.clone(),
117 }
118 .read(None)
119 }
120
121 pub fn load_with_options<P: AsRef<Path>>(path: P, opts: &LoadOptions) -> Result<Document> {
128 let file = File::open(path.as_ref())?;
129 let file_size = file.metadata()?.len() as usize;
130 if let Some(limit) = opts.max_file_bytes {
131 if file_size > limit {
132 return Err(Error::DocumentTooLarge {
133 size: file_size,
134 limit,
135 });
136 }
137 }
138 let mut buffer = Vec::with_capacity(file_size);
139 let mut f = file;
140 f.read_to_end(&mut buffer)?;
141 Reader {
142 buffer: &buffer,
143 document: Document::new(),
144 encryption_state: None,
145 raw_objects: BTreeMap::new(),
146 password: None,
147 options: opts.clone(),
148 }
149 .read(None)
150 }
151
152 pub fn load_mem_with_password(buffer: &[u8], password: &str) -> Result<Document> {
154 Reader {
155 buffer,
156 document: Document::new(),
157 encryption_state: None,
158 raw_objects: BTreeMap::new(),
159 password: Some(password.to_string()),
160 options: LoadOptions::default(),
161 }
162 .read(None)
163 }
164
165 #[inline]
168 pub fn load_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
169 let file = File::open(path)?;
170 let capacity = Some(file.metadata()?.len() as usize);
171 Self::load_metadata_internal(file, capacity, None)
172 }
173
174 #[inline]
176 pub fn load_metadata_with_password<P: AsRef<Path>>(
177 path: P,
178 password: &str,
179 ) -> Result<PdfMetadata> {
180 let file = File::open(path)?;
181 let capacity = Some(file.metadata()?.len() as usize);
182 Self::load_metadata_internal(file, capacity, Some(password.to_string()))
183 }
184
185 #[inline]
187 pub fn load_metadata_from<R: Read>(source: R) -> Result<PdfMetadata> {
188 Self::load_metadata_internal(source, None, None)
189 }
190
191 #[inline]
193 pub fn load_metadata_from_with_password<R: Read>(
194 source: R,
195 password: &str,
196 ) -> Result<PdfMetadata> {
197 Self::load_metadata_internal(source, None, Some(password.to_string()))
198 }
199
200 #[inline]
202 pub fn load_metadata_mem(buffer: &[u8]) -> Result<PdfMetadata> {
203 Reader {
204 buffer,
205 document: Document::new(),
206 encryption_state: None,
207 raw_objects: BTreeMap::new(),
208 password: None,
209 options: LoadOptions::default(),
210 }
211 .read_metadata()
212 }
213
214 #[inline]
216 pub fn load_metadata_mem_with_password(buffer: &[u8], password: &str) -> Result<PdfMetadata> {
217 Reader {
218 buffer,
219 document: Document::new(),
220 encryption_state: None,
221 raw_objects: BTreeMap::new(),
222 password: Some(password.to_string()),
223 options: LoadOptions::default(),
224 }
225 .read_metadata()
226 }
227
228 fn load_metadata_internal<R: Read>(
229 mut source: R,
230 capacity: Option<usize>,
231 password: Option<String>,
232 ) -> Result<PdfMetadata> {
233 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
234 source.read_to_end(&mut buffer)?;
235
236 Reader {
237 buffer: &buffer,
238 document: Document::new(),
239 encryption_state: None,
240 raw_objects: BTreeMap::new(),
241 password,
242 options: LoadOptions::default(),
243 }
244 .read_metadata()
245 }
246}
247
248#[cfg(feature = "async")]
249impl Document {
250 pub async fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
251 let file = File::open(path).await?;
252 let metadata = file.metadata().await?;
253 let capacity = Some(metadata.len() as usize);
254 Self::load_internal(file, capacity, None, None).await
255 }
256
257 pub async fn load_with_password<P: AsRef<Path>>(path: P, password: &str) -> Result<Document> {
259 let file = File::open(path).await?;
260 let metadata = file.metadata().await?;
261 let capacity = Some(metadata.len() as usize);
262 Self::load_internal(file, capacity, None, Some(password.to_string())).await
263 }
264
265 pub async fn load_filtered<P: AsRef<Path>>(
266 path: P,
267 filter_func: FilterFunc,
268 ) -> Result<Document> {
269 let file = File::open(path).await?;
270 let metadata = file.metadata().await?;
271 let capacity = Some(metadata.len() as usize);
272 Self::load_internal(file, capacity, Some(filter_func), None).await
273 }
274
275 async fn load_internal<R: AsyncRead>(
276 source: R,
277 capacity: Option<usize>,
278 filter_func: Option<FilterFunc>,
279 password: Option<String>,
280 ) -> Result<Document> {
281 pin!(source);
282
283 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
284 source.read_to_end(&mut buffer).await?;
285
286 Reader {
287 buffer: &buffer,
288 document: Document::new(),
289 encryption_state: None,
290 raw_objects: BTreeMap::new(),
291 password,
292 options: LoadOptions::default(),
293 }
294 .read(filter_func)
295 }
296
297 pub fn load_mem(buffer: &[u8]) -> Result<Document> {
299 buffer.try_into()
300 }
301
302 pub fn load_mem_with_password(buffer: &[u8], password: &str) -> Result<Document> {
308 Reader {
309 buffer,
310 document: Document::new(),
311 encryption_state: None,
312 raw_objects: BTreeMap::new(),
313 password: Some(password.to_string()),
314 options: LoadOptions::default(),
315 }
316 .read(None)
317 }
318
319 #[inline]
322 pub async fn load_metadata<P: AsRef<Path>>(path: P) -> Result<PdfMetadata> {
323 let file = File::open(path).await?;
324 let metadata = file.metadata().await?;
325 let capacity = Some(metadata.len() as usize);
326 Self::load_metadata_internal(file, capacity, None).await
327 }
328
329 #[inline]
331 pub async fn load_metadata_with_password<P: AsRef<Path>>(
332 path: P,
333 password: &str,
334 ) -> Result<PdfMetadata> {
335 let file = File::open(path).await?;
336 let metadata = file.metadata().await?;
337 let capacity = Some(metadata.len() as usize);
338 Self::load_metadata_internal(file, capacity, Some(password.to_string())).await
339 }
340
341 #[inline]
343 pub async fn load_metadata_from<R: AsyncRead>(source: R) -> Result<PdfMetadata> {
344 Self::load_metadata_internal(source, None, None).await
345 }
346
347 #[inline]
349 pub async fn load_metadata_from_with_password<R: AsyncRead>(
350 source: R,
351 password: &str,
352 ) -> Result<PdfMetadata> {
353 Self::load_metadata_internal(source, None, Some(password.to_string())).await
354 }
355
356 #[inline]
358 pub fn load_metadata_mem(buffer: &[u8]) -> Result<PdfMetadata> {
359 Reader {
360 buffer,
361 document: Document::new(),
362 encryption_state: None,
363 raw_objects: BTreeMap::new(),
364 password: None,
365 options: LoadOptions::default(),
366 }
367 .read_metadata()
368 }
369
370 #[inline]
372 pub fn load_metadata_mem_with_password(buffer: &[u8], password: &str) -> Result<PdfMetadata> {
373 Reader {
374 buffer,
375 document: Document::new(),
376 encryption_state: None,
377 raw_objects: BTreeMap::new(),
378 password: Some(password.to_string()),
379 options: LoadOptions::default(),
380 }
381 .read_metadata()
382 }
383
384 async fn load_metadata_internal<R: AsyncRead>(
385 source: R,
386 capacity: Option<usize>,
387 password: Option<String>,
388 ) -> Result<PdfMetadata> {
389 pin!(source);
390
391 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
392 source.read_to_end(&mut buffer).await?;
393
394 Reader {
395 buffer: &buffer,
396 document: Document::new(),
397 encryption_state: None,
398 raw_objects: BTreeMap::new(),
399 password,
400 options: LoadOptions::default(),
401 }
402 .read_metadata()
403 }
404}
405
406impl TryInto<Document> for &[u8] {
407 type Error = Error;
408
409 fn try_into(self) -> Result<Document> {
410 Reader {
411 buffer: self,
412 document: Document::new(),
413 encryption_state: None,
414 raw_objects: BTreeMap::new(),
415 password: None,
416 options: LoadOptions::default(),
417 }
418 .read(None)
419 }
420}
421
422#[cfg(not(feature = "async"))]
423impl IncrementalDocument {
424 #[inline]
426 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
427 let file = File::open(path)?;
428 let capacity = Some(file.metadata()?.len() as usize);
429 Self::load_internal(file, capacity)
430 }
431
432 #[inline]
434 pub fn load_from<R: Read>(source: R) -> Result<Self> {
435 Self::load_internal(source, None)
436 }
437
438 fn load_internal<R: Read>(mut source: R, capacity: Option<usize>) -> Result<Self> {
439 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
440 source.read_to_end(&mut buffer)?;
441
442 let document = Reader {
443 buffer: &buffer,
444 document: Document::new(),
445 encryption_state: None,
446 raw_objects: BTreeMap::new(),
447 password: None,
448 options: LoadOptions::default(),
449 }
450 .read(None)?;
451
452 Ok(IncrementalDocument::create_from(buffer, document))
453 }
454
455 pub fn load_mem(buffer: &[u8]) -> Result<Document> {
457 buffer.try_into()
458 }
459}
460
461#[cfg(feature = "async")]
462impl IncrementalDocument {
463 #[inline]
465 pub async fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
466 let file = File::open(path).await?;
467 let metadata = file.metadata().await?;
468 let capacity = Some(metadata.len() as usize);
469 Self::load_internal(file, capacity).await
470 }
471
472 #[inline]
474 pub async fn load_from<R: AsyncRead>(source: R) -> Result<Self> {
475 Self::load_internal(source, None).await
476 }
477
478 async fn load_internal<R: AsyncRead>(source: R, capacity: Option<usize>) -> Result<Self> {
479 pin!(source);
480
481 let mut buffer = capacity.map(Vec::with_capacity).unwrap_or_default();
482 source.read_to_end(&mut buffer).await?;
483
484 let document = Reader {
485 buffer: &buffer,
486 document: Document::new(),
487 encryption_state: None,
488 raw_objects: BTreeMap::new(),
489 password: None,
490 options: LoadOptions::default(),
491 }
492 .read(None)?;
493
494 Ok(IncrementalDocument::create_from(buffer, document))
495 }
496
497 pub fn load_mem(buffer: &[u8]) -> Result<Document> {
499 buffer.try_into()
500 }
501}
502
503impl TryInto<IncrementalDocument> for &[u8] {
504 type Error = Error;
505
506 fn try_into(self) -> Result<IncrementalDocument> {
507 let document = Reader {
508 buffer: self,
509 document: Document::new(),
510 encryption_state: None,
511 raw_objects: BTreeMap::new(),
512 password: None,
513 options: LoadOptions::default(),
514 }
515 .read(None)?;
516
517 Ok(IncrementalDocument::create_from(self.to_vec(), document))
518 }
519}
520
521pub struct Reader<'a> {
522 pub buffer: &'a [u8],
523 pub document: Document,
524 pub encryption_state: Option<EncryptionState>,
525 pub raw_objects: BTreeMap<ObjectId, Vec<u8>>, pub password: Option<String>, pub options: LoadOptions,
528}
529
530pub const MAX_BRACKET: usize = 100;
532
533#[derive(Debug, Clone)]
536pub struct PdfMetadata {
537 pub title: Option<String>,
539 pub author: Option<String>,
541 pub subject: Option<String>,
543 pub keywords: Option<String>,
545 pub creator: Option<String>,
547 pub producer: Option<String>,
549 pub creation_date: Option<String>,
551 pub modification_date: Option<String>,
553 pub page_count: u32,
555 pub version: String,
557}
558
559struct InfoMetadata {
560 title: Option<String>,
561 author: Option<String>,
562 subject: Option<String>,
563 keywords: Option<String>,
564 creator: Option<String>,
565 producer: Option<String>,
566 creation_date: Option<String>,
567 modification_date: Option<String>,
568}
569
570impl Reader<'_> {
571 pub fn read_metadata(mut self) -> Result<PdfMetadata> {
576 let offset = self
577 .buffer
578 .windows(5)
579 .position(|w| w == b"%PDF-")
580 .unwrap_or(0);
581 self.buffer = &self.buffer[offset..];
582
583 let version = parser::header(ParserInput::new_extra(self.buffer, "header"))
584 .ok_or(ParseError::InvalidFileHeader)?;
585
586 let xref_start = Self::get_xref_start(self.buffer)?;
587 if xref_start > self.buffer.len() {
588 return Err(Error::Xref(XrefError::Start));
589 }
590
591 let (mut xref, mut trailer) = parser::xref_and_trailer(
592 ParserInput::new_extra(&self.buffer[xref_start..], "xref"),
593 &self,
594 )?;
595
596 let mut already_seen = HashSet::new();
597 let mut prev_xref_start = trailer.remove(b"Prev");
598 while let Some(prev) = prev_xref_start.and_then(|offset| offset.as_i64().ok()) {
599 if already_seen.contains(&prev) {
600 break;
601 }
602 already_seen.insert(prev);
603 if prev < 0 || prev as usize > self.buffer.len() {
604 return Err(Error::Xref(XrefError::PrevStart));
605 }
606
607 let (prev_xref, prev_trailer) = parser::xref_and_trailer(
608 ParserInput::new_extra(&self.buffer[prev as usize..], ""),
609 &self,
610 )?;
611 xref.merge(prev_xref);
612
613 let prev_xref_stream_start = trailer.remove(b"XRefStm");
614 if let Some(prev) = prev_xref_stream_start.and_then(|offset| offset.as_i64().ok()) {
615 if prev < 0 || prev as usize > self.buffer.len() {
616 return Err(Error::Xref(XrefError::StreamStart));
617 }
618
619 let (prev_xref, _) = parser::xref_and_trailer(
620 ParserInput::new_extra(&self.buffer[prev as usize..], ""),
621 &self,
622 )?;
623 xref.merge(prev_xref);
624 }
625
626 prev_xref_start = prev_trailer.get(b"Prev").cloned().ok();
627 }
628 let xref_entry_count = xref
629 .max_id()
630 .checked_add(1)
631 .ok_or(ParseError::InvalidXref)?;
632 if xref.size != xref_entry_count {
633 warn!(
634 "Size entry of trailer dictionary is {}, correct value is {}.",
635 xref.size, xref_entry_count
636 );
637 xref.size = xref_entry_count;
638 }
639
640 self.document.reference_table = xref;
641 self.document.trailer = trailer.clone();
642
643 if self.document.trailer.get(b"Encrypt").is_ok() {
644 self.setup_encryption_for_metadata()?;
645 }
646
647 let info_metadata = self.extract_info_metadata()?;
648 let page_count = self.extract_page_count()?;
649
650 Ok(PdfMetadata {
651 title: info_metadata.title,
652 author: info_metadata.author,
653 subject: info_metadata.subject,
654 keywords: info_metadata.keywords,
655 creator: info_metadata.creator,
656 producer: info_metadata.producer,
657 creation_date: info_metadata.creation_date,
658 modification_date: info_metadata.modification_date,
659 page_count,
660 version,
661 })
662 }
663
664 fn extract_info_metadata(&self) -> Result<InfoMetadata> {
665 let info_ref = match self.document.trailer.get(b"Info") {
666 Ok(obj) => obj.as_reference().ok(),
667 Err(_) => {
668 return Ok(InfoMetadata {
669 title: None,
670 author: None,
671 subject: None,
672 keywords: None,
673 creator: None,
674 producer: None,
675 creation_date: None,
676 modification_date: None,
677 });
678 }
679 };
680
681 let info_id = match info_ref {
682 Some(id) => id,
683 None => {
684 return Ok(InfoMetadata {
685 title: None,
686 author: None,
687 subject: None,
688 keywords: None,
689 creator: None,
690 producer: None,
691 creation_date: None,
692 modification_date: None,
693 });
694 }
695 };
696
697 let mut already_seen = HashSet::new();
698 let info_obj = match self.get_object(info_id, &mut already_seen) {
699 Ok(obj) => obj,
700 Err(_) => {
701 return Ok(InfoMetadata {
702 title: None,
703 author: None,
704 subject: None,
705 keywords: None,
706 creator: None,
707 producer: None,
708 creation_date: None,
709 modification_date: None,
710 });
711 }
712 };
713
714 let info_dict = match info_obj.as_dict() {
715 Ok(dict) => dict,
716 Err(_) => {
717 return Ok(InfoMetadata {
718 title: None,
719 author: None,
720 subject: None,
721 keywords: None,
722 creator: None,
723 producer: None,
724 creation_date: None,
725 modification_date: None,
726 });
727 }
728 };
729
730 Ok(InfoMetadata {
731 title: Self::extract_string_field(info_dict, b"Title"),
732 author: Self::extract_string_field(info_dict, b"Author"),
733 subject: Self::extract_string_field(info_dict, b"Subject"),
734 keywords: Self::extract_string_field(info_dict, b"Keywords"),
735 creator: Self::extract_string_field(info_dict, b"Creator"),
736 producer: Self::extract_string_field(info_dict, b"Producer"),
737 creation_date: Self::extract_string_field(info_dict, b"CreationDate"),
738 modification_date: Self::extract_string_field(info_dict, b"ModDate"),
739 })
740 }
741
742 fn extract_string_field(dict: &Dictionary, key: &[u8]) -> Option<String> {
743 match dict.get(key) {
744 Ok(obj) => match obj {
745 Object::String(bytes, _) => {
746 let s = if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
747 let utf16_bytes: Vec<u16> = bytes[2..]
748 .chunks_exact(2)
749 .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
750 .collect();
751 String::from_utf16_lossy(&utf16_bytes)
752 } else {
753 String::from_utf8_lossy(bytes).to_string()
754 };
755 Some(s)
756 }
757 _ => None,
758 },
759 Err(_) => None,
760 }
761 }
762
763 fn extract_page_count(&self) -> Result<u32> {
764 let root_ref = match self
765 .document
766 .trailer
767 .get(b"Root")
768 .and_then(Object::as_reference)
769 {
770 Ok(id) => id,
771 Err(_) => return Ok(0),
772 };
773
774 let mut already_seen = HashSet::new();
775 let catalog_obj = match self.get_object(root_ref, &mut already_seen) {
776 Ok(obj) => obj,
777 Err(_) => return Ok(0),
778 };
779
780 let catalog_dict = match catalog_obj.as_dict() {
781 Ok(dict) => dict,
782 Err(_) => return Ok(0),
783 };
784
785 let pages_ref = match catalog_dict.get(b"Pages").and_then(Object::as_reference) {
786 Ok(id) => id,
787 Err(_) => return Ok(0),
788 };
789
790 self.get_pages_tree_count(pages_ref, &mut HashSet::new())
791 .or(Ok(0))
792 }
793
794 fn get_pages_tree_count(
795 &self,
796 pages_id: ObjectId,
797 seen: &mut HashSet<ObjectId>,
798 ) -> Result<u32> {
799 if seen.contains(&pages_id) {
800 return Err(Error::ReferenceCycle(pages_id));
801 }
802 seen.insert(pages_id);
803
804 let mut already_seen = HashSet::new();
805 let pages_obj = match self.get_object(pages_id, &mut already_seen) {
806 Ok(obj) => obj,
807 Err(_) => return Ok(0),
808 };
809
810 let pages_dict = match pages_obj.as_dict() {
811 Ok(dict) => dict,
812 Err(_) => return Ok(0),
813 };
814
815 match pages_dict.get_type() {
816 Ok(type_name) if type_name == b"Page" => Ok(1),
817 Ok(type_name) if type_name == b"Pages" => {
818 if let Ok(count_obj) = pages_dict.get(b"Count") {
819 if let Ok(count) = count_obj.as_i64() {
820 if count >= 0 {
821 return Ok(count as u32);
822 }
823 }
824 }
825
826 let kids = match pages_dict.get(b"Kids").and_then(Object::as_array) {
827 Ok(arr) => arr,
828 Err(_) => return Ok(0),
829 };
830
831 let mut total = 0u32;
832 for kid in kids.iter() {
833 if let Ok(kid_ref) = kid.as_reference() {
834 if let Ok(count) = self.get_pages_tree_count(kid_ref, seen) {
835 total += count;
836 }
837 }
838 }
839 Ok(total)
840 }
841 _ => Ok(1),
842 }
843 }
844
845 pub fn read(mut self, filter_func: Option<FilterFunc>) -> Result<Document> {
847 let offset = self
848 .buffer
849 .windows(5)
850 .position(|w| w == b"%PDF-")
851 .unwrap_or(0);
852 self.buffer = &self.buffer[offset..];
853
854 let version = parser::header(ParserInput::new_extra(self.buffer, "header"))
857 .ok_or(ParseError::InvalidFileHeader)?;
858
859 if let Some(pos) = self.buffer.iter().position(|&byte| byte == b'\n') {
861 if let Some(binary_mark) = parser::binary_mark(ParserInput::new_extra(
862 &self.buffer[pos + 1..],
863 "binary_mark",
864 )) {
865 if binary_mark.iter().all(|&byte| byte >= 128) {
866 self.document.binary_mark = binary_mark;
867 }
868 }
869 }
870
871 let xref_start = Self::get_xref_start(self.buffer)?;
872 if xref_start > self.buffer.len() {
873 return Err(Error::Xref(XrefError::Start));
874 }
875 self.document.xref_start = xref_start;
876
877 let (mut xref, mut trailer) = parser::xref_and_trailer(
878 ParserInput::new_extra(&self.buffer[xref_start..], "xref"),
879 &self,
880 )?;
881
882 let mut already_seen = HashSet::new();
884 let mut prev_xref_start = trailer.remove(b"Prev");
885 while let Some(prev) = prev_xref_start.and_then(|offset| offset.as_i64().ok()) {
886 if already_seen.contains(&prev) {
887 break;
888 }
889 already_seen.insert(prev);
890 if prev < 0 || prev as usize > self.buffer.len() {
891 return Err(Error::Xref(XrefError::PrevStart));
892 }
893
894 let (prev_xref, prev_trailer) = parser::xref_and_trailer(
895 ParserInput::new_extra(&self.buffer[prev as usize..], ""),
896 &self,
897 )?;
898 xref.merge(prev_xref);
899
900 let prev_xref_stream_start = trailer.remove(b"XRefStm");
902 if let Some(prev) = prev_xref_stream_start.and_then(|offset| offset.as_i64().ok()) {
903 if prev < 0 || prev as usize > self.buffer.len() {
904 return Err(Error::Xref(XrefError::StreamStart));
905 }
906
907 let (prev_xref, _) = parser::xref_and_trailer(
908 ParserInput::new_extra(&self.buffer[prev as usize..], ""),
909 &self,
910 )?;
911 xref.merge(prev_xref);
912 }
913
914 prev_xref_start = prev_trailer.get(b"Prev").cloned().ok();
915 }
916 let xref_entry_count = xref
917 .max_id()
918 .checked_add(1)
919 .ok_or(ParseError::InvalidXref)?;
920 if xref.size != xref_entry_count {
921 warn!(
922 "Size entry of trailer dictionary is {}, correct value is {}.",
923 xref.size, xref_entry_count
924 );
925 xref.size = xref_entry_count;
926 }
927
928 self.document.version = version;
929 self.document.max_id = xref.size - 1;
930 self.document.trailer = trailer;
931 self.document.reference_table = xref;
932
933 let is_encrypted = self.document.trailer.get(b"Encrypt").is_ok();
935
936 if is_encrypted {
937 self.load_encrypted_document(filter_func)?;
939 } else {
940 self.load_objects_raw(filter_func)?;
942 }
943
944 Ok(self.document)
945 }
946
947 fn load_encrypted_document(&mut self, _filter_func: Option<FilterFunc>) -> Result<()> {
948 let entries: Vec<_> = self
950 .document
951 .reference_table
952 .entries
953 .iter()
954 .map(|(k, v)| (*k, v.clone()))
955 .collect();
956
957 let mut object_streams = Vec::new();
958
959 for (obj_num, entry) in entries {
960 match entry {
961 XrefEntry::Normal { offset, .. } => {
962 if let Ok((obj_id, raw_bytes)) = self.extract_raw_object(offset as usize) {
963 self.raw_objects.insert(obj_id, raw_bytes);
964 }
965 }
966 XrefEntry::Compressed { container, index } => {
967 object_streams.push((obj_num, container, index));
969 }
970 XrefEntry::Free | XrefEntry::UnusableFree => {
971 }
973 }
974 }
975
976 self.parse_encryption_dictionary()?;
977
978 if self.authenticate_and_setup_encryption(false)?.is_none() {
979 return Ok(());
980 }
981
982 if let Some(ref state) = self.encryption_state {
983 let encrypt_ref = self
984 .document
985 .trailer
986 .get(b"Encrypt")
987 .ok()
988 .and_then(|o| o.as_reference().ok());
989
990 for (obj_id, raw_bytes) in &self.raw_objects {
991 if let Some(enc_ref) = encrypt_ref {
992 if *obj_id == enc_ref {
993 continue;
994 }
995 }
996
997 if let Ok((id, mut obj)) = self.parse_raw_object(raw_bytes) {
998 let _ = encryption::decrypt_object(state, *obj_id, &mut obj);
999 self.document.objects.insert(id, obj);
1000 }
1001 }
1002
1003 let mut streams_to_process: std::collections::HashMap<u32, Vec<(u32, u16)>> =
1004 std::collections::HashMap::new();
1005 for (obj_num, container_id, index) in object_streams {
1006 streams_to_process
1007 .entry(container_id)
1008 .or_default()
1009 .push((obj_num, index));
1010 }
1011
1012 for (container_id, objects_in_stream) in streams_to_process {
1013 if let Some(container_obj) = self.document.objects.get_mut(&(container_id, 0)) {
1014 if let Ok(stream) = container_obj.as_stream_mut() {
1015 match ObjectStream::new(stream) {
1016 Ok(object_stream) => {
1017 for (obj_num, _index) in objects_in_stream {
1018 let obj_id = (obj_num, 0);
1019 if let Some(obj) = object_stream.objects.get(&obj_id) {
1020 self.document.objects.insert(obj_id, obj.clone());
1021 }
1022 }
1023 }
1024 Err(_e) => {}
1025 }
1026 }
1027 }
1028 }
1029
1030 self.document.encryption_state = Some(state.clone());
1031
1032 if let Some(enc_ref) = encrypt_ref {
1033 self.document.objects.remove(&enc_ref);
1034 }
1035 self.document.trailer.remove(b"Encrypt");
1036 }
1037
1038 Ok(())
1039 }
1040
1041 fn parse_raw_object(&self, raw_bytes: &[u8]) -> Result<(ObjectId, Object)> {
1042 parser::indirect_object(
1044 ParserInput::new_extra(raw_bytes, "indirect object"),
1045 0,
1046 None,
1047 self,
1048 &mut HashSet::new(),
1049 )
1050 }
1051
1052 fn load_objects_raw(&mut self, filter_func: Option<FilterFunc>) -> Result<()> {
1053 let is_encrypted = self.document.trailer.get(b"Encrypt").is_ok();
1054 let zero_length_streams = Mutex::new(vec![]);
1055 let object_streams = Mutex::new(vec![]);
1056 let pending_obj_stream_ids: Mutex<Vec<ObjectId>> = Mutex::new(vec![]);
1058 let lazy_objstm = self.options.lazy_objstm;
1060
1061 let entries_filter_map = |(_, entry): (&_, &_)| {
1062 if let XrefEntry::Normal { offset, .. } = *entry {
1063 let result = self.read_object(offset as usize, None, &mut HashSet::new());
1065 let (object_id, mut object) = match result {
1066 Ok(obj) => obj,
1067 Err(e) => {
1068 if is_encrypted {
1070 warn!("Skipping encrypted object at offset {}: {:?}", offset, e);
1072 } else {
1073 error!("Object load error at offset {}: {e:?}", offset);
1074 }
1075 return None;
1076 }
1077 };
1078 if let Some(filter_func) = filter_func {
1079 filter_func(object_id, &mut object)?;
1080 }
1081
1082 if let Ok(ref mut stream) = object.as_stream_mut() {
1083 if stream.dict.has_type(b"ObjStm") && !is_encrypted {
1084 if lazy_objstm {
1085 pending_obj_stream_ids.lock().unwrap().push(object_id);
1089 } else {
1091 if let Ok(obj_stream) = ObjectStream::new(stream) {
1098 let container_id = object_id;
1099 let owned_objects = obj_stream.objects.into_iter().filter(
1100 |(nested_object_id, _)| {
1101 self.document.reference_table.compressed_object_belongs_to(
1102 *nested_object_id,
1103 container_id,
1104 )
1105 },
1106 );
1107 let mut object_streams = object_streams.lock().unwrap();
1108 if let Some(filter_func) = filter_func {
1109 let objects: BTreeMap<(u32, u16), Object> = owned_objects
1110 .filter_map(|(object_id, mut object)| {
1111 filter_func(object_id, &mut object)
1112 })
1113 .collect();
1114 object_streams.extend(objects);
1115 } else {
1116 object_streams.extend(owned_objects);
1117 }
1118 }
1119 return None;
1121 }
1122 } else if stream.content.is_empty() {
1123 let mut zero_length_streams = zero_length_streams.lock().unwrap();
1124 zero_length_streams.push(object_id);
1125 }
1126 }
1127
1128 Some((object_id, object))
1129 } else {
1130 None
1131 }
1132 };
1133
1134 #[cfg(feature = "rayon")]
1135 {
1136 self.document.objects = self
1137 .document
1138 .reference_table
1139 .entries
1140 .par_iter()
1141 .filter_map(entries_filter_map)
1142 .collect();
1143 }
1144 #[cfg(not(feature = "rayon"))]
1145 {
1146 self.document.objects = self
1147 .document
1148 .reference_table
1149 .entries
1150 .iter()
1151 .filter_map(entries_filter_map)
1152 .collect();
1153 }
1154
1155 for (id, entry) in object_streams.into_inner().unwrap() {
1157 self.document.objects.entry(id).or_insert(entry);
1158 }
1159
1160 for object_id in zero_length_streams.into_inner().unwrap() {
1161 let _ = self.read_stream_content(object_id);
1162 }
1163
1164 self.document.pending_obj_streams = pending_obj_stream_ids.into_inner().unwrap();
1167
1168 Ok(())
1169 }
1170
1171 fn read_stream_content(&mut self, object_id: ObjectId) -> Result<()> {
1172 let length = self.get_stream_length(object_id)?;
1173 let stream = self
1174 .document
1175 .get_object_mut(object_id)
1176 .and_then(Object::as_stream_mut)?;
1177 let start = stream
1178 .start_position
1179 .ok_or(Error::InvalidStream("missing start position".to_string()))?;
1180
1181 if length < 0 {
1182 return Err(Error::InvalidStream("negative stream length.".to_string()));
1183 }
1184
1185 let length = usize::try_from(length).map_err(|e| Error::NumericCast(e.to_string()))?;
1186 let end = start + length;
1187
1188 if end > self.buffer.len() {
1189 return Err(Error::InvalidStream(
1190 "stream extends after document end.".to_string(),
1191 ));
1192 }
1193
1194 stream.set_content(self.buffer[start..end].to_vec());
1195 Ok(())
1196 }
1197
1198 fn get_stream_length(&self, object_id: ObjectId) -> Result<i64> {
1199 let object = self.document.get_object(object_id)?;
1200 let stream = object.as_stream()?;
1201 stream
1202 .dict
1203 .get(b"Length")
1204 .and_then(|value| self.document.dereference(value))
1205 .and_then(|(_id, obj)| obj.as_i64())
1206 .inspect_err(|_err| {
1207 error!(
1208 "stream dictionary of '{} {} R' is missing the Length entry",
1209 object_id.0, object_id.1
1210 );
1211 })
1212 }
1213
1214 fn get_offset(&self, id: ObjectId) -> Result<u32> {
1216 let entry = self
1217 .document
1218 .reference_table
1219 .get(id.0)
1220 .ok_or(Error::MissingXrefEntry)?;
1221 match *entry {
1222 XrefEntry::Normal { offset, generation } if generation == id.1 => Ok(offset),
1223 _ => Err(Error::MissingXrefEntry),
1224 }
1225 }
1226
1227 fn get_compressed_object(&self, id: ObjectId) -> Result<Object> {
1229 let entry = self
1230 .document
1231 .reference_table
1232 .get(id.0)
1233 .ok_or(Error::MissingXrefEntry)?;
1234
1235 let container_id = match entry {
1236 XrefEntry::Compressed { container, .. } => *container,
1237 _ => return Err(Error::MissingXrefEntry),
1238 };
1239
1240 let container_id = (container_id, 0);
1241 let mut already_seen = HashSet::new();
1242 let container_obj = self.get_object(container_id, &mut already_seen)?;
1243 let mut container_stream = container_obj.as_stream()?.clone();
1244 let object_stream = ObjectStream::new(&mut container_stream)?;
1245 object_stream
1246 .objects
1247 .get(&id)
1248 .cloned()
1249 .ok_or(Error::MissingXrefEntry)
1250 }
1251
1252 pub fn get_object(&self, id: ObjectId, already_seen: &mut HashSet<ObjectId>) -> Result<Object> {
1253 if already_seen.contains(&id) {
1254 warn!(
1255 "reference cycle detected resolving object {} {}",
1256 id.0, id.1
1257 );
1258 return Err(Error::ReferenceCycle(id));
1259 }
1260 already_seen.insert(id);
1261
1262 if let Some(entry) = self.document.reference_table.get(id.0) {
1263 if matches!(entry, XrefEntry::Compressed { .. }) {
1264 return self.get_compressed_object(id);
1265 }
1266 }
1267
1268 let offset = self.get_offset(id)?;
1269 let (_, mut obj) = self.read_object(offset as usize, Some(id), already_seen)?;
1270
1271 if let Some(ref state) = self.encryption_state {
1272 let encrypt_ref = self
1273 .document
1274 .trailer
1275 .get(b"Encrypt")
1276 .ok()
1277 .and_then(|o| o.as_reference().ok());
1278 if let Some(enc_ref) = encrypt_ref {
1279 if id != enc_ref {
1280 encryption::decrypt_object(state, id, &mut obj).map_err(Error::Decryption)?;
1281 }
1282 }
1283 }
1284
1285 Ok(obj)
1286 }
1287
1288 fn parse_encryption_dictionary(&mut self) -> Result<()> {
1289 if let Ok(encrypt_ref) = self
1290 .document
1291 .trailer
1292 .get(b"Encrypt")
1293 .and_then(|o| o.as_reference())
1294 {
1295 if self.raw_objects.is_empty() {
1296 let offset = self.get_offset(encrypt_ref)?;
1297 let (_, encrypt_obj) =
1298 self.read_object(offset as usize, Some(encrypt_ref), &mut HashSet::new())?;
1299 self.document.objects.insert(encrypt_ref, encrypt_obj);
1300 } else if let Some(raw_bytes) = self.raw_objects.get(&encrypt_ref) {
1301 if let Ok((_, obj)) = self.parse_raw_object(raw_bytes) {
1302 self.document.objects.insert(encrypt_ref, obj);
1303 }
1304 }
1305 }
1306 Ok(())
1307 }
1308
1309 fn authenticate_and_setup_encryption(
1310 &mut self,
1311 require_password: bool,
1312 ) -> Result<Option<String>> {
1313 let password_to_use: Option<String> = if self.document.authenticate_password("").is_ok() {
1314 Some(String::new())
1315 } else if let Some(ref pwd) = self.password {
1316 if self.document.authenticate_password(pwd).is_ok() {
1317 Some(pwd.clone())
1318 } else if require_password {
1319 return Err(Error::InvalidPassword);
1320 } else {
1321 warn!("Invalid password provided for encrypted PDF");
1322 return Err(Error::InvalidPassword);
1323 }
1324 } else if require_password {
1325 return Err(Error::Unimplemented(
1326 "PDF is encrypted and requires a password. Use Document::load_metadata_with_password() instead.",
1327 ));
1328 } else {
1329 warn!("PDF is encrypted and requires a password");
1330 return Ok(None);
1331 };
1332
1333 if let Some(ref password) = password_to_use {
1334 let state = EncryptionState::decode(&self.document, password)?;
1335 self.encryption_state = Some(state);
1336 }
1337
1338 Ok(password_to_use)
1339 }
1340
1341 fn setup_encryption_for_metadata(&mut self) -> Result<()> {
1342 self.parse_encryption_dictionary()?;
1343 self.authenticate_and_setup_encryption(true)?;
1344 Ok(())
1345 }
1346
1347 fn extract_raw_object(&mut self, offset: usize) -> Result<(ObjectId, Vec<u8>)> {
1348 if offset > self.buffer.len() {
1349 return Err(Error::InvalidOffset(offset));
1350 }
1351
1352 let slice = &self.buffer[offset..];
1354
1355 let mut pos = 0;
1357 while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1358 pos += 1;
1359 }
1360
1361 let num_start = pos;
1363 while pos < slice.len() && slice[pos].is_ascii_digit() {
1364 pos += 1;
1365 }
1366 let obj_num: u32 = std::str::from_utf8(&slice[num_start..pos])
1367 .ok()
1368 .and_then(|s| s.parse().ok())
1369 .ok_or(Error::Parse(ParseError::InvalidXref))?;
1370
1371 while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1373 pos += 1;
1374 }
1375
1376 let gen_start = pos;
1378 while pos < slice.len() && slice[pos].is_ascii_digit() {
1379 pos += 1;
1380 }
1381 let obj_gen: u16 = std::str::from_utf8(&slice[gen_start..pos])
1382 .ok()
1383 .and_then(|s| s.parse().ok())
1384 .ok_or(Error::Parse(ParseError::InvalidXref))?;
1385
1386 while pos < slice.len() && slice[pos].is_ascii_whitespace() {
1388 pos += 1;
1389 }
1390 if pos + 3 > slice.len() || &slice[pos..pos + 3] != b"obj" {
1391 return Err(Error::Parse(ParseError::InvalidXref));
1392 }
1393 pos += 3;
1394
1395 let endobj_pattern = b"endobj";
1397 let mut end_pos = pos;
1398 while end_pos + endobj_pattern.len() <= slice.len() {
1399 if &slice[end_pos..end_pos + endobj_pattern.len()] == endobj_pattern {
1400 end_pos += endobj_pattern.len();
1401 break;
1402 }
1403 end_pos += 1;
1404 }
1405
1406 if end_pos > slice.len() {
1407 return Err(Error::Parse(ParseError::InvalidXref));
1408 }
1409
1410 let raw_bytes = slice[0..end_pos].to_vec();
1412
1413 Ok(((obj_num, obj_gen), raw_bytes))
1414 }
1415
1416 fn read_object(
1417 &self,
1418 offset: usize,
1419 expected_id: Option<ObjectId>,
1420 already_seen: &mut HashSet<ObjectId>,
1421 ) -> Result<(ObjectId, Object)> {
1422 if offset > self.buffer.len() {
1423 return Err(Error::InvalidOffset(offset));
1424 }
1425
1426 parser::indirect_object(
1428 ParserInput::new_extra(self.buffer, "indirect object"),
1429 offset,
1430 expected_id,
1431 self,
1432 already_seen,
1433 )
1434 }
1435
1436 fn get_xref_start(buffer: &[u8]) -> Result<usize> {
1437 let seek_pos = buffer.len() - cmp::min(buffer.len(), 512);
1438 Self::search_substring(buffer, b"%%EOF", seek_pos)
1439 .and_then(|eof_pos| if eof_pos > 25 { Some(eof_pos) } else { None })
1440 .and_then(|eof_pos| Self::search_substring(buffer, b"startxref", eof_pos - 25))
1441 .ok_or(Error::Xref(XrefError::Start))
1442 .and_then(|xref_pos| {
1443 if xref_pos <= buffer.len() {
1444 match parser::xref_start(ParserInput::new_extra(&buffer[xref_pos..], "xref")) {
1445 Some(startxref) => Ok(startxref as usize),
1446 None => Err(Error::Xref(XrefError::Start)),
1447 }
1448 } else {
1449 Err(Error::Xref(XrefError::Start))
1450 }
1451 })
1452 }
1453
1454 fn search_substring(buffer: &[u8], pattern: &[u8], start_pos: usize) -> Option<usize> {
1455 buffer
1456 .get(start_pos..)?
1457 .windows(pattern.len())
1458 .rposition(|window| window == pattern)
1459 .map(|pos| start_pos + pos)
1460 }
1461}
1462
1463#[cfg(all(test, not(feature = "async")))]
1464#[test]
1465fn load_document() {
1466 let mut doc = Document::load("assets/example.pdf").unwrap();
1467 assert_eq!(doc.version, "1.5");
1468
1469 let temp_dir = tempfile::tempdir().unwrap();
1471 let file_path = temp_dir.path().join("test_2_load.pdf");
1472 doc.save(file_path).unwrap();
1473}
1474
1475#[cfg(all(test, feature = "async"))]
1476#[tokio::test]
1477async fn load_document() {
1478 let mut doc = Document::load("assets/example.pdf").await.unwrap();
1479 assert_eq!(doc.version, "1.5");
1480
1481 let temp_dir = tempfile::tempdir().unwrap();
1483 let file_path = temp_dir.path().join("test_2_load.pdf");
1484 doc.save(file_path).unwrap();
1485}
1486
1487#[test]
1488#[should_panic(expected = "Xref(Start)")]
1489fn load_short_document() {
1490 let _doc = Document::load_mem(b"%PDF-1.5\n%%EOF\n").unwrap();
1491}
1492
1493#[test]
1494fn load_document_with_preceding_bytes() {
1495 let mut content = Vec::new();
1496 content.extend(b"garbage");
1497 content.extend(include_bytes!("../assets/example.pdf"));
1498 let doc = Document::load_mem(&content).unwrap();
1499 assert_eq!(doc.version, "1.5");
1500}
1501
1502#[test]
1503fn load_many_shallow_brackets() {
1504 let content: String = std::iter::repeat_n("()", MAX_BRACKET * 10)
1505 .flat_map(|x| x.chars())
1506 .collect();
1507 const STREAM_CRUFT: usize = 33;
1508 let doc = format!(
1509 "%PDF-1.5
15101 0 obj<</Type/Pages/Kids[5 0 R]/Count 1/Resources 3 0 R/MediaBox[0 0 595 842]>>endobj
15112 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
15123 0 obj<</Font<</F1 2 0 R>>>>endobj
15135 0 obj<</Type/Page/Parent 1 0 R/Contents[4 0 R]>>endobj
15146 0 obj<</Type/Catalog/Pages 1 0 R>>endobj
15154 0 obj<</Length {}>>stream
1516BT
1517/F1 48 Tf
1518100 600 Td
1519({}) Tj
1520ET
1521endstream endobj\n",
1522 content.len() + STREAM_CRUFT,
1523 content
1524 );
1525 let doc = format!(
1526 "{}xref
15270 7
15280000000000 65535 f
15290000000009 00000 n
15300000000096 00000 n
15310000000155 00000 n
15320000000291 00000 n
15330000000191 00000 n
15340000000248 00000 n
1535trailer
1536<</Root 6 0 R/Size 7>>
1537startxref
1538{}
1539%%EOF",
1540 doc,
1541 doc.len()
1542 );
1543
1544 let _doc = Document::load_mem(doc.as_bytes()).unwrap();
1545}
1546
1547#[test]
1548fn load_too_deep_brackets() {
1549 let content: Vec<u8> = std::iter::repeat_n(b'(', MAX_BRACKET + 1)
1550 .chain(std::iter::repeat_n(b')', MAX_BRACKET + 1))
1551 .collect();
1552 let content = String::from_utf8(content).unwrap();
1553 const STREAM_CRUFT: usize = 33;
1554 let doc = format!(
1555 "%PDF-1.5
15561 0 obj<</Type/Pages/Kids[5 0 R]/Count 1/Resources 3 0 R/MediaBox[0 0 595 842]>>endobj
15572 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
15583 0 obj<</Font<</F1 2 0 R>>>>endobj
15595 0 obj<</Type/Page/Parent 1 0 R/Contents[7 0 R 4 0 R]>>endobj
15606 0 obj<</Type/Catalog/Pages 1 0 R>>endobj
15617 0 obj<</Length 45>>stream
1562BT /F1 48 Tf 100 600 Td (Hello World!) Tj ET
1563endstream
1564endobj
15654 0 obj<</Length {}>>stream
1566BT
1567/F1 48 Tf
1568100 600 Td
1569({}) Tj
1570ET
1571endstream endobj\n",
1572 content.len() + STREAM_CRUFT,
1573 content
1574 );
1575 let doc = format!(
1576 "{}xref
15770 7
15780000000000 65535 f
15790000000009 00000 n
15800000000096 00000 n
15810000000155 00000 n
15820000000387 00000 n
15830000000191 00000 n
15840000000254 00000 n
15850000000297 00000 n
1586trailer
1587<</Root 6 0 R/Size 7>>
1588startxref
1589{}
1590%%EOF",
1591 doc,
1592 doc.len()
1593 );
1594
1595 let doc = Document::load_mem(doc.as_bytes()).unwrap();
1596 let pages = doc.get_pages().keys().cloned().collect::<Vec<_>>();
1597 assert_eq!("Hello World!\n", doc.extract_text(&pages).unwrap());
1598}
1599
1600#[cfg(all(test, not(feature = "async")))]
1601#[test]
1602fn search_substring_finds_last_occurrence() {
1603 assert_eq!(Reader::search_substring(b"hello world", b"xyz", 0), None);
1604 assert_eq!(
1605 Reader::search_substring(b"hello world", b"world", 0),
1606 Some(6)
1607 );
1608
1609 let buffer = b"%%EOF\ntest%%EOF\nend";
1610 assert_eq!(Reader::search_substring(buffer, b"%%EOF", 0), Some(10));
1611 assert_eq!(Reader::search_substring(buffer, b"%%EOF", 6), Some(10));
1612 assert_eq!(Reader::search_substring(buffer, b"%%EOF", 15), None);
1613 assert_eq!(Reader::search_substring(b"%%EOF", b"%%EOF", 0), Some(0));
1614
1615 let buffer_with_many_percents = b"%%%PDF-1.3%%%comment%%%more%%EOF";
1616 assert_eq!(
1617 Reader::search_substring(buffer_with_many_percents, b"%%EOF", 0),
1618 Some(27)
1619 );
1620}
1621
1622#[cfg(all(test, not(feature = "async")))]
1627fn minimal_pdf_bytes() -> &'static [u8] {
1628 include_bytes!("../assets/example.pdf")
1629}
1630
1631#[cfg(all(test, not(feature = "async")))]
1632#[test]
1633fn load_with_options_accepts_normal_document() {
1634 let data = minimal_pdf_bytes();
1636 let opts = LoadOptions::new();
1637 let doc = Document::load_mem_with_options(data, &opts)
1638 .expect("example.pdf should be accepted by default options");
1639 assert_eq!(doc.version, "1.5");
1640}
1641
1642#[cfg(all(test, not(feature = "async")))]
1643#[test]
1644fn load_with_options_rejects_oversized_document() {
1645 let data = minimal_pdf_bytes();
1647 let opts = LoadOptions::new().max_file_bytes(1usize);
1648 let err = Document::load_mem_with_options(data, &opts)
1649 .expect_err("document larger than 1 byte must be rejected");
1650 match err {
1651 Error::DocumentTooLarge { size, limit } => {
1652 assert_eq!(limit, 1);
1653 assert_eq!(size, data.len());
1654 }
1655 other => panic!("expected DocumentTooLarge, got {other:?}"),
1656 }
1657}
1658
1659#[cfg(all(test, not(feature = "async")))]
1660#[test]
1661fn load_with_options_unlimited() {
1662 let data = minimal_pdf_bytes();
1664 let opts = LoadOptions::new().max_file_bytes(None);
1665 let doc = Document::load_mem_with_options(data, &opts)
1666 .expect("unlimited options must not reject documents");
1667 assert_eq!(doc.version, "1.5");
1668}
1669
1670#[cfg(all(test, not(feature = "async")))]
1671#[test]
1672fn load_mem_with_options_lazy_objstm_no_objects_lost() {
1673 let data = minimal_pdf_bytes();
1679 let opts = LoadOptions::new().lazy_objstm(true).max_file_bytes(None);
1680 let mut lazy_doc = Document::load_mem_with_options(data, &opts)
1681 .expect("lazy load of example.pdf should succeed");
1682
1683 let eager_doc = Document::load_mem(data).expect("eager load of example.pdf should succeed");
1685
1686 lazy_doc
1689 .resolve_pending_object_streams()
1690 .expect("resolve_pending_object_streams should not fail on valid data");
1691
1692 assert_eq!(
1693 lazy_doc.objects.len(),
1694 eager_doc.objects.len(),
1695 "after resolve, lazy doc must have same object count as eager doc"
1696 );
1697 assert!(
1698 lazy_doc.pending_obj_streams.is_empty(),
1699 "pending_obj_streams must be empty after resolve"
1700 );
1701}
1702
1703#[cfg(all(test, not(feature = "async")))]
1704#[test]
1705fn resolve_pending_object_streams_skips_objects_reassigned_to_newer_container() {
1706 let mut doc = Document::new();
1707 doc.reference_table.insert(
1708 7,
1709 XrefEntry::Compressed {
1710 container: 20,
1711 index: 0,
1712 },
1713 );
1714
1715 let mut old_stream = ObjectStream::builder().compression_level(0).build();
1716 old_stream
1717 .add_object((7, 0), Object::Integer(1))
1718 .expect("old ObjStm should accept object");
1719 doc.objects.insert(
1720 (10, 0),
1721 Object::Stream(old_stream.to_stream_object().unwrap()),
1722 );
1723
1724 let mut new_stream = ObjectStream::builder().compression_level(0).build();
1725 new_stream
1726 .add_object((7, 0), Object::Integer(2))
1727 .expect("new ObjStm should accept object");
1728 doc.objects.insert(
1729 (20, 0),
1730 Object::Stream(new_stream.to_stream_object().unwrap()),
1731 );
1732
1733 doc.pending_obj_streams = vec![(10, 0), (20, 0)];
1734 doc.resolve_pending_object_streams()
1735 .expect("lazy ObjStm resolution should succeed");
1736
1737 let resolved = doc
1738 .get_object((7, 0))
1739 .expect("object should resolve from the current ObjStm");
1740 assert_eq!(
1741 resolved
1742 .as_i64()
1743 .expect("resolved object should stay an integer"),
1744 2
1745 );
1746 assert!(
1747 !doc.objects.contains_key(&(10, 0)),
1748 "old ObjStm container should be dropped after resolution"
1749 );
1750 assert!(
1751 !doc.objects.contains_key(&(20, 0)),
1752 "new ObjStm container should be dropped after resolution"
1753 );
1754}
1755
1756#[test]
1757fn load_options_builder() {
1758 let opts = LoadOptions::new()
1759 .max_file_bytes(64 * 1024 * 1024)
1760 .lazy_objstm(true);
1761 assert_eq!(opts.max_file_bytes, Some(64 * 1024 * 1024));
1762 assert!(opts.lazy_objstm);
1763
1764 let no_limit = LoadOptions::new().max_file_bytes(None);
1765 assert_eq!(no_limit.max_file_bytes, None);
1766
1767 let default = LoadOptions::default();
1768 assert_eq!(
1769 default.max_file_bytes,
1770 Some(crate::load_options::DEFAULT_MAX_FILE_BYTES)
1771 );
1772 assert!(!default.lazy_objstm);
1773}