1#[cfg(feature = "decrypt")]
4mod encryption;
5
6use std::{
7 borrow::Cow,
8 collections::BTreeMap,
9 fmt,
10 io::{Read, Seek, SeekFrom, Write},
11 num::NonZeroU32,
12 sync::{Arc, OnceLock},
13};
14
15use serde::{Deserialize, Serialize};
16use tempfile::NamedTempFile;
17
18#[cfg(not(feature = "decrypt"))]
19use crate::Identifier;
20use crate::{
21 BoundedText, ConfigError, Identifier, ObjectKey, ObjectLocation, ParseError, ParseFact,
22 PasswordSecret, PdfVersion, ResourceLimits, Result, StreamFact, ValidationWarning, XrefFact,
23};
24
25const HEADER_MARKER: &[u8] = b"%PDF-";
26const EOF_MARKER: &[u8] = b"%%EOF";
27const STREAM_MARKER: &[u8] = b"stream";
28const ENDSTREAM_MARKER: &[u8] = b"endstream";
29const ENDOBJ_MARKER: &[u8] = b"endobj";
30const SPILL_SEARCH_CHUNK_BYTES: usize = 8192;
31
32#[allow(
33 clippy::disallowed_types,
34 reason = "parser source storage is synchronous Read+Seek; async file handles do not fit this \
35 API"
36)]
37type SpillFileHandle = std::fs::File;
38
39pub trait PdfSource: Read + Seek {}
41
42impl<T> PdfSource for T where T: Read + Seek {}
43
44#[derive(Clone, Debug)]
46pub struct Parser {
47 limits: ResourceLimits,
48 decoder_registry: Option<DecoderRegistry>,
49}
50
51impl Parser {
52 #[must_use]
54 pub fn new(limits: ResourceLimits) -> Self {
55 Self {
56 limits,
57 decoder_registry: None,
58 }
59 }
60
61 #[must_use]
63 pub fn with_decoder_registry(
64 limits: ResourceLimits,
65 decoder_registry: DecoderRegistry,
66 ) -> Self {
67 Self {
68 limits,
69 decoder_registry: Some(decoder_registry),
70 }
71 }
72
73 pub fn parse<R: PdfSource>(&self, source: R) -> Result<ParsedDocument> {
80 self.parse_with_options(source, ParseOptions::default())
81 }
82
83 pub fn parse_with_options<R: PdfSource>(
90 &self,
91 mut source: R,
92 options: ParseOptions<'_>,
93 ) -> Result<ParsedDocument> {
94 let byte_len = source
95 .seek(SeekFrom::End(0))
96 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
97 if byte_len > self.limits.max_file_bytes {
98 return Err(ParseError::LimitExceeded {
99 limit: "max_file_bytes",
100 }
101 .into());
102 }
103 source
104 .rewind()
105 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
106
107 let storage = SourceStorage::from_source(
108 source,
109 byte_len,
110 self.limits.memory_source_threshold_bytes,
111 )?;
112
113 ByteParser::new(
114 storage,
115 self.limits.clone(),
116 self.decoder_registry.clone(),
117 options,
118 )
119 .parse_document()
120 }
121}
122
123impl Default for Parser {
124 fn default() -> Self {
125 Self::new(ResourceLimits::default())
126 }
127}
128
129fn default_decoder_registry() -> &'static DecoderRegistry {
130 static REGISTRY: OnceLock<DecoderRegistry> = OnceLock::new();
131 REGISTRY.get_or_init(DecoderRegistry::default)
132}
133
134pub trait StreamDecoder: fmt::Debug {
136 fn decode(
143 &self,
144 input: &[u8],
145 params: &DecodeParams,
146 limits: &ResourceLimits,
147 ) -> std::result::Result<DecoderOutput, ParseError>;
148}
149
150#[derive(Clone, Debug, Eq, PartialEq)]
152#[non_exhaustive]
153pub struct DecoderOutput {
154 pub bytes: Vec<u8>,
156 pub metadata_mode: bool,
158}
159
160#[derive(Clone)]
162pub struct DecoderRegistry {
163 decoders: BTreeMap<PdfName, Arc<dyn StreamDecoder + Send + Sync>>,
164}
165
166#[derive(Clone, Debug)]
168#[non_exhaustive]
169pub enum SourceStorage {
170 Memory(Arc<[u8]>),
172 SpillFile {
174 file: Arc<SpillFileHandle>,
176 len: usize,
178 path: Arc<tempfile::TempPath>,
180 },
181}
182
183impl SourceStorage {
184 fn from_source<R: PdfSource>(
185 mut source: R,
186 byte_len: u64,
187 memory_threshold: u64,
188 ) -> Result<Self> {
189 if byte_len <= memory_threshold {
190 let capacity = usize::try_from(byte_len).map_err(|_| ParseError::LimitExceeded {
191 limit: "max_file_bytes",
192 })?;
193 let mut bytes = Vec::with_capacity(capacity);
194 source
195 .read_to_end(&mut bytes)
196 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
197 return Ok(Self::Memory(Arc::from(bytes)));
198 }
199
200 let mut tempfile =
201 NamedTempFile::new().map_err(|source| crate::PdfvError::Io { path: None, source })?;
202 let copied = std::io::copy(&mut source, &mut tempfile)
203 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
204 if copied != byte_len {
205 return Err(ParseError::Malformed {
206 message: bounded("source length changed while spilling"),
207 }
208 .into());
209 }
210 tempfile
211 .as_file_mut()
212 .flush()
213 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
214 let file = tempfile
215 .reopen()
216 .map_err(|source| crate::PdfvError::Io { path: None, source })?;
217 Ok(Self::SpillFile {
218 file: Arc::new(file),
219 len: usize::try_from(byte_len).map_err(|_| ParseError::LimitExceeded {
220 limit: "max_file_bytes",
221 })?,
222 path: Arc::new(tempfile.into_temp_path()),
223 })
224 }
225
226 fn len(&self) -> usize {
227 match self {
228 Self::Memory(bytes) => bytes.len(),
229 Self::SpillFile { len, .. } => *len,
230 }
231 }
232
233 fn slice(&self, start: usize, end: usize) -> std::result::Result<Cow<'_, [u8]>, ParseError> {
234 if start > end || end > self.len() {
235 return Err(ParseError::Malformed {
236 message: bounded("byte range out of bounds"),
237 });
238 }
239 match self {
240 Self::Memory(bytes) => {
241 bytes
242 .get(start..end)
243 .map(Cow::Borrowed)
244 .ok_or(ParseError::Malformed {
245 message: bounded("byte range out of bounds"),
246 })
247 }
248 Self::SpillFile { file, .. } => {
249 let mut buffer = vec![0_u8; end.saturating_sub(start)];
250 read_exact_at(file, &mut buffer, start)?;
251 Ok(Cow::Owned(buffer))
252 }
253 }
254 }
255
256 fn byte(&self, pos: usize) -> Option<u8> {
257 if pos >= self.len() {
258 return None;
259 }
260 match self {
261 Self::Memory(bytes) => bytes.get(pos).copied(),
262 Self::SpillFile { file, .. } => {
263 let mut byte = [0_u8; 1];
264 read_exact_at(file, &mut byte, pos).ok()?;
265 Some(byte[0])
266 }
267 }
268 }
269
270 fn starts_with(&self, pos: usize, expected: &[u8]) -> bool {
271 let Some(end) = pos.checked_add(expected.len()) else {
272 return false;
273 };
274 self.slice(pos, end)
275 .is_ok_and(|bytes| bytes.as_ref() == expected)
276 }
277
278 fn find_bytes(&self, needle: &[u8], start: usize, end: usize) -> Option<usize> {
279 if needle.is_empty() || start > end || end > self.len() {
280 return None;
281 }
282 match self {
283 Self::Memory(_) => {
284 let bytes = self.slice(start, end).ok()?;
285 find_bytes(bytes.as_ref(), needle, 0)
286 .and_then(|relative| start.checked_add(relative))
287 }
288 Self::SpillFile { file, .. } => find_bytes_in_spill_file(file, needle, start, end),
289 }
290 }
291
292 fn stream_source(
293 &self,
294 start: usize,
295 end: usize,
296 ) -> std::result::Result<(Arc<[u8]>, StreamRange), ParseError> {
297 match self {
298 Self::Memory(bytes) => Ok((
299 Arc::clone(bytes),
300 StreamRange {
301 start: u64::try_from(start).map_err(|_| ParseError::ArithmeticOverflow {
302 context: "stream start",
303 })?,
304 end: u64::try_from(end).map_err(|_| ParseError::ArithmeticOverflow {
305 context: "stream end",
306 })?,
307 },
308 )),
309 Self::SpillFile { .. } => {
310 let bytes = self.slice(start, end)?.into_owned();
311 let end =
312 u64::try_from(bytes.len()).map_err(|_| ParseError::ArithmeticOverflow {
313 context: "stream end",
314 })?;
315 Ok((Arc::from(bytes), StreamRange { start: 0, end }))
316 }
317 }
318 }
319}
320
321fn read_exact_at(
322 file: &SpillFileHandle,
323 buffer: &mut [u8],
324 offset: usize,
325) -> std::result::Result<(), ParseError> {
326 let mut file = file.try_clone().map_err(|_| ParseError::Malformed {
327 message: bounded("failed to clone spill file handle"),
328 })?;
329 file.seek(SeekFrom::Start(u64::try_from(offset).map_err(|_| {
330 ParseError::ArithmeticOverflow {
331 context: "spill file offset",
332 }
333 })?))
334 .map_err(|_| ParseError::Malformed {
335 message: bounded("failed to seek spill file"),
336 })?;
337 file.read_exact(buffer).map_err(|_| ParseError::Malformed {
338 message: bounded("failed to read spill file"),
339 })
340}
341
342fn find_bytes_in_spill_file(
343 file: &SpillFileHandle,
344 needle: &[u8],
345 start: usize,
346 end: usize,
347) -> Option<usize> {
348 let overlap = needle.len().saturating_sub(1);
349 let mut pos = start;
350 let mut carried = Vec::new();
351 while pos < end {
352 let read_len = end.saturating_sub(pos).min(SPILL_SEARCH_CHUNK_BYTES);
353 let mut chunk = vec![0_u8; read_len];
354 read_exact_at(file, &mut chunk, pos).ok()?;
355 let search_base = pos.saturating_sub(carried.len());
356 let carried_len = carried.len();
357 carried.extend_from_slice(&chunk);
358 if let Some(relative) = find_bytes(&carried, needle, 0) {
359 return search_base.checked_add(relative);
360 }
361 if carried.len() > overlap {
362 let keep_start = carried.len().saturating_sub(overlap);
363 carried = carried.get(keep_start..)?.to_vec();
364 }
365 pos = pos.checked_add(read_len)?;
366 if read_len == 0 || carried_len == carried.len() && chunk.is_empty() {
367 break;
368 }
369 }
370 None
371}
372
373impl DecoderRegistry {
374 #[must_use]
376 pub fn new() -> Self {
377 let mut registry = Self {
378 decoders: BTreeMap::new(),
379 };
380 let flate: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(FlateDecoder);
381 let ascii_hex: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(AsciiHexDecoder);
382 let ascii85: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(Ascii85Decoder);
383 let run_length: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(RunLengthDecoder);
384 let lzw: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(LzwDecoder);
385 let crypt: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(CryptDecoder);
386 let metadata_mode: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(MetadataModeDecoder);
387 registry.register_many(["FlateDecode", "Fl"], &flate);
388 registry.register_many(["ASCIIHexDecode", "AHx"], &ascii_hex);
389 registry.register_many(["ASCII85Decode", "A85"], &ascii85);
390 registry.register_many(["RunLengthDecode", "RL"], &run_length);
391 registry.register_many(["LZWDecode", "LZW"], &lzw);
392 registry.register_many(["Crypt"], &crypt);
393 registry.register_many(
394 ["DCTDecode", "JPXDecode", "JBIG2Decode", "CCITTFaxDecode"],
395 &metadata_mode,
396 );
397 registry
398 }
399
400 pub fn register(&mut self, name: PdfName, decoder: &Arc<dyn StreamDecoder + Send + Sync>) {
402 self.decoders.insert(name, Arc::clone(decoder));
403 }
404
405 fn register_many<const N: usize>(
406 &mut self,
407 names: [&'static str; N],
408 decoder: &Arc<dyn StreamDecoder + Send + Sync>,
409 ) {
410 for name in names {
411 self.register(PdfName::from_static(name), decoder);
412 }
413 }
414
415 fn decoder(&self, name: &PdfName) -> Option<&Arc<dyn StreamDecoder + Send + Sync>> {
416 self.decoders.get(name)
417 }
418}
419
420impl Default for DecoderRegistry {
421 fn default() -> Self {
422 Self::new()
423 }
424}
425
426impl fmt::Debug for DecoderRegistry {
427 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
428 formatter
429 .debug_struct("DecoderRegistry")
430 .field("decoder_count", &self.decoders.len())
431 .finish()
432 }
433}
434
435#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
437#[non_exhaustive]
438#[serde(rename_all = "camelCase", deny_unknown_fields)]
439pub struct DecodeParams {
440 pub predictor: u16,
442 pub colors: u16,
444 pub bits_per_component: u16,
446 pub columns: u32,
448 pub early_change: u8,
450 pub crypt_filter_name: Option<PdfName>,
452}
453
454impl Default for DecodeParams {
455 fn default() -> Self {
456 Self {
457 predictor: 1,
458 colors: 1,
459 bits_per_component: 8,
460 columns: 1,
461 early_change: 1,
462 crypt_filter_name: None,
463 }
464 }
465}
466
467#[derive(Clone, Copy, Debug, Default)]
469#[non_exhaustive]
470pub struct ParseOptions<'a> {
471 pub password: Option<&'a PasswordSecret>,
473}
474
475#[derive(Clone, Debug, Deserialize, Serialize)]
477#[non_exhaustive]
478#[serde(rename_all = "camelCase", deny_unknown_fields)]
479pub struct ParsedDocument {
480 pub version: PdfVersion,
482 pub catalog: Option<ObjectKey>,
484 pub objects: ObjectStore,
486 pub trailers: Vec<Trailer>,
488 pub parse_facts: Vec<ParseFact>,
490 pub warnings: Vec<ValidationWarning>,
492}
493
494impl ParsedDocument {
495 #[must_use]
497 pub fn is_encrypted(&self) -> bool {
498 self.parse_facts.iter().any(|fact| {
499 matches!(
500 fact,
501 ParseFact::Encryption {
502 encrypted: true,
503 decrypted: false,
504 handler: _,
505 ..
506 }
507 )
508 })
509 }
510}
511
512#[derive(Clone, Debug, Default, Deserialize, Serialize)]
514#[serde(transparent)]
515pub struct ObjectStore(BTreeMap<ObjectKey, IndirectObject>);
516
517impl ObjectStore {
518 pub fn insert(&mut self, object: IndirectObject) {
520 self.0.insert(object.key, object);
521 }
522
523 #[must_use]
525 pub fn get(&self, key: &ObjectKey) -> Option<&IndirectObject> {
526 self.0.get(key)
527 }
528
529 pub fn values(&self) -> impl Iterator<Item = &IndirectObject> {
531 self.0.values()
532 }
533
534 pub(crate) fn values_mut(&mut self) -> impl Iterator<Item = &mut IndirectObject> {
535 self.0.values_mut()
536 }
537
538 #[must_use]
540 pub fn len(&self) -> usize {
541 self.0.len()
542 }
543
544 #[must_use]
546 pub fn is_empty(&self) -> bool {
547 self.0.is_empty()
548 }
549}
550
551#[derive(Clone, Debug, Deserialize, Serialize)]
553#[non_exhaustive]
554#[serde(rename_all = "camelCase", deny_unknown_fields)]
555pub struct IndirectObject {
556 pub key: ObjectKey,
558 pub offset: u64,
560 pub object: CosObject,
562}
563
564#[derive(Clone, Debug, Deserialize, Serialize)]
566#[non_exhaustive]
567#[serde(rename_all = "camelCase", deny_unknown_fields)]
568pub struct Trailer {
569 pub dictionary: Dictionary,
571 pub offset: u64,
573}
574
575#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
577#[non_exhaustive]
578#[serde(rename_all = "camelCase", tag = "type", content = "value")]
579pub enum CosObject {
580 Null,
582 Boolean(bool),
584 Integer(i64),
586 Real(f64),
588 Name(PdfName),
590 String(PdfString),
592 Array(Vec<CosObject>),
594 Dictionary(Dictionary),
596 Stream(StreamObject),
598 Reference(ObjectKey),
600}
601
602impl CosObject {
603 #[must_use]
605 pub fn as_dictionary(&self) -> Option<&Dictionary> {
606 match self {
607 Self::Dictionary(dictionary) => Some(dictionary),
608 Self::Stream(stream) => Some(&stream.dictionary),
609 _ => None,
610 }
611 }
612}
613
614#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
616#[serde(transparent)]
617pub struct Dictionary(BTreeMap<PdfName, CosObject>);
618
619impl Dictionary {
620 pub fn insert(&mut self, key: PdfName, value: CosObject) {
622 self.0.insert(key, value);
623 }
624
625 #[must_use]
627 pub fn get(&self, key: &str) -> Option<&CosObject> {
628 self.0.get(&PdfName::from_static(key))
629 }
630
631 pub fn iter(&self) -> impl Iterator<Item = (&PdfName, &CosObject)> {
633 self.0.iter()
634 }
635
636 pub(crate) fn values_mut(&mut self) -> impl Iterator<Item = &mut CosObject> {
637 self.0.values_mut()
638 }
639
640 #[must_use]
642 pub fn len(&self) -> usize {
643 self.0.len()
644 }
645
646 #[must_use]
648 pub fn is_empty(&self) -> bool {
649 self.0.is_empty()
650 }
651}
652
653#[derive(Clone, Debug, Deserialize, Eq, Hash, Ord, PartialEq, PartialOrd, Serialize)]
655#[serde(try_from = "String", into = "String")]
656pub struct PdfName(Vec<u8>);
657
658impl PdfName {
659 pub fn new(bytes: Vec<u8>, limits: &ResourceLimits) -> std::result::Result<Self, ParseError> {
665 if bytes.len() > limits.max_name_bytes {
666 return Err(ParseError::LimitExceeded {
667 limit: "max_name_bytes",
668 });
669 }
670 Ok(Self(bytes))
671 }
672
673 #[must_use]
675 pub fn as_bytes(&self) -> &[u8] {
676 &self.0
677 }
678
679 #[must_use]
681 pub fn matches(&self, value: &str) -> bool {
682 self.0.as_slice() == value.as_bytes()
683 }
684
685 fn from_static(value: &str) -> Self {
686 Self(value.as_bytes().to_vec())
687 }
688}
689
690impl TryFrom<String> for PdfName {
691 type Error = ConfigError;
692
693 fn try_from(value: String) -> std::result::Result<Self, Self::Error> {
694 Ok(Self(value.into_bytes()))
695 }
696}
697
698impl From<PdfName> for String {
699 fn from(value: PdfName) -> Self {
700 String::from_utf8_lossy(&value.0).into_owned()
701 }
702}
703
704#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
706#[serde(try_from = "Vec<u8>", into = "Vec<u8>")]
707pub struct PdfString(Vec<u8>);
708
709impl PdfString {
710 pub fn new(bytes: Vec<u8>, limits: &ResourceLimits) -> std::result::Result<Self, ParseError> {
716 if bytes.len() > limits.max_string_bytes {
717 return Err(ParseError::LimitExceeded {
718 limit: "max_string_bytes",
719 });
720 }
721 Ok(Self(bytes))
722 }
723
724 #[must_use]
726 pub fn as_bytes(&self) -> &[u8] {
727 &self.0
728 }
729}
730
731impl TryFrom<Vec<u8>> for PdfString {
732 type Error = ConfigError;
733
734 fn try_from(value: Vec<u8>) -> std::result::Result<Self, Self::Error> {
735 Ok(Self(value))
736 }
737}
738
739impl From<PdfString> for Vec<u8> {
740 fn from(value: PdfString) -> Self {
741 value.0
742 }
743}
744
745#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
747#[non_exhaustive]
748#[serde(rename_all = "camelCase", deny_unknown_fields)]
749pub struct StreamObject {
750 pub dictionary: Dictionary,
752 pub raw_range: StreamRange,
754 pub declared_length: Option<u64>,
756 pub discovered_length: u64,
758 pub filters: Vec<PdfName>,
760 #[serde(default)]
762 pub decode_params: Vec<DecodeParams>,
763 #[serde(skip, default = "empty_source")]
765 pub raw_source: Arc<[u8]>,
766 pub stream_keyword_crlf_compliant: bool,
768 pub endstream_keyword_eol_compliant: bool,
770}
771
772impl StreamObject {
773 pub(crate) fn remove_crypt_filters(&mut self) {
774 let mut next_filters = Vec::with_capacity(self.filters.len());
775 let mut next_params = Vec::with_capacity(self.decode_params.len());
776 for (index, filter) in self.filters.iter().enumerate() {
777 if filter.matches("Crypt") {
778 continue;
779 }
780 next_filters.push(filter.clone());
781 next_params.push(self.decode_params.get(index).cloned().unwrap_or_default());
782 }
783 self.filters = next_filters;
784 self.decode_params = next_params;
785 }
786
787 pub(crate) fn raw_bytes(&self) -> std::result::Result<&[u8], ParseError> {
788 let raw_start =
789 usize::try_from(self.raw_range.start).map_err(|_| ParseError::ArithmeticOverflow {
790 context: "stream raw range",
791 })?;
792 let raw_end =
793 usize::try_from(self.raw_range.end).map_err(|_| ParseError::ArithmeticOverflow {
794 context: "stream raw range",
795 })?;
796 self.raw_source
797 .get(raw_start..raw_end)
798 .ok_or(ParseError::Malformed {
799 message: bounded("stream raw range out of bounds"),
800 })
801 }
802
803 pub fn decoded_bytes(
810 &self,
811 limits: &ResourceLimits,
812 ) -> std::result::Result<Vec<u8>, ParseError> {
813 self.decoded_bytes_with_registry(limits, default_decoder_registry())
814 .map(|decoded| decoded.bytes)
815 }
816
817 fn decoded_bytes_with_registry(
818 &self,
819 limits: &ResourceLimits,
820 registry: &DecoderRegistry,
821 ) -> std::result::Result<DecodedStream, ParseError> {
822 let mut current = self.raw_bytes()?.to_vec();
823 let mut facts = Vec::new();
824 for (index, filter) in self.filters.iter().enumerate() {
825 let params = self.decode_params.get(index).cloned().unwrap_or_default();
826 let decoder = registry
827 .decoder(filter)
828 .ok_or(ParseError::UnsupportedFilter {
829 filter: BoundedText::unchecked(String::from_utf8_lossy(filter.as_bytes())),
830 })?;
831 let input_len = checked_u64_len(current.len(), "stream filter input length")?;
832 let output = decoder.decode(¤t, ¶ms, limits)?;
833 let output_len = checked_u64_len(output.bytes.len(), "stream filter output length")?;
834 enforce_decoded_len(output_len, limits.max_stream_decode_bytes)?;
835 let filter = filter_identifier(filter)?;
836 facts.push(if output.metadata_mode {
837 StreamFact::FilterMetadataMode {
838 filter,
839 bytes: output_len,
840 }
841 } else {
842 StreamFact::FilterDecoded {
843 filter,
844 input_bytes: input_len,
845 output_bytes: output_len,
846 }
847 });
848 current = output.bytes;
849 }
850 let decoded_len = checked_u64_len(current.len(), "decoded stream length")?;
851 enforce_decoded_len(decoded_len, limits.max_stream_decode_bytes)?;
852 Ok(DecodedStream {
853 bytes: current,
854 facts,
855 })
856 }
857}
858
859#[derive(Clone, Debug, Eq, PartialEq)]
860struct DecodedStream {
861 bytes: Vec<u8>,
862 facts: Vec<StreamFact>,
863}
864
865fn empty_source() -> Arc<[u8]> {
866 Arc::from(Vec::<u8>::new())
867}
868
869#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
871#[non_exhaustive]
872#[serde(rename_all = "camelCase", deny_unknown_fields)]
873pub struct StreamRange {
874 pub start: u64,
876 pub end: u64,
878}
879
880struct ByteParser<'a> {
881 source: SourceStorage,
882 limits: ResourceLimits,
883 decoder_registry: Option<DecoderRegistry>,
884 options: ParseOptions<'a>,
885 pos: usize,
886 parse_facts: Vec<ParseFact>,
887 warnings: Vec<ValidationWarning>,
888}
889
890#[derive(Clone, Copy, Debug)]
891enum NumberToken {
892 Integer(i64),
893 Real(f64),
894}
895
896#[derive(Clone, Copy, Debug)]
897struct XrefStreamSummary {
898 decoded_bytes: usize,
899 entries: u64,
900 compressed_entries: u64,
901}
902
903impl<'a> ByteParser<'a> {
904 fn new(
905 source: SourceStorage,
906 limits: ResourceLimits,
907 decoder_registry: Option<DecoderRegistry>,
908 options: ParseOptions<'a>,
909 ) -> Self {
910 Self {
911 source,
912 limits,
913 decoder_registry,
914 options,
915 pos: 0,
916 parse_facts: Vec::new(),
917 warnings: Vec::new(),
918 }
919 }
920
921 fn parse_document(mut self) -> Result<ParsedDocument> {
922 let (header_offset, version) = self.parse_header()?;
923 self.push_fact(ParseFact::Header {
924 offset: header_offset,
925 version,
926 had_leading_bytes: header_offset != 0,
927 });
928
929 let mut objects = ObjectStore::default();
930 let mut trailers = Vec::new();
931 self.parse_top_level_objects(&mut objects, &mut trailers)?;
932
933 let encrypted_catalog = trailers
934 .iter()
935 .rev()
936 .find_map(|trailer| object_ref_from_dictionary(&trailer.dictionary, "Root"));
937
938 if encryption_reference(&trailers).is_some() {
939 let fact = encryption_fact(&objects, &trailers);
940 #[cfg(feature = "decrypt")]
941 if let Err(error) = encryption::classify_encryption(&objects, &trailers, &self.limits)
942 && !error.is_encrypted_status()
943 {
944 return Err(error.into_parse_error().into());
945 }
946 #[cfg(feature = "decrypt")]
947 if let Some(password) = self.options.password {
948 match encryption::decrypt_document(
949 &mut objects,
950 &mut trailers,
951 &self.limits,
952 password,
953 ) {
954 Ok(summary) => {
955 self.push_fact(summary.into_fact(true));
956 }
957 Err(error) if error.is_encrypted_status() => {
958 self.warnings.push(ValidationWarning::General {
959 message: BoundedText::unchecked(error.safe_message()),
960 });
961 self.push_fact(error.into_fact(fact));
962 return Ok(ParsedDocument {
963 version,
964 catalog: encrypted_catalog,
965 objects,
966 trailers,
967 parse_facts: self.parse_facts,
968 warnings: self.warnings,
969 });
970 }
971 Err(error) => return Err(error.into_parse_error().into()),
972 }
973 } else if self.options.password.is_none() {
974 self.push_fact(fact);
975 return Ok(ParsedDocument {
976 version,
977 catalog: encrypted_catalog,
978 objects,
979 trailers,
980 parse_facts: self.parse_facts,
981 warnings: self.warnings,
982 });
983 }
984
985 #[cfg(not(feature = "decrypt"))]
986 {
987 self.push_fact(fact);
988 return Ok(ParsedDocument {
989 version,
990 catalog: encrypted_catalog,
991 objects,
992 trailers,
993 parse_facts: self.parse_facts,
994 warnings: self.warnings,
995 });
996 }
997 }
998
999 self.materialize_stream_backed_structures(&mut objects, &mut trailers)?;
1000 let catalog = trailers
1001 .iter()
1002 .rev()
1003 .find_map(|trailer| object_ref_from_dictionary(&trailer.dictionary, "Root"));
1004
1005 Ok(ParsedDocument {
1006 version,
1007 catalog,
1008 objects,
1009 trailers,
1010 parse_facts: self.parse_facts,
1011 warnings: self.warnings,
1012 })
1013 }
1014
1015 fn parse_top_level_objects(
1016 &mut self,
1017 objects: &mut ObjectStore,
1018 trailers: &mut Vec<Trailer>,
1019 ) -> Result<()> {
1020 while self.pos < self.source.len() {
1021 self.skip_ws_and_comments();
1022 if self.starts_with(EOF_MARKER) {
1023 self.parse_post_eof_fact()?;
1024 return Ok(());
1025 }
1026 if self.starts_with(b"startxref") {
1027 self.skip_line();
1028 continue;
1029 }
1030 if self.starts_with(b"xref") {
1031 self.parse_xref_and_trailer(trailers)?;
1032 continue;
1033 }
1034 if self.starts_with(b"trailer") {
1035 self.consume_bytes(b"trailer")?;
1036 self.skip_ws_and_comments();
1037 let offset = self.offset()?;
1038 let dictionary = self.parse_dictionary(0)?;
1039 self.push_xref_chain_facts(None, offset, &dictionary)?;
1040 trailers.push(Trailer { dictionary, offset });
1041 continue;
1042 }
1043
1044 let before = self.pos;
1045 match self.parse_indirect_object()? {
1046 Some(object) => {
1047 let object_count = u64::try_from(objects.len())
1048 .map_err(|_| ParseError::ArithmeticOverflow {
1049 context: "object count",
1050 })?
1051 .checked_add(1)
1052 .ok_or(ParseError::ArithmeticOverflow {
1053 context: "object count",
1054 })?;
1055 if object_count > self.limits.max_objects {
1056 return Err(ParseError::LimitExceeded {
1057 limit: "max_objects",
1058 }
1059 .into());
1060 }
1061 objects.insert(object);
1062 }
1063 None => {
1064 self.pos = before.saturating_add(1);
1065 }
1066 }
1067 }
1068 Ok(())
1069 }
1070
1071 fn materialize_stream_backed_structures(
1072 &mut self,
1073 objects: &mut ObjectStore,
1074 trailers: &mut Vec<Trailer>,
1075 ) -> std::result::Result<(), ParseError> {
1076 let streams = objects
1077 .values()
1078 .filter_map(|object| match &object.object {
1079 CosObject::Stream(stream) => Some((object.key, object.offset, stream.clone())),
1080 _ => None,
1081 })
1082 .collect::<Vec<_>>();
1083 let mut expanded_objects = Vec::new();
1084 for (key, offset, stream) in streams {
1085 if matches!(stream.dictionary.get("Type"), Some(CosObject::Name(name)) if name.matches("XRef"))
1086 {
1087 let summary = self.parse_xref_stream(key, &stream)?;
1088 let decoded_len = u64::try_from(summary.decoded_bytes).map_err(|_| {
1089 ParseError::ArithmeticOverflow {
1090 context: "decoded xref stream length",
1091 }
1092 })?;
1093 self.push_fact(ParseFact::Stream {
1094 object: key,
1095 fact: StreamFact::Decoded { bytes: decoded_len },
1096 });
1097 trailers.push(Trailer {
1098 dictionary: stream.dictionary.clone(),
1099 offset,
1100 });
1101 self.push_xref_chain_facts(Some(key), offset, &stream.dictionary)?;
1102 self.push_fact(ParseFact::Xref {
1103 section: ObjectLocation {
1104 object: Some(key),
1105 offset: Some(offset),
1106 path: None,
1107 },
1108 fact: XrefFact::XrefStreamParsed {
1109 entries: summary.entries,
1110 compressed_entries: summary.compressed_entries,
1111 },
1112 });
1113 }
1114 if matches!(stream.dictionary.get("Type"), Some(CosObject::Name(name)) if name.matches("ObjStm"))
1115 {
1116 let decoded = self.decode_stream(key, &stream)?;
1117 let decoded_len = checked_u64_len(decoded.bytes.len(), "decoded stream length")?;
1118 self.push_fact(ParseFact::Stream {
1119 object: key,
1120 fact: StreamFact::Decoded { bytes: decoded_len },
1121 });
1122 let mut parsed_objects = self.parse_object_stream(key, &stream, &decoded.bytes)?;
1123 expanded_objects.append(&mut parsed_objects);
1124 self.push_fact(ParseFact::Xref {
1125 section: ObjectLocation {
1126 object: Some(key),
1127 offset: Some(offset),
1128 path: None,
1129 },
1130 fact: XrefFact::ObjectStreamParsed,
1131 });
1132 }
1133 }
1134 for object in expanded_objects {
1135 if objects.get(&object.key).is_none() {
1136 let next_count =
1137 u64::try_from(objects.len()).map_err(|_| ParseError::ArithmeticOverflow {
1138 context: "object count",
1139 })? + 1;
1140 if next_count > self.limits.max_objects {
1141 return Err(ParseError::LimitExceeded {
1142 limit: "max_objects",
1143 });
1144 }
1145 objects.insert(object);
1146 }
1147 }
1148 Ok(())
1149 }
1150
1151 fn parse_xref_stream(
1152 &mut self,
1153 stream_key: ObjectKey,
1154 stream: &StreamObject,
1155 ) -> std::result::Result<XrefStreamSummary, ParseError> {
1156 let size = non_negative_u64_from_dictionary(&stream.dictionary, "Size")?;
1157 if size > self.limits.max_objects {
1158 return Err(ParseError::LimitExceeded {
1159 limit: "max_objects",
1160 });
1161 }
1162 let widths = xref_widths(&stream.dictionary)?;
1163 let indexes = xref_indexes(&stream.dictionary, size)?;
1164 let entry_width = widths
1165 .iter()
1166 .try_fold(0_usize, |sum, width| sum.checked_add(*width))
1167 .ok_or(ParseError::ArithmeticOverflow {
1168 context: "xref stream entry width",
1169 })?;
1170 if entry_width == 0 {
1171 return Err(ParseError::Malformed {
1172 message: bounded("xref stream entry width must be non-zero"),
1173 });
1174 }
1175 let decoded = self.decode_stream(stream_key, stream)?.bytes;
1176 let total_entries = indexes
1177 .iter()
1178 .try_fold(0_u64, |sum, (_, count)| sum.checked_add(*count))
1179 .ok_or(ParseError::ArithmeticOverflow {
1180 context: "xref stream entries",
1181 })?;
1182 if total_entries > self.limits.max_objects {
1183 return Err(ParseError::LimitExceeded {
1184 limit: "max_objects",
1185 });
1186 }
1187 let required_bytes = usize::try_from(total_entries)
1188 .ok()
1189 .and_then(|entries| entries.checked_mul(entry_width))
1190 .ok_or(ParseError::ArithmeticOverflow {
1191 context: "xref stream bytes",
1192 })?;
1193 if decoded.len() < required_bytes {
1194 return Err(ParseError::Malformed {
1195 message: bounded("xref stream data shorter than declared entries"),
1196 });
1197 }
1198
1199 let mut pos = 0_usize;
1200 let mut compressed_entries = 0_u64;
1201 for (_first_object, count) in indexes {
1202 for _ in 0..count {
1203 let entry_type = if widths[0] == 0 {
1204 1
1205 } else {
1206 read_be_uint(&decoded, &mut pos, widths[0])?
1207 };
1208 let _field_two = read_be_uint(&decoded, &mut pos, widths[1])?;
1209 let _field_three = read_be_uint(&decoded, &mut pos, widths[2])?;
1210 if entry_type == 2 {
1211 compressed_entries = compressed_entries.checked_add(1).ok_or(
1212 ParseError::ArithmeticOverflow {
1213 context: "compressed xref entries",
1214 },
1215 )?;
1216 }
1217 }
1218 }
1219 Ok(XrefStreamSummary {
1220 decoded_bytes: decoded.len(),
1221 entries: total_entries,
1222 compressed_entries,
1223 })
1224 }
1225
1226 fn decode_stream(
1227 &mut self,
1228 key: ObjectKey,
1229 stream: &StreamObject,
1230 ) -> std::result::Result<DecodedStream, ParseError> {
1231 let decoded = if let Some(registry) = &self.decoder_registry {
1232 stream.decoded_bytes_with_registry(&self.limits, registry)?
1233 } else {
1234 stream.decoded_bytes_with_registry(&self.limits, default_decoder_registry())?
1235 };
1236 for fact in &decoded.facts {
1237 self.push_fact(ParseFact::Stream {
1238 object: key,
1239 fact: fact.clone(),
1240 });
1241 }
1242 Ok(decoded)
1243 }
1244
1245 fn parse_object_stream(
1246 &self,
1247 stream_key: ObjectKey,
1248 stream: &StreamObject,
1249 decoded: &[u8],
1250 ) -> std::result::Result<Vec<IndirectObject>, ParseError> {
1251 let count_u64 = non_negative_u64_from_dictionary(&stream.dictionary, "N")?;
1252 if count_u64 > self.limits.max_objects {
1253 return Err(ParseError::LimitExceeded {
1254 limit: "max_objects",
1255 });
1256 }
1257 let first = non_negative_usize_from_dictionary(&stream.dictionary, "First")?;
1258 if first > decoded.len() {
1259 return Err(ParseError::Malformed {
1260 message: bounded("object stream first offset exceeds decoded bytes"),
1261 });
1262 }
1263 let count = usize::try_from(count_u64).map_err(|_| ParseError::LimitExceeded {
1264 limit: "max_objects",
1265 })?;
1266 if count > 0 && count > first / 4 {
1267 return Err(ParseError::Malformed {
1268 message: bounded("object stream header too short for object count"),
1269 });
1270 }
1271 let mut parser = ByteParser::new(
1272 SourceStorage::Memory(Arc::from(decoded.to_vec())),
1273 self.limits.clone(),
1274 None,
1275 ParseOptions::default(),
1276 );
1277 let mut headers = Vec::with_capacity(count);
1278 for _ in 0..count {
1279 let Some(number) = parser.parse_unsigned_u32()? else {
1280 return Err(ParseError::Malformed {
1281 message: bounded("object stream missing object number"),
1282 });
1283 };
1284 parser.skip_required_ws()?;
1285 let Some(relative_offset) = parser.parse_unsigned::<usize>()? else {
1286 return Err(ParseError::Malformed {
1287 message: bounded("object stream missing object offset"),
1288 });
1289 };
1290 let Some(number) = NonZeroU32::new(number) else {
1291 return Err(ParseError::Malformed {
1292 message: bounded("object number must be non-zero"),
1293 });
1294 };
1295 headers.push((
1296 ObjectKey {
1297 number,
1298 generation: 0,
1299 },
1300 relative_offset,
1301 ));
1302 }
1303
1304 let mut objects = Vec::with_capacity(count);
1305 for (key, relative_offset) in headers {
1306 let object_pos =
1307 first
1308 .checked_add(relative_offset)
1309 .ok_or(ParseError::ArithmeticOverflow {
1310 context: "object stream offset",
1311 })?;
1312 if object_pos >= decoded.len() {
1313 return Err(ParseError::Malformed {
1314 message: bounded("object stream object offset exceeds decoded bytes"),
1315 });
1316 }
1317 parser.pos = object_pos;
1318 let object = parser.parse_object(0)?;
1319 let offset = u64::try_from(object_pos)
1320 .ok()
1321 .and_then(|relative| stream.raw_range.start.checked_add(relative))
1322 .ok_or(ParseError::ArithmeticOverflow {
1323 context: "object stream object offset",
1324 })?;
1325 if key == stream_key {
1326 return Err(ParseError::Malformed {
1327 message: bounded("object stream cannot contain itself"),
1328 });
1329 }
1330 objects.push(IndirectObject {
1331 key,
1332 offset,
1333 object,
1334 });
1335 }
1336 Ok(objects)
1337 }
1338
1339 fn parse_header(&mut self) -> std::result::Result<(u64, PdfVersion), ParseError> {
1340 let Some(header_pos) = self.source.find_bytes(HEADER_MARKER, 0, self.source.len()) else {
1341 return Err(ParseError::Malformed {
1342 message: bounded("missing PDF header"),
1343 });
1344 };
1345 self.pos = header_pos
1346 .checked_add(HEADER_MARKER.len())
1347 .ok_or(ParseError::ArithmeticOverflow { context: "header" })?;
1348 let mut malformed = false;
1349 let major = if let Some(value) = self.parse_version_digit() {
1350 value
1351 } else {
1352 malformed = true;
1353 1
1354 };
1355 if self.peek_byte() == Some(b'.') {
1356 self.pos = self.pos.saturating_add(1);
1357 } else {
1358 malformed = true;
1359 }
1360 let minor = if let Some(value) = self.parse_version_digit() {
1361 value
1362 } else {
1363 malformed = true;
1364 4
1365 };
1366 if malformed {
1367 self.warnings.push(ValidationWarning::General {
1368 message: BoundedText::unchecked("malformed PDF header version recovered as 1.4"),
1369 });
1370 }
1371 Ok((
1372 u64::try_from(header_pos).map_err(|_| ParseError::ArithmeticOverflow {
1373 context: "header offset",
1374 })?,
1375 PdfVersion { major, minor },
1376 ))
1377 }
1378
1379 fn parse_version_digit(&mut self) -> Option<u8> {
1380 let byte = self.peek_byte()?;
1381 if byte.is_ascii_digit() {
1382 self.pos = self.pos.saturating_add(1);
1383 Some(byte.saturating_sub(b'0'))
1384 } else {
1385 None
1386 }
1387 }
1388
1389 fn parse_indirect_object(&mut self) -> std::result::Result<Option<IndirectObject>, ParseError> {
1390 let start = self.pos;
1391 let Some(number) = self.parse_unsigned_u32()? else {
1392 return Ok(None);
1393 };
1394 self.skip_required_ws()?;
1395 let Some(generation) = self.parse_unsigned_u16()? else {
1396 self.pos = start;
1397 return Ok(None);
1398 };
1399 self.skip_required_ws()?;
1400 if !self.starts_with(b"obj") {
1401 self.pos = start;
1402 return Ok(None);
1403 }
1404 self.consume_bytes(b"obj")?;
1405 let Some(number) = NonZeroU32::new(number) else {
1406 return Err(ParseError::Malformed {
1407 message: bounded("object number must be non-zero"),
1408 });
1409 };
1410 let key = ObjectKey { number, generation };
1411 let object_start = u64::try_from(start).map_err(|_| ParseError::ArithmeticOverflow {
1412 context: "object offset",
1413 })?;
1414
1415 let parsed = self.parse_object(0)?;
1416 let object = match parsed {
1417 CosObject::Dictionary(dictionary) if self.peek_stream_marker() => {
1418 CosObject::Stream(self.parse_stream(key, dictionary)?)
1419 }
1420 other => other,
1421 };
1422 self.skip_ws_and_comments();
1423 if self.starts_with(ENDOBJ_MARKER) {
1424 self.consume_bytes(ENDOBJ_MARKER)?;
1425 }
1426 Ok(Some(IndirectObject {
1427 key,
1428 offset: object_start,
1429 object,
1430 }))
1431 }
1432
1433 fn parse_object(&mut self, depth: u32) -> std::result::Result<CosObject, ParseError> {
1434 if depth > self.limits.max_object_depth {
1435 return Err(ParseError::LimitExceeded {
1436 limit: "max_object_depth",
1437 });
1438 }
1439 self.skip_ws_and_comments();
1440 if self.starts_with(b"<<") {
1441 return Ok(CosObject::Dictionary(self.parse_dictionary(depth)?));
1442 }
1443 if self.starts_with(b"[") {
1444 return Ok(CosObject::Array(self.parse_array(depth)?));
1445 }
1446 match self.peek_byte() {
1447 Some(b'/') => self.parse_name().map(CosObject::Name),
1448 Some(b'(') => self.parse_literal_string().map(CosObject::String),
1449 Some(b'<') => self.parse_hex_string().map(CosObject::String),
1450 Some(b't') if self.starts_with(b"true") => {
1451 self.consume_bytes(b"true")?;
1452 Ok(CosObject::Boolean(true))
1453 }
1454 Some(b'f') if self.starts_with(b"false") => {
1455 self.consume_bytes(b"false")?;
1456 Ok(CosObject::Boolean(false))
1457 }
1458 Some(b'n') if self.starts_with(b"null") => {
1459 self.consume_bytes(b"null")?;
1460 Ok(CosObject::Null)
1461 }
1462 Some(b'-' | b'+' | b'.' | b'0'..=b'9') => self.parse_number_or_reference(),
1463 _ => Err(ParseError::Malformed {
1464 message: BoundedText::unchecked(format!("unexpected token at {}", self.pos)),
1465 }),
1466 }
1467 }
1468
1469 fn parse_dictionary(&mut self, depth: u32) -> std::result::Result<Dictionary, ParseError> {
1470 self.consume_bytes(b"<<")?;
1471 let mut dictionary = Dictionary::default();
1472 loop {
1473 self.skip_ws_and_comments();
1474 if self.starts_with(b">>") {
1475 self.consume_bytes(b">>")?;
1476 return Ok(dictionary);
1477 }
1478 let key = self.parse_name()?;
1479 let value = self.parse_object(depth.saturating_add(1))?;
1480 let next_len =
1481 u64::try_from(dictionary.len()).map_err(|_| ParseError::ArithmeticOverflow {
1482 context: "dictionary length",
1483 })? + 1;
1484 if next_len > self.limits.max_dict_entries {
1485 return Err(ParseError::LimitExceeded {
1486 limit: "max_dict_entries",
1487 });
1488 }
1489 dictionary.insert(key, value);
1490 }
1491 }
1492
1493 fn parse_array(&mut self, depth: u32) -> std::result::Result<Vec<CosObject>, ParseError> {
1494 self.consume_bytes(b"[")?;
1495 let mut values = Vec::new();
1496 loop {
1497 self.skip_ws_and_comments();
1498 if self.starts_with(b"]") {
1499 self.consume_bytes(b"]")?;
1500 return Ok(values);
1501 }
1502 let value = self.parse_object(depth.saturating_add(1))?;
1503 let next_len =
1504 u64::try_from(values.len()).map_err(|_| ParseError::ArithmeticOverflow {
1505 context: "array length",
1506 })? + 1;
1507 if next_len > self.limits.max_array_len {
1508 return Err(ParseError::LimitExceeded {
1509 limit: "max_array_len",
1510 });
1511 }
1512 values.push(value);
1513 }
1514 }
1515
1516 fn parse_name(&mut self) -> std::result::Result<PdfName, ParseError> {
1517 self.consume_bytes(b"/")?;
1518 let mut bytes = Vec::new();
1519 while let Some(byte) = self.peek_byte() {
1520 if is_delimiter(byte) || is_ws(byte) {
1521 break;
1522 }
1523 self.pos = self.pos.saturating_add(1);
1524 if byte == b'#' {
1525 let high = self.next_byte().ok_or(ParseError::Malformed {
1526 message: bounded("truncated name escape"),
1527 })?;
1528 let low = self.next_byte().ok_or(ParseError::Malformed {
1529 message: bounded("truncated name escape"),
1530 })?;
1531 let decoded = decode_hex_pair(high, low).ok_or(ParseError::Malformed {
1532 message: bounded("invalid name escape"),
1533 })?;
1534 bytes.push(decoded);
1535 } else {
1536 bytes.push(byte);
1537 }
1538 if bytes.len() > self.limits.max_name_bytes {
1539 return Err(ParseError::LimitExceeded {
1540 limit: "max_name_bytes",
1541 });
1542 }
1543 }
1544 PdfName::new(bytes, &self.limits)
1545 }
1546
1547 fn parse_literal_string(&mut self) -> std::result::Result<PdfString, ParseError> {
1548 self.consume_bytes(b"(")?;
1549 let mut depth = 1_u32;
1550 let mut bytes = Vec::new();
1551 while let Some(byte) = self.next_byte() {
1552 match byte {
1553 b'\\' => {
1554 let Some(escaped) = self.next_byte() else {
1555 return Err(ParseError::Malformed {
1556 message: bounded("truncated string escape"),
1557 });
1558 };
1559 bytes.push(match escaped {
1560 b'n' => b'\n',
1561 b'r' => b'\r',
1562 b't' => b'\t',
1563 b'b' => 0x08,
1564 b'f' => 0x0c,
1565 other => other,
1566 });
1567 }
1568 b'(' => {
1569 depth = depth.checked_add(1).ok_or(ParseError::ArithmeticOverflow {
1570 context: "string nesting",
1571 })?;
1572 bytes.push(byte);
1573 }
1574 b')' => {
1575 depth = depth.saturating_sub(1);
1576 if depth == 0 {
1577 return PdfString::new(bytes, &self.limits);
1578 }
1579 bytes.push(byte);
1580 }
1581 other => bytes.push(other),
1582 }
1583 if bytes.len() > self.limits.max_string_bytes {
1584 return Err(ParseError::LimitExceeded {
1585 limit: "max_string_bytes",
1586 });
1587 }
1588 }
1589 Err(ParseError::Malformed {
1590 message: bounded("unterminated literal string"),
1591 })
1592 }
1593
1594 fn parse_hex_string(&mut self) -> std::result::Result<PdfString, ParseError> {
1595 self.consume_bytes(b"<")?;
1596 let mut nibbles = Vec::new();
1597 while let Some(byte) = self.peek_byte() {
1598 if byte == b'>' {
1599 self.pos = self.pos.saturating_add(1);
1600 break;
1601 }
1602 self.pos = self.pos.saturating_add(1);
1603 if !is_ws(byte) {
1604 nibbles.push(byte);
1605 }
1606 }
1607 if nibbles.len() % 2 != 0 {
1608 nibbles.push(b'0');
1609 }
1610 let mut bytes = Vec::with_capacity(nibbles.len() / 2);
1611 for pair in nibbles.chunks(2) {
1612 let high = pair.first().copied().ok_or(ParseError::Malformed {
1613 message: bounded("invalid hex string"),
1614 })?;
1615 let low = pair.get(1).copied().ok_or(ParseError::Malformed {
1616 message: bounded("invalid hex string"),
1617 })?;
1618 let decoded = decode_hex_pair(high, low).ok_or(ParseError::Malformed {
1619 message: bounded("invalid hex string"),
1620 })?;
1621 bytes.push(decoded);
1622 }
1623 PdfString::new(bytes, &self.limits)
1624 }
1625
1626 fn parse_number_or_reference(&mut self) -> std::result::Result<CosObject, ParseError> {
1627 let first_start = self.pos;
1628 let first = self.parse_number_token()?;
1629 if let NumberToken::Integer(first_integer) = first {
1630 let after_first = self.pos;
1631 if self.skip_required_ws().is_ok() {
1632 let second_start = self.pos;
1633 if let Some(generation) = self.parse_unsigned_u16()?
1634 && self.skip_required_ws().is_ok()
1635 && self.starts_with(b"R")
1636 {
1637 self.consume_bytes(b"R")?;
1638 if let Some(number) =
1639 NonZeroU32::new(u32::try_from(first_integer).map_err(|_| {
1640 ParseError::Malformed {
1641 message: bounded("reference object number out of range"),
1642 }
1643 })?)
1644 {
1645 return Ok(CosObject::Reference(ObjectKey { number, generation }));
1646 }
1647 }
1648 self.pos = second_start;
1649 }
1650 self.pos = after_first;
1651 }
1652 self.pos = first_start;
1653 match self.parse_number_token()? {
1654 NumberToken::Integer(value) => Ok(CosObject::Integer(value)),
1655 NumberToken::Real(value) => Ok(CosObject::Real(value)),
1656 }
1657 }
1658
1659 fn parse_number_token(&mut self) -> std::result::Result<NumberToken, ParseError> {
1660 self.skip_ws_and_comments();
1661 let start = self.pos;
1662 if matches!(self.peek_byte(), Some(b'+' | b'-')) {
1663 self.pos = self.pos.saturating_add(1);
1664 }
1665 let mut has_dot = false;
1666 while let Some(byte) = self.peek_byte() {
1667 if byte == b'.' {
1668 has_dot = true;
1669 self.pos = self.pos.saturating_add(1);
1670 } else if byte.is_ascii_digit() {
1671 self.pos = self.pos.saturating_add(1);
1672 } else {
1673 break;
1674 }
1675 }
1676 let token = self.slice(start, self.pos)?;
1677 let text = std::str::from_utf8(token.as_ref()).map_err(|_| ParseError::Malformed {
1678 message: bounded("number is not valid ASCII"),
1679 })?;
1680 if has_dot {
1681 let value = text.parse::<f64>().map_err(|_| ParseError::Malformed {
1682 message: bounded("invalid real number"),
1683 })?;
1684 if !value.is_finite() {
1685 return Err(ParseError::Malformed {
1686 message: bounded("non-finite real number"),
1687 });
1688 }
1689 Ok(NumberToken::Real(value))
1690 } else {
1691 let value = text.parse::<i64>().map_err(|_| ParseError::Malformed {
1692 message: bounded("invalid integer"),
1693 })?;
1694 Ok(NumberToken::Integer(value))
1695 }
1696 }
1697
1698 fn parse_stream(
1699 &mut self,
1700 key: ObjectKey,
1701 dictionary: Dictionary,
1702 ) -> std::result::Result<StreamObject, ParseError> {
1703 self.skip_ws_and_comments();
1704 self.consume_bytes(STREAM_MARKER)?;
1705 let stream_keyword_crlf_compliant = self.starts_with(b"\r\n");
1706 if self.starts_with(b"\r\n") {
1707 self.consume_bytes(b"\r\n")?;
1708 } else if self.starts_with(b"\n") || self.starts_with(b"\r") {
1709 self.pos = self.pos.saturating_add(1);
1710 }
1711 let data_start = self.pos;
1712 let declared_length = integer_from_dictionary(&dictionary, "Length")
1713 .and_then(|value| u64::try_from(value).ok());
1714 if let Some(declared) = declared_length
1715 && declared > self.limits.max_stream_declared_bytes
1716 {
1717 return Err(ParseError::LimitExceeded {
1718 limit: "max_stream_declared_bytes",
1719 });
1720 }
1721
1722 let declared_end = declared_length
1723 .and_then(|length| usize::try_from(length).ok())
1724 .and_then(|length| data_start.checked_add(length));
1725 let declared_keyword =
1726 declared_end.and_then(|offset| endstream_after_optional_eol(&self.source, offset));
1727 let (data_end, endstream_pos) =
1728 if let (Some(data_end), Some(keyword_pos)) = (declared_end, declared_keyword) {
1729 (data_end, keyword_pos)
1730 } else {
1731 let max_scan =
1732 usize::try_from(self.limits.max_stream_declared_bytes).map_err(|_| {
1733 ParseError::LimitExceeded {
1734 limit: "max_stream_declared_bytes",
1735 }
1736 })?;
1737 let scan_end = data_start
1738 .checked_add(max_scan)
1739 .map_or(self.source.len(), |end| end.min(self.source.len()));
1740 let keyword_pos = self
1741 .source
1742 .find_bytes(ENDSTREAM_MARKER, data_start, scan_end)
1743 .ok_or(ParseError::Malformed {
1744 message: bounded("missing endstream"),
1745 })?;
1746 (
1747 trim_eol_before(&self.source, data_start, keyword_pos),
1748 keyword_pos,
1749 )
1750 };
1751 let endstream_keyword_eol_compliant = has_eol_before(&self.source, endstream_pos);
1752 let discovered_length =
1753 u64::try_from(data_end.saturating_sub(data_start)).map_err(|_| {
1754 ParseError::ArithmeticOverflow {
1755 context: "stream length",
1756 }
1757 })?;
1758 self.pos = endstream_pos;
1759 self.consume_bytes(ENDSTREAM_MARKER)?;
1760
1761 let filters = stream_filters(&dictionary);
1762 self.push_fact(ParseFact::Stream {
1763 object: key,
1764 fact: StreamFact::Length {
1765 declared: declared_length.unwrap_or(discovered_length),
1766 discovered: discovered_length,
1767 },
1768 });
1769 self.push_fact(ParseFact::Stream {
1770 object: key,
1771 fact: StreamFact::KeywordSpacing {
1772 stream_keyword_crlf_compliant,
1773 endstream_keyword_eol_compliant,
1774 },
1775 });
1776
1777 let decode_params = stream_decode_params(&dictionary, filters.len());
1778 let (raw_source, raw_range) = self.source.stream_source(data_start, data_end)?;
1779 Ok(StreamObject {
1780 dictionary,
1781 raw_range,
1782 declared_length,
1783 discovered_length,
1784 filters,
1785 decode_params,
1786 raw_source,
1787 stream_keyword_crlf_compliant,
1788 endstream_keyword_eol_compliant,
1789 })
1790 }
1791
1792 fn parse_xref_and_trailer(
1793 &mut self,
1794 trailers: &mut Vec<Trailer>,
1795 ) -> std::result::Result<(), ParseError> {
1796 let section_offset = self.offset()?;
1797 self.consume_bytes(b"xref")?;
1798 let mut compliant = true;
1799 let mut parsed_entries = 0_u64;
1800 loop {
1801 self.skip_ws_and_comments();
1802 if self.pos >= self.source.len() || self.starts_with(b"trailer") {
1803 break;
1804 }
1805 if self.starts_with(b"startxref") || self.starts_with(EOF_MARKER) {
1806 break;
1807 }
1808 let Some(_first_object) = self.parse_unsigned_u32()? else {
1809 compliant = false;
1810 self.skip_line();
1811 continue;
1812 };
1813 self.skip_required_ws()?;
1814 let Some(count) = self.parse_unsigned_u32()? else {
1815 compliant = false;
1816 self.skip_line();
1817 continue;
1818 };
1819 self.skip_line();
1820 for _ in 0..count {
1821 let line_start = self.pos;
1822 let offset = self.parse_fixed_digits(10);
1823 self.skip_xref_spaces();
1824 let generation = self.parse_fixed_digits(5);
1825 self.skip_xref_spaces();
1826 let marker = self.next_byte();
1827 if offset.is_none()
1828 || generation.is_none()
1829 || !matches!(marker, Some(b'n' | b'f'))
1830 || !line_had_eol(&self.source, line_start)
1831 {
1832 compliant = false;
1833 }
1834 parsed_entries =
1835 parsed_entries
1836 .checked_add(1)
1837 .ok_or(ParseError::ArithmeticOverflow {
1838 context: "xref entries",
1839 })?;
1840 if parsed_entries > self.limits.max_objects {
1841 return Err(ParseError::LimitExceeded {
1842 limit: "max_objects",
1843 });
1844 }
1845 self.skip_line();
1846 }
1847 }
1848 self.push_fact(ParseFact::Xref {
1849 section: ObjectLocation {
1850 object: None,
1851 offset: Some(section_offset),
1852 path: None,
1853 },
1854 fact: if compliant {
1855 XrefFact::EolMarkersComply
1856 } else {
1857 XrefFact::MalformedClassic
1858 },
1859 });
1860 loop {
1861 self.skip_ws_and_comments();
1862 if self.pos >= self.source.len() {
1863 return Ok(());
1864 }
1865 if self.starts_with(b"trailer") {
1866 self.consume_bytes(b"trailer")?;
1867 self.skip_ws_and_comments();
1868 let offset = self.offset()?;
1869 let dictionary = self.parse_dictionary(0)?;
1870 self.push_xref_chain_facts(None, offset, &dictionary)?;
1871 trailers.push(Trailer { dictionary, offset });
1872 return Ok(());
1873 }
1874 if self.starts_with(b"startxref") || self.starts_with(EOF_MARKER) {
1875 return Ok(());
1876 }
1877 self.skip_line();
1878 }
1879 }
1880
1881 fn parse_post_eof_fact(&mut self) -> std::result::Result<(), ParseError> {
1882 self.consume_bytes(EOF_MARKER)?;
1883 let remaining =
1884 self.source
1885 .len()
1886 .saturating_sub(self.pos)
1887 .saturating_sub(count_trailing_ws(
1888 self.slice(self.pos, self.source.len())?.as_ref(),
1889 ));
1890 if remaining > 0 {
1891 self.push_fact(ParseFact::PostEofData {
1892 bytes: u64::try_from(remaining).map_err(|_| ParseError::ArithmeticOverflow {
1893 context: "post eof bytes",
1894 })?,
1895 });
1896 }
1897 Ok(())
1898 }
1899
1900 fn push_xref_chain_facts(
1901 &mut self,
1902 object: Option<ObjectKey>,
1903 offset: u64,
1904 dictionary: &Dictionary,
1905 ) -> std::result::Result<(), ParseError> {
1906 let section = ObjectLocation {
1907 object,
1908 offset: Some(offset),
1909 path: None,
1910 };
1911 if let Some(prev) = optional_non_negative_offset(dictionary, "Prev")? {
1912 self.push_fact(ParseFact::Xref {
1913 section: section.clone(),
1914 fact: XrefFact::PrevChain { offset: prev },
1915 });
1916 }
1917 if let Some(hybrid) = optional_non_negative_offset(dictionary, "XRefStm")? {
1918 self.push_fact(ParseFact::Xref {
1919 section,
1920 fact: XrefFact::HybridReference { offset: hybrid },
1921 });
1922 }
1923 Ok(())
1924 }
1925
1926 fn push_fact(&mut self, fact: ParseFact) {
1927 if self.parse_facts.len() >= self.limits.max_parse_facts {
1928 if !self
1929 .warnings
1930 .iter()
1931 .any(|warning| matches!(warning, ValidationWarning::ParseFactCapReached { .. }))
1932 {
1933 self.warnings.push(ValidationWarning::ParseFactCapReached {
1934 cap: self.limits.max_parse_facts,
1935 });
1936 }
1937 return;
1938 }
1939 self.parse_facts.push(fact);
1940 }
1941
1942 fn peek_stream_marker(&mut self) -> bool {
1943 let saved = self.pos;
1944 self.skip_ws_and_comments();
1945 let found = self.starts_with(STREAM_MARKER);
1946 self.pos = saved;
1947 found
1948 }
1949
1950 fn parse_unsigned_u32(&mut self) -> std::result::Result<Option<u32>, ParseError> {
1951 self.parse_unsigned::<u32>()
1952 }
1953
1954 fn parse_unsigned_u16(&mut self) -> std::result::Result<Option<u16>, ParseError> {
1955 self.parse_unsigned::<u16>()
1956 }
1957
1958 fn parse_fixed_digits(&mut self, len: usize) -> Option<u64> {
1959 let end = self.pos.checked_add(len)?;
1960 let slice = self.source.slice(self.pos, end).ok()?;
1961 if !slice.iter().all(u8::is_ascii_digit) {
1962 return None;
1963 }
1964 self.pos = end;
1965 std::str::from_utf8(slice.as_ref())
1966 .ok()?
1967 .parse::<u64>()
1968 .ok()
1969 }
1970
1971 fn parse_unsigned<T>(&mut self) -> std::result::Result<Option<T>, ParseError>
1972 where
1973 T: std::str::FromStr,
1974 {
1975 self.skip_ws_and_comments();
1976 let start = self.pos;
1977 while let Some(byte) = self.peek_byte() {
1978 if byte.is_ascii_digit() {
1979 self.pos = self.pos.saturating_add(1);
1980 } else {
1981 break;
1982 }
1983 }
1984 if start == self.pos {
1985 return Ok(None);
1986 }
1987 let token = self.slice(start, self.pos)?;
1988 let text = std::str::from_utf8(token.as_ref()).map_err(|_| ParseError::Malformed {
1989 message: bounded("unsigned integer is not ASCII"),
1990 })?;
1991 text.parse::<T>()
1992 .map(Some)
1993 .map_err(|_| ParseError::Malformed {
1994 message: bounded("unsigned integer out of range"),
1995 })
1996 }
1997
1998 fn skip_ws_and_comments(&mut self) {
1999 loop {
2000 while self.peek_byte().is_some_and(is_ws) {
2001 self.pos = self.pos.saturating_add(1);
2002 }
2003 if self.starts_with(EOF_MARKER) {
2004 break;
2005 }
2006 if self.peek_byte() == Some(b'%') {
2007 self.skip_line();
2008 } else {
2009 break;
2010 }
2011 }
2012 }
2013
2014 fn skip_xref_spaces(&mut self) {
2015 while matches!(self.peek_byte(), Some(b'\t' | b' ')) {
2016 self.pos = self.pos.saturating_add(1);
2017 }
2018 }
2019
2020 fn skip_required_ws(&mut self) -> std::result::Result<(), ParseError> {
2021 let start = self.pos;
2022 while self.peek_byte().is_some_and(is_ws) {
2023 self.pos = self.pos.saturating_add(1);
2024 }
2025 if self.pos == start {
2026 return Err(ParseError::Malformed {
2027 message: bounded("expected whitespace"),
2028 });
2029 }
2030 Ok(())
2031 }
2032
2033 fn skip_line(&mut self) {
2034 while let Some(byte) = self.peek_byte() {
2035 self.pos = self.pos.saturating_add(1);
2036 if matches!(byte, b'\n' | b'\r') {
2037 break;
2038 }
2039 }
2040 }
2041
2042 fn consume_bytes(&mut self, expected: &[u8]) -> std::result::Result<(), ParseError> {
2043 if !self.starts_with(expected) {
2044 return Err(ParseError::Malformed {
2045 message: BoundedText::unchecked(format!("unexpected token at {}", self.pos)),
2046 });
2047 }
2048 self.pos = self
2049 .pos
2050 .checked_add(expected.len())
2051 .ok_or(ParseError::ArithmeticOverflow { context: "offset" })?;
2052 Ok(())
2053 }
2054
2055 fn starts_with(&self, expected: &[u8]) -> bool {
2056 self.source.starts_with(self.pos, expected)
2057 }
2058
2059 fn peek_byte(&self) -> Option<u8> {
2060 self.source.byte(self.pos)
2061 }
2062
2063 fn next_byte(&mut self) -> Option<u8> {
2064 let byte = self.peek_byte()?;
2065 self.pos = self.pos.saturating_add(1);
2066 Some(byte)
2067 }
2068
2069 fn slice(&self, start: usize, end: usize) -> std::result::Result<Cow<'_, [u8]>, ParseError> {
2070 self.source.slice(start, end)
2071 }
2072
2073 fn offset(&self) -> std::result::Result<u64, ParseError> {
2074 u64::try_from(self.pos).map_err(|_| ParseError::ArithmeticOverflow { context: "offset" })
2075 }
2076}
2077
2078fn bounded(value: &str) -> BoundedText {
2079 BoundedText::unchecked(value)
2080}
2081
2082fn is_ws(byte: u8) -> bool {
2083 matches!(byte, b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
2084}
2085
2086fn is_delimiter(byte: u8) -> bool {
2087 matches!(
2088 byte,
2089 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
2090 )
2091}
2092
2093fn decode_hex_pair(high: u8, low: u8) -> Option<u8> {
2094 let high = decode_hex_digit(high)?;
2095 let low = decode_hex_digit(low)?;
2096 Some(high.saturating_mul(16).saturating_add(low))
2097}
2098
2099fn decode_hex_digit(byte: u8) -> Option<u8> {
2100 match byte {
2101 b'0'..=b'9' => Some(byte.saturating_sub(b'0')),
2102 b'a'..=b'f' => Some(byte.saturating_sub(b'a').saturating_add(10)),
2103 b'A'..=b'F' => Some(byte.saturating_sub(b'A').saturating_add(10)),
2104 _ => None,
2105 }
2106}
2107
2108fn find_bytes(haystack: &[u8], needle: &[u8], start: usize) -> Option<usize> {
2109 haystack
2110 .get(start..)?
2111 .windows(needle.len())
2112 .position(|window| window == needle)
2113 .and_then(|relative| start.checked_add(relative))
2114}
2115
2116fn has_eol_before(source: &SourceStorage, pos: usize) -> bool {
2117 matches!(
2118 pos.checked_sub(1).and_then(|index| source.byte(index)),
2119 Some(b'\n' | b'\r')
2120 )
2121}
2122
2123fn line_had_eol(source: &SourceStorage, line_start: usize) -> bool {
2124 let Some(relative) = source.find_bytes(b"\n", line_start, source.len()) else {
2125 return source.find_bytes(b"\r", line_start, source.len()).is_some();
2126 };
2127 relative >= line_start
2128}
2129
2130fn endstream_after_optional_eol(source: &SourceStorage, offset: usize) -> Option<usize> {
2131 if source.starts_with(offset, ENDSTREAM_MARKER) {
2132 return Some(offset);
2133 }
2134 if source.starts_with(offset, b"\r\nendstream") {
2135 return offset.checked_add(2);
2136 }
2137 if source.starts_with(offset, b"\nendstream") || source.starts_with(offset, b"\rendstream") {
2138 return offset.checked_add(1);
2139 }
2140 None
2141}
2142
2143fn trim_eol_before(source: &SourceStorage, data_start: usize, keyword_pos: usize) -> usize {
2144 if keyword_pos >= data_start.saturating_add(2)
2145 && source
2146 .slice(keyword_pos.saturating_sub(2), keyword_pos)
2147 .is_ok_and(|bytes| bytes.as_ref() == b"\r\n")
2148 {
2149 return keyword_pos.saturating_sub(2);
2150 }
2151 if keyword_pos > data_start
2152 && matches!(
2153 source.byte(keyword_pos.saturating_sub(1)),
2154 Some(b'\n' | b'\r')
2155 )
2156 {
2157 return keyword_pos.saturating_sub(1);
2158 }
2159 keyword_pos
2160}
2161
2162fn count_trailing_ws(bytes: &[u8]) -> usize {
2163 bytes.iter().rev().take_while(|byte| is_ws(**byte)).count()
2164}
2165
2166fn integer_from_dictionary(dictionary: &Dictionary, key: &str) -> Option<i64> {
2167 match dictionary.get(key) {
2168 Some(CosObject::Integer(value)) => Some(*value),
2169 _ => None,
2170 }
2171}
2172
2173fn non_negative_usize_from_dictionary(
2174 dictionary: &Dictionary,
2175 key: &'static str,
2176) -> std::result::Result<usize, ParseError> {
2177 let value = non_negative_u64_from_dictionary(dictionary, key)?;
2178 usize::try_from(value).map_err(|_| ParseError::Malformed {
2179 message: BoundedText::unchecked(format!("invalid object stream {key}")),
2180 })
2181}
2182
2183fn non_negative_u64_from_dictionary(
2184 dictionary: &Dictionary,
2185 key: &'static str,
2186) -> std::result::Result<u64, ParseError> {
2187 let Some(value) = integer_from_dictionary(dictionary, key) else {
2188 return Err(ParseError::Malformed {
2189 message: BoundedText::unchecked(format!("missing integer dictionary key {key}")),
2190 });
2191 };
2192 u64::try_from(value).map_err(|_| ParseError::Malformed {
2193 message: BoundedText::unchecked(format!("invalid non-negative dictionary key {key}")),
2194 })
2195}
2196
2197fn optional_non_negative_offset(
2198 dictionary: &Dictionary,
2199 key: &'static str,
2200) -> std::result::Result<Option<u64>, ParseError> {
2201 let Some(value) = integer_from_dictionary(dictionary, key) else {
2202 return Ok(None);
2203 };
2204 u64::try_from(value)
2205 .map(Some)
2206 .map_err(|_| ParseError::Malformed {
2207 message: BoundedText::unchecked(format!("invalid xref offset dictionary key {key}")),
2208 })
2209}
2210
2211fn xref_widths(dictionary: &Dictionary) -> std::result::Result<[usize; 3], ParseError> {
2212 let Some(CosObject::Array(values)) = dictionary.get("W") else {
2213 return Err(ParseError::Malformed {
2214 message: bounded("xref stream missing W array"),
2215 });
2216 };
2217 if values.len() != 3 {
2218 return Err(ParseError::Malformed {
2219 message: bounded("xref stream W array must have three entries"),
2220 });
2221 }
2222 let mut widths = [0_usize; 3];
2223 for (index, value) in values.iter().enumerate() {
2224 let CosObject::Integer(width) = value else {
2225 return Err(ParseError::Malformed {
2226 message: bounded("xref stream W entry must be integer"),
2227 });
2228 };
2229 let width = usize::try_from(*width).map_err(|_| ParseError::Malformed {
2230 message: bounded("xref stream W entry must be non-negative"),
2231 })?;
2232 if width > 8 {
2233 return Err(ParseError::Malformed {
2234 message: bounded("xref stream W entry exceeds supported width"),
2235 });
2236 }
2237 let Some(slot) = widths.get_mut(index) else {
2238 return Err(ParseError::Malformed {
2239 message: bounded("xref stream W index out of bounds"),
2240 });
2241 };
2242 *slot = width;
2243 }
2244 Ok(widths)
2245}
2246
2247fn xref_indexes(
2248 dictionary: &Dictionary,
2249 size: u64,
2250) -> std::result::Result<Vec<(u64, u64)>, ParseError> {
2251 let Some(index_object) = dictionary.get("Index") else {
2252 return Ok(vec![(0, size)]);
2253 };
2254 let CosObject::Array(values) = index_object else {
2255 return Err(ParseError::Malformed {
2256 message: bounded("xref stream Index must be an array"),
2257 });
2258 };
2259 if values.len() % 2 != 0 {
2260 return Err(ParseError::Malformed {
2261 message: bounded("xref stream Index must contain pairs"),
2262 });
2263 }
2264 let mut indexes = Vec::with_capacity(values.len() / 2);
2265 for pair in values.chunks(2) {
2266 let first = integer_value(pair.first(), "xref stream Index first")?;
2267 let count = integer_value(pair.get(1), "xref stream Index count")?;
2268 let first = u64::try_from(first).map_err(|_| ParseError::Malformed {
2269 message: bounded("xref stream Index first must be non-negative"),
2270 })?;
2271 let count = u64::try_from(count).map_err(|_| ParseError::Malformed {
2272 message: bounded("xref stream Index count must be non-negative"),
2273 })?;
2274 first
2275 .checked_add(count)
2276 .ok_or(ParseError::ArithmeticOverflow {
2277 context: "xref stream Index",
2278 })?;
2279 indexes.push((first, count));
2280 }
2281 Ok(indexes)
2282}
2283
2284fn integer_value(
2285 value: Option<&CosObject>,
2286 context: &'static str,
2287) -> std::result::Result<i64, ParseError> {
2288 match value {
2289 Some(CosObject::Integer(value)) => Ok(*value),
2290 _ => Err(ParseError::Malformed {
2291 message: BoundedText::unchecked(format!("{context} must be integer")),
2292 }),
2293 }
2294}
2295
2296fn read_be_uint(
2297 bytes: &[u8],
2298 pos: &mut usize,
2299 width: usize,
2300) -> std::result::Result<u64, ParseError> {
2301 let end = pos
2302 .checked_add(width)
2303 .ok_or(ParseError::ArithmeticOverflow {
2304 context: "xref stream field",
2305 })?;
2306 let field = bytes.get(*pos..end).ok_or(ParseError::Malformed {
2307 message: bounded("xref stream field out of bounds"),
2308 })?;
2309 let mut value = 0_u64;
2310 for byte in field {
2311 value = value
2312 .checked_mul(256)
2313 .and_then(|current| current.checked_add(u64::from(*byte)))
2314 .ok_or(ParseError::ArithmeticOverflow {
2315 context: "xref stream field",
2316 })?;
2317 }
2318 *pos = end;
2319 Ok(value)
2320}
2321
2322fn object_ref_from_dictionary(dictionary: &Dictionary, key: &str) -> Option<ObjectKey> {
2323 match dictionary.get(key) {
2324 Some(CosObject::Reference(value)) => Some(*value),
2325 _ => None,
2326 }
2327}
2328
2329fn stream_filters(dictionary: &Dictionary) -> Vec<PdfName> {
2330 match dictionary.get("Filter") {
2331 Some(CosObject::Name(_)) if is_identity_crypt_filter(dictionary, 0) => Vec::new(),
2332 Some(CosObject::Name(name)) => vec![name.clone()],
2333 Some(CosObject::Array(values)) => values
2334 .iter()
2335 .enumerate()
2336 .filter_map(|(index, value)| match value {
2337 CosObject::Name(name) if is_identity_crypt_filter(dictionary, index) => None,
2338 CosObject::Name(name) => Some(name.clone()),
2339 _ => None,
2340 })
2341 .collect(),
2342 _ => Vec::new(),
2343 }
2344}
2345
2346fn is_identity_crypt_filter(dictionary: &Dictionary, filter_index: usize) -> bool {
2347 let Some(filter) = dictionary.get("Filter") else {
2348 return false;
2349 };
2350 let filter_is_crypt = match filter {
2351 CosObject::Name(name) => name.matches("Crypt"),
2352 CosObject::Array(filters) => matches!(
2353 filters.get(filter_index),
2354 Some(CosObject::Name(name)) if name.matches("Crypt")
2355 ),
2356 _ => false,
2357 };
2358 if !filter_is_crypt {
2359 return false;
2360 }
2361 match dictionary.get("DecodeParms") {
2362 Some(CosObject::Dictionary(params)) => matches!(
2363 params.get("Name"),
2364 Some(CosObject::Name(name)) if name.matches("Identity")
2365 ),
2366 Some(CosObject::Array(params)) => matches!(
2367 params.get(filter_index),
2368 Some(CosObject::Dictionary(params)) if matches!(
2369 params.get("Name"),
2370 Some(CosObject::Name(name)) if name.matches("Identity")
2371 )
2372 ),
2373 _ => false,
2374 }
2375}
2376
2377fn stream_decode_params(dictionary: &Dictionary, filter_count: usize) -> Vec<DecodeParams> {
2378 match dictionary.get("DecodeParms") {
2379 Some(CosObject::Dictionary(params)) => vec![decode_params_from_dictionary(params)],
2380 Some(CosObject::Array(values)) => values
2381 .iter()
2382 .take(filter_count)
2383 .map(|value| match value {
2384 CosObject::Dictionary(params) => decode_params_from_dictionary(params),
2385 _ => DecodeParams::default(),
2386 })
2387 .collect(),
2388 _ => vec![DecodeParams::default(); filter_count],
2389 }
2390}
2391
2392fn decode_params_from_dictionary(dictionary: &Dictionary) -> DecodeParams {
2393 DecodeParams {
2394 predictor: integer_from_dictionary(dictionary, "Predictor")
2395 .and_then(|value| u16::try_from(value).ok())
2396 .unwrap_or(1),
2397 colors: integer_from_dictionary(dictionary, "Colors")
2398 .and_then(|value| u16::try_from(value).ok())
2399 .unwrap_or(1),
2400 bits_per_component: integer_from_dictionary(dictionary, "BitsPerComponent")
2401 .and_then(|value| u16::try_from(value).ok())
2402 .unwrap_or(8),
2403 columns: integer_from_dictionary(dictionary, "Columns")
2404 .and_then(|value| u32::try_from(value).ok())
2405 .unwrap_or(1),
2406 early_change: integer_from_dictionary(dictionary, "EarlyChange")
2407 .and_then(|value| u8::try_from(value).ok())
2408 .unwrap_or(1),
2409 crypt_filter_name: match dictionary.get("Name") {
2410 Some(CosObject::Name(name)) => Some(name.clone()),
2411 _ => None,
2412 },
2413 }
2414}
2415
2416fn checked_u64_len(len: usize, context: &'static str) -> std::result::Result<u64, ParseError> {
2417 u64::try_from(len).map_err(|_| ParseError::ArithmeticOverflow { context })
2418}
2419
2420fn enforce_decoded_len(len: u64, max_decode_bytes: u64) -> std::result::Result<(), ParseError> {
2421 if len > max_decode_bytes {
2422 return Err(ParseError::LimitExceeded {
2423 limit: "max_stream_decode_bytes",
2424 });
2425 }
2426 Ok(())
2427}
2428
2429fn filter_identifier(filter: &PdfName) -> std::result::Result<Identifier, ParseError> {
2430 Identifier::new(String::from_utf8_lossy(filter.as_bytes()).into_owned()).map_err(|_| {
2431 ParseError::Malformed {
2432 message: bounded("stream filter name is not a valid identifier"),
2433 }
2434 })
2435}
2436
2437#[derive(Debug)]
2438struct FlateDecoder;
2439
2440impl StreamDecoder for FlateDecoder {
2441 fn decode(
2442 &self,
2443 input: &[u8],
2444 params: &DecodeParams,
2445 limits: &ResourceLimits,
2446 ) -> std::result::Result<DecoderOutput, ParseError> {
2447 let decoded = decode_flate_limited(input, limits.max_stream_decode_bytes)?;
2448 Ok(DecoderOutput {
2449 bytes: apply_predictor(decoded, params, limits.max_stream_decode_bytes)?,
2450 metadata_mode: false,
2451 })
2452 }
2453}
2454
2455#[derive(Debug)]
2456struct AsciiHexDecoder;
2457
2458impl StreamDecoder for AsciiHexDecoder {
2459 fn decode(
2460 &self,
2461 input: &[u8],
2462 _params: &DecodeParams,
2463 limits: &ResourceLimits,
2464 ) -> std::result::Result<DecoderOutput, ParseError> {
2465 let mut output = Vec::new();
2466 let mut high: Option<u8> = None;
2467 for byte in input {
2468 if is_ws(*byte) {
2469 continue;
2470 }
2471 if *byte == b'>' {
2472 break;
2473 }
2474 let Some(nibble) = decode_hex_digit(*byte) else {
2475 return Err(ParseError::StreamDecode {
2476 message: bounded("invalid ASCIIHex digit"),
2477 });
2478 };
2479 if let Some(previous) = high.take() {
2480 push_limited_byte(
2481 &mut output,
2482 previous.saturating_mul(16).saturating_add(nibble),
2483 limits.max_stream_decode_bytes,
2484 )?;
2485 } else {
2486 high = Some(nibble);
2487 }
2488 }
2489 if let Some(previous) = high {
2490 push_limited_byte(
2491 &mut output,
2492 previous.saturating_mul(16),
2493 limits.max_stream_decode_bytes,
2494 )?;
2495 }
2496 Ok(DecoderOutput {
2497 bytes: output,
2498 metadata_mode: false,
2499 })
2500 }
2501}
2502
2503#[derive(Debug)]
2504struct Ascii85Decoder;
2505
2506impl StreamDecoder for Ascii85Decoder {
2507 fn decode(
2508 &self,
2509 input: &[u8],
2510 _params: &DecodeParams,
2511 limits: &ResourceLimits,
2512 ) -> std::result::Result<DecoderOutput, ParseError> {
2513 let mut output = Vec::new();
2514 let mut group = Vec::with_capacity(5);
2515 let mut iter = input.iter().copied().peekable();
2516 while let Some(byte) = iter.next() {
2517 if is_ws(byte) {
2518 continue;
2519 }
2520 if byte == b'~' && iter.peek() == Some(&b'>') {
2521 break;
2522 }
2523 if byte == b'z' {
2524 if !group.is_empty() {
2525 return Err(ParseError::StreamDecode {
2526 message: bounded("ASCII85 z inside a partial group"),
2527 });
2528 }
2529 extend_limited(&mut output, &[0, 0, 0, 0], limits.max_stream_decode_bytes)?;
2530 continue;
2531 }
2532 if !(b'!'..=b'u').contains(&byte) {
2533 return Err(ParseError::StreamDecode {
2534 message: bounded("invalid ASCII85 digit"),
2535 });
2536 }
2537 group.push(byte.saturating_sub(b'!'));
2538 if group.len() == 5 {
2539 append_ascii85_group(&mut output, &group, 4, limits.max_stream_decode_bytes)?;
2540 group.clear();
2541 }
2542 }
2543 if !group.is_empty() {
2544 let output_bytes = group.len().saturating_sub(1);
2545 while group.len() < 5 {
2546 group.push(84);
2547 }
2548 append_ascii85_group(
2549 &mut output,
2550 &group,
2551 output_bytes,
2552 limits.max_stream_decode_bytes,
2553 )?;
2554 }
2555 Ok(DecoderOutput {
2556 bytes: output,
2557 metadata_mode: false,
2558 })
2559 }
2560}
2561
2562#[derive(Debug)]
2563struct RunLengthDecoder;
2564
2565impl StreamDecoder for RunLengthDecoder {
2566 fn decode(
2567 &self,
2568 input: &[u8],
2569 _params: &DecodeParams,
2570 limits: &ResourceLimits,
2571 ) -> std::result::Result<DecoderOutput, ParseError> {
2572 let mut output = Vec::new();
2573 let mut pos = 0_usize;
2574 while let Some(length) = input.get(pos).copied() {
2575 pos = pos.saturating_add(1);
2576 match length {
2577 128 => break,
2578 0..=127 => {
2579 let count = usize::from(length).saturating_add(1);
2580 let end = pos
2581 .checked_add(count)
2582 .ok_or(ParseError::ArithmeticOverflow {
2583 context: "RunLength literal",
2584 })?;
2585 let literal = input.get(pos..end).ok_or(ParseError::StreamDecode {
2586 message: bounded("RunLength literal exceeds input"),
2587 })?;
2588 extend_limited(&mut output, literal, limits.max_stream_decode_bytes)?;
2589 pos = end;
2590 }
2591 _ => {
2592 let Some(value) = input.get(pos).copied() else {
2593 return Err(ParseError::StreamDecode {
2594 message: bounded("RunLength repeat missing byte"),
2595 });
2596 };
2597 pos = pos.saturating_add(1);
2598 let count = 257_usize.saturating_sub(usize::from(length));
2599 for _ in 0..count {
2600 push_limited_byte(&mut output, value, limits.max_stream_decode_bytes)?;
2601 }
2602 }
2603 }
2604 }
2605 Ok(DecoderOutput {
2606 bytes: output,
2607 metadata_mode: false,
2608 })
2609 }
2610}
2611
2612#[derive(Debug)]
2613struct LzwDecoder;
2614
2615impl StreamDecoder for LzwDecoder {
2616 fn decode(
2617 &self,
2618 input: &[u8],
2619 params: &DecodeParams,
2620 limits: &ResourceLimits,
2621 ) -> std::result::Result<DecoderOutput, ParseError> {
2622 let decoded = decode_lzw(input, params.early_change, limits.max_stream_decode_bytes)?;
2623 Ok(DecoderOutput {
2624 bytes: apply_predictor(decoded, params, limits.max_stream_decode_bytes)?,
2625 metadata_mode: false,
2626 })
2627 }
2628}
2629
2630#[derive(Debug)]
2631struct CryptDecoder;
2632
2633impl StreamDecoder for CryptDecoder {
2634 fn decode(
2635 &self,
2636 input: &[u8],
2637 params: &DecodeParams,
2638 _limits: &ResourceLimits,
2639 ) -> std::result::Result<DecoderOutput, ParseError> {
2640 if params
2641 .crypt_filter_name
2642 .as_ref()
2643 .is_none_or(|name| name.matches("Identity"))
2644 {
2645 return Ok(DecoderOutput {
2646 bytes: input.to_vec(),
2647 metadata_mode: false,
2648 });
2649 }
2650 Err(ParseError::UnsupportedFilter {
2651 filter: BoundedText::unchecked("Crypt"),
2652 })
2653 }
2654}
2655
2656#[derive(Debug)]
2657struct MetadataModeDecoder;
2658
2659impl StreamDecoder for MetadataModeDecoder {
2660 fn decode(
2661 &self,
2662 input: &[u8],
2663 _params: &DecodeParams,
2664 _limits: &ResourceLimits,
2665 ) -> std::result::Result<DecoderOutput, ParseError> {
2666 Ok(DecoderOutput {
2667 bytes: input.to_vec(),
2668 metadata_mode: true,
2669 })
2670 }
2671}
2672
2673fn append_ascii85_group(
2674 output: &mut Vec<u8>,
2675 group: &[u8],
2676 output_bytes: usize,
2677 max_decode_bytes: u64,
2678) -> std::result::Result<(), ParseError> {
2679 let mut value = 0_u32;
2680 for digit in group {
2681 value = value
2682 .checked_mul(85)
2683 .and_then(|current| current.checked_add(u32::from(*digit)))
2684 .ok_or(ParseError::StreamDecode {
2685 message: bounded("ASCII85 group overflows"),
2686 })?;
2687 }
2688 let bytes = value.to_be_bytes();
2689 let Some(slice) = bytes.get(..output_bytes) else {
2690 return Err(ParseError::StreamDecode {
2691 message: bounded("invalid ASCII85 group length"),
2692 });
2693 };
2694 extend_limited(output, slice, max_decode_bytes)
2695}
2696
2697fn push_limited_byte(
2698 output: &mut Vec<u8>,
2699 byte: u8,
2700 max_decode_bytes: u64,
2701) -> std::result::Result<(), ParseError> {
2702 let next_len = checked_u64_len(output.len(), "decoded stream length")?
2703 .checked_add(1)
2704 .ok_or(ParseError::ArithmeticOverflow {
2705 context: "decoded stream length",
2706 })?;
2707 enforce_decoded_len(next_len, max_decode_bytes)?;
2708 output.push(byte);
2709 Ok(())
2710}
2711
2712fn extend_limited(
2713 output: &mut Vec<u8>,
2714 bytes: &[u8],
2715 max_decode_bytes: u64,
2716) -> std::result::Result<(), ParseError> {
2717 let next_len = checked_u64_len(output.len(), "decoded stream length")?
2718 .checked_add(checked_u64_len(bytes.len(), "decoded stream length")?)
2719 .ok_or(ParseError::ArithmeticOverflow {
2720 context: "decoded stream length",
2721 })?;
2722 enforce_decoded_len(next_len, max_decode_bytes)?;
2723 output.extend_from_slice(bytes);
2724 Ok(())
2725}
2726
2727fn apply_predictor(
2728 bytes: Vec<u8>,
2729 params: &DecodeParams,
2730 max_decode_bytes: u64,
2731) -> std::result::Result<Vec<u8>, ParseError> {
2732 match params.predictor {
2733 1 => Ok(bytes),
2734 2 => apply_tiff_predictor(bytes, params, max_decode_bytes),
2735 10..=15 => apply_png_predictor(&bytes, params, max_decode_bytes),
2736 _ => Err(ParseError::StreamDecode {
2737 message: bounded("unsupported predictor"),
2738 }),
2739 }
2740}
2741
2742fn predictor_geometry(params: &DecodeParams) -> std::result::Result<(usize, usize), ParseError> {
2743 if params.colors == 0 || params.bits_per_component == 0 || params.columns == 0 {
2744 return Err(ParseError::StreamDecode {
2745 message: bounded("invalid predictor geometry"),
2746 });
2747 }
2748 let bits_per_row = u64::from(params.colors)
2749 .checked_mul(u64::from(params.bits_per_component))
2750 .and_then(|bits| bits.checked_mul(u64::from(params.columns)))
2751 .ok_or(ParseError::ArithmeticOverflow {
2752 context: "predictor row size",
2753 })?;
2754 let row_bytes = bits_per_row
2755 .checked_add(7)
2756 .ok_or(ParseError::ArithmeticOverflow {
2757 context: "predictor row size",
2758 })?
2759 / 8;
2760 let bytes_per_pixel_bits = u64::from(params.colors)
2761 .checked_mul(u64::from(params.bits_per_component))
2762 .ok_or(ParseError::ArithmeticOverflow {
2763 context: "predictor pixel size",
2764 })?;
2765 let bytes_per_pixel =
2766 bytes_per_pixel_bits
2767 .checked_add(7)
2768 .ok_or(ParseError::ArithmeticOverflow {
2769 context: "predictor pixel size",
2770 })?
2771 / 8;
2772 Ok((
2773 usize::try_from(row_bytes).map_err(|_| ParseError::LimitExceeded {
2774 limit: "max_stream_decode_bytes",
2775 })?,
2776 usize::try_from(bytes_per_pixel.max(1)).map_err(|_| ParseError::LimitExceeded {
2777 limit: "max_stream_decode_bytes",
2778 })?,
2779 ))
2780}
2781
2782fn apply_tiff_predictor(
2783 mut bytes: Vec<u8>,
2784 params: &DecodeParams,
2785 max_decode_bytes: u64,
2786) -> std::result::Result<Vec<u8>, ParseError> {
2787 enforce_decoded_len(
2788 checked_u64_len(bytes.len(), "predictor output length")?,
2789 max_decode_bytes,
2790 )?;
2791 let (row_bytes, bytes_per_pixel) = predictor_geometry(params)?;
2792 if row_bytes == 0 || !bytes.len().is_multiple_of(row_bytes) {
2793 return Err(ParseError::StreamDecode {
2794 message: bounded("TIFF predictor row length mismatch"),
2795 });
2796 }
2797 for row in bytes.chunks_mut(row_bytes) {
2798 for index in bytes_per_pixel..row.len() {
2799 let left = row
2800 .get(index.saturating_sub(bytes_per_pixel))
2801 .copied()
2802 .ok_or(ParseError::StreamDecode {
2803 message: bounded("TIFF predictor left byte missing"),
2804 })?;
2805 let Some(byte) = row.get_mut(index) else {
2806 return Err(ParseError::StreamDecode {
2807 message: bounded("TIFF predictor byte missing"),
2808 });
2809 };
2810 *byte = byte.wrapping_add(left);
2811 }
2812 }
2813 Ok(bytes)
2814}
2815
2816fn apply_png_predictor(
2817 bytes: &[u8],
2818 params: &DecodeParams,
2819 max_decode_bytes: u64,
2820) -> std::result::Result<Vec<u8>, ParseError> {
2821 let (row_bytes, bytes_per_pixel) = predictor_geometry(params)?;
2822 let encoded_row = row_bytes
2823 .checked_add(1)
2824 .ok_or(ParseError::ArithmeticOverflow {
2825 context: "PNG predictor row size",
2826 })?;
2827 if encoded_row == 0 || !bytes.len().is_multiple_of(encoded_row) {
2828 return Err(ParseError::StreamDecode {
2829 message: bounded("PNG predictor row length mismatch"),
2830 });
2831 }
2832 let row_count = bytes.len() / encoded_row;
2833 let output_capacity =
2834 row_count
2835 .checked_mul(row_bytes)
2836 .ok_or(ParseError::ArithmeticOverflow {
2837 context: "PNG predictor output length",
2838 })?;
2839 enforce_decoded_len(
2840 checked_u64_len(output_capacity, "PNG predictor output length")?,
2841 max_decode_bytes,
2842 )?;
2843 let mut output = vec![0_u8; output_capacity];
2844 for row_index in 0..row_count {
2845 let encoded_start =
2846 row_index
2847 .checked_mul(encoded_row)
2848 .ok_or(ParseError::ArithmeticOverflow {
2849 context: "PNG predictor row offset",
2850 })?;
2851 let filter = *bytes.get(encoded_start).ok_or(ParseError::StreamDecode {
2852 message: bounded("PNG predictor filter byte missing"),
2853 })?;
2854 let encoded = bytes
2855 .get(encoded_start.saturating_add(1)..encoded_start.saturating_add(encoded_row))
2856 .ok_or(ParseError::StreamDecode {
2857 message: bounded("PNG predictor row missing"),
2858 })?;
2859 let output_start =
2860 row_index
2861 .checked_mul(row_bytes)
2862 .ok_or(ParseError::ArithmeticOverflow {
2863 context: "PNG predictor output row offset",
2864 })?;
2865 for index in 0..row_bytes {
2866 let raw = *encoded.get(index).ok_or(ParseError::StreamDecode {
2867 message: bounded("PNG predictor source byte missing"),
2868 })?;
2869 let left = if index >= bytes_per_pixel {
2870 output
2871 .get(output_start + index - bytes_per_pixel)
2872 .copied()
2873 .ok_or(ParseError::StreamDecode {
2874 message: bounded("PNG predictor left byte missing"),
2875 })?
2876 } else {
2877 0
2878 };
2879 let up = if row_index > 0 {
2880 output
2881 .get(output_start + index - row_bytes)
2882 .copied()
2883 .ok_or(ParseError::StreamDecode {
2884 message: bounded("PNG predictor upper byte missing"),
2885 })?
2886 } else {
2887 0
2888 };
2889 let up_left = if row_index > 0 && index >= bytes_per_pixel {
2890 output
2891 .get(output_start + index - row_bytes - bytes_per_pixel)
2892 .copied()
2893 .ok_or(ParseError::StreamDecode {
2894 message: bounded("PNG predictor upper-left byte missing"),
2895 })?
2896 } else {
2897 0
2898 };
2899 let value = png_predictor_value(filter, raw, left, up, up_left)?;
2900 let Some(target) = output.get_mut(output_start + index) else {
2901 return Err(ParseError::StreamDecode {
2902 message: bounded("PNG predictor target byte missing"),
2903 });
2904 };
2905 *target = value;
2906 }
2907 }
2908 Ok(output)
2909}
2910
2911fn png_predictor_value(
2912 filter: u8,
2913 raw: u8,
2914 left: u8,
2915 up: u8,
2916 up_left: u8,
2917) -> std::result::Result<u8, ParseError> {
2918 match filter {
2919 0 => Ok(raw),
2920 1 => Ok(raw.wrapping_add(left)),
2921 2 => Ok(raw.wrapping_add(up)),
2922 3 => {
2923 let average =
2924 u8::try_from(u16::midpoint(u16::from(left), u16::from(up))).map_err(|_| {
2925 ParseError::StreamDecode {
2926 message: bounded("PNG predictor average byte out of range"),
2927 }
2928 })?;
2929 Ok(raw.wrapping_add(average))
2930 }
2931 4 => Ok(raw.wrapping_add(paeth_predictor(left, up, up_left))),
2932 _ => Err(ParseError::StreamDecode {
2933 message: bounded("invalid PNG predictor filter"),
2934 }),
2935 }
2936}
2937
2938fn paeth_predictor(left: u8, up: u8, up_left: u8) -> u8 {
2939 let left = i16::from(left);
2940 let up = i16::from(up);
2941 let up_left = i16::from(up_left);
2942 let estimate = left + up - up_left;
2943 let left_distance = (estimate - left).abs();
2944 let up_distance = (estimate - up).abs();
2945 let up_left_distance = (estimate - up_left).abs();
2946 if left_distance <= up_distance && left_distance <= up_left_distance {
2947 u8::try_from(left).unwrap_or(0)
2948 } else if up_distance <= up_left_distance {
2949 u8::try_from(up).unwrap_or(0)
2950 } else {
2951 u8::try_from(up_left).unwrap_or(0)
2952 }
2953}
2954
2955fn decode_lzw(
2956 input: &[u8],
2957 early_change: u8,
2958 max_decode_bytes: u64,
2959) -> std::result::Result<Vec<u8>, ParseError> {
2960 let mut reader = MsbBitReader::new(input);
2961 let mut dictionary = initial_lzw_dictionary();
2962 let mut code_bits = 9_u8;
2963 let mut next_code = 258_u16;
2964 let mut previous: Option<Vec<u8>> = None;
2965 let mut output = Vec::new();
2966 while let Some(code) = reader.read_bits(code_bits)? {
2967 match code {
2968 256 => {
2969 dictionary = initial_lzw_dictionary();
2970 code_bits = 9;
2971 next_code = 258;
2972 previous = None;
2973 }
2974 257 => break,
2975 _ => {
2976 let entry = if let Some(value) = dictionary.get(usize::from(code)).cloned() {
2977 value
2978 } else if code == next_code {
2979 let mut value = previous.clone().ok_or(ParseError::StreamDecode {
2980 message: bounded("LZW missing previous entry"),
2981 })?;
2982 let first = *value.first().ok_or(ParseError::StreamDecode {
2983 message: bounded("LZW empty previous entry"),
2984 })?;
2985 value.push(first);
2986 value
2987 } else {
2988 return Err(ParseError::StreamDecode {
2989 message: bounded("invalid LZW code"),
2990 });
2991 };
2992 extend_limited(&mut output, &entry, max_decode_bytes)?;
2993 if let Some(previous_entry) = previous {
2994 let mut new_entry = previous_entry;
2995 let first = *entry.first().ok_or(ParseError::StreamDecode {
2996 message: bounded("LZW empty entry"),
2997 })?;
2998 new_entry.push(first);
2999 if dictionary.len() < 4096 {
3000 dictionary.push(new_entry);
3001 next_code = next_code.saturating_add(1);
3002 let threshold =
3003 (1_u16 << code_bits).saturating_sub(u16::from(early_change.min(1)));
3004 if next_code >= threshold && code_bits < 12 {
3005 code_bits = code_bits.saturating_add(1);
3006 }
3007 }
3008 }
3009 previous = Some(entry);
3010 }
3011 }
3012 }
3013 Ok(output)
3014}
3015
3016fn initial_lzw_dictionary() -> Vec<Vec<u8>> {
3017 let mut dictionary = Vec::with_capacity(258);
3018 for byte in 0_u8..=255 {
3019 dictionary.push(vec![byte]);
3020 }
3021 dictionary.push(Vec::new());
3022 dictionary.push(Vec::new());
3023 dictionary
3024}
3025
3026#[derive(Clone, Copy, Debug)]
3027struct MsbBitReader<'a> {
3028 input: &'a [u8],
3029 bit_pos: usize,
3030}
3031
3032impl<'a> MsbBitReader<'a> {
3033 fn new(input: &'a [u8]) -> Self {
3034 Self { input, bit_pos: 0 }
3035 }
3036
3037 fn read_bits(&mut self, bits: u8) -> std::result::Result<Option<u16>, ParseError> {
3038 let remaining_bits = self
3039 .input
3040 .len()
3041 .checked_mul(8)
3042 .and_then(|total| total.checked_sub(self.bit_pos))
3043 .ok_or(ParseError::ArithmeticOverflow {
3044 context: "LZW bit position",
3045 })?;
3046 if remaining_bits < usize::from(bits) {
3047 return Ok(None);
3048 }
3049 let mut value = 0_u16;
3050 for _ in 0..bits {
3051 let byte_index = self.bit_pos / 8;
3052 let bit_index = 7_usize.saturating_sub(self.bit_pos % 8);
3053 let byte = self
3054 .input
3055 .get(byte_index)
3056 .copied()
3057 .ok_or(ParseError::StreamDecode {
3058 message: bounded("LZW bit read out of bounds"),
3059 })?;
3060 value = value.checked_shl(1).ok_or(ParseError::ArithmeticOverflow {
3061 context: "LZW code",
3062 })? | u16::from((byte >> bit_index) & 1);
3063 self.bit_pos = self.bit_pos.saturating_add(1);
3064 }
3065 Ok(Some(value))
3066 }
3067}
3068
3069fn encryption_reference(trailers: &[Trailer]) -> Option<&CosObject> {
3070 trailers
3071 .iter()
3072 .rev()
3073 .find_map(|trailer| trailer.dictionary.get("Encrypt"))
3074}
3075
3076fn encryption_fact(objects: &ObjectStore, trailers: &[Trailer]) -> ParseFact {
3077 #[cfg(feature = "decrypt")]
3078 {
3079 encryption::encryption_summary(objects, trailers).into_fact(false)
3080 }
3081 #[cfg(not(feature = "decrypt"))]
3082 {
3083 let handler = trailers.iter().rev().find_map(|trailer| {
3084 let encrypt = trailer.dictionary.get("Encrypt")?;
3085 match encrypt {
3086 CosObject::Dictionary(dictionary) => encryption_handler(dictionary),
3087 CosObject::Reference(key) => objects
3088 .get(key)
3089 .and_then(|object| object.object.as_dictionary())
3090 .and_then(encryption_handler),
3091 _ => None,
3092 }
3093 });
3094 ParseFact::Encryption {
3095 encrypted: true,
3096 handler,
3097 version: None,
3098 revision: None,
3099 algorithm: None,
3100 decrypted: false,
3101 }
3102 }
3103}
3104
3105#[cfg(not(feature = "decrypt"))]
3106fn encryption_handler(dictionary: &Dictionary) -> Option<Identifier> {
3107 let Some(CosObject::Name(filter)) = dictionary.get("Filter") else {
3108 return None;
3109 };
3110 Identifier::new(String::from_utf8_lossy(filter.as_bytes()).into_owned()).ok()
3111}
3112
3113#[cfg(feature = "flate")]
3114fn decode_flate_limited(
3115 bytes: &[u8],
3116 max_decode_bytes: u64,
3117) -> std::result::Result<Vec<u8>, ParseError> {
3118 use flate2::read::{DeflateDecoder, ZlibDecoder};
3119
3120 read_limited(
3121 ZlibDecoder::new(std::io::Cursor::new(bytes)),
3122 max_decode_bytes,
3123 )
3124 .or_else(|_| {
3125 read_limited(
3126 DeflateDecoder::new(std::io::Cursor::new(bytes)),
3127 max_decode_bytes,
3128 )
3129 })
3130}
3131
3132#[cfg(not(feature = "flate"))]
3133fn decode_flate_limited(
3134 _bytes: &[u8],
3135 _max_decode_bytes: u64,
3136) -> std::result::Result<Vec<u8>, ParseError> {
3137 Err(ParseError::UnsupportedFilter {
3138 filter: BoundedText::unchecked("FlateDecode"),
3139 })
3140}
3141
3142#[cfg(feature = "flate")]
3143fn read_limited(
3144 mut reader: impl Read,
3145 max_decode_bytes: u64,
3146) -> std::result::Result<Vec<u8>, ParseError> {
3147 let mut output = Vec::new();
3148 let mut buffer = [0_u8; 8192];
3149 loop {
3150 let read = reader
3151 .read(&mut buffer)
3152 .map_err(|source| ParseError::StreamDecode {
3153 message: BoundedText::unchecked(source.to_string()),
3154 })?;
3155 if read == 0 {
3156 return Ok(output);
3157 }
3158 let next_len = u64::try_from(output.len())
3159 .ok()
3160 .and_then(|len| {
3161 u64::try_from(read)
3162 .ok()
3163 .and_then(|read| len.checked_add(read))
3164 })
3165 .ok_or(ParseError::ArithmeticOverflow {
3166 context: "decoded stream length",
3167 })?;
3168 if next_len > max_decode_bytes {
3169 return Err(ParseError::LimitExceeded {
3170 limit: "max_stream_decode_bytes",
3171 });
3172 }
3173 let chunk = buffer.get(..read).ok_or(ParseError::Malformed {
3174 message: bounded("decode buffer range out of bounds"),
3175 })?;
3176 output.extend_from_slice(chunk);
3177 }
3178}
3179
3180#[cfg(test)]
3181mod tests {
3182 use std::{error::Error, io::Cursor, num::NonZeroU32};
3183
3184 use proptest::prelude::*;
3185 use rstest::rstest;
3186
3187 use super::{CosObject, ParsedDocument, Parser, StreamObject};
3188 use crate::{ParseFact, ResourceLimits, StreamFact};
3189
3190 fn minimal_pdf() -> Vec<u8> {
3191 br"%PDF-1.7
31921 0 obj
3193<< /Type /Catalog >>
3194endobj
3195xref
31960 2
31970000000000 65535 f
31980000000009 00000 n
3199trailer
3200<< /Root 1 0 R /Size 2 >>
3201startxref
320245
3203%%EOF
3204"
3205 .to_vec()
3206 }
3207
3208 fn xref_stream_data(entries: &[(u8, u32, u16)]) -> Vec<u8> {
3209 let mut data = Vec::with_capacity(entries.len() * 7);
3210 for (entry_type, field_two, field_three) in entries {
3211 data.push(*entry_type);
3212 data.extend(field_two.to_be_bytes());
3213 data.extend(field_three.to_be_bytes());
3214 }
3215 data
3216 }
3217
3218 #[test]
3219 fn test_should_parse_header_and_catalog_from_m0_fixture() -> crate::Result<()> {
3220 let document = Parser::default().parse(Cursor::new(minimal_pdf()))?;
3221
3222 assert_eq!(document.version.major, 1);
3223 assert_eq!(document.version.minor, 7);
3224 assert!(document.catalog.is_some());
3225 assert_eq!(document.objects.len(), 1);
3226 Ok(())
3227 }
3228
3229 #[test]
3230 fn test_should_record_leading_header_bytes() -> crate::Result<()> {
3231 let mut bytes = b"junk".to_vec();
3232 bytes.extend(minimal_pdf());
3233
3234 let document = Parser::default().parse(Cursor::new(bytes))?;
3235
3236 assert!(document.parse_facts.iter().any(|fact| {
3237 matches!(
3238 fact,
3239 ParseFact::Header {
3240 offset: 4,
3241 had_leading_bytes: true,
3242 ..
3243 }
3244 )
3245 }));
3246 Ok(())
3247 }
3248
3249 #[test]
3250 fn test_should_warn_on_malformed_recoverable_header() -> crate::Result<()> {
3251 let bytes = br"%PDF-x.y
32521 0 obj
3253<< /Type /Catalog >>
3254endobj
3255trailer
3256<< /Root 1 0 R >>
3257%%EOF
3258";
3259
3260 let document = Parser::default().parse(Cursor::new(bytes))?;
3261
3262 assert!(!document.warnings.is_empty());
3263 assert_eq!(document.version, crate::PdfVersion { major: 1, minor: 4 });
3264 Ok(())
3265 }
3266
3267 #[test]
3268 fn test_should_parse_names_strings_numbers_arrays_and_dictionaries() -> crate::Result<()> {
3269 let bytes = br"%PDF-1.7
32701 0 obj
3271<< /Type /Catalog /Name /A#20B /Title (hello\nworld) /Nums [1 -2 3.5 true false null] >>
3272endobj
3273trailer
3274<< /Root 1 0 R >>
3275%%EOF
3276";
3277
3278 let document = Parser::default().parse(Cursor::new(bytes))?;
3279 let object =
3280 document
3281 .objects
3282 .values()
3283 .next()
3284 .ok_or_else(|| crate::ParseError::MissingObject {
3285 message: crate::BoundedText::unchecked("missing object"),
3286 })?;
3287 let dictionary =
3288 object
3289 .object
3290 .as_dictionary()
3291 .ok_or_else(|| crate::ParseError::Malformed {
3292 message: crate::BoundedText::unchecked("missing dictionary"),
3293 })?;
3294
3295 assert!(
3296 matches!(dictionary.get("Nums"), Some(CosObject::Array(values)) if values.len() == 6)
3297 );
3298 Ok(())
3299 }
3300
3301 #[test]
3302 fn test_should_scan_bad_stream_length_and_emit_facts() -> crate::Result<()> {
3303 let bytes = br"%PDF-1.7
33041 0 obj
3305<< /Length 99 >>
3306stream
3307abc
3308endstream
3309endobj
3310trailer
3311<< /Root 1 0 R >>
3312%%EOF
3313";
3314
3315 let document = Parser::default().parse(Cursor::new(bytes))?;
3316
3317 assert!(document.parse_facts.iter().any(|fact| {
3318 matches!(
3319 fact,
3320 ParseFact::Stream {
3321 fact: StreamFact::Length {
3322 declared: 99,
3323 discovered: 3
3324 },
3325 ..
3326 }
3327 )
3328 }));
3329 Ok(())
3330 }
3331
3332 #[test]
3333 fn test_should_treat_identity_crypt_stream_filter_as_passthrough() -> crate::Result<()> {
3334 let bytes = br"%PDF-1.7
33351 0 obj
3336<< /Type /Catalog >>
3337endobj
33382 0 obj
3339<< /Length 3 /Filter /Crypt /DecodeParms << /Name /Identity >> >>
3340stream
3341abc
3342endstream
3343endobj
3344trailer
3345<< /Root 1 0 R >>
3346%%EOF
3347";
3348
3349 let document = Parser::default().parse(Cursor::new(bytes))?;
3350 let object = document
3351 .objects
3352 .get(&crate::ObjectKey::new(
3353 NonZeroU32::new(2).unwrap_or(NonZeroU32::MIN),
3354 0,
3355 ))
3356 .ok_or_else(|| crate::ParseError::MissingObject {
3357 message: crate::BoundedText::unchecked("missing stream object"),
3358 })?;
3359 let CosObject::Stream(stream) = &object.object else {
3360 return Err(crate::ParseError::Malformed {
3361 message: crate::BoundedText::unchecked("missing stream"),
3362 }
3363 .into());
3364 };
3365
3366 assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"abc");
3367 Ok(())
3368 }
3369
3370 #[test]
3371 fn test_should_parse_xref_stream_as_trailer_source() -> crate::Result<()> {
3372 let xref_data = xref_stream_data(&[(0, 0, 65_535), (1, 9, 0), (1, 45, 0)]);
3373 let mut bytes = br"%PDF-1.7
33741 0 obj
3375<< /Type /Catalog >>
3376endobj
33772 0 obj
3378<< /Type /XRef /Size 3 /W [1 4 2] /Index [0 3] /Length "
3379 .to_vec();
3380 bytes.extend(xref_data.len().to_string().as_bytes());
3381 bytes.extend(
3382 br" /Root 1 0 R >>
3383stream
3384",
3385 );
3386 bytes.extend(xref_data);
3387 bytes.extend(
3388 br"
3389endstream
3390endobj
3391%%EOF
3392",
3393 );
3394
3395 let document = Parser::default().parse(Cursor::new(bytes))?;
3396
3397 assert!(document.catalog.is_some());
3398 assert!(document.parse_facts.iter().any(|fact| {
3399 matches!(
3400 fact,
3401 ParseFact::Xref {
3402 fact: crate::XrefFact::XrefStreamParsed { .. },
3403 ..
3404 }
3405 )
3406 }));
3407 Ok(())
3408 }
3409
3410 #[test]
3411 fn test_should_parse_flate_xref_stream_with_compressed_entry() -> Result<(), Box<dyn Error>> {
3412 use std::io::Write;
3413
3414 use flate2::{Compression, write::ZlibEncoder};
3415
3416 let xref_data = xref_stream_data(&[(2, 2, 0), (1, 3, 0)]);
3417 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3418 encoder.write_all(&xref_data)?;
3419 let compressed = encoder.finish()?;
3420 let mut bytes = br"%PDF-1.7
34211 0 obj
3422<< /Type /Catalog >>
3423endobj
34242 0 obj
3425<< /Type /ObjStm /N 0 /First 0 /Length 0 >>
3426stream
3427endstream
3428endobj
34293 0 obj
3430<< /Type /XRef /Size 3 /W [1 4 2] /Index [1 2] /Filter /FlateDecode /Length "
3431 .to_vec();
3432 bytes.extend(compressed.len().to_string().as_bytes());
3433 bytes.extend(
3434 br" /Root 1 0 R >>
3435stream
3436",
3437 );
3438 bytes.extend(compressed);
3439 bytes.extend(
3440 br"
3441endstream
3442endobj
3443%%EOF
3444",
3445 );
3446
3447 let document = Parser::default().parse(Cursor::new(bytes))?;
3448
3449 assert!(document.parse_facts.iter().any(|fact| {
3450 matches!(
3451 fact,
3452 ParseFact::Xref {
3453 fact: crate::XrefFact::XrefStreamParsed {
3454 entries: 2,
3455 compressed_entries: 1
3456 },
3457 ..
3458 }
3459 )
3460 }));
3461 Ok(())
3462 }
3463
3464 #[test]
3465 fn test_should_emit_xref_prev_and_hybrid_reference_facts() -> crate::Result<()> {
3466 let bytes = br"%PDF-1.7
34671 0 obj
3468<< /Type /Catalog >>
3469endobj
3470xref
34710 2
34720000000000 65535 f
34730000000009 00000 n
3474trailer
3475<< /Size 2 /Root 1 0 R >>
3476xref
34770 1
34780000000000 65535 f
3479trailer
3480<< /Size 2 /Root 1 0 R /Prev 40 /XRefStm 120 >>
3481%%EOF
3482";
3483
3484 let document = Parser::default().parse(Cursor::new(bytes))?;
3485
3486 assert!(document.parse_facts.iter().any(|fact| {
3487 matches!(
3488 fact,
3489 ParseFact::Xref {
3490 fact: crate::XrefFact::PrevChain { offset: 40 },
3491 ..
3492 }
3493 )
3494 }));
3495 assert!(document.parse_facts.iter().any(|fact| {
3496 matches!(
3497 fact,
3498 ParseFact::Xref {
3499 fact: crate::XrefFact::HybridReference { offset: 120 },
3500 ..
3501 }
3502 )
3503 }));
3504 Ok(())
3505 }
3506
3507 #[test]
3508 fn test_should_expand_unfiltered_object_stream() -> crate::Result<()> {
3509 let object_stream = b"1 0 << /Type /Catalog >>";
3510 let mut bytes = br"%PDF-1.7
35112 0 obj
3512<< /Type /ObjStm /N 1 /First 4 /Length "
3513 .to_vec();
3514 bytes.extend(object_stream.len().to_string().as_bytes());
3515 bytes.extend(
3516 br" >>
3517stream
3518",
3519 );
3520 bytes.extend(object_stream);
3521 bytes.extend(
3522 br"
3523endstream
3524endobj
35253 0 obj
3526<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3527stream
3528endstream
3529endobj
3530%%EOF
3531",
3532 );
3533
3534 let document = Parser::default().parse(Cursor::new(bytes))?;
3535
3536 assert!(document.catalog.is_some());
3537 assert_eq!(document.objects.len(), 3);
3538 assert!(document.parse_facts.iter().any(|fact| {
3539 matches!(
3540 fact,
3541 ParseFact::Xref {
3542 fact: crate::XrefFact::ObjectStreamParsed,
3543 ..
3544 }
3545 )
3546 }));
3547 Ok(())
3548 }
3549
3550 #[test]
3551 fn test_should_decode_flate_object_stream_with_limit() -> Result<(), Box<dyn Error>> {
3552 use std::io::Write;
3553
3554 use flate2::{Compression, write::ZlibEncoder};
3555
3556 let object_stream = b"1 0 << /Type /Catalog >>";
3557 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3558 encoder.write_all(object_stream)?;
3559 let compressed = encoder.finish()?;
3560 let mut bytes = br"%PDF-1.7
35612 0 obj
3562<< /Type /ObjStm /N 1 /First 4 /Filter /FlateDecode /Length "
3563 .to_vec();
3564 bytes.extend(compressed.len().to_string().as_bytes());
3565 bytes.extend(
3566 br" >>
3567stream
3568",
3569 );
3570 bytes.extend(compressed);
3571 bytes.extend(
3572 br"
3573endstream
3574endobj
35753 0 obj
3576<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3577stream
3578endstream
3579endobj
3580%%EOF
3581",
3582 );
3583
3584 let document = Parser::default().parse(Cursor::new(bytes))?;
3585
3586 assert!(document.catalog.is_some());
3587 assert!(document.parse_facts.iter().any(|fact| {
3588 matches!(
3589 fact,
3590 ParseFact::Stream {
3591 fact: StreamFact::Decoded { bytes: 24 },
3592 ..
3593 }
3594 )
3595 }));
3596 Ok(())
3597 }
3598
3599 #[test]
3600 fn test_should_decode_asciihex_ascii85_runlength_and_lzw_streams() -> crate::Result<()> {
3601 let cases: [(&str, Vec<u8>, &[u8]); 4] = [
3602 ("ASCIIHexDecode", b"61 62>".to_vec(), b"ab"),
3603 ("ASCII85Decode", b"9jqo~>".to_vec(), b"Man"),
3604 (
3605 "RunLengthDecode",
3606 vec![2, b'a', b'b', b'c', 254, b'x', 128],
3607 b"abcxxx",
3608 ),
3609 (
3610 "LZWDecode",
3611 pack_lzw_codes(&[(256, 9), (97, 9), (98, 9), (97, 9), (257, 9)]),
3612 b"aba",
3613 ),
3614 ];
3615 for (filter, encoded, expected) in cases {
3616 let document =
3617 Parser::default().parse(Cursor::new(single_stream_pdf(filter, "", &encoded)))?;
3618 let stream = parsed_stream(&document)?;
3619
3620 assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, expected);
3621 }
3622 Ok(())
3623 }
3624
3625 #[test]
3626 fn test_should_apply_flate_png_predictor() -> Result<(), Box<dyn Error>> {
3627 use std::io::Write;
3628
3629 use flate2::{Compression, write::ZlibEncoder};
3630
3631 let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3632 encoder.write_all(&[1, b'a', 1, 1])?;
3633 let compressed = encoder.finish()?;
3634 let document = Parser::default().parse(Cursor::new(single_stream_pdf(
3635 "FlateDecode",
3636 "/DecodeParms << /Predictor 12 /Columns 3 >>",
3637 &compressed,
3638 )))?;
3639 let stream = parsed_stream(&document)?;
3640
3641 assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"abc");
3642 Ok(())
3643 }
3644
3645 #[test]
3646 fn test_should_emit_per_filter_decode_facts_for_object_stream() -> crate::Result<()> {
3647 let object_stream = b"1 0 << /Type /Catalog >>";
3648 let encoded = hex_bytes(object_stream);
3649 let mut bytes = br"%PDF-1.7
36502 0 obj
3651<< /Type /ObjStm /N 1 /First 4 /Filter /ASCIIHexDecode /Length "
3652 .to_vec();
3653 bytes.extend(encoded.len().to_string().as_bytes());
3654 bytes.extend(
3655 br" >>
3656stream
3657",
3658 );
3659 bytes.extend(encoded);
3660 bytes.extend(
3661 br"
3662endstream
3663endobj
36643 0 obj
3665<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3666stream
3667endstream
3668endobj
3669%%EOF
3670",
3671 );
3672
3673 let document = Parser::default().parse(Cursor::new(bytes))?;
3674
3675 assert!(document.catalog.is_some());
3676 assert!(document.parse_facts.iter().any(|fact| {
3677 matches!(
3678 fact,
3679 ParseFact::Stream {
3680 fact: StreamFact::FilterDecoded {
3681 filter,
3682 output_bytes: 24,
3683 ..
3684 },
3685 ..
3686 } if filter.as_str() == "ASCIIHexDecode"
3687 )
3688 }));
3689 Ok(())
3690 }
3691
3692 #[test]
3693 fn test_should_preserve_image_filter_bytes_in_metadata_mode() -> crate::Result<()> {
3694 let document =
3695 Parser::default().parse(Cursor::new(single_stream_pdf("DCTDecode", "", b"image")))?;
3696 let stream = parsed_stream(&document)?;
3697
3698 assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"image");
3699 Ok(())
3700 }
3701
3702 #[test]
3703 fn test_should_parse_with_spill_file_source_storage_above_threshold() -> crate::Result<()> {
3704 let limits = ResourceLimits {
3705 memory_source_threshold_bytes: 0,
3706 ..ResourceLimits::default()
3707 };
3708 let document = Parser::new(limits.clone()).parse(Cursor::new(single_stream_pdf(
3709 "ASCIIHexDecode",
3710 "",
3711 b"61 62>",
3712 )))?;
3713 let stream = parsed_stream(&document)?;
3714
3715 assert_eq!(stream.decoded_bytes(&limits)?, b"ab");
3716 Ok(())
3717 }
3718
3719 #[test]
3720 fn test_should_enforce_name_limit() {
3721 let limits = ResourceLimits {
3722 max_name_bytes: 2,
3723 ..ResourceLimits::default()
3724 };
3725 let bytes = br"%PDF-1.7
37261 0 obj
3727<< /Long /Name >>
3728endobj
3729%%EOF
3730";
3731
3732 let result = Parser::new(limits).parse(Cursor::new(bytes));
3733
3734 assert!(result.is_err());
3735 }
3736
3737 #[rstest]
3738 #[case(b"/A", "A")]
3739 #[case(b"/A#20B", "A B")]
3740 fn test_should_parse_name_escape_matrix(
3741 #[case] token: &[u8],
3742 #[case] expected: &str,
3743 ) -> crate::Result<()> {
3744 let mut bytes = b"%PDF-1.7\n1 0 obj\n<< /Name ".to_vec();
3745 bytes.extend(token);
3746 bytes.extend(b" >>\nendobj\n%%EOF\n");
3747
3748 let document = Parser::default().parse(Cursor::new(bytes))?;
3749 let object =
3750 document
3751 .objects
3752 .values()
3753 .next()
3754 .ok_or_else(|| crate::ParseError::MissingObject {
3755 message: crate::BoundedText::unchecked("missing object"),
3756 })?;
3757 let dictionary =
3758 object
3759 .object
3760 .as_dictionary()
3761 .ok_or_else(|| crate::ParseError::Malformed {
3762 message: crate::BoundedText::unchecked("missing dictionary"),
3763 })?;
3764
3765 assert!(
3766 matches!(dictionary.get("Name"), Some(CosObject::Name(name)) if name.as_bytes() == expected.as_bytes())
3767 );
3768 Ok(())
3769 }
3770
3771 proptest! {
3772 #[test]
3773 fn test_should_not_panic_on_arbitrary_bytes(input in proptest::collection::vec(any::<u8>(), 0..512)) {
3774 let _ = Parser::default().parse(Cursor::new(input));
3775 }
3776 }
3777
3778 fn single_stream_pdf(filter: &str, params: &str, encoded: &[u8]) -> Vec<u8> {
3779 let mut bytes = format!(
3780 "%PDF-1.7\n1 0 obj\n<< /Length {} /Filter /{filter} {params} >>\nstream\n",
3781 encoded.len()
3782 )
3783 .into_bytes();
3784 bytes.extend(encoded);
3785 bytes.extend(b"\nendstream\nendobj\n%%EOF\n");
3786 bytes
3787 }
3788
3789 fn parsed_stream(document: &ParsedDocument) -> crate::Result<&StreamObject> {
3790 let object =
3791 document
3792 .objects
3793 .values()
3794 .next()
3795 .ok_or_else(|| crate::ParseError::MissingObject {
3796 message: crate::BoundedText::unchecked("missing stream object"),
3797 })?;
3798 let CosObject::Stream(stream) = &object.object else {
3799 return Err(crate::ParseError::Malformed {
3800 message: crate::BoundedText::unchecked("missing stream"),
3801 }
3802 .into());
3803 };
3804 Ok(stream)
3805 }
3806
3807 fn pack_lzw_codes(codes: &[(u16, u8)]) -> Vec<u8> {
3808 let mut output = Vec::new();
3809 let mut current = 0_u8;
3810 let mut used = 0_u8;
3811 for (code, bits) in codes {
3812 for bit in (0..*bits).rev() {
3813 current <<= 1;
3814 current |= u8::try_from((code >> bit) & 1).unwrap_or(0);
3815 used = used.saturating_add(1);
3816 if used == 8 {
3817 output.push(current);
3818 current = 0;
3819 used = 0;
3820 }
3821 }
3822 }
3823 if used != 0 {
3824 current <<= 8_u8.saturating_sub(used);
3825 output.push(current);
3826 }
3827 output
3828 }
3829
3830 fn hex_bytes(bytes: &[u8]) -> Vec<u8> {
3831 const HEX: &[u8; 16] = b"0123456789abcdef";
3832 let mut output = Vec::with_capacity(bytes.len().saturating_mul(2).saturating_add(1));
3833 for byte in bytes {
3834 output.push(HEX.get(usize::from(byte >> 4)).copied().unwrap_or(b'0'));
3835 output.push(HEX.get(usize::from(byte & 0x0f)).copied().unwrap_or(b'0'));
3836 }
3837 output.push(b'>');
3838 output
3839 }
3840}