pdfv_core/
parser.rs

1//! Tolerant byte-level PDF parser used by the validator.
2
3#[cfg(feature = "decrypt")]
4mod encryption;
5
6use std::{
7    borrow::Cow,
8    collections::BTreeMap,
9    fmt,
10    io::{Read, Seek, SeekFrom, Write},
11    num::NonZeroU32,
12    sync::{Arc, OnceLock},
13};
14
15use serde::{Deserialize, Serialize};
16use tempfile::NamedTempFile;
17
18#[cfg(not(feature = "decrypt"))]
19use crate::Identifier;
20use crate::{
21    BoundedText, ConfigError, Identifier, ObjectKey, ObjectLocation, ParseError, ParseFact,
22    PasswordSecret, PdfVersion, ResourceLimits, Result, StreamFact, ValidationWarning, XrefFact,
23};
24
25const HEADER_MARKER: &[u8] = b"%PDF-";
26const EOF_MARKER: &[u8] = b"%%EOF";
27const STREAM_MARKER: &[u8] = b"stream";
28const ENDSTREAM_MARKER: &[u8] = b"endstream";
29const ENDOBJ_MARKER: &[u8] = b"endobj";
30const SPILL_SEARCH_CHUNK_BYTES: usize = 8192;
31
32#[allow(
33    clippy::disallowed_types,
34    reason = "parser source storage is synchronous Read+Seek; async file handles do not fit this \
35              API"
36)]
37type SpillFileHandle = std::fs::File;
38
39/// Seekable PDF source accepted by [`Parser`].
40pub trait PdfSource: Read + Seek {}
41
42impl<T> PdfSource for T where T: Read + Seek {}
43
44/// M0 PDF parser.
45#[derive(Clone, Debug)]
46pub struct Parser {
47    limits: ResourceLimits,
48    decoder_registry: Option<DecoderRegistry>,
49}
50
51impl Parser {
52    /// Creates a parser with the supplied resource limits.
53    #[must_use]
54    pub fn new(limits: ResourceLimits) -> Self {
55        Self {
56            limits,
57            decoder_registry: None,
58        }
59    }
60
61    /// Creates a parser with explicit resource limits and stream decoders.
62    #[must_use]
63    pub fn with_decoder_registry(
64        limits: ResourceLimits,
65        decoder_registry: DecoderRegistry,
66    ) -> Self {
67        Self {
68            limits,
69            decoder_registry: Some(decoder_registry),
70        }
71    }
72
73    /// Parses a seekable PDF source into a tolerant document model.
74    ///
75    /// # Errors
76    ///
77    /// Returns [`crate::PdfvError`] when input cannot be read, exceeds a resource
78    /// limit, or is too malformed for M0 recovery.
79    pub fn parse<R: PdfSource>(&self, source: R) -> Result<ParsedDocument> {
80        self.parse_with_options(source, ParseOptions::default())
81    }
82
83    /// Parses a seekable PDF source with optional password state.
84    ///
85    /// # Errors
86    ///
87    /// Returns [`crate::PdfvError`] when input cannot be read, exceeds a resource
88    /// limit, or is too malformed for recovery.
89    pub fn parse_with_options<R: PdfSource>(
90        &self,
91        mut source: R,
92        options: ParseOptions<'_>,
93    ) -> Result<ParsedDocument> {
94        let byte_len = source
95            .seek(SeekFrom::End(0))
96            .map_err(|source| crate::PdfvError::Io { path: None, source })?;
97        if byte_len > self.limits.max_file_bytes {
98            return Err(ParseError::LimitExceeded {
99                limit: "max_file_bytes",
100            }
101            .into());
102        }
103        source
104            .rewind()
105            .map_err(|source| crate::PdfvError::Io { path: None, source })?;
106
107        let storage = SourceStorage::from_source(
108            source,
109            byte_len,
110            self.limits.memory_source_threshold_bytes,
111        )?;
112
113        ByteParser::new(
114            storage,
115            self.limits.clone(),
116            self.decoder_registry.clone(),
117            options,
118        )
119        .parse_document()
120    }
121}
122
123impl Default for Parser {
124    fn default() -> Self {
125        Self::new(ResourceLimits::default())
126    }
127}
128
129fn default_decoder_registry() -> &'static DecoderRegistry {
130    static REGISTRY: OnceLock<DecoderRegistry> = OnceLock::new();
131    REGISTRY.get_or_init(DecoderRegistry::default)
132}
133
134/// Stream decoder extension point used by [`DecoderRegistry`].
135pub trait StreamDecoder: fmt::Debug {
136    /// Decodes one PDF stream filter under parser resource limits.
137    ///
138    /// # Errors
139    ///
140    /// Returns [`ParseError`] when the encoded bytes are malformed or the
141    /// decoded output exceeds configured limits.
142    fn decode(
143        &self,
144        input: &[u8],
145        params: &DecodeParams,
146        limits: &ResourceLimits,
147    ) -> std::result::Result<DecoderOutput, ParseError>;
148}
149
150/// Result of applying one stream decoder.
151#[derive(Clone, Debug, Eq, PartialEq)]
152#[non_exhaustive]
153pub struct DecoderOutput {
154    /// Decoded or byte-preserved output bytes.
155    pub bytes: Vec<u8>,
156    /// Whether the decoder deliberately preserved encoded bytes in metadata mode.
157    pub metadata_mode: bool,
158}
159
160/// Registry mapping PDF filter names to bounded decoders.
161#[derive(Clone)]
162pub struct DecoderRegistry {
163    decoders: BTreeMap<PdfName, Arc<dyn StreamDecoder + Send + Sync>>,
164}
165
166/// Parser-owned storage for source bytes.
167#[derive(Clone, Debug)]
168#[non_exhaustive]
169pub enum SourceStorage {
170    /// Source bytes are held in memory.
171    Memory(Arc<[u8]>),
172    /// Source bytes are held in a temporary spill file.
173    SpillFile {
174        /// Spill file handle.
175        file: Arc<SpillFileHandle>,
176        /// Total source byte length.
177        len: usize,
178        /// Temporary path retained for deterministic cleanup on drop.
179        path: Arc<tempfile::TempPath>,
180    },
181}
182
183impl SourceStorage {
184    fn from_source<R: PdfSource>(
185        mut source: R,
186        byte_len: u64,
187        memory_threshold: u64,
188    ) -> Result<Self> {
189        if byte_len <= memory_threshold {
190            let capacity = usize::try_from(byte_len).map_err(|_| ParseError::LimitExceeded {
191                limit: "max_file_bytes",
192            })?;
193            let mut bytes = Vec::with_capacity(capacity);
194            source
195                .read_to_end(&mut bytes)
196                .map_err(|source| crate::PdfvError::Io { path: None, source })?;
197            return Ok(Self::Memory(Arc::from(bytes)));
198        }
199
200        let mut tempfile =
201            NamedTempFile::new().map_err(|source| crate::PdfvError::Io { path: None, source })?;
202        let copied = std::io::copy(&mut source, &mut tempfile)
203            .map_err(|source| crate::PdfvError::Io { path: None, source })?;
204        if copied != byte_len {
205            return Err(ParseError::Malformed {
206                message: bounded("source length changed while spilling"),
207            }
208            .into());
209        }
210        tempfile
211            .as_file_mut()
212            .flush()
213            .map_err(|source| crate::PdfvError::Io { path: None, source })?;
214        let file = tempfile
215            .reopen()
216            .map_err(|source| crate::PdfvError::Io { path: None, source })?;
217        Ok(Self::SpillFile {
218            file: Arc::new(file),
219            len: usize::try_from(byte_len).map_err(|_| ParseError::LimitExceeded {
220                limit: "max_file_bytes",
221            })?,
222            path: Arc::new(tempfile.into_temp_path()),
223        })
224    }
225
226    fn len(&self) -> usize {
227        match self {
228            Self::Memory(bytes) => bytes.len(),
229            Self::SpillFile { len, .. } => *len,
230        }
231    }
232
233    fn slice(&self, start: usize, end: usize) -> std::result::Result<Cow<'_, [u8]>, ParseError> {
234        if start > end || end > self.len() {
235            return Err(ParseError::Malformed {
236                message: bounded("byte range out of bounds"),
237            });
238        }
239        match self {
240            Self::Memory(bytes) => {
241                bytes
242                    .get(start..end)
243                    .map(Cow::Borrowed)
244                    .ok_or(ParseError::Malformed {
245                        message: bounded("byte range out of bounds"),
246                    })
247            }
248            Self::SpillFile { file, .. } => {
249                let mut buffer = vec![0_u8; end.saturating_sub(start)];
250                read_exact_at(file, &mut buffer, start)?;
251                Ok(Cow::Owned(buffer))
252            }
253        }
254    }
255
256    fn byte(&self, pos: usize) -> Option<u8> {
257        if pos >= self.len() {
258            return None;
259        }
260        match self {
261            Self::Memory(bytes) => bytes.get(pos).copied(),
262            Self::SpillFile { file, .. } => {
263                let mut byte = [0_u8; 1];
264                read_exact_at(file, &mut byte, pos).ok()?;
265                Some(byte[0])
266            }
267        }
268    }
269
270    fn starts_with(&self, pos: usize, expected: &[u8]) -> bool {
271        let Some(end) = pos.checked_add(expected.len()) else {
272            return false;
273        };
274        self.slice(pos, end)
275            .is_ok_and(|bytes| bytes.as_ref() == expected)
276    }
277
278    fn find_bytes(&self, needle: &[u8], start: usize, end: usize) -> Option<usize> {
279        if needle.is_empty() || start > end || end > self.len() {
280            return None;
281        }
282        match self {
283            Self::Memory(_) => {
284                let bytes = self.slice(start, end).ok()?;
285                find_bytes(bytes.as_ref(), needle, 0)
286                    .and_then(|relative| start.checked_add(relative))
287            }
288            Self::SpillFile { file, .. } => find_bytes_in_spill_file(file, needle, start, end),
289        }
290    }
291
292    fn stream_source(
293        &self,
294        start: usize,
295        end: usize,
296    ) -> std::result::Result<(Arc<[u8]>, StreamRange), ParseError> {
297        match self {
298            Self::Memory(bytes) => Ok((
299                Arc::clone(bytes),
300                StreamRange {
301                    start: u64::try_from(start).map_err(|_| ParseError::ArithmeticOverflow {
302                        context: "stream start",
303                    })?,
304                    end: u64::try_from(end).map_err(|_| ParseError::ArithmeticOverflow {
305                        context: "stream end",
306                    })?,
307                },
308            )),
309            Self::SpillFile { .. } => {
310                let bytes = self.slice(start, end)?.into_owned();
311                let end =
312                    u64::try_from(bytes.len()).map_err(|_| ParseError::ArithmeticOverflow {
313                        context: "stream end",
314                    })?;
315                Ok((Arc::from(bytes), StreamRange { start: 0, end }))
316            }
317        }
318    }
319}
320
321fn read_exact_at(
322    file: &SpillFileHandle,
323    buffer: &mut [u8],
324    offset: usize,
325) -> std::result::Result<(), ParseError> {
326    let mut file = file.try_clone().map_err(|_| ParseError::Malformed {
327        message: bounded("failed to clone spill file handle"),
328    })?;
329    file.seek(SeekFrom::Start(u64::try_from(offset).map_err(|_| {
330        ParseError::ArithmeticOverflow {
331            context: "spill file offset",
332        }
333    })?))
334    .map_err(|_| ParseError::Malformed {
335        message: bounded("failed to seek spill file"),
336    })?;
337    file.read_exact(buffer).map_err(|_| ParseError::Malformed {
338        message: bounded("failed to read spill file"),
339    })
340}
341
342fn find_bytes_in_spill_file(
343    file: &SpillFileHandle,
344    needle: &[u8],
345    start: usize,
346    end: usize,
347) -> Option<usize> {
348    let overlap = needle.len().saturating_sub(1);
349    let mut pos = start;
350    let mut carried = Vec::new();
351    while pos < end {
352        let read_len = end.saturating_sub(pos).min(SPILL_SEARCH_CHUNK_BYTES);
353        let mut chunk = vec![0_u8; read_len];
354        read_exact_at(file, &mut chunk, pos).ok()?;
355        let search_base = pos.saturating_sub(carried.len());
356        let carried_len = carried.len();
357        carried.extend_from_slice(&chunk);
358        if let Some(relative) = find_bytes(&carried, needle, 0) {
359            return search_base.checked_add(relative);
360        }
361        if carried.len() > overlap {
362            let keep_start = carried.len().saturating_sub(overlap);
363            carried = carried.get(keep_start..)?.to_vec();
364        }
365        pos = pos.checked_add(read_len)?;
366        if read_len == 0 || carried_len == carried.len() && chunk.is_empty() {
367            break;
368        }
369    }
370    None
371}
372
373impl DecoderRegistry {
374    /// Creates the default pure-Rust decoder registry.
375    #[must_use]
376    pub fn new() -> Self {
377        let mut registry = Self {
378            decoders: BTreeMap::new(),
379        };
380        let flate: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(FlateDecoder);
381        let ascii_hex: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(AsciiHexDecoder);
382        let ascii85: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(Ascii85Decoder);
383        let run_length: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(RunLengthDecoder);
384        let lzw: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(LzwDecoder);
385        let crypt: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(CryptDecoder);
386        let metadata_mode: Arc<dyn StreamDecoder + Send + Sync> = Arc::new(MetadataModeDecoder);
387        registry.register_many(["FlateDecode", "Fl"], &flate);
388        registry.register_many(["ASCIIHexDecode", "AHx"], &ascii_hex);
389        registry.register_many(["ASCII85Decode", "A85"], &ascii85);
390        registry.register_many(["RunLengthDecode", "RL"], &run_length);
391        registry.register_many(["LZWDecode", "LZW"], &lzw);
392        registry.register_many(["Crypt"], &crypt);
393        registry.register_many(
394            ["DCTDecode", "JPXDecode", "JBIG2Decode", "CCITTFaxDecode"],
395            &metadata_mode,
396        );
397        registry
398    }
399
400    /// Registers a decoder for a PDF filter name.
401    pub fn register(&mut self, name: PdfName, decoder: &Arc<dyn StreamDecoder + Send + Sync>) {
402        self.decoders.insert(name, Arc::clone(decoder));
403    }
404
405    fn register_many<const N: usize>(
406        &mut self,
407        names: [&'static str; N],
408        decoder: &Arc<dyn StreamDecoder + Send + Sync>,
409    ) {
410        for name in names {
411            self.register(PdfName::from_static(name), decoder);
412        }
413    }
414
415    fn decoder(&self, name: &PdfName) -> Option<&Arc<dyn StreamDecoder + Send + Sync>> {
416        self.decoders.get(name)
417    }
418}
419
420impl Default for DecoderRegistry {
421    fn default() -> Self {
422        Self::new()
423    }
424}
425
426impl fmt::Debug for DecoderRegistry {
427    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
428        formatter
429            .debug_struct("DecoderRegistry")
430            .field("decoder_count", &self.decoders.len())
431            .finish()
432    }
433}
434
435/// Structured parameters for one stream filter.
436#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
437#[non_exhaustive]
438#[serde(rename_all = "camelCase", deny_unknown_fields)]
439pub struct DecodeParams {
440    /// Predictor algorithm, where 1 means no predictor.
441    pub predictor: u16,
442    /// Number of colour components for predictor decoding.
443    pub colors: u16,
444    /// Bits per component for predictor decoding.
445    pub bits_per_component: u16,
446    /// Predictor column count.
447    pub columns: u32,
448    /// LZW early-change mode.
449    pub early_change: u8,
450    /// Named crypt filter, when `/Filter /Crypt` carries one.
451    pub crypt_filter_name: Option<PdfName>,
452}
453
454impl Default for DecodeParams {
455    fn default() -> Self {
456        Self {
457            predictor: 1,
458            colors: 1,
459            bits_per_component: 8,
460            columns: 1,
461            early_change: 1,
462            crypt_filter_name: None,
463        }
464    }
465}
466
467/// Parser options for password-capable parsing.
468#[derive(Clone, Copy, Debug, Default)]
469#[non_exhaustive]
470pub struct ParseOptions<'a> {
471    /// Optional redacted password for Standard security handler decryption.
472    pub password: Option<&'a PasswordSecret>,
473}
474
475/// Parsed PDF document produced by [`Parser`].
476#[derive(Clone, Debug, Deserialize, Serialize)]
477#[non_exhaustive]
478#[serde(rename_all = "camelCase", deny_unknown_fields)]
479pub struct ParsedDocument {
480    /// Parsed PDF header version.
481    pub version: PdfVersion,
482    /// Catalog object referenced by the latest trailer, when available.
483    pub catalog: Option<ObjectKey>,
484    /// Indirect object store.
485    pub objects: ObjectStore,
486    /// Parsed trailers.
487    pub trailers: Vec<Trailer>,
488    /// Parser facts retained for validation.
489    pub parse_facts: Vec<ParseFact>,
490    /// Recoverable parser warnings.
491    pub warnings: Vec<ValidationWarning>,
492}
493
494impl ParsedDocument {
495    /// Returns true when the document trailer declares encryption.
496    #[must_use]
497    pub fn is_encrypted(&self) -> bool {
498        self.parse_facts.iter().any(|fact| {
499            matches!(
500                fact,
501                ParseFact::Encryption {
502                    encrypted: true,
503                    decrypted: false,
504                    handler: _,
505                    ..
506                }
507            )
508        })
509    }
510}
511
512/// Indirect object storage keyed by object number and generation.
513#[derive(Clone, Debug, Default, Deserialize, Serialize)]
514#[serde(transparent)]
515pub struct ObjectStore(BTreeMap<ObjectKey, IndirectObject>);
516
517impl ObjectStore {
518    /// Inserts an indirect object.
519    pub fn insert(&mut self, object: IndirectObject) {
520        self.0.insert(object.key, object);
521    }
522
523    /// Returns an indirect object by key.
524    #[must_use]
525    pub fn get(&self, key: &ObjectKey) -> Option<&IndirectObject> {
526        self.0.get(key)
527    }
528
529    /// Iterates over indirect objects in key order.
530    pub fn values(&self) -> impl Iterator<Item = &IndirectObject> {
531        self.0.values()
532    }
533
534    pub(crate) fn values_mut(&mut self) -> impl Iterator<Item = &mut IndirectObject> {
535        self.0.values_mut()
536    }
537
538    /// Returns the number of stored objects.
539    #[must_use]
540    pub fn len(&self) -> usize {
541        self.0.len()
542    }
543
544    /// Returns true when no objects are stored.
545    #[must_use]
546    pub fn is_empty(&self) -> bool {
547        self.0.is_empty()
548    }
549}
550
551/// Parsed indirect object.
552#[derive(Clone, Debug, Deserialize, Serialize)]
553#[non_exhaustive]
554#[serde(rename_all = "camelCase", deny_unknown_fields)]
555pub struct IndirectObject {
556    /// Object key.
557    pub key: ObjectKey,
558    /// Byte offset where the indirect object starts.
559    pub offset: u64,
560    /// Materialized COS object.
561    pub object: CosObject,
562}
563
564/// Parsed trailer dictionary.
565#[derive(Clone, Debug, Deserialize, Serialize)]
566#[non_exhaustive]
567#[serde(rename_all = "camelCase", deny_unknown_fields)]
568pub struct Trailer {
569    /// Trailer dictionary.
570    pub dictionary: Dictionary,
571    /// Byte offset where `trailer` was parsed.
572    pub offset: u64,
573}
574
575/// PDF COS object.
576#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
577#[non_exhaustive]
578#[serde(rename_all = "camelCase", tag = "type", content = "value")]
579pub enum CosObject {
580    /// Null object.
581    Null,
582    /// Boolean object.
583    Boolean(bool),
584    /// Integer number.
585    Integer(i64),
586    /// Real number, stored as finite `f64`.
587    Real(f64),
588    /// Name object.
589    Name(PdfName),
590    /// String object.
591    String(PdfString),
592    /// Array object.
593    Array(Vec<CosObject>),
594    /// Dictionary object.
595    Dictionary(Dictionary),
596    /// Stream object.
597    Stream(StreamObject),
598    /// Indirect reference.
599    Reference(ObjectKey),
600}
601
602impl CosObject {
603    /// Returns this object as a dictionary when it has that shape.
604    #[must_use]
605    pub fn as_dictionary(&self) -> Option<&Dictionary> {
606        match self {
607            Self::Dictionary(dictionary) => Some(dictionary),
608            Self::Stream(stream) => Some(&stream.dictionary),
609            _ => None,
610        }
611    }
612}
613
614/// PDF dictionary.
615#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
616#[serde(transparent)]
617pub struct Dictionary(BTreeMap<PdfName, CosObject>);
618
619impl Dictionary {
620    /// Inserts a key/value pair.
621    pub fn insert(&mut self, key: PdfName, value: CosObject) {
622        self.0.insert(key, value);
623    }
624
625    /// Returns a dictionary value by PDF name.
626    #[must_use]
627    pub fn get(&self, key: &str) -> Option<&CosObject> {
628        self.0.get(&PdfName::from_static(key))
629    }
630
631    /// Iterates over dictionary entries in key order.
632    pub fn iter(&self) -> impl Iterator<Item = (&PdfName, &CosObject)> {
633        self.0.iter()
634    }
635
636    pub(crate) fn values_mut(&mut self) -> impl Iterator<Item = &mut CosObject> {
637        self.0.values_mut()
638    }
639
640    /// Returns the number of entries.
641    #[must_use]
642    pub fn len(&self) -> usize {
643        self.0.len()
644    }
645
646    /// Returns true when the dictionary is empty.
647    #[must_use]
648    pub fn is_empty(&self) -> bool {
649        self.0.is_empty()
650    }
651}
652
653/// PDF name bytes after hash escape decoding.
654#[derive(Clone, Debug, Deserialize, Eq, Hash, Ord, PartialEq, PartialOrd, Serialize)]
655#[serde(try_from = "String", into = "String")]
656pub struct PdfName(Vec<u8>);
657
658impl PdfName {
659    /// Creates a name from already validated bytes.
660    ///
661    /// # Errors
662    ///
663    /// Returns [`ParseError`] when the byte limit is exceeded.
664    pub fn new(bytes: Vec<u8>, limits: &ResourceLimits) -> std::result::Result<Self, ParseError> {
665        if bytes.len() > limits.max_name_bytes {
666            return Err(ParseError::LimitExceeded {
667                limit: "max_name_bytes",
668            });
669        }
670        Ok(Self(bytes))
671    }
672
673    /// Returns the raw name bytes.
674    #[must_use]
675    pub fn as_bytes(&self) -> &[u8] {
676        &self.0
677    }
678
679    /// Returns true when the decoded bytes match an ASCII name.
680    #[must_use]
681    pub fn matches(&self, value: &str) -> bool {
682        self.0.as_slice() == value.as_bytes()
683    }
684
685    fn from_static(value: &str) -> Self {
686        Self(value.as_bytes().to_vec())
687    }
688}
689
690impl TryFrom<String> for PdfName {
691    type Error = ConfigError;
692
693    fn try_from(value: String) -> std::result::Result<Self, Self::Error> {
694        Ok(Self(value.into_bytes()))
695    }
696}
697
698impl From<PdfName> for String {
699    fn from(value: PdfName) -> Self {
700        String::from_utf8_lossy(&value.0).into_owned()
701    }
702}
703
704/// PDF string bytes.
705#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
706#[serde(try_from = "Vec<u8>", into = "Vec<u8>")]
707pub struct PdfString(Vec<u8>);
708
709impl PdfString {
710    /// Creates a bounded PDF string.
711    ///
712    /// # Errors
713    ///
714    /// Returns [`ParseError`] when the byte limit is exceeded.
715    pub fn new(bytes: Vec<u8>, limits: &ResourceLimits) -> std::result::Result<Self, ParseError> {
716        if bytes.len() > limits.max_string_bytes {
717            return Err(ParseError::LimitExceeded {
718                limit: "max_string_bytes",
719            });
720        }
721        Ok(Self(bytes))
722    }
723
724    /// Returns raw string bytes.
725    #[must_use]
726    pub fn as_bytes(&self) -> &[u8] {
727        &self.0
728    }
729}
730
731impl TryFrom<Vec<u8>> for PdfString {
732    type Error = ConfigError;
733
734    fn try_from(value: Vec<u8>) -> std::result::Result<Self, Self::Error> {
735        Ok(Self(value))
736    }
737}
738
739impl From<PdfString> for Vec<u8> {
740    fn from(value: PdfString) -> Self {
741        value.0
742    }
743}
744
745/// Parsed stream object with raw byte range metadata.
746#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
747#[non_exhaustive]
748#[serde(rename_all = "camelCase", deny_unknown_fields)]
749pub struct StreamObject {
750    /// Stream dictionary.
751    pub dictionary: Dictionary,
752    /// Range of raw stream bytes within the input.
753    pub raw_range: StreamRange,
754    /// Declared stream length.
755    pub declared_length: Option<u64>,
756    /// Length discovered by scanning to `endstream`.
757    pub discovered_length: u64,
758    /// Stream filters as name objects.
759    pub filters: Vec<PdfName>,
760    /// Structured decode parameters aligned with `filters`.
761    #[serde(default)]
762    pub decode_params: Vec<DecodeParams>,
763    /// Shared source bytes used for lazy stream decoding.
764    #[serde(skip, default = "empty_source")]
765    pub raw_source: Arc<[u8]>,
766    /// Whether the `stream` keyword is followed by CRLF.
767    pub stream_keyword_crlf_compliant: bool,
768    /// Whether `endstream` is preceded by an EOL marker.
769    pub endstream_keyword_eol_compliant: bool,
770}
771
772impl StreamObject {
773    pub(crate) fn remove_crypt_filters(&mut self) {
774        let mut next_filters = Vec::with_capacity(self.filters.len());
775        let mut next_params = Vec::with_capacity(self.decode_params.len());
776        for (index, filter) in self.filters.iter().enumerate() {
777            if filter.matches("Crypt") {
778                continue;
779            }
780            next_filters.push(filter.clone());
781            next_params.push(self.decode_params.get(index).cloned().unwrap_or_default());
782        }
783        self.filters = next_filters;
784        self.decode_params = next_params;
785    }
786
787    pub(crate) fn raw_bytes(&self) -> std::result::Result<&[u8], ParseError> {
788        let raw_start =
789            usize::try_from(self.raw_range.start).map_err(|_| ParseError::ArithmeticOverflow {
790                context: "stream raw range",
791            })?;
792        let raw_end =
793            usize::try_from(self.raw_range.end).map_err(|_| ParseError::ArithmeticOverflow {
794                context: "stream raw range",
795            })?;
796        self.raw_source
797            .get(raw_start..raw_end)
798            .ok_or(ParseError::Malformed {
799                message: bounded("stream raw range out of bounds"),
800            })
801    }
802
803    /// Returns decoded stream bytes, enforcing `max_stream_decode_bytes`.
804    ///
805    /// # Errors
806    ///
807    /// Returns [`ParseError`] when a filter is unsupported, decompression fails,
808    /// or decoded output exceeds the configured limit.
809    pub fn decoded_bytes(
810        &self,
811        limits: &ResourceLimits,
812    ) -> std::result::Result<Vec<u8>, ParseError> {
813        self.decoded_bytes_with_registry(limits, default_decoder_registry())
814            .map(|decoded| decoded.bytes)
815    }
816
817    fn decoded_bytes_with_registry(
818        &self,
819        limits: &ResourceLimits,
820        registry: &DecoderRegistry,
821    ) -> std::result::Result<DecodedStream, ParseError> {
822        let mut current = self.raw_bytes()?.to_vec();
823        let mut facts = Vec::new();
824        for (index, filter) in self.filters.iter().enumerate() {
825            let params = self.decode_params.get(index).cloned().unwrap_or_default();
826            let decoder = registry
827                .decoder(filter)
828                .ok_or(ParseError::UnsupportedFilter {
829                    filter: BoundedText::unchecked(String::from_utf8_lossy(filter.as_bytes())),
830                })?;
831            let input_len = checked_u64_len(current.len(), "stream filter input length")?;
832            let output = decoder.decode(&current, &params, limits)?;
833            let output_len = checked_u64_len(output.bytes.len(), "stream filter output length")?;
834            enforce_decoded_len(output_len, limits.max_stream_decode_bytes)?;
835            let filter = filter_identifier(filter)?;
836            facts.push(if output.metadata_mode {
837                StreamFact::FilterMetadataMode {
838                    filter,
839                    bytes: output_len,
840                }
841            } else {
842                StreamFact::FilterDecoded {
843                    filter,
844                    input_bytes: input_len,
845                    output_bytes: output_len,
846                }
847            });
848            current = output.bytes;
849        }
850        let decoded_len = checked_u64_len(current.len(), "decoded stream length")?;
851        enforce_decoded_len(decoded_len, limits.max_stream_decode_bytes)?;
852        Ok(DecodedStream {
853            bytes: current,
854            facts,
855        })
856    }
857}
858
859#[derive(Clone, Debug, Eq, PartialEq)]
860struct DecodedStream {
861    bytes: Vec<u8>,
862    facts: Vec<StreamFact>,
863}
864
865fn empty_source() -> Arc<[u8]> {
866    Arc::from(Vec::<u8>::new())
867}
868
869/// Raw stream byte range in the source file.
870#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
871#[non_exhaustive]
872#[serde(rename_all = "camelCase", deny_unknown_fields)]
873pub struct StreamRange {
874    /// Inclusive start offset.
875    pub start: u64,
876    /// Exclusive end offset.
877    pub end: u64,
878}
879
880struct ByteParser<'a> {
881    source: SourceStorage,
882    limits: ResourceLimits,
883    decoder_registry: Option<DecoderRegistry>,
884    options: ParseOptions<'a>,
885    pos: usize,
886    parse_facts: Vec<ParseFact>,
887    warnings: Vec<ValidationWarning>,
888}
889
890#[derive(Clone, Copy, Debug)]
891enum NumberToken {
892    Integer(i64),
893    Real(f64),
894}
895
896#[derive(Clone, Copy, Debug)]
897struct XrefStreamSummary {
898    decoded_bytes: usize,
899    entries: u64,
900    compressed_entries: u64,
901}
902
903impl<'a> ByteParser<'a> {
904    fn new(
905        source: SourceStorage,
906        limits: ResourceLimits,
907        decoder_registry: Option<DecoderRegistry>,
908        options: ParseOptions<'a>,
909    ) -> Self {
910        Self {
911            source,
912            limits,
913            decoder_registry,
914            options,
915            pos: 0,
916            parse_facts: Vec::new(),
917            warnings: Vec::new(),
918        }
919    }
920
921    fn parse_document(mut self) -> Result<ParsedDocument> {
922        let (header_offset, version) = self.parse_header()?;
923        self.push_fact(ParseFact::Header {
924            offset: header_offset,
925            version,
926            had_leading_bytes: header_offset != 0,
927        });
928
929        let mut objects = ObjectStore::default();
930        let mut trailers = Vec::new();
931        self.parse_top_level_objects(&mut objects, &mut trailers)?;
932
933        let encrypted_catalog = trailers
934            .iter()
935            .rev()
936            .find_map(|trailer| object_ref_from_dictionary(&trailer.dictionary, "Root"));
937
938        if encryption_reference(&trailers).is_some() {
939            let fact = encryption_fact(&objects, &trailers);
940            #[cfg(feature = "decrypt")]
941            if let Err(error) = encryption::classify_encryption(&objects, &trailers, &self.limits)
942                && !error.is_encrypted_status()
943            {
944                return Err(error.into_parse_error().into());
945            }
946            #[cfg(feature = "decrypt")]
947            if let Some(password) = self.options.password {
948                match encryption::decrypt_document(
949                    &mut objects,
950                    &mut trailers,
951                    &self.limits,
952                    password,
953                ) {
954                    Ok(summary) => {
955                        self.push_fact(summary.into_fact(true));
956                    }
957                    Err(error) if error.is_encrypted_status() => {
958                        self.warnings.push(ValidationWarning::General {
959                            message: BoundedText::unchecked(error.safe_message()),
960                        });
961                        self.push_fact(error.into_fact(fact));
962                        return Ok(ParsedDocument {
963                            version,
964                            catalog: encrypted_catalog,
965                            objects,
966                            trailers,
967                            parse_facts: self.parse_facts,
968                            warnings: self.warnings,
969                        });
970                    }
971                    Err(error) => return Err(error.into_parse_error().into()),
972                }
973            } else if self.options.password.is_none() {
974                self.push_fact(fact);
975                return Ok(ParsedDocument {
976                    version,
977                    catalog: encrypted_catalog,
978                    objects,
979                    trailers,
980                    parse_facts: self.parse_facts,
981                    warnings: self.warnings,
982                });
983            }
984
985            #[cfg(not(feature = "decrypt"))]
986            {
987                self.push_fact(fact);
988                return Ok(ParsedDocument {
989                    version,
990                    catalog: encrypted_catalog,
991                    objects,
992                    trailers,
993                    parse_facts: self.parse_facts,
994                    warnings: self.warnings,
995                });
996            }
997        }
998
999        self.materialize_stream_backed_structures(&mut objects, &mut trailers)?;
1000        let catalog = trailers
1001            .iter()
1002            .rev()
1003            .find_map(|trailer| object_ref_from_dictionary(&trailer.dictionary, "Root"));
1004
1005        Ok(ParsedDocument {
1006            version,
1007            catalog,
1008            objects,
1009            trailers,
1010            parse_facts: self.parse_facts,
1011            warnings: self.warnings,
1012        })
1013    }
1014
1015    fn parse_top_level_objects(
1016        &mut self,
1017        objects: &mut ObjectStore,
1018        trailers: &mut Vec<Trailer>,
1019    ) -> Result<()> {
1020        while self.pos < self.source.len() {
1021            self.skip_ws_and_comments();
1022            if self.starts_with(EOF_MARKER) {
1023                self.parse_post_eof_fact()?;
1024                return Ok(());
1025            }
1026            if self.starts_with(b"startxref") {
1027                self.skip_line();
1028                continue;
1029            }
1030            if self.starts_with(b"xref") {
1031                self.parse_xref_and_trailer(trailers)?;
1032                continue;
1033            }
1034            if self.starts_with(b"trailer") {
1035                self.consume_bytes(b"trailer")?;
1036                self.skip_ws_and_comments();
1037                let offset = self.offset()?;
1038                let dictionary = self.parse_dictionary(0)?;
1039                self.push_xref_chain_facts(None, offset, &dictionary)?;
1040                trailers.push(Trailer { dictionary, offset });
1041                continue;
1042            }
1043
1044            let before = self.pos;
1045            match self.parse_indirect_object()? {
1046                Some(object) => {
1047                    let object_count = u64::try_from(objects.len())
1048                        .map_err(|_| ParseError::ArithmeticOverflow {
1049                            context: "object count",
1050                        })?
1051                        .checked_add(1)
1052                        .ok_or(ParseError::ArithmeticOverflow {
1053                            context: "object count",
1054                        })?;
1055                    if object_count > self.limits.max_objects {
1056                        return Err(ParseError::LimitExceeded {
1057                            limit: "max_objects",
1058                        }
1059                        .into());
1060                    }
1061                    objects.insert(object);
1062                }
1063                None => {
1064                    self.pos = before.saturating_add(1);
1065                }
1066            }
1067        }
1068        Ok(())
1069    }
1070
1071    fn materialize_stream_backed_structures(
1072        &mut self,
1073        objects: &mut ObjectStore,
1074        trailers: &mut Vec<Trailer>,
1075    ) -> std::result::Result<(), ParseError> {
1076        let streams = objects
1077            .values()
1078            .filter_map(|object| match &object.object {
1079                CosObject::Stream(stream) => Some((object.key, object.offset, stream.clone())),
1080                _ => None,
1081            })
1082            .collect::<Vec<_>>();
1083        let mut expanded_objects = Vec::new();
1084        for (key, offset, stream) in streams {
1085            if matches!(stream.dictionary.get("Type"), Some(CosObject::Name(name)) if name.matches("XRef"))
1086            {
1087                let summary = self.parse_xref_stream(key, &stream)?;
1088                let decoded_len = u64::try_from(summary.decoded_bytes).map_err(|_| {
1089                    ParseError::ArithmeticOverflow {
1090                        context: "decoded xref stream length",
1091                    }
1092                })?;
1093                self.push_fact(ParseFact::Stream {
1094                    object: key,
1095                    fact: StreamFact::Decoded { bytes: decoded_len },
1096                });
1097                trailers.push(Trailer {
1098                    dictionary: stream.dictionary.clone(),
1099                    offset,
1100                });
1101                self.push_xref_chain_facts(Some(key), offset, &stream.dictionary)?;
1102                self.push_fact(ParseFact::Xref {
1103                    section: ObjectLocation {
1104                        object: Some(key),
1105                        offset: Some(offset),
1106                        path: None,
1107                    },
1108                    fact: XrefFact::XrefStreamParsed {
1109                        entries: summary.entries,
1110                        compressed_entries: summary.compressed_entries,
1111                    },
1112                });
1113            }
1114            if matches!(stream.dictionary.get("Type"), Some(CosObject::Name(name)) if name.matches("ObjStm"))
1115            {
1116                let decoded = self.decode_stream(key, &stream)?;
1117                let decoded_len = checked_u64_len(decoded.bytes.len(), "decoded stream length")?;
1118                self.push_fact(ParseFact::Stream {
1119                    object: key,
1120                    fact: StreamFact::Decoded { bytes: decoded_len },
1121                });
1122                let mut parsed_objects = self.parse_object_stream(key, &stream, &decoded.bytes)?;
1123                expanded_objects.append(&mut parsed_objects);
1124                self.push_fact(ParseFact::Xref {
1125                    section: ObjectLocation {
1126                        object: Some(key),
1127                        offset: Some(offset),
1128                        path: None,
1129                    },
1130                    fact: XrefFact::ObjectStreamParsed,
1131                });
1132            }
1133        }
1134        for object in expanded_objects {
1135            if objects.get(&object.key).is_none() {
1136                let next_count =
1137                    u64::try_from(objects.len()).map_err(|_| ParseError::ArithmeticOverflow {
1138                        context: "object count",
1139                    })? + 1;
1140                if next_count > self.limits.max_objects {
1141                    return Err(ParseError::LimitExceeded {
1142                        limit: "max_objects",
1143                    });
1144                }
1145                objects.insert(object);
1146            }
1147        }
1148        Ok(())
1149    }
1150
1151    fn parse_xref_stream(
1152        &mut self,
1153        stream_key: ObjectKey,
1154        stream: &StreamObject,
1155    ) -> std::result::Result<XrefStreamSummary, ParseError> {
1156        let size = non_negative_u64_from_dictionary(&stream.dictionary, "Size")?;
1157        if size > self.limits.max_objects {
1158            return Err(ParseError::LimitExceeded {
1159                limit: "max_objects",
1160            });
1161        }
1162        let widths = xref_widths(&stream.dictionary)?;
1163        let indexes = xref_indexes(&stream.dictionary, size)?;
1164        let entry_width = widths
1165            .iter()
1166            .try_fold(0_usize, |sum, width| sum.checked_add(*width))
1167            .ok_or(ParseError::ArithmeticOverflow {
1168                context: "xref stream entry width",
1169            })?;
1170        if entry_width == 0 {
1171            return Err(ParseError::Malformed {
1172                message: bounded("xref stream entry width must be non-zero"),
1173            });
1174        }
1175        let decoded = self.decode_stream(stream_key, stream)?.bytes;
1176        let total_entries = indexes
1177            .iter()
1178            .try_fold(0_u64, |sum, (_, count)| sum.checked_add(*count))
1179            .ok_or(ParseError::ArithmeticOverflow {
1180                context: "xref stream entries",
1181            })?;
1182        if total_entries > self.limits.max_objects {
1183            return Err(ParseError::LimitExceeded {
1184                limit: "max_objects",
1185            });
1186        }
1187        let required_bytes = usize::try_from(total_entries)
1188            .ok()
1189            .and_then(|entries| entries.checked_mul(entry_width))
1190            .ok_or(ParseError::ArithmeticOverflow {
1191                context: "xref stream bytes",
1192            })?;
1193        if decoded.len() < required_bytes {
1194            return Err(ParseError::Malformed {
1195                message: bounded("xref stream data shorter than declared entries"),
1196            });
1197        }
1198
1199        let mut pos = 0_usize;
1200        let mut compressed_entries = 0_u64;
1201        for (_first_object, count) in indexes {
1202            for _ in 0..count {
1203                let entry_type = if widths[0] == 0 {
1204                    1
1205                } else {
1206                    read_be_uint(&decoded, &mut pos, widths[0])?
1207                };
1208                let _field_two = read_be_uint(&decoded, &mut pos, widths[1])?;
1209                let _field_three = read_be_uint(&decoded, &mut pos, widths[2])?;
1210                if entry_type == 2 {
1211                    compressed_entries = compressed_entries.checked_add(1).ok_or(
1212                        ParseError::ArithmeticOverflow {
1213                            context: "compressed xref entries",
1214                        },
1215                    )?;
1216                }
1217            }
1218        }
1219        Ok(XrefStreamSummary {
1220            decoded_bytes: decoded.len(),
1221            entries: total_entries,
1222            compressed_entries,
1223        })
1224    }
1225
1226    fn decode_stream(
1227        &mut self,
1228        key: ObjectKey,
1229        stream: &StreamObject,
1230    ) -> std::result::Result<DecodedStream, ParseError> {
1231        let decoded = if let Some(registry) = &self.decoder_registry {
1232            stream.decoded_bytes_with_registry(&self.limits, registry)?
1233        } else {
1234            stream.decoded_bytes_with_registry(&self.limits, default_decoder_registry())?
1235        };
1236        for fact in &decoded.facts {
1237            self.push_fact(ParseFact::Stream {
1238                object: key,
1239                fact: fact.clone(),
1240            });
1241        }
1242        Ok(decoded)
1243    }
1244
1245    fn parse_object_stream(
1246        &self,
1247        stream_key: ObjectKey,
1248        stream: &StreamObject,
1249        decoded: &[u8],
1250    ) -> std::result::Result<Vec<IndirectObject>, ParseError> {
1251        let count_u64 = non_negative_u64_from_dictionary(&stream.dictionary, "N")?;
1252        if count_u64 > self.limits.max_objects {
1253            return Err(ParseError::LimitExceeded {
1254                limit: "max_objects",
1255            });
1256        }
1257        let first = non_negative_usize_from_dictionary(&stream.dictionary, "First")?;
1258        if first > decoded.len() {
1259            return Err(ParseError::Malformed {
1260                message: bounded("object stream first offset exceeds decoded bytes"),
1261            });
1262        }
1263        let count = usize::try_from(count_u64).map_err(|_| ParseError::LimitExceeded {
1264            limit: "max_objects",
1265        })?;
1266        if count > 0 && count > first / 4 {
1267            return Err(ParseError::Malformed {
1268                message: bounded("object stream header too short for object count"),
1269            });
1270        }
1271        let mut parser = ByteParser::new(
1272            SourceStorage::Memory(Arc::from(decoded.to_vec())),
1273            self.limits.clone(),
1274            None,
1275            ParseOptions::default(),
1276        );
1277        let mut headers = Vec::with_capacity(count);
1278        for _ in 0..count {
1279            let Some(number) = parser.parse_unsigned_u32()? else {
1280                return Err(ParseError::Malformed {
1281                    message: bounded("object stream missing object number"),
1282                });
1283            };
1284            parser.skip_required_ws()?;
1285            let Some(relative_offset) = parser.parse_unsigned::<usize>()? else {
1286                return Err(ParseError::Malformed {
1287                    message: bounded("object stream missing object offset"),
1288                });
1289            };
1290            let Some(number) = NonZeroU32::new(number) else {
1291                return Err(ParseError::Malformed {
1292                    message: bounded("object number must be non-zero"),
1293                });
1294            };
1295            headers.push((
1296                ObjectKey {
1297                    number,
1298                    generation: 0,
1299                },
1300                relative_offset,
1301            ));
1302        }
1303
1304        let mut objects = Vec::with_capacity(count);
1305        for (key, relative_offset) in headers {
1306            let object_pos =
1307                first
1308                    .checked_add(relative_offset)
1309                    .ok_or(ParseError::ArithmeticOverflow {
1310                        context: "object stream offset",
1311                    })?;
1312            if object_pos >= decoded.len() {
1313                return Err(ParseError::Malformed {
1314                    message: bounded("object stream object offset exceeds decoded bytes"),
1315                });
1316            }
1317            parser.pos = object_pos;
1318            let object = parser.parse_object(0)?;
1319            let offset = u64::try_from(object_pos)
1320                .ok()
1321                .and_then(|relative| stream.raw_range.start.checked_add(relative))
1322                .ok_or(ParseError::ArithmeticOverflow {
1323                    context: "object stream object offset",
1324                })?;
1325            if key == stream_key {
1326                return Err(ParseError::Malformed {
1327                    message: bounded("object stream cannot contain itself"),
1328                });
1329            }
1330            objects.push(IndirectObject {
1331                key,
1332                offset,
1333                object,
1334            });
1335        }
1336        Ok(objects)
1337    }
1338
1339    fn parse_header(&mut self) -> std::result::Result<(u64, PdfVersion), ParseError> {
1340        let Some(header_pos) = self.source.find_bytes(HEADER_MARKER, 0, self.source.len()) else {
1341            return Err(ParseError::Malformed {
1342                message: bounded("missing PDF header"),
1343            });
1344        };
1345        self.pos = header_pos
1346            .checked_add(HEADER_MARKER.len())
1347            .ok_or(ParseError::ArithmeticOverflow { context: "header" })?;
1348        let mut malformed = false;
1349        let major = if let Some(value) = self.parse_version_digit() {
1350            value
1351        } else {
1352            malformed = true;
1353            1
1354        };
1355        if self.peek_byte() == Some(b'.') {
1356            self.pos = self.pos.saturating_add(1);
1357        } else {
1358            malformed = true;
1359        }
1360        let minor = if let Some(value) = self.parse_version_digit() {
1361            value
1362        } else {
1363            malformed = true;
1364            4
1365        };
1366        if malformed {
1367            self.warnings.push(ValidationWarning::General {
1368                message: BoundedText::unchecked("malformed PDF header version recovered as 1.4"),
1369            });
1370        }
1371        Ok((
1372            u64::try_from(header_pos).map_err(|_| ParseError::ArithmeticOverflow {
1373                context: "header offset",
1374            })?,
1375            PdfVersion { major, minor },
1376        ))
1377    }
1378
1379    fn parse_version_digit(&mut self) -> Option<u8> {
1380        let byte = self.peek_byte()?;
1381        if byte.is_ascii_digit() {
1382            self.pos = self.pos.saturating_add(1);
1383            Some(byte.saturating_sub(b'0'))
1384        } else {
1385            None
1386        }
1387    }
1388
1389    fn parse_indirect_object(&mut self) -> std::result::Result<Option<IndirectObject>, ParseError> {
1390        let start = self.pos;
1391        let Some(number) = self.parse_unsigned_u32()? else {
1392            return Ok(None);
1393        };
1394        self.skip_required_ws()?;
1395        let Some(generation) = self.parse_unsigned_u16()? else {
1396            self.pos = start;
1397            return Ok(None);
1398        };
1399        self.skip_required_ws()?;
1400        if !self.starts_with(b"obj") {
1401            self.pos = start;
1402            return Ok(None);
1403        }
1404        self.consume_bytes(b"obj")?;
1405        let Some(number) = NonZeroU32::new(number) else {
1406            return Err(ParseError::Malformed {
1407                message: bounded("object number must be non-zero"),
1408            });
1409        };
1410        let key = ObjectKey { number, generation };
1411        let object_start = u64::try_from(start).map_err(|_| ParseError::ArithmeticOverflow {
1412            context: "object offset",
1413        })?;
1414
1415        let parsed = self.parse_object(0)?;
1416        let object = match parsed {
1417            CosObject::Dictionary(dictionary) if self.peek_stream_marker() => {
1418                CosObject::Stream(self.parse_stream(key, dictionary)?)
1419            }
1420            other => other,
1421        };
1422        self.skip_ws_and_comments();
1423        if self.starts_with(ENDOBJ_MARKER) {
1424            self.consume_bytes(ENDOBJ_MARKER)?;
1425        }
1426        Ok(Some(IndirectObject {
1427            key,
1428            offset: object_start,
1429            object,
1430        }))
1431    }
1432
1433    fn parse_object(&mut self, depth: u32) -> std::result::Result<CosObject, ParseError> {
1434        if depth > self.limits.max_object_depth {
1435            return Err(ParseError::LimitExceeded {
1436                limit: "max_object_depth",
1437            });
1438        }
1439        self.skip_ws_and_comments();
1440        if self.starts_with(b"<<") {
1441            return Ok(CosObject::Dictionary(self.parse_dictionary(depth)?));
1442        }
1443        if self.starts_with(b"[") {
1444            return Ok(CosObject::Array(self.parse_array(depth)?));
1445        }
1446        match self.peek_byte() {
1447            Some(b'/') => self.parse_name().map(CosObject::Name),
1448            Some(b'(') => self.parse_literal_string().map(CosObject::String),
1449            Some(b'<') => self.parse_hex_string().map(CosObject::String),
1450            Some(b't') if self.starts_with(b"true") => {
1451                self.consume_bytes(b"true")?;
1452                Ok(CosObject::Boolean(true))
1453            }
1454            Some(b'f') if self.starts_with(b"false") => {
1455                self.consume_bytes(b"false")?;
1456                Ok(CosObject::Boolean(false))
1457            }
1458            Some(b'n') if self.starts_with(b"null") => {
1459                self.consume_bytes(b"null")?;
1460                Ok(CosObject::Null)
1461            }
1462            Some(b'-' | b'+' | b'.' | b'0'..=b'9') => self.parse_number_or_reference(),
1463            _ => Err(ParseError::Malformed {
1464                message: BoundedText::unchecked(format!("unexpected token at {}", self.pos)),
1465            }),
1466        }
1467    }
1468
1469    fn parse_dictionary(&mut self, depth: u32) -> std::result::Result<Dictionary, ParseError> {
1470        self.consume_bytes(b"<<")?;
1471        let mut dictionary = Dictionary::default();
1472        loop {
1473            self.skip_ws_and_comments();
1474            if self.starts_with(b">>") {
1475                self.consume_bytes(b">>")?;
1476                return Ok(dictionary);
1477            }
1478            let key = self.parse_name()?;
1479            let value = self.parse_object(depth.saturating_add(1))?;
1480            let next_len =
1481                u64::try_from(dictionary.len()).map_err(|_| ParseError::ArithmeticOverflow {
1482                    context: "dictionary length",
1483                })? + 1;
1484            if next_len > self.limits.max_dict_entries {
1485                return Err(ParseError::LimitExceeded {
1486                    limit: "max_dict_entries",
1487                });
1488            }
1489            dictionary.insert(key, value);
1490        }
1491    }
1492
1493    fn parse_array(&mut self, depth: u32) -> std::result::Result<Vec<CosObject>, ParseError> {
1494        self.consume_bytes(b"[")?;
1495        let mut values = Vec::new();
1496        loop {
1497            self.skip_ws_and_comments();
1498            if self.starts_with(b"]") {
1499                self.consume_bytes(b"]")?;
1500                return Ok(values);
1501            }
1502            let value = self.parse_object(depth.saturating_add(1))?;
1503            let next_len =
1504                u64::try_from(values.len()).map_err(|_| ParseError::ArithmeticOverflow {
1505                    context: "array length",
1506                })? + 1;
1507            if next_len > self.limits.max_array_len {
1508                return Err(ParseError::LimitExceeded {
1509                    limit: "max_array_len",
1510                });
1511            }
1512            values.push(value);
1513        }
1514    }
1515
1516    fn parse_name(&mut self) -> std::result::Result<PdfName, ParseError> {
1517        self.consume_bytes(b"/")?;
1518        let mut bytes = Vec::new();
1519        while let Some(byte) = self.peek_byte() {
1520            if is_delimiter(byte) || is_ws(byte) {
1521                break;
1522            }
1523            self.pos = self.pos.saturating_add(1);
1524            if byte == b'#' {
1525                let high = self.next_byte().ok_or(ParseError::Malformed {
1526                    message: bounded("truncated name escape"),
1527                })?;
1528                let low = self.next_byte().ok_or(ParseError::Malformed {
1529                    message: bounded("truncated name escape"),
1530                })?;
1531                let decoded = decode_hex_pair(high, low).ok_or(ParseError::Malformed {
1532                    message: bounded("invalid name escape"),
1533                })?;
1534                bytes.push(decoded);
1535            } else {
1536                bytes.push(byte);
1537            }
1538            if bytes.len() > self.limits.max_name_bytes {
1539                return Err(ParseError::LimitExceeded {
1540                    limit: "max_name_bytes",
1541                });
1542            }
1543        }
1544        PdfName::new(bytes, &self.limits)
1545    }
1546
1547    fn parse_literal_string(&mut self) -> std::result::Result<PdfString, ParseError> {
1548        self.consume_bytes(b"(")?;
1549        let mut depth = 1_u32;
1550        let mut bytes = Vec::new();
1551        while let Some(byte) = self.next_byte() {
1552            match byte {
1553                b'\\' => {
1554                    let Some(escaped) = self.next_byte() else {
1555                        return Err(ParseError::Malformed {
1556                            message: bounded("truncated string escape"),
1557                        });
1558                    };
1559                    bytes.push(match escaped {
1560                        b'n' => b'\n',
1561                        b'r' => b'\r',
1562                        b't' => b'\t',
1563                        b'b' => 0x08,
1564                        b'f' => 0x0c,
1565                        other => other,
1566                    });
1567                }
1568                b'(' => {
1569                    depth = depth.checked_add(1).ok_or(ParseError::ArithmeticOverflow {
1570                        context: "string nesting",
1571                    })?;
1572                    bytes.push(byte);
1573                }
1574                b')' => {
1575                    depth = depth.saturating_sub(1);
1576                    if depth == 0 {
1577                        return PdfString::new(bytes, &self.limits);
1578                    }
1579                    bytes.push(byte);
1580                }
1581                other => bytes.push(other),
1582            }
1583            if bytes.len() > self.limits.max_string_bytes {
1584                return Err(ParseError::LimitExceeded {
1585                    limit: "max_string_bytes",
1586                });
1587            }
1588        }
1589        Err(ParseError::Malformed {
1590            message: bounded("unterminated literal string"),
1591        })
1592    }
1593
1594    fn parse_hex_string(&mut self) -> std::result::Result<PdfString, ParseError> {
1595        self.consume_bytes(b"<")?;
1596        let mut nibbles = Vec::new();
1597        while let Some(byte) = self.peek_byte() {
1598            if byte == b'>' {
1599                self.pos = self.pos.saturating_add(1);
1600                break;
1601            }
1602            self.pos = self.pos.saturating_add(1);
1603            if !is_ws(byte) {
1604                nibbles.push(byte);
1605            }
1606        }
1607        if nibbles.len() % 2 != 0 {
1608            nibbles.push(b'0');
1609        }
1610        let mut bytes = Vec::with_capacity(nibbles.len() / 2);
1611        for pair in nibbles.chunks(2) {
1612            let high = pair.first().copied().ok_or(ParseError::Malformed {
1613                message: bounded("invalid hex string"),
1614            })?;
1615            let low = pair.get(1).copied().ok_or(ParseError::Malformed {
1616                message: bounded("invalid hex string"),
1617            })?;
1618            let decoded = decode_hex_pair(high, low).ok_or(ParseError::Malformed {
1619                message: bounded("invalid hex string"),
1620            })?;
1621            bytes.push(decoded);
1622        }
1623        PdfString::new(bytes, &self.limits)
1624    }
1625
1626    fn parse_number_or_reference(&mut self) -> std::result::Result<CosObject, ParseError> {
1627        let first_start = self.pos;
1628        let first = self.parse_number_token()?;
1629        if let NumberToken::Integer(first_integer) = first {
1630            let after_first = self.pos;
1631            if self.skip_required_ws().is_ok() {
1632                let second_start = self.pos;
1633                if let Some(generation) = self.parse_unsigned_u16()?
1634                    && self.skip_required_ws().is_ok()
1635                    && self.starts_with(b"R")
1636                {
1637                    self.consume_bytes(b"R")?;
1638                    if let Some(number) =
1639                        NonZeroU32::new(u32::try_from(first_integer).map_err(|_| {
1640                            ParseError::Malformed {
1641                                message: bounded("reference object number out of range"),
1642                            }
1643                        })?)
1644                    {
1645                        return Ok(CosObject::Reference(ObjectKey { number, generation }));
1646                    }
1647                }
1648                self.pos = second_start;
1649            }
1650            self.pos = after_first;
1651        }
1652        self.pos = first_start;
1653        match self.parse_number_token()? {
1654            NumberToken::Integer(value) => Ok(CosObject::Integer(value)),
1655            NumberToken::Real(value) => Ok(CosObject::Real(value)),
1656        }
1657    }
1658
1659    fn parse_number_token(&mut self) -> std::result::Result<NumberToken, ParseError> {
1660        self.skip_ws_and_comments();
1661        let start = self.pos;
1662        if matches!(self.peek_byte(), Some(b'+' | b'-')) {
1663            self.pos = self.pos.saturating_add(1);
1664        }
1665        let mut has_dot = false;
1666        while let Some(byte) = self.peek_byte() {
1667            if byte == b'.' {
1668                has_dot = true;
1669                self.pos = self.pos.saturating_add(1);
1670            } else if byte.is_ascii_digit() {
1671                self.pos = self.pos.saturating_add(1);
1672            } else {
1673                break;
1674            }
1675        }
1676        let token = self.slice(start, self.pos)?;
1677        let text = std::str::from_utf8(token.as_ref()).map_err(|_| ParseError::Malformed {
1678            message: bounded("number is not valid ASCII"),
1679        })?;
1680        if has_dot {
1681            let value = text.parse::<f64>().map_err(|_| ParseError::Malformed {
1682                message: bounded("invalid real number"),
1683            })?;
1684            if !value.is_finite() {
1685                return Err(ParseError::Malformed {
1686                    message: bounded("non-finite real number"),
1687                });
1688            }
1689            Ok(NumberToken::Real(value))
1690        } else {
1691            let value = text.parse::<i64>().map_err(|_| ParseError::Malformed {
1692                message: bounded("invalid integer"),
1693            })?;
1694            Ok(NumberToken::Integer(value))
1695        }
1696    }
1697
1698    fn parse_stream(
1699        &mut self,
1700        key: ObjectKey,
1701        dictionary: Dictionary,
1702    ) -> std::result::Result<StreamObject, ParseError> {
1703        self.skip_ws_and_comments();
1704        self.consume_bytes(STREAM_MARKER)?;
1705        let stream_keyword_crlf_compliant = self.starts_with(b"\r\n");
1706        if self.starts_with(b"\r\n") {
1707            self.consume_bytes(b"\r\n")?;
1708        } else if self.starts_with(b"\n") || self.starts_with(b"\r") {
1709            self.pos = self.pos.saturating_add(1);
1710        }
1711        let data_start = self.pos;
1712        let declared_length = integer_from_dictionary(&dictionary, "Length")
1713            .and_then(|value| u64::try_from(value).ok());
1714        if let Some(declared) = declared_length
1715            && declared > self.limits.max_stream_declared_bytes
1716        {
1717            return Err(ParseError::LimitExceeded {
1718                limit: "max_stream_declared_bytes",
1719            });
1720        }
1721
1722        let declared_end = declared_length
1723            .and_then(|length| usize::try_from(length).ok())
1724            .and_then(|length| data_start.checked_add(length));
1725        let declared_keyword =
1726            declared_end.and_then(|offset| endstream_after_optional_eol(&self.source, offset));
1727        let (data_end, endstream_pos) =
1728            if let (Some(data_end), Some(keyword_pos)) = (declared_end, declared_keyword) {
1729                (data_end, keyword_pos)
1730            } else {
1731                let max_scan =
1732                    usize::try_from(self.limits.max_stream_declared_bytes).map_err(|_| {
1733                        ParseError::LimitExceeded {
1734                            limit: "max_stream_declared_bytes",
1735                        }
1736                    })?;
1737                let scan_end = data_start
1738                    .checked_add(max_scan)
1739                    .map_or(self.source.len(), |end| end.min(self.source.len()));
1740                let keyword_pos = self
1741                    .source
1742                    .find_bytes(ENDSTREAM_MARKER, data_start, scan_end)
1743                    .ok_or(ParseError::Malformed {
1744                        message: bounded("missing endstream"),
1745                    })?;
1746                (
1747                    trim_eol_before(&self.source, data_start, keyword_pos),
1748                    keyword_pos,
1749                )
1750            };
1751        let endstream_keyword_eol_compliant = has_eol_before(&self.source, endstream_pos);
1752        let discovered_length =
1753            u64::try_from(data_end.saturating_sub(data_start)).map_err(|_| {
1754                ParseError::ArithmeticOverflow {
1755                    context: "stream length",
1756                }
1757            })?;
1758        self.pos = endstream_pos;
1759        self.consume_bytes(ENDSTREAM_MARKER)?;
1760
1761        let filters = stream_filters(&dictionary);
1762        self.push_fact(ParseFact::Stream {
1763            object: key,
1764            fact: StreamFact::Length {
1765                declared: declared_length.unwrap_or(discovered_length),
1766                discovered: discovered_length,
1767            },
1768        });
1769        self.push_fact(ParseFact::Stream {
1770            object: key,
1771            fact: StreamFact::KeywordSpacing {
1772                stream_keyword_crlf_compliant,
1773                endstream_keyword_eol_compliant,
1774            },
1775        });
1776
1777        let decode_params = stream_decode_params(&dictionary, filters.len());
1778        let (raw_source, raw_range) = self.source.stream_source(data_start, data_end)?;
1779        Ok(StreamObject {
1780            dictionary,
1781            raw_range,
1782            declared_length,
1783            discovered_length,
1784            filters,
1785            decode_params,
1786            raw_source,
1787            stream_keyword_crlf_compliant,
1788            endstream_keyword_eol_compliant,
1789        })
1790    }
1791
1792    fn parse_xref_and_trailer(
1793        &mut self,
1794        trailers: &mut Vec<Trailer>,
1795    ) -> std::result::Result<(), ParseError> {
1796        let section_offset = self.offset()?;
1797        self.consume_bytes(b"xref")?;
1798        let mut compliant = true;
1799        let mut parsed_entries = 0_u64;
1800        loop {
1801            self.skip_ws_and_comments();
1802            if self.pos >= self.source.len() || self.starts_with(b"trailer") {
1803                break;
1804            }
1805            if self.starts_with(b"startxref") || self.starts_with(EOF_MARKER) {
1806                break;
1807            }
1808            let Some(_first_object) = self.parse_unsigned_u32()? else {
1809                compliant = false;
1810                self.skip_line();
1811                continue;
1812            };
1813            self.skip_required_ws()?;
1814            let Some(count) = self.parse_unsigned_u32()? else {
1815                compliant = false;
1816                self.skip_line();
1817                continue;
1818            };
1819            self.skip_line();
1820            for _ in 0..count {
1821                let line_start = self.pos;
1822                let offset = self.parse_fixed_digits(10);
1823                self.skip_xref_spaces();
1824                let generation = self.parse_fixed_digits(5);
1825                self.skip_xref_spaces();
1826                let marker = self.next_byte();
1827                if offset.is_none()
1828                    || generation.is_none()
1829                    || !matches!(marker, Some(b'n' | b'f'))
1830                    || !line_had_eol(&self.source, line_start)
1831                {
1832                    compliant = false;
1833                }
1834                parsed_entries =
1835                    parsed_entries
1836                        .checked_add(1)
1837                        .ok_or(ParseError::ArithmeticOverflow {
1838                            context: "xref entries",
1839                        })?;
1840                if parsed_entries > self.limits.max_objects {
1841                    return Err(ParseError::LimitExceeded {
1842                        limit: "max_objects",
1843                    });
1844                }
1845                self.skip_line();
1846            }
1847        }
1848        self.push_fact(ParseFact::Xref {
1849            section: ObjectLocation {
1850                object: None,
1851                offset: Some(section_offset),
1852                path: None,
1853            },
1854            fact: if compliant {
1855                XrefFact::EolMarkersComply
1856            } else {
1857                XrefFact::MalformedClassic
1858            },
1859        });
1860        loop {
1861            self.skip_ws_and_comments();
1862            if self.pos >= self.source.len() {
1863                return Ok(());
1864            }
1865            if self.starts_with(b"trailer") {
1866                self.consume_bytes(b"trailer")?;
1867                self.skip_ws_and_comments();
1868                let offset = self.offset()?;
1869                let dictionary = self.parse_dictionary(0)?;
1870                self.push_xref_chain_facts(None, offset, &dictionary)?;
1871                trailers.push(Trailer { dictionary, offset });
1872                return Ok(());
1873            }
1874            if self.starts_with(b"startxref") || self.starts_with(EOF_MARKER) {
1875                return Ok(());
1876            }
1877            self.skip_line();
1878        }
1879    }
1880
1881    fn parse_post_eof_fact(&mut self) -> std::result::Result<(), ParseError> {
1882        self.consume_bytes(EOF_MARKER)?;
1883        let remaining =
1884            self.source
1885                .len()
1886                .saturating_sub(self.pos)
1887                .saturating_sub(count_trailing_ws(
1888                    self.slice(self.pos, self.source.len())?.as_ref(),
1889                ));
1890        if remaining > 0 {
1891            self.push_fact(ParseFact::PostEofData {
1892                bytes: u64::try_from(remaining).map_err(|_| ParseError::ArithmeticOverflow {
1893                    context: "post eof bytes",
1894                })?,
1895            });
1896        }
1897        Ok(())
1898    }
1899
1900    fn push_xref_chain_facts(
1901        &mut self,
1902        object: Option<ObjectKey>,
1903        offset: u64,
1904        dictionary: &Dictionary,
1905    ) -> std::result::Result<(), ParseError> {
1906        let section = ObjectLocation {
1907            object,
1908            offset: Some(offset),
1909            path: None,
1910        };
1911        if let Some(prev) = optional_non_negative_offset(dictionary, "Prev")? {
1912            self.push_fact(ParseFact::Xref {
1913                section: section.clone(),
1914                fact: XrefFact::PrevChain { offset: prev },
1915            });
1916        }
1917        if let Some(hybrid) = optional_non_negative_offset(dictionary, "XRefStm")? {
1918            self.push_fact(ParseFact::Xref {
1919                section,
1920                fact: XrefFact::HybridReference { offset: hybrid },
1921            });
1922        }
1923        Ok(())
1924    }
1925
1926    fn push_fact(&mut self, fact: ParseFact) {
1927        if self.parse_facts.len() >= self.limits.max_parse_facts {
1928            if !self
1929                .warnings
1930                .iter()
1931                .any(|warning| matches!(warning, ValidationWarning::ParseFactCapReached { .. }))
1932            {
1933                self.warnings.push(ValidationWarning::ParseFactCapReached {
1934                    cap: self.limits.max_parse_facts,
1935                });
1936            }
1937            return;
1938        }
1939        self.parse_facts.push(fact);
1940    }
1941
1942    fn peek_stream_marker(&mut self) -> bool {
1943        let saved = self.pos;
1944        self.skip_ws_and_comments();
1945        let found = self.starts_with(STREAM_MARKER);
1946        self.pos = saved;
1947        found
1948    }
1949
1950    fn parse_unsigned_u32(&mut self) -> std::result::Result<Option<u32>, ParseError> {
1951        self.parse_unsigned::<u32>()
1952    }
1953
1954    fn parse_unsigned_u16(&mut self) -> std::result::Result<Option<u16>, ParseError> {
1955        self.parse_unsigned::<u16>()
1956    }
1957
1958    fn parse_fixed_digits(&mut self, len: usize) -> Option<u64> {
1959        let end = self.pos.checked_add(len)?;
1960        let slice = self.source.slice(self.pos, end).ok()?;
1961        if !slice.iter().all(u8::is_ascii_digit) {
1962            return None;
1963        }
1964        self.pos = end;
1965        std::str::from_utf8(slice.as_ref())
1966            .ok()?
1967            .parse::<u64>()
1968            .ok()
1969    }
1970
1971    fn parse_unsigned<T>(&mut self) -> std::result::Result<Option<T>, ParseError>
1972    where
1973        T: std::str::FromStr,
1974    {
1975        self.skip_ws_and_comments();
1976        let start = self.pos;
1977        while let Some(byte) = self.peek_byte() {
1978            if byte.is_ascii_digit() {
1979                self.pos = self.pos.saturating_add(1);
1980            } else {
1981                break;
1982            }
1983        }
1984        if start == self.pos {
1985            return Ok(None);
1986        }
1987        let token = self.slice(start, self.pos)?;
1988        let text = std::str::from_utf8(token.as_ref()).map_err(|_| ParseError::Malformed {
1989            message: bounded("unsigned integer is not ASCII"),
1990        })?;
1991        text.parse::<T>()
1992            .map(Some)
1993            .map_err(|_| ParseError::Malformed {
1994                message: bounded("unsigned integer out of range"),
1995            })
1996    }
1997
1998    fn skip_ws_and_comments(&mut self) {
1999        loop {
2000            while self.peek_byte().is_some_and(is_ws) {
2001                self.pos = self.pos.saturating_add(1);
2002            }
2003            if self.starts_with(EOF_MARKER) {
2004                break;
2005            }
2006            if self.peek_byte() == Some(b'%') {
2007                self.skip_line();
2008            } else {
2009                break;
2010            }
2011        }
2012    }
2013
2014    fn skip_xref_spaces(&mut self) {
2015        while matches!(self.peek_byte(), Some(b'\t' | b' ')) {
2016            self.pos = self.pos.saturating_add(1);
2017        }
2018    }
2019
2020    fn skip_required_ws(&mut self) -> std::result::Result<(), ParseError> {
2021        let start = self.pos;
2022        while self.peek_byte().is_some_and(is_ws) {
2023            self.pos = self.pos.saturating_add(1);
2024        }
2025        if self.pos == start {
2026            return Err(ParseError::Malformed {
2027                message: bounded("expected whitespace"),
2028            });
2029        }
2030        Ok(())
2031    }
2032
2033    fn skip_line(&mut self) {
2034        while let Some(byte) = self.peek_byte() {
2035            self.pos = self.pos.saturating_add(1);
2036            if matches!(byte, b'\n' | b'\r') {
2037                break;
2038            }
2039        }
2040    }
2041
2042    fn consume_bytes(&mut self, expected: &[u8]) -> std::result::Result<(), ParseError> {
2043        if !self.starts_with(expected) {
2044            return Err(ParseError::Malformed {
2045                message: BoundedText::unchecked(format!("unexpected token at {}", self.pos)),
2046            });
2047        }
2048        self.pos = self
2049            .pos
2050            .checked_add(expected.len())
2051            .ok_or(ParseError::ArithmeticOverflow { context: "offset" })?;
2052        Ok(())
2053    }
2054
2055    fn starts_with(&self, expected: &[u8]) -> bool {
2056        self.source.starts_with(self.pos, expected)
2057    }
2058
2059    fn peek_byte(&self) -> Option<u8> {
2060        self.source.byte(self.pos)
2061    }
2062
2063    fn next_byte(&mut self) -> Option<u8> {
2064        let byte = self.peek_byte()?;
2065        self.pos = self.pos.saturating_add(1);
2066        Some(byte)
2067    }
2068
2069    fn slice(&self, start: usize, end: usize) -> std::result::Result<Cow<'_, [u8]>, ParseError> {
2070        self.source.slice(start, end)
2071    }
2072
2073    fn offset(&self) -> std::result::Result<u64, ParseError> {
2074        u64::try_from(self.pos).map_err(|_| ParseError::ArithmeticOverflow { context: "offset" })
2075    }
2076}
2077
2078fn bounded(value: &str) -> BoundedText {
2079    BoundedText::unchecked(value)
2080}
2081
2082fn is_ws(byte: u8) -> bool {
2083    matches!(byte, b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
2084}
2085
2086fn is_delimiter(byte: u8) -> bool {
2087    matches!(
2088        byte,
2089        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
2090    )
2091}
2092
2093fn decode_hex_pair(high: u8, low: u8) -> Option<u8> {
2094    let high = decode_hex_digit(high)?;
2095    let low = decode_hex_digit(low)?;
2096    Some(high.saturating_mul(16).saturating_add(low))
2097}
2098
2099fn decode_hex_digit(byte: u8) -> Option<u8> {
2100    match byte {
2101        b'0'..=b'9' => Some(byte.saturating_sub(b'0')),
2102        b'a'..=b'f' => Some(byte.saturating_sub(b'a').saturating_add(10)),
2103        b'A'..=b'F' => Some(byte.saturating_sub(b'A').saturating_add(10)),
2104        _ => None,
2105    }
2106}
2107
2108fn find_bytes(haystack: &[u8], needle: &[u8], start: usize) -> Option<usize> {
2109    haystack
2110        .get(start..)?
2111        .windows(needle.len())
2112        .position(|window| window == needle)
2113        .and_then(|relative| start.checked_add(relative))
2114}
2115
2116fn has_eol_before(source: &SourceStorage, pos: usize) -> bool {
2117    matches!(
2118        pos.checked_sub(1).and_then(|index| source.byte(index)),
2119        Some(b'\n' | b'\r')
2120    )
2121}
2122
2123fn line_had_eol(source: &SourceStorage, line_start: usize) -> bool {
2124    let Some(relative) = source.find_bytes(b"\n", line_start, source.len()) else {
2125        return source.find_bytes(b"\r", line_start, source.len()).is_some();
2126    };
2127    relative >= line_start
2128}
2129
2130fn endstream_after_optional_eol(source: &SourceStorage, offset: usize) -> Option<usize> {
2131    if source.starts_with(offset, ENDSTREAM_MARKER) {
2132        return Some(offset);
2133    }
2134    if source.starts_with(offset, b"\r\nendstream") {
2135        return offset.checked_add(2);
2136    }
2137    if source.starts_with(offset, b"\nendstream") || source.starts_with(offset, b"\rendstream") {
2138        return offset.checked_add(1);
2139    }
2140    None
2141}
2142
2143fn trim_eol_before(source: &SourceStorage, data_start: usize, keyword_pos: usize) -> usize {
2144    if keyword_pos >= data_start.saturating_add(2)
2145        && source
2146            .slice(keyword_pos.saturating_sub(2), keyword_pos)
2147            .is_ok_and(|bytes| bytes.as_ref() == b"\r\n")
2148    {
2149        return keyword_pos.saturating_sub(2);
2150    }
2151    if keyword_pos > data_start
2152        && matches!(
2153            source.byte(keyword_pos.saturating_sub(1)),
2154            Some(b'\n' | b'\r')
2155        )
2156    {
2157        return keyword_pos.saturating_sub(1);
2158    }
2159    keyword_pos
2160}
2161
2162fn count_trailing_ws(bytes: &[u8]) -> usize {
2163    bytes.iter().rev().take_while(|byte| is_ws(**byte)).count()
2164}
2165
2166fn integer_from_dictionary(dictionary: &Dictionary, key: &str) -> Option<i64> {
2167    match dictionary.get(key) {
2168        Some(CosObject::Integer(value)) => Some(*value),
2169        _ => None,
2170    }
2171}
2172
2173fn non_negative_usize_from_dictionary(
2174    dictionary: &Dictionary,
2175    key: &'static str,
2176) -> std::result::Result<usize, ParseError> {
2177    let value = non_negative_u64_from_dictionary(dictionary, key)?;
2178    usize::try_from(value).map_err(|_| ParseError::Malformed {
2179        message: BoundedText::unchecked(format!("invalid object stream {key}")),
2180    })
2181}
2182
2183fn non_negative_u64_from_dictionary(
2184    dictionary: &Dictionary,
2185    key: &'static str,
2186) -> std::result::Result<u64, ParseError> {
2187    let Some(value) = integer_from_dictionary(dictionary, key) else {
2188        return Err(ParseError::Malformed {
2189            message: BoundedText::unchecked(format!("missing integer dictionary key {key}")),
2190        });
2191    };
2192    u64::try_from(value).map_err(|_| ParseError::Malformed {
2193        message: BoundedText::unchecked(format!("invalid non-negative dictionary key {key}")),
2194    })
2195}
2196
2197fn optional_non_negative_offset(
2198    dictionary: &Dictionary,
2199    key: &'static str,
2200) -> std::result::Result<Option<u64>, ParseError> {
2201    let Some(value) = integer_from_dictionary(dictionary, key) else {
2202        return Ok(None);
2203    };
2204    u64::try_from(value)
2205        .map(Some)
2206        .map_err(|_| ParseError::Malformed {
2207            message: BoundedText::unchecked(format!("invalid xref offset dictionary key {key}")),
2208        })
2209}
2210
2211fn xref_widths(dictionary: &Dictionary) -> std::result::Result<[usize; 3], ParseError> {
2212    let Some(CosObject::Array(values)) = dictionary.get("W") else {
2213        return Err(ParseError::Malformed {
2214            message: bounded("xref stream missing W array"),
2215        });
2216    };
2217    if values.len() != 3 {
2218        return Err(ParseError::Malformed {
2219            message: bounded("xref stream W array must have three entries"),
2220        });
2221    }
2222    let mut widths = [0_usize; 3];
2223    for (index, value) in values.iter().enumerate() {
2224        let CosObject::Integer(width) = value else {
2225            return Err(ParseError::Malformed {
2226                message: bounded("xref stream W entry must be integer"),
2227            });
2228        };
2229        let width = usize::try_from(*width).map_err(|_| ParseError::Malformed {
2230            message: bounded("xref stream W entry must be non-negative"),
2231        })?;
2232        if width > 8 {
2233            return Err(ParseError::Malformed {
2234                message: bounded("xref stream W entry exceeds supported width"),
2235            });
2236        }
2237        let Some(slot) = widths.get_mut(index) else {
2238            return Err(ParseError::Malformed {
2239                message: bounded("xref stream W index out of bounds"),
2240            });
2241        };
2242        *slot = width;
2243    }
2244    Ok(widths)
2245}
2246
2247fn xref_indexes(
2248    dictionary: &Dictionary,
2249    size: u64,
2250) -> std::result::Result<Vec<(u64, u64)>, ParseError> {
2251    let Some(index_object) = dictionary.get("Index") else {
2252        return Ok(vec![(0, size)]);
2253    };
2254    let CosObject::Array(values) = index_object else {
2255        return Err(ParseError::Malformed {
2256            message: bounded("xref stream Index must be an array"),
2257        });
2258    };
2259    if values.len() % 2 != 0 {
2260        return Err(ParseError::Malformed {
2261            message: bounded("xref stream Index must contain pairs"),
2262        });
2263    }
2264    let mut indexes = Vec::with_capacity(values.len() / 2);
2265    for pair in values.chunks(2) {
2266        let first = integer_value(pair.first(), "xref stream Index first")?;
2267        let count = integer_value(pair.get(1), "xref stream Index count")?;
2268        let first = u64::try_from(first).map_err(|_| ParseError::Malformed {
2269            message: bounded("xref stream Index first must be non-negative"),
2270        })?;
2271        let count = u64::try_from(count).map_err(|_| ParseError::Malformed {
2272            message: bounded("xref stream Index count must be non-negative"),
2273        })?;
2274        first
2275            .checked_add(count)
2276            .ok_or(ParseError::ArithmeticOverflow {
2277                context: "xref stream Index",
2278            })?;
2279        indexes.push((first, count));
2280    }
2281    Ok(indexes)
2282}
2283
2284fn integer_value(
2285    value: Option<&CosObject>,
2286    context: &'static str,
2287) -> std::result::Result<i64, ParseError> {
2288    match value {
2289        Some(CosObject::Integer(value)) => Ok(*value),
2290        _ => Err(ParseError::Malformed {
2291            message: BoundedText::unchecked(format!("{context} must be integer")),
2292        }),
2293    }
2294}
2295
2296fn read_be_uint(
2297    bytes: &[u8],
2298    pos: &mut usize,
2299    width: usize,
2300) -> std::result::Result<u64, ParseError> {
2301    let end = pos
2302        .checked_add(width)
2303        .ok_or(ParseError::ArithmeticOverflow {
2304            context: "xref stream field",
2305        })?;
2306    let field = bytes.get(*pos..end).ok_or(ParseError::Malformed {
2307        message: bounded("xref stream field out of bounds"),
2308    })?;
2309    let mut value = 0_u64;
2310    for byte in field {
2311        value = value
2312            .checked_mul(256)
2313            .and_then(|current| current.checked_add(u64::from(*byte)))
2314            .ok_or(ParseError::ArithmeticOverflow {
2315                context: "xref stream field",
2316            })?;
2317    }
2318    *pos = end;
2319    Ok(value)
2320}
2321
2322fn object_ref_from_dictionary(dictionary: &Dictionary, key: &str) -> Option<ObjectKey> {
2323    match dictionary.get(key) {
2324        Some(CosObject::Reference(value)) => Some(*value),
2325        _ => None,
2326    }
2327}
2328
2329fn stream_filters(dictionary: &Dictionary) -> Vec<PdfName> {
2330    match dictionary.get("Filter") {
2331        Some(CosObject::Name(_)) if is_identity_crypt_filter(dictionary, 0) => Vec::new(),
2332        Some(CosObject::Name(name)) => vec![name.clone()],
2333        Some(CosObject::Array(values)) => values
2334            .iter()
2335            .enumerate()
2336            .filter_map(|(index, value)| match value {
2337                CosObject::Name(name) if is_identity_crypt_filter(dictionary, index) => None,
2338                CosObject::Name(name) => Some(name.clone()),
2339                _ => None,
2340            })
2341            .collect(),
2342        _ => Vec::new(),
2343    }
2344}
2345
2346fn is_identity_crypt_filter(dictionary: &Dictionary, filter_index: usize) -> bool {
2347    let Some(filter) = dictionary.get("Filter") else {
2348        return false;
2349    };
2350    let filter_is_crypt = match filter {
2351        CosObject::Name(name) => name.matches("Crypt"),
2352        CosObject::Array(filters) => matches!(
2353            filters.get(filter_index),
2354            Some(CosObject::Name(name)) if name.matches("Crypt")
2355        ),
2356        _ => false,
2357    };
2358    if !filter_is_crypt {
2359        return false;
2360    }
2361    match dictionary.get("DecodeParms") {
2362        Some(CosObject::Dictionary(params)) => matches!(
2363            params.get("Name"),
2364            Some(CosObject::Name(name)) if name.matches("Identity")
2365        ),
2366        Some(CosObject::Array(params)) => matches!(
2367            params.get(filter_index),
2368            Some(CosObject::Dictionary(params)) if matches!(
2369                params.get("Name"),
2370                Some(CosObject::Name(name)) if name.matches("Identity")
2371            )
2372        ),
2373        _ => false,
2374    }
2375}
2376
2377fn stream_decode_params(dictionary: &Dictionary, filter_count: usize) -> Vec<DecodeParams> {
2378    match dictionary.get("DecodeParms") {
2379        Some(CosObject::Dictionary(params)) => vec![decode_params_from_dictionary(params)],
2380        Some(CosObject::Array(values)) => values
2381            .iter()
2382            .take(filter_count)
2383            .map(|value| match value {
2384                CosObject::Dictionary(params) => decode_params_from_dictionary(params),
2385                _ => DecodeParams::default(),
2386            })
2387            .collect(),
2388        _ => vec![DecodeParams::default(); filter_count],
2389    }
2390}
2391
2392fn decode_params_from_dictionary(dictionary: &Dictionary) -> DecodeParams {
2393    DecodeParams {
2394        predictor: integer_from_dictionary(dictionary, "Predictor")
2395            .and_then(|value| u16::try_from(value).ok())
2396            .unwrap_or(1),
2397        colors: integer_from_dictionary(dictionary, "Colors")
2398            .and_then(|value| u16::try_from(value).ok())
2399            .unwrap_or(1),
2400        bits_per_component: integer_from_dictionary(dictionary, "BitsPerComponent")
2401            .and_then(|value| u16::try_from(value).ok())
2402            .unwrap_or(8),
2403        columns: integer_from_dictionary(dictionary, "Columns")
2404            .and_then(|value| u32::try_from(value).ok())
2405            .unwrap_or(1),
2406        early_change: integer_from_dictionary(dictionary, "EarlyChange")
2407            .and_then(|value| u8::try_from(value).ok())
2408            .unwrap_or(1),
2409        crypt_filter_name: match dictionary.get("Name") {
2410            Some(CosObject::Name(name)) => Some(name.clone()),
2411            _ => None,
2412        },
2413    }
2414}
2415
2416fn checked_u64_len(len: usize, context: &'static str) -> std::result::Result<u64, ParseError> {
2417    u64::try_from(len).map_err(|_| ParseError::ArithmeticOverflow { context })
2418}
2419
2420fn enforce_decoded_len(len: u64, max_decode_bytes: u64) -> std::result::Result<(), ParseError> {
2421    if len > max_decode_bytes {
2422        return Err(ParseError::LimitExceeded {
2423            limit: "max_stream_decode_bytes",
2424        });
2425    }
2426    Ok(())
2427}
2428
2429fn filter_identifier(filter: &PdfName) -> std::result::Result<Identifier, ParseError> {
2430    Identifier::new(String::from_utf8_lossy(filter.as_bytes()).into_owned()).map_err(|_| {
2431        ParseError::Malformed {
2432            message: bounded("stream filter name is not a valid identifier"),
2433        }
2434    })
2435}
2436
2437#[derive(Debug)]
2438struct FlateDecoder;
2439
2440impl StreamDecoder for FlateDecoder {
2441    fn decode(
2442        &self,
2443        input: &[u8],
2444        params: &DecodeParams,
2445        limits: &ResourceLimits,
2446    ) -> std::result::Result<DecoderOutput, ParseError> {
2447        let decoded = decode_flate_limited(input, limits.max_stream_decode_bytes)?;
2448        Ok(DecoderOutput {
2449            bytes: apply_predictor(decoded, params, limits.max_stream_decode_bytes)?,
2450            metadata_mode: false,
2451        })
2452    }
2453}
2454
2455#[derive(Debug)]
2456struct AsciiHexDecoder;
2457
2458impl StreamDecoder for AsciiHexDecoder {
2459    fn decode(
2460        &self,
2461        input: &[u8],
2462        _params: &DecodeParams,
2463        limits: &ResourceLimits,
2464    ) -> std::result::Result<DecoderOutput, ParseError> {
2465        let mut output = Vec::new();
2466        let mut high: Option<u8> = None;
2467        for byte in input {
2468            if is_ws(*byte) {
2469                continue;
2470            }
2471            if *byte == b'>' {
2472                break;
2473            }
2474            let Some(nibble) = decode_hex_digit(*byte) else {
2475                return Err(ParseError::StreamDecode {
2476                    message: bounded("invalid ASCIIHex digit"),
2477                });
2478            };
2479            if let Some(previous) = high.take() {
2480                push_limited_byte(
2481                    &mut output,
2482                    previous.saturating_mul(16).saturating_add(nibble),
2483                    limits.max_stream_decode_bytes,
2484                )?;
2485            } else {
2486                high = Some(nibble);
2487            }
2488        }
2489        if let Some(previous) = high {
2490            push_limited_byte(
2491                &mut output,
2492                previous.saturating_mul(16),
2493                limits.max_stream_decode_bytes,
2494            )?;
2495        }
2496        Ok(DecoderOutput {
2497            bytes: output,
2498            metadata_mode: false,
2499        })
2500    }
2501}
2502
2503#[derive(Debug)]
2504struct Ascii85Decoder;
2505
2506impl StreamDecoder for Ascii85Decoder {
2507    fn decode(
2508        &self,
2509        input: &[u8],
2510        _params: &DecodeParams,
2511        limits: &ResourceLimits,
2512    ) -> std::result::Result<DecoderOutput, ParseError> {
2513        let mut output = Vec::new();
2514        let mut group = Vec::with_capacity(5);
2515        let mut iter = input.iter().copied().peekable();
2516        while let Some(byte) = iter.next() {
2517            if is_ws(byte) {
2518                continue;
2519            }
2520            if byte == b'~' && iter.peek() == Some(&b'>') {
2521                break;
2522            }
2523            if byte == b'z' {
2524                if !group.is_empty() {
2525                    return Err(ParseError::StreamDecode {
2526                        message: bounded("ASCII85 z inside a partial group"),
2527                    });
2528                }
2529                extend_limited(&mut output, &[0, 0, 0, 0], limits.max_stream_decode_bytes)?;
2530                continue;
2531            }
2532            if !(b'!'..=b'u').contains(&byte) {
2533                return Err(ParseError::StreamDecode {
2534                    message: bounded("invalid ASCII85 digit"),
2535                });
2536            }
2537            group.push(byte.saturating_sub(b'!'));
2538            if group.len() == 5 {
2539                append_ascii85_group(&mut output, &group, 4, limits.max_stream_decode_bytes)?;
2540                group.clear();
2541            }
2542        }
2543        if !group.is_empty() {
2544            let output_bytes = group.len().saturating_sub(1);
2545            while group.len() < 5 {
2546                group.push(84);
2547            }
2548            append_ascii85_group(
2549                &mut output,
2550                &group,
2551                output_bytes,
2552                limits.max_stream_decode_bytes,
2553            )?;
2554        }
2555        Ok(DecoderOutput {
2556            bytes: output,
2557            metadata_mode: false,
2558        })
2559    }
2560}
2561
2562#[derive(Debug)]
2563struct RunLengthDecoder;
2564
2565impl StreamDecoder for RunLengthDecoder {
2566    fn decode(
2567        &self,
2568        input: &[u8],
2569        _params: &DecodeParams,
2570        limits: &ResourceLimits,
2571    ) -> std::result::Result<DecoderOutput, ParseError> {
2572        let mut output = Vec::new();
2573        let mut pos = 0_usize;
2574        while let Some(length) = input.get(pos).copied() {
2575            pos = pos.saturating_add(1);
2576            match length {
2577                128 => break,
2578                0..=127 => {
2579                    let count = usize::from(length).saturating_add(1);
2580                    let end = pos
2581                        .checked_add(count)
2582                        .ok_or(ParseError::ArithmeticOverflow {
2583                            context: "RunLength literal",
2584                        })?;
2585                    let literal = input.get(pos..end).ok_or(ParseError::StreamDecode {
2586                        message: bounded("RunLength literal exceeds input"),
2587                    })?;
2588                    extend_limited(&mut output, literal, limits.max_stream_decode_bytes)?;
2589                    pos = end;
2590                }
2591                _ => {
2592                    let Some(value) = input.get(pos).copied() else {
2593                        return Err(ParseError::StreamDecode {
2594                            message: bounded("RunLength repeat missing byte"),
2595                        });
2596                    };
2597                    pos = pos.saturating_add(1);
2598                    let count = 257_usize.saturating_sub(usize::from(length));
2599                    for _ in 0..count {
2600                        push_limited_byte(&mut output, value, limits.max_stream_decode_bytes)?;
2601                    }
2602                }
2603            }
2604        }
2605        Ok(DecoderOutput {
2606            bytes: output,
2607            metadata_mode: false,
2608        })
2609    }
2610}
2611
2612#[derive(Debug)]
2613struct LzwDecoder;
2614
2615impl StreamDecoder for LzwDecoder {
2616    fn decode(
2617        &self,
2618        input: &[u8],
2619        params: &DecodeParams,
2620        limits: &ResourceLimits,
2621    ) -> std::result::Result<DecoderOutput, ParseError> {
2622        let decoded = decode_lzw(input, params.early_change, limits.max_stream_decode_bytes)?;
2623        Ok(DecoderOutput {
2624            bytes: apply_predictor(decoded, params, limits.max_stream_decode_bytes)?,
2625            metadata_mode: false,
2626        })
2627    }
2628}
2629
2630#[derive(Debug)]
2631struct CryptDecoder;
2632
2633impl StreamDecoder for CryptDecoder {
2634    fn decode(
2635        &self,
2636        input: &[u8],
2637        params: &DecodeParams,
2638        _limits: &ResourceLimits,
2639    ) -> std::result::Result<DecoderOutput, ParseError> {
2640        if params
2641            .crypt_filter_name
2642            .as_ref()
2643            .is_none_or(|name| name.matches("Identity"))
2644        {
2645            return Ok(DecoderOutput {
2646                bytes: input.to_vec(),
2647                metadata_mode: false,
2648            });
2649        }
2650        Err(ParseError::UnsupportedFilter {
2651            filter: BoundedText::unchecked("Crypt"),
2652        })
2653    }
2654}
2655
2656#[derive(Debug)]
2657struct MetadataModeDecoder;
2658
2659impl StreamDecoder for MetadataModeDecoder {
2660    fn decode(
2661        &self,
2662        input: &[u8],
2663        _params: &DecodeParams,
2664        _limits: &ResourceLimits,
2665    ) -> std::result::Result<DecoderOutput, ParseError> {
2666        Ok(DecoderOutput {
2667            bytes: input.to_vec(),
2668            metadata_mode: true,
2669        })
2670    }
2671}
2672
2673fn append_ascii85_group(
2674    output: &mut Vec<u8>,
2675    group: &[u8],
2676    output_bytes: usize,
2677    max_decode_bytes: u64,
2678) -> std::result::Result<(), ParseError> {
2679    let mut value = 0_u32;
2680    for digit in group {
2681        value = value
2682            .checked_mul(85)
2683            .and_then(|current| current.checked_add(u32::from(*digit)))
2684            .ok_or(ParseError::StreamDecode {
2685                message: bounded("ASCII85 group overflows"),
2686            })?;
2687    }
2688    let bytes = value.to_be_bytes();
2689    let Some(slice) = bytes.get(..output_bytes) else {
2690        return Err(ParseError::StreamDecode {
2691            message: bounded("invalid ASCII85 group length"),
2692        });
2693    };
2694    extend_limited(output, slice, max_decode_bytes)
2695}
2696
2697fn push_limited_byte(
2698    output: &mut Vec<u8>,
2699    byte: u8,
2700    max_decode_bytes: u64,
2701) -> std::result::Result<(), ParseError> {
2702    let next_len = checked_u64_len(output.len(), "decoded stream length")?
2703        .checked_add(1)
2704        .ok_or(ParseError::ArithmeticOverflow {
2705            context: "decoded stream length",
2706        })?;
2707    enforce_decoded_len(next_len, max_decode_bytes)?;
2708    output.push(byte);
2709    Ok(())
2710}
2711
2712fn extend_limited(
2713    output: &mut Vec<u8>,
2714    bytes: &[u8],
2715    max_decode_bytes: u64,
2716) -> std::result::Result<(), ParseError> {
2717    let next_len = checked_u64_len(output.len(), "decoded stream length")?
2718        .checked_add(checked_u64_len(bytes.len(), "decoded stream length")?)
2719        .ok_or(ParseError::ArithmeticOverflow {
2720            context: "decoded stream length",
2721        })?;
2722    enforce_decoded_len(next_len, max_decode_bytes)?;
2723    output.extend_from_slice(bytes);
2724    Ok(())
2725}
2726
2727fn apply_predictor(
2728    bytes: Vec<u8>,
2729    params: &DecodeParams,
2730    max_decode_bytes: u64,
2731) -> std::result::Result<Vec<u8>, ParseError> {
2732    match params.predictor {
2733        1 => Ok(bytes),
2734        2 => apply_tiff_predictor(bytes, params, max_decode_bytes),
2735        10..=15 => apply_png_predictor(&bytes, params, max_decode_bytes),
2736        _ => Err(ParseError::StreamDecode {
2737            message: bounded("unsupported predictor"),
2738        }),
2739    }
2740}
2741
2742fn predictor_geometry(params: &DecodeParams) -> std::result::Result<(usize, usize), ParseError> {
2743    if params.colors == 0 || params.bits_per_component == 0 || params.columns == 0 {
2744        return Err(ParseError::StreamDecode {
2745            message: bounded("invalid predictor geometry"),
2746        });
2747    }
2748    let bits_per_row = u64::from(params.colors)
2749        .checked_mul(u64::from(params.bits_per_component))
2750        .and_then(|bits| bits.checked_mul(u64::from(params.columns)))
2751        .ok_or(ParseError::ArithmeticOverflow {
2752            context: "predictor row size",
2753        })?;
2754    let row_bytes = bits_per_row
2755        .checked_add(7)
2756        .ok_or(ParseError::ArithmeticOverflow {
2757            context: "predictor row size",
2758        })?
2759        / 8;
2760    let bytes_per_pixel_bits = u64::from(params.colors)
2761        .checked_mul(u64::from(params.bits_per_component))
2762        .ok_or(ParseError::ArithmeticOverflow {
2763            context: "predictor pixel size",
2764        })?;
2765    let bytes_per_pixel =
2766        bytes_per_pixel_bits
2767            .checked_add(7)
2768            .ok_or(ParseError::ArithmeticOverflow {
2769                context: "predictor pixel size",
2770            })?
2771            / 8;
2772    Ok((
2773        usize::try_from(row_bytes).map_err(|_| ParseError::LimitExceeded {
2774            limit: "max_stream_decode_bytes",
2775        })?,
2776        usize::try_from(bytes_per_pixel.max(1)).map_err(|_| ParseError::LimitExceeded {
2777            limit: "max_stream_decode_bytes",
2778        })?,
2779    ))
2780}
2781
2782fn apply_tiff_predictor(
2783    mut bytes: Vec<u8>,
2784    params: &DecodeParams,
2785    max_decode_bytes: u64,
2786) -> std::result::Result<Vec<u8>, ParseError> {
2787    enforce_decoded_len(
2788        checked_u64_len(bytes.len(), "predictor output length")?,
2789        max_decode_bytes,
2790    )?;
2791    let (row_bytes, bytes_per_pixel) = predictor_geometry(params)?;
2792    if row_bytes == 0 || !bytes.len().is_multiple_of(row_bytes) {
2793        return Err(ParseError::StreamDecode {
2794            message: bounded("TIFF predictor row length mismatch"),
2795        });
2796    }
2797    for row in bytes.chunks_mut(row_bytes) {
2798        for index in bytes_per_pixel..row.len() {
2799            let left = row
2800                .get(index.saturating_sub(bytes_per_pixel))
2801                .copied()
2802                .ok_or(ParseError::StreamDecode {
2803                    message: bounded("TIFF predictor left byte missing"),
2804                })?;
2805            let Some(byte) = row.get_mut(index) else {
2806                return Err(ParseError::StreamDecode {
2807                    message: bounded("TIFF predictor byte missing"),
2808                });
2809            };
2810            *byte = byte.wrapping_add(left);
2811        }
2812    }
2813    Ok(bytes)
2814}
2815
2816fn apply_png_predictor(
2817    bytes: &[u8],
2818    params: &DecodeParams,
2819    max_decode_bytes: u64,
2820) -> std::result::Result<Vec<u8>, ParseError> {
2821    let (row_bytes, bytes_per_pixel) = predictor_geometry(params)?;
2822    let encoded_row = row_bytes
2823        .checked_add(1)
2824        .ok_or(ParseError::ArithmeticOverflow {
2825            context: "PNG predictor row size",
2826        })?;
2827    if encoded_row == 0 || !bytes.len().is_multiple_of(encoded_row) {
2828        return Err(ParseError::StreamDecode {
2829            message: bounded("PNG predictor row length mismatch"),
2830        });
2831    }
2832    let row_count = bytes.len() / encoded_row;
2833    let output_capacity =
2834        row_count
2835            .checked_mul(row_bytes)
2836            .ok_or(ParseError::ArithmeticOverflow {
2837                context: "PNG predictor output length",
2838            })?;
2839    enforce_decoded_len(
2840        checked_u64_len(output_capacity, "PNG predictor output length")?,
2841        max_decode_bytes,
2842    )?;
2843    let mut output = vec![0_u8; output_capacity];
2844    for row_index in 0..row_count {
2845        let encoded_start =
2846            row_index
2847                .checked_mul(encoded_row)
2848                .ok_or(ParseError::ArithmeticOverflow {
2849                    context: "PNG predictor row offset",
2850                })?;
2851        let filter = *bytes.get(encoded_start).ok_or(ParseError::StreamDecode {
2852            message: bounded("PNG predictor filter byte missing"),
2853        })?;
2854        let encoded = bytes
2855            .get(encoded_start.saturating_add(1)..encoded_start.saturating_add(encoded_row))
2856            .ok_or(ParseError::StreamDecode {
2857                message: bounded("PNG predictor row missing"),
2858            })?;
2859        let output_start =
2860            row_index
2861                .checked_mul(row_bytes)
2862                .ok_or(ParseError::ArithmeticOverflow {
2863                    context: "PNG predictor output row offset",
2864                })?;
2865        for index in 0..row_bytes {
2866            let raw = *encoded.get(index).ok_or(ParseError::StreamDecode {
2867                message: bounded("PNG predictor source byte missing"),
2868            })?;
2869            let left = if index >= bytes_per_pixel {
2870                output
2871                    .get(output_start + index - bytes_per_pixel)
2872                    .copied()
2873                    .ok_or(ParseError::StreamDecode {
2874                        message: bounded("PNG predictor left byte missing"),
2875                    })?
2876            } else {
2877                0
2878            };
2879            let up = if row_index > 0 {
2880                output
2881                    .get(output_start + index - row_bytes)
2882                    .copied()
2883                    .ok_or(ParseError::StreamDecode {
2884                        message: bounded("PNG predictor upper byte missing"),
2885                    })?
2886            } else {
2887                0
2888            };
2889            let up_left = if row_index > 0 && index >= bytes_per_pixel {
2890                output
2891                    .get(output_start + index - row_bytes - bytes_per_pixel)
2892                    .copied()
2893                    .ok_or(ParseError::StreamDecode {
2894                        message: bounded("PNG predictor upper-left byte missing"),
2895                    })?
2896            } else {
2897                0
2898            };
2899            let value = png_predictor_value(filter, raw, left, up, up_left)?;
2900            let Some(target) = output.get_mut(output_start + index) else {
2901                return Err(ParseError::StreamDecode {
2902                    message: bounded("PNG predictor target byte missing"),
2903                });
2904            };
2905            *target = value;
2906        }
2907    }
2908    Ok(output)
2909}
2910
2911fn png_predictor_value(
2912    filter: u8,
2913    raw: u8,
2914    left: u8,
2915    up: u8,
2916    up_left: u8,
2917) -> std::result::Result<u8, ParseError> {
2918    match filter {
2919        0 => Ok(raw),
2920        1 => Ok(raw.wrapping_add(left)),
2921        2 => Ok(raw.wrapping_add(up)),
2922        3 => {
2923            let average =
2924                u8::try_from(u16::midpoint(u16::from(left), u16::from(up))).map_err(|_| {
2925                    ParseError::StreamDecode {
2926                        message: bounded("PNG predictor average byte out of range"),
2927                    }
2928                })?;
2929            Ok(raw.wrapping_add(average))
2930        }
2931        4 => Ok(raw.wrapping_add(paeth_predictor(left, up, up_left))),
2932        _ => Err(ParseError::StreamDecode {
2933            message: bounded("invalid PNG predictor filter"),
2934        }),
2935    }
2936}
2937
2938fn paeth_predictor(left: u8, up: u8, up_left: u8) -> u8 {
2939    let left = i16::from(left);
2940    let up = i16::from(up);
2941    let up_left = i16::from(up_left);
2942    let estimate = left + up - up_left;
2943    let left_distance = (estimate - left).abs();
2944    let up_distance = (estimate - up).abs();
2945    let up_left_distance = (estimate - up_left).abs();
2946    if left_distance <= up_distance && left_distance <= up_left_distance {
2947        u8::try_from(left).unwrap_or(0)
2948    } else if up_distance <= up_left_distance {
2949        u8::try_from(up).unwrap_or(0)
2950    } else {
2951        u8::try_from(up_left).unwrap_or(0)
2952    }
2953}
2954
2955fn decode_lzw(
2956    input: &[u8],
2957    early_change: u8,
2958    max_decode_bytes: u64,
2959) -> std::result::Result<Vec<u8>, ParseError> {
2960    let mut reader = MsbBitReader::new(input);
2961    let mut dictionary = initial_lzw_dictionary();
2962    let mut code_bits = 9_u8;
2963    let mut next_code = 258_u16;
2964    let mut previous: Option<Vec<u8>> = None;
2965    let mut output = Vec::new();
2966    while let Some(code) = reader.read_bits(code_bits)? {
2967        match code {
2968            256 => {
2969                dictionary = initial_lzw_dictionary();
2970                code_bits = 9;
2971                next_code = 258;
2972                previous = None;
2973            }
2974            257 => break,
2975            _ => {
2976                let entry = if let Some(value) = dictionary.get(usize::from(code)).cloned() {
2977                    value
2978                } else if code == next_code {
2979                    let mut value = previous.clone().ok_or(ParseError::StreamDecode {
2980                        message: bounded("LZW missing previous entry"),
2981                    })?;
2982                    let first = *value.first().ok_or(ParseError::StreamDecode {
2983                        message: bounded("LZW empty previous entry"),
2984                    })?;
2985                    value.push(first);
2986                    value
2987                } else {
2988                    return Err(ParseError::StreamDecode {
2989                        message: bounded("invalid LZW code"),
2990                    });
2991                };
2992                extend_limited(&mut output, &entry, max_decode_bytes)?;
2993                if let Some(previous_entry) = previous {
2994                    let mut new_entry = previous_entry;
2995                    let first = *entry.first().ok_or(ParseError::StreamDecode {
2996                        message: bounded("LZW empty entry"),
2997                    })?;
2998                    new_entry.push(first);
2999                    if dictionary.len() < 4096 {
3000                        dictionary.push(new_entry);
3001                        next_code = next_code.saturating_add(1);
3002                        let threshold =
3003                            (1_u16 << code_bits).saturating_sub(u16::from(early_change.min(1)));
3004                        if next_code >= threshold && code_bits < 12 {
3005                            code_bits = code_bits.saturating_add(1);
3006                        }
3007                    }
3008                }
3009                previous = Some(entry);
3010            }
3011        }
3012    }
3013    Ok(output)
3014}
3015
3016fn initial_lzw_dictionary() -> Vec<Vec<u8>> {
3017    let mut dictionary = Vec::with_capacity(258);
3018    for byte in 0_u8..=255 {
3019        dictionary.push(vec![byte]);
3020    }
3021    dictionary.push(Vec::new());
3022    dictionary.push(Vec::new());
3023    dictionary
3024}
3025
3026#[derive(Clone, Copy, Debug)]
3027struct MsbBitReader<'a> {
3028    input: &'a [u8],
3029    bit_pos: usize,
3030}
3031
3032impl<'a> MsbBitReader<'a> {
3033    fn new(input: &'a [u8]) -> Self {
3034        Self { input, bit_pos: 0 }
3035    }
3036
3037    fn read_bits(&mut self, bits: u8) -> std::result::Result<Option<u16>, ParseError> {
3038        let remaining_bits = self
3039            .input
3040            .len()
3041            .checked_mul(8)
3042            .and_then(|total| total.checked_sub(self.bit_pos))
3043            .ok_or(ParseError::ArithmeticOverflow {
3044                context: "LZW bit position",
3045            })?;
3046        if remaining_bits < usize::from(bits) {
3047            return Ok(None);
3048        }
3049        let mut value = 0_u16;
3050        for _ in 0..bits {
3051            let byte_index = self.bit_pos / 8;
3052            let bit_index = 7_usize.saturating_sub(self.bit_pos % 8);
3053            let byte = self
3054                .input
3055                .get(byte_index)
3056                .copied()
3057                .ok_or(ParseError::StreamDecode {
3058                    message: bounded("LZW bit read out of bounds"),
3059                })?;
3060            value = value.checked_shl(1).ok_or(ParseError::ArithmeticOverflow {
3061                context: "LZW code",
3062            })? | u16::from((byte >> bit_index) & 1);
3063            self.bit_pos = self.bit_pos.saturating_add(1);
3064        }
3065        Ok(Some(value))
3066    }
3067}
3068
3069fn encryption_reference(trailers: &[Trailer]) -> Option<&CosObject> {
3070    trailers
3071        .iter()
3072        .rev()
3073        .find_map(|trailer| trailer.dictionary.get("Encrypt"))
3074}
3075
3076fn encryption_fact(objects: &ObjectStore, trailers: &[Trailer]) -> ParseFact {
3077    #[cfg(feature = "decrypt")]
3078    {
3079        encryption::encryption_summary(objects, trailers).into_fact(false)
3080    }
3081    #[cfg(not(feature = "decrypt"))]
3082    {
3083        let handler = trailers.iter().rev().find_map(|trailer| {
3084            let encrypt = trailer.dictionary.get("Encrypt")?;
3085            match encrypt {
3086                CosObject::Dictionary(dictionary) => encryption_handler(dictionary),
3087                CosObject::Reference(key) => objects
3088                    .get(key)
3089                    .and_then(|object| object.object.as_dictionary())
3090                    .and_then(encryption_handler),
3091                _ => None,
3092            }
3093        });
3094        ParseFact::Encryption {
3095            encrypted: true,
3096            handler,
3097            version: None,
3098            revision: None,
3099            algorithm: None,
3100            decrypted: false,
3101        }
3102    }
3103}
3104
3105#[cfg(not(feature = "decrypt"))]
3106fn encryption_handler(dictionary: &Dictionary) -> Option<Identifier> {
3107    let Some(CosObject::Name(filter)) = dictionary.get("Filter") else {
3108        return None;
3109    };
3110    Identifier::new(String::from_utf8_lossy(filter.as_bytes()).into_owned()).ok()
3111}
3112
3113#[cfg(feature = "flate")]
3114fn decode_flate_limited(
3115    bytes: &[u8],
3116    max_decode_bytes: u64,
3117) -> std::result::Result<Vec<u8>, ParseError> {
3118    use flate2::read::{DeflateDecoder, ZlibDecoder};
3119
3120    read_limited(
3121        ZlibDecoder::new(std::io::Cursor::new(bytes)),
3122        max_decode_bytes,
3123    )
3124    .or_else(|_| {
3125        read_limited(
3126            DeflateDecoder::new(std::io::Cursor::new(bytes)),
3127            max_decode_bytes,
3128        )
3129    })
3130}
3131
3132#[cfg(not(feature = "flate"))]
3133fn decode_flate_limited(
3134    _bytes: &[u8],
3135    _max_decode_bytes: u64,
3136) -> std::result::Result<Vec<u8>, ParseError> {
3137    Err(ParseError::UnsupportedFilter {
3138        filter: BoundedText::unchecked("FlateDecode"),
3139    })
3140}
3141
3142#[cfg(feature = "flate")]
3143fn read_limited(
3144    mut reader: impl Read,
3145    max_decode_bytes: u64,
3146) -> std::result::Result<Vec<u8>, ParseError> {
3147    let mut output = Vec::new();
3148    let mut buffer = [0_u8; 8192];
3149    loop {
3150        let read = reader
3151            .read(&mut buffer)
3152            .map_err(|source| ParseError::StreamDecode {
3153                message: BoundedText::unchecked(source.to_string()),
3154            })?;
3155        if read == 0 {
3156            return Ok(output);
3157        }
3158        let next_len = u64::try_from(output.len())
3159            .ok()
3160            .and_then(|len| {
3161                u64::try_from(read)
3162                    .ok()
3163                    .and_then(|read| len.checked_add(read))
3164            })
3165            .ok_or(ParseError::ArithmeticOverflow {
3166                context: "decoded stream length",
3167            })?;
3168        if next_len > max_decode_bytes {
3169            return Err(ParseError::LimitExceeded {
3170                limit: "max_stream_decode_bytes",
3171            });
3172        }
3173        let chunk = buffer.get(..read).ok_or(ParseError::Malformed {
3174            message: bounded("decode buffer range out of bounds"),
3175        })?;
3176        output.extend_from_slice(chunk);
3177    }
3178}
3179
3180#[cfg(test)]
3181mod tests {
3182    use std::{error::Error, io::Cursor, num::NonZeroU32};
3183
3184    use proptest::prelude::*;
3185    use rstest::rstest;
3186
3187    use super::{CosObject, ParsedDocument, Parser, StreamObject};
3188    use crate::{ParseFact, ResourceLimits, StreamFact};
3189
3190    fn minimal_pdf() -> Vec<u8> {
3191        br"%PDF-1.7
31921 0 obj
3193<< /Type /Catalog >>
3194endobj
3195xref
31960 2
31970000000000 65535 f 
31980000000009 00000 n 
3199trailer
3200<< /Root 1 0 R /Size 2 >>
3201startxref
320245
3203%%EOF
3204"
3205        .to_vec()
3206    }
3207
3208    fn xref_stream_data(entries: &[(u8, u32, u16)]) -> Vec<u8> {
3209        let mut data = Vec::with_capacity(entries.len() * 7);
3210        for (entry_type, field_two, field_three) in entries {
3211            data.push(*entry_type);
3212            data.extend(field_two.to_be_bytes());
3213            data.extend(field_three.to_be_bytes());
3214        }
3215        data
3216    }
3217
3218    #[test]
3219    fn test_should_parse_header_and_catalog_from_m0_fixture() -> crate::Result<()> {
3220        let document = Parser::default().parse(Cursor::new(minimal_pdf()))?;
3221
3222        assert_eq!(document.version.major, 1);
3223        assert_eq!(document.version.minor, 7);
3224        assert!(document.catalog.is_some());
3225        assert_eq!(document.objects.len(), 1);
3226        Ok(())
3227    }
3228
3229    #[test]
3230    fn test_should_record_leading_header_bytes() -> crate::Result<()> {
3231        let mut bytes = b"junk".to_vec();
3232        bytes.extend(minimal_pdf());
3233
3234        let document = Parser::default().parse(Cursor::new(bytes))?;
3235
3236        assert!(document.parse_facts.iter().any(|fact| {
3237            matches!(
3238                fact,
3239                ParseFact::Header {
3240                    offset: 4,
3241                    had_leading_bytes: true,
3242                    ..
3243                }
3244            )
3245        }));
3246        Ok(())
3247    }
3248
3249    #[test]
3250    fn test_should_warn_on_malformed_recoverable_header() -> crate::Result<()> {
3251        let bytes = br"%PDF-x.y
32521 0 obj
3253<< /Type /Catalog >>
3254endobj
3255trailer
3256<< /Root 1 0 R >>
3257%%EOF
3258";
3259
3260        let document = Parser::default().parse(Cursor::new(bytes))?;
3261
3262        assert!(!document.warnings.is_empty());
3263        assert_eq!(document.version, crate::PdfVersion { major: 1, minor: 4 });
3264        Ok(())
3265    }
3266
3267    #[test]
3268    fn test_should_parse_names_strings_numbers_arrays_and_dictionaries() -> crate::Result<()> {
3269        let bytes = br"%PDF-1.7
32701 0 obj
3271<< /Type /Catalog /Name /A#20B /Title (hello\nworld) /Nums [1 -2 3.5 true false null] >>
3272endobj
3273trailer
3274<< /Root 1 0 R >>
3275%%EOF
3276";
3277
3278        let document = Parser::default().parse(Cursor::new(bytes))?;
3279        let object =
3280            document
3281                .objects
3282                .values()
3283                .next()
3284                .ok_or_else(|| crate::ParseError::MissingObject {
3285                    message: crate::BoundedText::unchecked("missing object"),
3286                })?;
3287        let dictionary =
3288            object
3289                .object
3290                .as_dictionary()
3291                .ok_or_else(|| crate::ParseError::Malformed {
3292                    message: crate::BoundedText::unchecked("missing dictionary"),
3293                })?;
3294
3295        assert!(
3296            matches!(dictionary.get("Nums"), Some(CosObject::Array(values)) if values.len() == 6)
3297        );
3298        Ok(())
3299    }
3300
3301    #[test]
3302    fn test_should_scan_bad_stream_length_and_emit_facts() -> crate::Result<()> {
3303        let bytes = br"%PDF-1.7
33041 0 obj
3305<< /Length 99 >>
3306stream
3307abc
3308endstream
3309endobj
3310trailer
3311<< /Root 1 0 R >>
3312%%EOF
3313";
3314
3315        let document = Parser::default().parse(Cursor::new(bytes))?;
3316
3317        assert!(document.parse_facts.iter().any(|fact| {
3318            matches!(
3319                fact,
3320                ParseFact::Stream {
3321                    fact: StreamFact::Length {
3322                        declared: 99,
3323                        discovered: 3
3324                    },
3325                    ..
3326                }
3327            )
3328        }));
3329        Ok(())
3330    }
3331
3332    #[test]
3333    fn test_should_treat_identity_crypt_stream_filter_as_passthrough() -> crate::Result<()> {
3334        let bytes = br"%PDF-1.7
33351 0 obj
3336<< /Type /Catalog >>
3337endobj
33382 0 obj
3339<< /Length 3 /Filter /Crypt /DecodeParms << /Name /Identity >> >>
3340stream
3341abc
3342endstream
3343endobj
3344trailer
3345<< /Root 1 0 R >>
3346%%EOF
3347";
3348
3349        let document = Parser::default().parse(Cursor::new(bytes))?;
3350        let object = document
3351            .objects
3352            .get(&crate::ObjectKey::new(
3353                NonZeroU32::new(2).unwrap_or(NonZeroU32::MIN),
3354                0,
3355            ))
3356            .ok_or_else(|| crate::ParseError::MissingObject {
3357                message: crate::BoundedText::unchecked("missing stream object"),
3358            })?;
3359        let CosObject::Stream(stream) = &object.object else {
3360            return Err(crate::ParseError::Malformed {
3361                message: crate::BoundedText::unchecked("missing stream"),
3362            }
3363            .into());
3364        };
3365
3366        assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"abc");
3367        Ok(())
3368    }
3369
3370    #[test]
3371    fn test_should_parse_xref_stream_as_trailer_source() -> crate::Result<()> {
3372        let xref_data = xref_stream_data(&[(0, 0, 65_535), (1, 9, 0), (1, 45, 0)]);
3373        let mut bytes = br"%PDF-1.7
33741 0 obj
3375<< /Type /Catalog >>
3376endobj
33772 0 obj
3378<< /Type /XRef /Size 3 /W [1 4 2] /Index [0 3] /Length "
3379            .to_vec();
3380        bytes.extend(xref_data.len().to_string().as_bytes());
3381        bytes.extend(
3382            br" /Root 1 0 R >>
3383stream
3384",
3385        );
3386        bytes.extend(xref_data);
3387        bytes.extend(
3388            br"
3389endstream
3390endobj
3391%%EOF
3392",
3393        );
3394
3395        let document = Parser::default().parse(Cursor::new(bytes))?;
3396
3397        assert!(document.catalog.is_some());
3398        assert!(document.parse_facts.iter().any(|fact| {
3399            matches!(
3400                fact,
3401                ParseFact::Xref {
3402                    fact: crate::XrefFact::XrefStreamParsed { .. },
3403                    ..
3404                }
3405            )
3406        }));
3407        Ok(())
3408    }
3409
3410    #[test]
3411    fn test_should_parse_flate_xref_stream_with_compressed_entry() -> Result<(), Box<dyn Error>> {
3412        use std::io::Write;
3413
3414        use flate2::{Compression, write::ZlibEncoder};
3415
3416        let xref_data = xref_stream_data(&[(2, 2, 0), (1, 3, 0)]);
3417        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3418        encoder.write_all(&xref_data)?;
3419        let compressed = encoder.finish()?;
3420        let mut bytes = br"%PDF-1.7
34211 0 obj
3422<< /Type /Catalog >>
3423endobj
34242 0 obj
3425<< /Type /ObjStm /N 0 /First 0 /Length 0 >>
3426stream
3427endstream
3428endobj
34293 0 obj
3430<< /Type /XRef /Size 3 /W [1 4 2] /Index [1 2] /Filter /FlateDecode /Length "
3431            .to_vec();
3432        bytes.extend(compressed.len().to_string().as_bytes());
3433        bytes.extend(
3434            br" /Root 1 0 R >>
3435stream
3436",
3437        );
3438        bytes.extend(compressed);
3439        bytes.extend(
3440            br"
3441endstream
3442endobj
3443%%EOF
3444",
3445        );
3446
3447        let document = Parser::default().parse(Cursor::new(bytes))?;
3448
3449        assert!(document.parse_facts.iter().any(|fact| {
3450            matches!(
3451                fact,
3452                ParseFact::Xref {
3453                    fact: crate::XrefFact::XrefStreamParsed {
3454                        entries: 2,
3455                        compressed_entries: 1
3456                    },
3457                    ..
3458                }
3459            )
3460        }));
3461        Ok(())
3462    }
3463
3464    #[test]
3465    fn test_should_emit_xref_prev_and_hybrid_reference_facts() -> crate::Result<()> {
3466        let bytes = br"%PDF-1.7
34671 0 obj
3468<< /Type /Catalog >>
3469endobj
3470xref
34710 2
34720000000000 65535 f 
34730000000009 00000 n 
3474trailer
3475<< /Size 2 /Root 1 0 R >>
3476xref
34770 1
34780000000000 65535 f 
3479trailer
3480<< /Size 2 /Root 1 0 R /Prev 40 /XRefStm 120 >>
3481%%EOF
3482";
3483
3484        let document = Parser::default().parse(Cursor::new(bytes))?;
3485
3486        assert!(document.parse_facts.iter().any(|fact| {
3487            matches!(
3488                fact,
3489                ParseFact::Xref {
3490                    fact: crate::XrefFact::PrevChain { offset: 40 },
3491                    ..
3492                }
3493            )
3494        }));
3495        assert!(document.parse_facts.iter().any(|fact| {
3496            matches!(
3497                fact,
3498                ParseFact::Xref {
3499                    fact: crate::XrefFact::HybridReference { offset: 120 },
3500                    ..
3501                }
3502            )
3503        }));
3504        Ok(())
3505    }
3506
3507    #[test]
3508    fn test_should_expand_unfiltered_object_stream() -> crate::Result<()> {
3509        let object_stream = b"1 0 << /Type /Catalog >>";
3510        let mut bytes = br"%PDF-1.7
35112 0 obj
3512<< /Type /ObjStm /N 1 /First 4 /Length "
3513            .to_vec();
3514        bytes.extend(object_stream.len().to_string().as_bytes());
3515        bytes.extend(
3516            br" >>
3517stream
3518",
3519        );
3520        bytes.extend(object_stream);
3521        bytes.extend(
3522            br"
3523endstream
3524endobj
35253 0 obj
3526<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3527stream
3528endstream
3529endobj
3530%%EOF
3531",
3532        );
3533
3534        let document = Parser::default().parse(Cursor::new(bytes))?;
3535
3536        assert!(document.catalog.is_some());
3537        assert_eq!(document.objects.len(), 3);
3538        assert!(document.parse_facts.iter().any(|fact| {
3539            matches!(
3540                fact,
3541                ParseFact::Xref {
3542                    fact: crate::XrefFact::ObjectStreamParsed,
3543                    ..
3544                }
3545            )
3546        }));
3547        Ok(())
3548    }
3549
3550    #[test]
3551    fn test_should_decode_flate_object_stream_with_limit() -> Result<(), Box<dyn Error>> {
3552        use std::io::Write;
3553
3554        use flate2::{Compression, write::ZlibEncoder};
3555
3556        let object_stream = b"1 0 << /Type /Catalog >>";
3557        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3558        encoder.write_all(object_stream)?;
3559        let compressed = encoder.finish()?;
3560        let mut bytes = br"%PDF-1.7
35612 0 obj
3562<< /Type /ObjStm /N 1 /First 4 /Filter /FlateDecode /Length "
3563            .to_vec();
3564        bytes.extend(compressed.len().to_string().as_bytes());
3565        bytes.extend(
3566            br" >>
3567stream
3568",
3569        );
3570        bytes.extend(compressed);
3571        bytes.extend(
3572            br"
3573endstream
3574endobj
35753 0 obj
3576<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3577stream
3578endstream
3579endobj
3580%%EOF
3581",
3582        );
3583
3584        let document = Parser::default().parse(Cursor::new(bytes))?;
3585
3586        assert!(document.catalog.is_some());
3587        assert!(document.parse_facts.iter().any(|fact| {
3588            matches!(
3589                fact,
3590                ParseFact::Stream {
3591                    fact: StreamFact::Decoded { bytes: 24 },
3592                    ..
3593                }
3594            )
3595        }));
3596        Ok(())
3597    }
3598
3599    #[test]
3600    fn test_should_decode_asciihex_ascii85_runlength_and_lzw_streams() -> crate::Result<()> {
3601        let cases: [(&str, Vec<u8>, &[u8]); 4] = [
3602            ("ASCIIHexDecode", b"61 62>".to_vec(), b"ab"),
3603            ("ASCII85Decode", b"9jqo~>".to_vec(), b"Man"),
3604            (
3605                "RunLengthDecode",
3606                vec![2, b'a', b'b', b'c', 254, b'x', 128],
3607                b"abcxxx",
3608            ),
3609            (
3610                "LZWDecode",
3611                pack_lzw_codes(&[(256, 9), (97, 9), (98, 9), (97, 9), (257, 9)]),
3612                b"aba",
3613            ),
3614        ];
3615        for (filter, encoded, expected) in cases {
3616            let document =
3617                Parser::default().parse(Cursor::new(single_stream_pdf(filter, "", &encoded)))?;
3618            let stream = parsed_stream(&document)?;
3619
3620            assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, expected);
3621        }
3622        Ok(())
3623    }
3624
3625    #[test]
3626    fn test_should_apply_flate_png_predictor() -> Result<(), Box<dyn Error>> {
3627        use std::io::Write;
3628
3629        use flate2::{Compression, write::ZlibEncoder};
3630
3631        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
3632        encoder.write_all(&[1, b'a', 1, 1])?;
3633        let compressed = encoder.finish()?;
3634        let document = Parser::default().parse(Cursor::new(single_stream_pdf(
3635            "FlateDecode",
3636            "/DecodeParms << /Predictor 12 /Columns 3 >>",
3637            &compressed,
3638        )))?;
3639        let stream = parsed_stream(&document)?;
3640
3641        assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"abc");
3642        Ok(())
3643    }
3644
3645    #[test]
3646    fn test_should_emit_per_filter_decode_facts_for_object_stream() -> crate::Result<()> {
3647        let object_stream = b"1 0 << /Type /Catalog >>";
3648        let encoded = hex_bytes(object_stream);
3649        let mut bytes = br"%PDF-1.7
36502 0 obj
3651<< /Type /ObjStm /N 1 /First 4 /Filter /ASCIIHexDecode /Length "
3652            .to_vec();
3653        bytes.extend(encoded.len().to_string().as_bytes());
3654        bytes.extend(
3655            br" >>
3656stream
3657",
3658        );
3659        bytes.extend(encoded);
3660        bytes.extend(
3661            br"
3662endstream
3663endobj
36643 0 obj
3665<< /Type /XRef /Size 4 /W [1 1 1] /Index [0 0] /Length 0 /Root 1 0 R >>
3666stream
3667endstream
3668endobj
3669%%EOF
3670",
3671        );
3672
3673        let document = Parser::default().parse(Cursor::new(bytes))?;
3674
3675        assert!(document.catalog.is_some());
3676        assert!(document.parse_facts.iter().any(|fact| {
3677            matches!(
3678                fact,
3679                ParseFact::Stream {
3680                    fact: StreamFact::FilterDecoded {
3681                        filter,
3682                        output_bytes: 24,
3683                        ..
3684                    },
3685                    ..
3686                } if filter.as_str() == "ASCIIHexDecode"
3687            )
3688        }));
3689        Ok(())
3690    }
3691
3692    #[test]
3693    fn test_should_preserve_image_filter_bytes_in_metadata_mode() -> crate::Result<()> {
3694        let document =
3695            Parser::default().parse(Cursor::new(single_stream_pdf("DCTDecode", "", b"image")))?;
3696        let stream = parsed_stream(&document)?;
3697
3698        assert_eq!(stream.decoded_bytes(&ResourceLimits::default())?, b"image");
3699        Ok(())
3700    }
3701
3702    #[test]
3703    fn test_should_parse_with_spill_file_source_storage_above_threshold() -> crate::Result<()> {
3704        let limits = ResourceLimits {
3705            memory_source_threshold_bytes: 0,
3706            ..ResourceLimits::default()
3707        };
3708        let document = Parser::new(limits.clone()).parse(Cursor::new(single_stream_pdf(
3709            "ASCIIHexDecode",
3710            "",
3711            b"61 62>",
3712        )))?;
3713        let stream = parsed_stream(&document)?;
3714
3715        assert_eq!(stream.decoded_bytes(&limits)?, b"ab");
3716        Ok(())
3717    }
3718
3719    #[test]
3720    fn test_should_enforce_name_limit() {
3721        let limits = ResourceLimits {
3722            max_name_bytes: 2,
3723            ..ResourceLimits::default()
3724        };
3725        let bytes = br"%PDF-1.7
37261 0 obj
3727<< /Long /Name >>
3728endobj
3729%%EOF
3730";
3731
3732        let result = Parser::new(limits).parse(Cursor::new(bytes));
3733
3734        assert!(result.is_err());
3735    }
3736
3737    #[rstest]
3738    #[case(b"/A", "A")]
3739    #[case(b"/A#20B", "A B")]
3740    fn test_should_parse_name_escape_matrix(
3741        #[case] token: &[u8],
3742        #[case] expected: &str,
3743    ) -> crate::Result<()> {
3744        let mut bytes = b"%PDF-1.7\n1 0 obj\n<< /Name ".to_vec();
3745        bytes.extend(token);
3746        bytes.extend(b" >>\nendobj\n%%EOF\n");
3747
3748        let document = Parser::default().parse(Cursor::new(bytes))?;
3749        let object =
3750            document
3751                .objects
3752                .values()
3753                .next()
3754                .ok_or_else(|| crate::ParseError::MissingObject {
3755                    message: crate::BoundedText::unchecked("missing object"),
3756                })?;
3757        let dictionary =
3758            object
3759                .object
3760                .as_dictionary()
3761                .ok_or_else(|| crate::ParseError::Malformed {
3762                    message: crate::BoundedText::unchecked("missing dictionary"),
3763                })?;
3764
3765        assert!(
3766            matches!(dictionary.get("Name"), Some(CosObject::Name(name)) if name.as_bytes() == expected.as_bytes())
3767        );
3768        Ok(())
3769    }
3770
3771    proptest! {
3772        #[test]
3773        fn test_should_not_panic_on_arbitrary_bytes(input in proptest::collection::vec(any::<u8>(), 0..512)) {
3774            let _ = Parser::default().parse(Cursor::new(input));
3775        }
3776    }
3777
3778    fn single_stream_pdf(filter: &str, params: &str, encoded: &[u8]) -> Vec<u8> {
3779        let mut bytes = format!(
3780            "%PDF-1.7\n1 0 obj\n<< /Length {} /Filter /{filter} {params} >>\nstream\n",
3781            encoded.len()
3782        )
3783        .into_bytes();
3784        bytes.extend(encoded);
3785        bytes.extend(b"\nendstream\nendobj\n%%EOF\n");
3786        bytes
3787    }
3788
3789    fn parsed_stream(document: &ParsedDocument) -> crate::Result<&StreamObject> {
3790        let object =
3791            document
3792                .objects
3793                .values()
3794                .next()
3795                .ok_or_else(|| crate::ParseError::MissingObject {
3796                    message: crate::BoundedText::unchecked("missing stream object"),
3797                })?;
3798        let CosObject::Stream(stream) = &object.object else {
3799            return Err(crate::ParseError::Malformed {
3800                message: crate::BoundedText::unchecked("missing stream"),
3801            }
3802            .into());
3803        };
3804        Ok(stream)
3805    }
3806
3807    fn pack_lzw_codes(codes: &[(u16, u8)]) -> Vec<u8> {
3808        let mut output = Vec::new();
3809        let mut current = 0_u8;
3810        let mut used = 0_u8;
3811        for (code, bits) in codes {
3812            for bit in (0..*bits).rev() {
3813                current <<= 1;
3814                current |= u8::try_from((code >> bit) & 1).unwrap_or(0);
3815                used = used.saturating_add(1);
3816                if used == 8 {
3817                    output.push(current);
3818                    current = 0;
3819                    used = 0;
3820                }
3821            }
3822        }
3823        if used != 0 {
3824            current <<= 8_u8.saturating_sub(used);
3825            output.push(current);
3826        }
3827        output
3828    }
3829
3830    fn hex_bytes(bytes: &[u8]) -> Vec<u8> {
3831        const HEX: &[u8; 16] = b"0123456789abcdef";
3832        let mut output = Vec::with_capacity(bytes.len().saturating_mul(2).saturating_add(1));
3833        for byte in bytes {
3834            output.push(HEX.get(usize::from(byte >> 4)).copied().unwrap_or(b'0'));
3835            output.push(HEX.get(usize::from(byte & 0x0f)).copied().unwrap_or(b'0'));
3836        }
3837        output.push(b'>');
3838        output
3839    }
3840}
pdfv_core/parser.rs

pdfv_core/
parser.rs