Skip to main content

pdfluent_lopdf/
object.rs

1use crate::encodings;
2use crate::encodings::Encoding;
3use crate::encodings::cmap::ToUnicodeCMap;
4use crate::error::DecompressError;
5use crate::{Document, Error, Result};
6use indexmap::IndexMap;
7use log::warn;
8use std::cmp::max;
9use std::fmt;
10use std::str;
11
12/// Hard cap on decompressed stream data (LOPDF-ZBOMB-01).
13/// Prevents zip-bomb DoS via crafted FlateDecode/LZWDecode streams.
14/// 256 MiB is generous for any real-world PDF stream content.
15const MAX_DECOMPRESSED_BYTES: usize = 256 * 1024 * 1024;
16
17/// Object identifier consists of two parts: object number and generation number.
18pub type ObjectId = (u32, u16);
19
20/// Dictionary object.
21#[derive(Clone, Default, PartialEq)]
22pub struct Dictionary(IndexMap<Vec<u8>, Object>);
23
24/// Stream object
25/// Warning - all streams must be indirect objects, while
26/// the stream dictionary may be a direct object
27#[derive(Debug, Clone, PartialEq)]
28pub struct Stream {
29    /// Associated stream dictionary
30    pub dict: Dictionary,
31    /// Contents of the stream in bytes
32    pub content: Vec<u8>,
33    /// Can the stream be compressed by the `Document::compress()` function?
34    /// Font streams may not be compressed, for example
35    pub allows_compression: bool,
36    /// Stream data's position in PDF file.
37    pub start_position: Option<usize>,
38}
39
40/// Basic PDF object types defined in an enum.
41#[derive(Clone, PartialEq)]
42pub enum Object {
43    Null,
44    Boolean(bool),
45    Integer(i64),
46    Real(f32),
47    Name(Vec<u8>),
48    String(Vec<u8>, StringFormat),
49    Array(Vec<Object>),
50    Dictionary(Dictionary),
51    Stream(Stream),
52    Reference(ObjectId),
53}
54
55/// String objects can be written in two formats.
56#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
57pub enum StringFormat {
58    #[default]
59    Literal,
60    Hexadecimal,
61}
62
63impl From<bool> for Object {
64    fn from(value: bool) -> Self {
65        Object::Boolean(value)
66    }
67}
68
69impl From<i64> for Object {
70    fn from(number: i64) -> Self {
71        Object::Integer(number)
72    }
73}
74
75macro_rules! from_smaller_ints {
76	($( $Int: ty )+) => {
77		$(
78			impl From<$Int> for Object {
79				fn from(number: $Int) -> Self {
80					Object::Integer(i64::from(number))
81				}
82			}
83		)+
84	}
85}
86
87from_smaller_ints! {
88    i8 i16 i32
89    u8 u16 u32
90}
91
92impl From<f64> for Object {
93    fn from(number: f64) -> Self {
94        Object::Real(number as f32)
95    }
96}
97
98impl From<f32> for Object {
99    fn from(number: f32) -> Self {
100        Object::Real(number)
101    }
102}
103
104impl From<String> for Object {
105    fn from(name: String) -> Self {
106        Object::Name(name.into_bytes())
107    }
108}
109
110impl<'a> From<&'a str> for Object {
111    fn from(name: &'a str) -> Self {
112        Object::Name(name.as_bytes().to_vec())
113    }
114}
115
116impl From<Vec<Object>> for Object {
117    fn from(array: Vec<Object>) -> Self {
118        Object::Array(array)
119    }
120}
121
122impl From<Dictionary> for Object {
123    fn from(dict: Dictionary) -> Self {
124        Object::Dictionary(dict)
125    }
126}
127
128impl From<Stream> for Object {
129    fn from(stream: Stream) -> Self {
130        Object::Stream(stream)
131    }
132}
133
134impl From<ObjectId> for Object {
135    fn from(id: ObjectId) -> Self {
136        Object::Reference(id)
137    }
138}
139
140impl Object {
141    pub fn string_literal<S: Into<Vec<u8>>>(s: S) -> Self {
142        Object::String(s.into(), StringFormat::Literal)
143    }
144
145    pub fn is_null(&self) -> bool {
146        matches!(*self, Object::Null)
147    }
148
149    pub fn as_bool(&self) -> Result<bool> {
150        match self {
151            Object::Boolean(value) => Ok(*value),
152            _ => Err(Error::ObjectType {
153                expected: "Boolean",
154                found: self.enum_variant(),
155            }),
156        }
157    }
158
159    pub fn as_i64(&self) -> Result<i64> {
160        match self {
161            Object::Integer(value) => Ok(*value),
162            _ => Err(Error::ObjectType {
163                expected: "Integer",
164                found: self.enum_variant(),
165            }),
166        }
167    }
168
169    pub fn as_f32(&self) -> Result<f32> {
170        match self {
171            Object::Real(value) => Ok(*value),
172            _ => Err(Error::ObjectType {
173                expected: "Real",
174                found: self.enum_variant(),
175            }),
176        }
177    }
178
179    /// Get the object value as a float.
180    /// Unlike [`Object::as_f32`] this will also cast an Integer to a Real.
181    pub fn as_float(&self) -> Result<f32> {
182        match self {
183            Object::Integer(value) => Ok(*value as f32),
184            Object::Real(value) => Ok(*value),
185            _ => Err(Error::ObjectType {
186                expected: "Integer or Real",
187                found: self.enum_variant(),
188            }),
189        }
190    }
191
192    pub fn as_name(&self) -> Result<&[u8]> {
193        match self {
194            Object::Name(name) => Ok(name),
195            _ => Err(Error::ObjectType {
196                expected: "Name",
197                found: self.enum_variant(),
198            }),
199        }
200    }
201
202    pub fn as_str(&self) -> Result<&[u8]> {
203        match self {
204            Object::String(string, _) => Ok(string),
205            _ => Err(Error::ObjectType {
206                expected: "String",
207                found: self.enum_variant(),
208            }),
209        }
210    }
211
212    pub fn as_str_mut(&mut self) -> Result<&mut Vec<u8>> {
213        match self {
214            Object::String(string, _) => Ok(string),
215            _ => Err(Error::ObjectType {
216                expected: "String",
217                found: self.enum_variant(),
218            }),
219        }
220    }
221
222    pub fn as_reference(&self) -> Result<ObjectId> {
223        match self {
224            Object::Reference(id) => Ok(*id),
225            _ => Err(Error::ObjectType {
226                expected: "Reference",
227                found: self.enum_variant(),
228            }),
229        }
230    }
231
232    pub fn as_array(&self) -> Result<&Vec<Object>> {
233        match self {
234            Object::Array(arr) => Ok(arr),
235            _ => Err(Error::ObjectType {
236                expected: "Array",
237                found: self.enum_variant(),
238            }),
239        }
240    }
241
242    pub fn as_array_mut(&mut self) -> Result<&mut Vec<Object>> {
243        match self {
244            Object::Array(arr) => Ok(arr),
245            _ => Err(Error::ObjectType {
246                expected: "Array",
247                found: self.enum_variant(),
248            }),
249        }
250    }
251
252    pub fn as_dict(&self) -> Result<&Dictionary> {
253        match self {
254            Object::Dictionary(dict) => Ok(dict),
255            _ => Err(Error::ObjectType {
256                expected: "Dictionary",
257                found: self.enum_variant(),
258            }),
259        }
260    }
261
262    pub fn as_dict_mut(&mut self) -> Result<&mut Dictionary> {
263        match self {
264            Object::Dictionary(dict) => Ok(dict),
265            _ => Err(Error::ObjectType {
266                expected: "Dictionary",
267                found: self.enum_variant(),
268            }),
269        }
270    }
271
272    pub fn as_stream(&self) -> Result<&Stream> {
273        match self {
274            Object::Stream(stream) => Ok(stream),
275            _ => Err(Error::ObjectType {
276                expected: "Stream",
277                found: self.enum_variant(),
278            }),
279        }
280    }
281
282    pub fn as_stream_mut(&mut self) -> Result<&mut Stream> {
283        match self {
284            Object::Stream(stream) => Ok(stream),
285            _ => Err(Error::ObjectType {
286                expected: "Stream",
287                found: self.enum_variant(),
288            }),
289        }
290    }
291
292    // TODO: maybe remove
293    pub fn type_name(&self) -> Result<&[u8]> {
294        match self {
295            Object::Dictionary(dict) => dict.get_type(),
296            Object::Stream(stream) => stream.dict.get_type(),
297            obj => Err(Error::ObjectType {
298                expected: "Dictionary or Stream",
299                found: obj.enum_variant(),
300            }),
301        }
302    }
303
304    pub fn enum_variant(&self) -> &'static str {
305        match self {
306            Object::Null => "Null",
307            Object::Boolean(_) => "Boolean",
308            Object::Integer(_) => "Integer",
309            Object::Real(_) => "Real",
310            Object::Name(_) => "Name",
311            Object::String(_, _) => "String",
312            Object::Array(_) => "Array",
313            Object::Dictionary(_) => "Dictionary",
314            Object::Stream(_) => "Stream",
315            Object::Reference(_) => "Reference",
316        }
317    }
318}
319
320impl fmt::Debug for Object {
321    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
322        match self {
323            Object::Null => write!(f, "Null"),
324            Object::Boolean(value) => write!(f, "{value}"),
325            Object::Integer(value) => write!(f, "{value}"),
326            Object::Real(value) => write!(f, "{value}"),
327            Object::Name(name) => write!(f, "/{}", String::from_utf8_lossy(name)),
328            Object::String(text, StringFormat::Literal) => {
329                write!(f, "({})", String::from_utf8_lossy(text))
330            }
331            Object::String(text, StringFormat::Hexadecimal) => {
332                write!(f, "<")?;
333                for b in text {
334                    write!(f, "{b:02x}")?
335                }
336                write!(f, ">")
337            }
338            Object::Array(array) => {
339                let items = array
340                    .iter()
341                    .map(|item| format!("{item:?}"))
342                    .collect::<Vec<String>>();
343                write!(f, "[{}]", items.join(" "))
344            }
345            Object::Dictionary(dict) => write!(f, "{dict:?}"),
346            Object::Stream(stream) => write!(f, "{:?}stream...endstream", stream.dict),
347            Object::Reference(id) => write!(f, "{} {} R", id.0, id.1),
348        }
349    }
350}
351
352impl Dictionary {
353    pub fn new() -> Dictionary {
354        Dictionary(IndexMap::new())
355    }
356
357    pub fn has(&self, key: &[u8]) -> bool {
358        self.0.contains_key(key)
359    }
360
361    pub fn get(&self, key: &[u8]) -> Result<&Object> {
362        self.0
363            .get(key)
364            .ok_or(Error::DictKey(String::from_utf8_lossy(key).to_string()))
365    }
366
367    /// Extract object from dictionary, dereferencing
368    /// the object if it is a reference.
369    pub fn get_deref<'a>(&'a self, key: &[u8], doc: &'a Document) -> Result<&'a Object> {
370        doc.dereference(self.get(key)?).map(|(_, object)| object)
371    }
372
373    pub fn get_mut(&mut self, key: &[u8]) -> Result<&mut Object> {
374        self.0
375            .get_mut(key)
376            .ok_or(Error::DictKey(String::from_utf8_lossy(key).to_string()))
377    }
378
379    pub fn set<K, V>(&mut self, key: K, value: V)
380    where
381        K: Into<Vec<u8>>,
382        V: Into<Object>,
383    {
384        self.0.insert(key.into(), value.into());
385    }
386
387    pub fn len(&self) -> usize {
388        self.0.len()
389    }
390
391    pub fn is_empty(&self) -> bool {
392        self.0.len() == 0
393    }
394
395    pub fn remove(&mut self, key: &[u8]) -> Option<Object> {
396        self.0.swap_remove(key)
397    }
398
399    pub fn has_type(&self, type_name: &[u8]) -> bool {
400        self.get(b"Type").and_then(|s| s.as_name()).ok() == Some(type_name)
401    }
402
403    pub fn get_type(&self) -> Result<&[u8]> {
404        self.get(b"Type")
405            .and_then(Object::as_name)
406            .or_else(|_| self.get(b"Linearized").and(Ok(b"Linearized")))
407    }
408
409    pub fn iter(&'_ self) -> indexmap::map::Iter<'_, Vec<u8>, Object> {
410        self.0.iter()
411    }
412
413    pub fn iter_mut(&'_ mut self) -> indexmap::map::IterMut<'_, Vec<u8>, Object> {
414        self.0.iter_mut()
415    }
416
417    pub fn get_font_encoding(&'_ self, doc: &Document) -> Result<Encoding<'_>> {
418        if !self.has_type(b"Font") {
419            return Err(Error::DictType {
420                expected: "Font",
421                found: String::from_utf8_lossy(self.get_type().unwrap_or(b"None")).to_string(),
422            });
423        }
424
425        // Note: currently not all encodings are handled, not implemented:
426        // - dictionary differences encoding
427        // - default base encoding in dictionary differences encoding
428        // - TrueType cmap tables
429        // - DescendantFonts in CID-Keyed fonts
430        // - predefined CJK CMAP other than indicated in SimpleEncoding
431        match self.get(b"Encoding").and_then(Object::as_name) {
432            Ok(b"StandardEncoding") => Ok(Encoding::OneByteEncoding(&encodings::STANDARD_ENCODING)),
433            Ok(b"MacRomanEncoding") => {
434                Ok(Encoding::OneByteEncoding(&encodings::MAC_ROMAN_ENCODING))
435            }
436            Ok(b"MacExpertEncoding") => {
437                Ok(Encoding::OneByteEncoding(&encodings::MAC_EXPERT_ENCODING))
438            }
439            Ok(b"WinAnsiEncoding") => Ok(Encoding::OneByteEncoding(&encodings::WIN_ANSI_ENCODING)),
440            Ok(b"PDFDocEncoding") => {
441                log::warn!("PDFDocEncoding is not a valid character encoding for a font");
442                Ok(Encoding::OneByteEncoding(&encodings::PDF_DOC_ENCODING))
443            }
444            Ok(b"Identity-H") | Ok(b"Identity-V") => {
445                let stream = self.get_deref(b"ToUnicode", doc)?.as_stream()?;
446                self.get_encoding_from_to_unicode_cmap(stream)
447            }
448            Ok(name) => Ok(Encoding::SimpleEncoding(name)),
449            Err(err) => {
450                warn!(
451                    "Could not parse the encoding, error: {err:#?}\nFont: {self:#?}\nTrying to retrieve ToUnicode."
452                );
453                let stream = self
454                    .get_deref(b"ToUnicode", doc)
455                    .and_then(Object::as_stream);
456                if let Ok(stream) = stream {
457                    return self.get_encoding_from_to_unicode_cmap(stream);
458                }
459
460                warn!("Using standard encoding as a fallback!");
461                Ok(Encoding::OneByteEncoding(&encodings::STANDARD_ENCODING))
462            }
463        }
464    }
465
466    fn get_encoding_from_to_unicode_cmap(&'_ self, stream: &Stream) -> Result<Encoding<'_>> {
467        let content = stream.get_plain_content()?;
468        let cmap = ToUnicodeCMap::parse(content)?;
469        Ok(Encoding::UnicodeMapEncoding(cmap))
470    }
471
472    pub fn extend(&mut self, other: &Dictionary) {
473        let keep_both_objects = |new_dict: &mut IndexMap<Vec<u8>, Object>,
474                                 key: &Vec<u8>,
475                                 value: &Object,
476                                 old_value: Object| {
477            let mut final_array;
478
479            match value {
480                Object::Array(array) => {
481                    final_array = Vec::with_capacity(array.len() + 1);
482                    final_array.push(old_value);
483                    final_array.extend(array.to_owned());
484                }
485                _ => {
486                    final_array = vec![value.to_owned(), old_value];
487                }
488            }
489
490            new_dict.insert(key.to_owned(), Object::Array(final_array));
491        };
492
493        let mut new_dict = std::mem::take(&mut self.0);
494        new_dict.reserve_exact(other.0.len());
495
496        for (key, value) in other.0.iter() {
497            if let Some(old_value) = new_dict.get(key) {
498                let old_value = old_value.to_owned();
499                match (&old_value, value) {
500                    (Object::Dictionary(old_dict), Object::Dictionary(dict)) => {
501                        let mut replaced_dict = old_dict.to_owned();
502                        replaced_dict.extend(dict);
503                        new_dict.insert(key.to_owned(), Object::Dictionary(replaced_dict));
504                    }
505                    (Object::Array(old_array), Object::Array(array)) => {
506                        let mut replaced_array = old_array.to_owned();
507                        replaced_array.extend(array.to_owned());
508                        new_dict.insert(key.to_owned(), Object::Array(replaced_array));
509                    }
510                    (Object::Integer(old_id), Object::Integer(id)) => {
511                        let array = vec![Object::Integer(*old_id), Object::Integer(*id)];
512                        new_dict.insert(key.to_owned(), Object::Array(array));
513                    }
514                    (Object::Real(old_id), Object::Real(id)) => {
515                        let array = vec![Object::Real(*old_id), Object::Real(*id)];
516                        new_dict.insert(key.to_owned(), Object::Array(array));
517                    }
518                    (Object::String(old_ids, old_format), Object::String(ids, format)) => {
519                        let array = vec![
520                            Object::String(old_ids.to_owned(), old_format.to_owned()),
521                            Object::String(ids.to_owned(), format.to_owned()),
522                        ];
523                        new_dict.insert(key.to_owned(), Object::Array(array));
524                    }
525                    (Object::Reference(old_object_id), Object::Reference(object_id)) => {
526                        let array = vec![
527                            Object::Reference(*old_object_id),
528                            Object::Reference(*object_id),
529                        ];
530                        new_dict.insert(key.to_owned(), Object::Array(array));
531                    }
532                    (Object::Null, _)
533                    | (Object::Boolean(_), _)
534                    | (Object::Name(_), _)
535                    | (Object::Stream(_), _) => {
536                        new_dict.insert(key.to_owned(), old_value);
537                    }
538                    (_, _) => keep_both_objects(&mut new_dict, key, value, old_value),
539                }
540            } else {
541                new_dict.insert(key.to_owned(), value.to_owned());
542            }
543        }
544
545        self.0 = new_dict;
546    }
547
548    /// Return a reference to the inner  [`IndexMap`].
549    pub fn as_hashmap(&self) -> &IndexMap<Vec<u8>, Object> {
550        &self.0
551    }
552
553    /// Return a mut reference to the inner [`IndexMap`].
554    pub fn as_hashmap_mut(&mut self) -> &mut IndexMap<Vec<u8>, Object> {
555        &mut self.0
556    }
557}
558
559#[macro_export]
560macro_rules! dictionary {
561	() => {
562		$crate::Dictionary::new()
563	};
564	($( $key: expr => $value: expr ),+ ,) => {
565		dictionary!( $($key => $value),+ )
566	};
567	($( $key: expr => $value: expr ),*) => {{
568		let mut dict = $crate::Dictionary::new();
569		$(
570			dict.set($key, $value);
571		)*
572		dict
573	}}
574}
575
576impl fmt::Debug for Dictionary {
577    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
578        let entries = self
579            .into_iter()
580            .map(|(key, value)| format!("/{} {:?}", String::from_utf8_lossy(key), value))
581            .collect::<Vec<String>>();
582        write!(f, "<<{}>>", entries.concat())
583    }
584}
585
586impl IntoIterator for Dictionary {
587    type Item = (Vec<u8>, Object);
588    type IntoIter = indexmap::map::IntoIter<Vec<u8>, Object>;
589
590    fn into_iter(self) -> Self::IntoIter {
591        self.0.into_iter()
592    }
593}
594
595impl<'a> IntoIterator for &'a Dictionary {
596    type Item = (&'a Vec<u8>, &'a Object);
597    type IntoIter = indexmap::map::Iter<'a, Vec<u8>, Object>;
598
599    fn into_iter(self) -> Self::IntoIter {
600        self.0.iter()
601    }
602}
603
604impl<'a> IntoIterator for &'a mut Dictionary {
605    type Item = (&'a Vec<u8>, &'a mut Object);
606    type IntoIter = indexmap::map::IterMut<'a, Vec<u8>, Object>;
607
608    fn into_iter(self) -> Self::IntoIter {
609        self.0.iter_mut()
610    }
611}
612
613use std::iter::FromIterator;
614impl<K: Into<Vec<u8>>> FromIterator<(K, Object)> for Dictionary {
615    fn from_iter<I: IntoIterator<Item = (K, Object)>>(iter: I) -> Self {
616        let mut dict = Dictionary::new();
617        for (k, v) in iter {
618            dict.set(k, v);
619        }
620        dict
621    }
622}
623
624impl Stream {
625    pub fn new(mut dict: Dictionary, content: Vec<u8>) -> Stream {
626        dict.set("Length", content.len() as i64);
627        Stream {
628            dict,
629            content,
630            allows_compression: true,
631            start_position: None,
632        }
633    }
634
635    pub fn with_position(dict: Dictionary, position: usize) -> Stream {
636        Stream {
637            dict,
638            content: vec![],
639            allows_compression: true,
640            start_position: Some(position),
641        }
642    }
643
644    /// Default is that the stream may be compressed. On font streams,
645    /// set this to false, otherwise the font will be corrupt
646    #[inline]
647    pub fn with_compression(mut self, allows_compression: bool) -> Stream {
648        self.allows_compression = allows_compression;
649        self
650    }
651
652    pub fn filters(&self) -> Result<Vec<&[u8]>> {
653        let filter = self.dict.get(b"Filter")?;
654
655        if let Ok(name) = filter.as_name() {
656            Ok(vec![name])
657        } else if let Ok(names) = filter.as_array() {
658            names.iter().map(Object::as_name).collect()
659        } else {
660            Err(Error::ObjectType {
661                expected: "Name or Array",
662                found: filter.enum_variant(),
663            })
664        }
665    }
666
667    pub fn set_content(&mut self, content: Vec<u8>) {
668        self.content = content;
669        self.dict.set("Length", self.content.len() as i64);
670    }
671
672    pub fn set_plain_content(&mut self, content: Vec<u8>) {
673        self.dict.remove(b"DecodeParms");
674        self.dict.remove(b"Filter");
675        self.dict.set("Length", content.len() as i64);
676        self.content = content;
677    }
678
679    pub fn get_plain_content(&self) -> Result<Vec<u8>> {
680        match self.filters() {
681            Ok(vec) if !vec.is_empty() => self.decompressed_content(),
682            _ => Ok(self.content.clone()),
683        }
684    }
685
686    pub fn compress(&mut self) -> Result<()> {
687        self.compress_with_level(9)
688    }
689
690    /// Compress with a specific DEFLATE level (1=fast, 9=best).
691    /// Use level 1 for intermediate pipeline passes where output size matters
692    /// less than throughput. (#534 perf)
693    pub fn compress_with_level(&mut self, level: u32) -> Result<()> {
694        use flate2::Compression;
695        use flate2::write::ZlibEncoder;
696        use std::io::prelude::*;
697
698        if self.dict.get(b"Filter").is_err() {
699            let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(level));
700            encoder.write_all(self.content.as_slice())?;
701            let compressed = encoder.finish()?;
702            if compressed.len() + 19 < self.content.len() {
703                self.dict.set("Filter", "FlateDecode");
704                self.set_content(compressed);
705            }
706        }
707        Ok(())
708    }
709
710    pub fn decompressed_content(&self) -> Result<Vec<u8>> {
711        let params = self.dict.get(b"DecodeParms").and_then(Object::as_dict).ok();
712        let filters = self.filters()?;
713
714        let mut input = self.content.as_slice();
715        let mut output = vec![];
716
717        // Filters are in decoding order.
718        for filter in filters {
719            output = match filter {
720                b"FlateDecode" => Self::decompress_zlib(input, params)?,
721                b"LZWDecode" => Self::decompress_lzw(input, params)?,
722                b"ASCII85Decode" => Self::decode_ascii85(input)?,
723                b"ASCIIHexDecode" | b"AHx" => Self::decode_ascii_hex(input)?,
724                b"RunLengthDecode" | b"RL" => Self::decode_run_length(input)?,
725                #[cfg(feature = "embed_image")]
726                b"CCITTFaxDecode" | b"CCF" => Self::decode_ccitt_fax(input, params)?,
727                #[cfg(feature = "embed_image")]
728                b"JBIG2Decode" => Self::decode_jbig2(input)?,
729                #[cfg(feature = "embed_image")]
730                b"JPXDecode" => Self::decode_jpx(input)?,
731                b"DCTDecode" | b"DCT" => input.to_vec(), // JPEG passthrough
732                _ => return Err(Error::Unimplemented("decompression algorithms")),
733            };
734            input = &output;
735        }
736        Ok(output)
737    }
738
739    fn decompress_lzw(input: &[u8], params: Option<&Dictionary>) -> Result<Vec<u8>> {
740        use weezl::{BitOrder, decode::Decoder};
741        const MIN_BITS: u8 = 9;
742
743        let early_change = params
744            .and_then(|p| p.get(b"EarlyChange").ok())
745            .and_then(|p| Object::as_i64(p).ok())
746            .map(|v| v != 0)
747            .unwrap_or(true);
748
749        let mut decoder = if early_change {
750            Decoder::with_tiff_size_switch(BitOrder::Msb, MIN_BITS - 1)
751        } else {
752            Decoder::new(BitOrder::Msb, MIN_BITS - 1)
753        };
754
755        let output = Self::decompress_lzw_loop(input, &mut decoder)?;
756        Self::decompress_predictor(output, params)
757    }
758
759    fn decompress_lzw_loop(input: &[u8], decoder: &mut weezl::decode::Decoder) -> Result<Vec<u8>> {
760        let mut output = vec![];
761
762        let result = decoder.into_stream(&mut output).decode_all(input);
763        if let Err(err) = result.status {
764            warn!("{err}");
765        }
766        if output.len() > MAX_DECOMPRESSED_BYTES {
767            return Err(Error::StreamTooLarge {
768                limit: MAX_DECOMPRESSED_BYTES,
769            });
770        }
771
772        Ok(output)
773    }
774
775    fn decompress_zlib(input: &[u8], params: Option<&Dictionary>) -> Result<Vec<u8>> {
776        use flate2::read::ZlibDecoder;
777        use std::io::prelude::*;
778
779        let mut output = Vec::with_capacity(input.len().min(4096) * 2);
780        let decoder = ZlibDecoder::new(input);
781
782        if !input.is_empty() {
783            // take(limit + 1): if we read limit+1 bytes the stream exceeds the cap.
784            decoder
785                .take(MAX_DECOMPRESSED_BYTES as u64 + 1)
786                .read_to_end(&mut output)
787                .unwrap_or_else(|err| {
788                    warn!("{err}");
789                    0
790                });
791            if output.len() > MAX_DECOMPRESSED_BYTES {
792                return Err(Error::StreamTooLarge {
793                    limit: MAX_DECOMPRESSED_BYTES,
794                });
795            }
796        }
797        Self::decompress_predictor(output, params)
798    }
799
800    fn decode_ascii85(input: &[u8]) -> Result<Vec<u8>> {
801        let mut output = vec![];
802        let mut buffer: u32 = 0;
803        let mut count = 0;
804        // Check for EOD marker
805        let input_no_eod = if input.len() >= 2 && &input[input.len() - 2..] == b"~>" {
806            &input[..input.len() - 2]
807        } else {
808            log::warn!("ASCII85 stream is missing its EOD marker");
809            input
810        };
811        for &ch in input_no_eod {
812            if ch == b'z' {
813                if count != 0 {
814                    return Err(DecompressError::Ascii85(
815                        "z character is not allowed in the middle of a group",
816                    )
817                    .into());
818                }
819                output.extend_from_slice(&[0, 0, 0, 0]);
820                continue;
821            }
822
823            if ch.is_ascii_whitespace() {
824                continue;
825            }
826
827            if !(b'!'..=b'u').contains(&ch) {
828                break;
829            }
830            buffer = buffer
831                .checked_mul(85)
832                .ok_or(DecompressError::Ascii85("multiplication overflow"))?;
833            buffer += (ch - b'!') as u32;
834            count += 1;
835
836            if count == 5 {
837                output.extend_from_slice(&buffer.to_be_bytes());
838                buffer = 0;
839                count = 0;
840            }
841        }
842
843        if count > 0 {
844            for _ in count..5 {
845                buffer = buffer
846                    .checked_mul(85)
847                    .ok_or(DecompressError::Ascii85("multiplication overflow"))?;
848                buffer += 84;
849            }
850
851            let bytes = buffer.to_be_bytes();
852            output.extend_from_slice(&bytes[..count - 1]);
853        }
854
855        Ok(output)
856    }
857
858    fn decode_ascii_hex(input: &[u8]) -> Result<Vec<u8>> {
859        let mut output = Vec::with_capacity(input.len() / 2);
860        let mut hi: Option<u8> = None;
861
862        for &ch in input {
863            if ch == b'>' {
864                break; // EOD marker
865            }
866            if ch.is_ascii_whitespace() {
867                continue;
868            }
869            let nibble = match ch {
870                b'0'..=b'9' => ch - b'0',
871                b'A'..=b'F' => ch - b'A' + 10,
872                b'a'..=b'f' => ch - b'a' + 10,
873                _ => return Err(DecompressError::AsciiHex("invalid hex digit").into()),
874            };
875            match hi {
876                None => hi = Some(nibble),
877                Some(h) => {
878                    output.push((h << 4) | nibble);
879                    hi = None;
880                }
881            }
882        }
883        // Odd trailing nibble: pad with 0 (per PDF spec).
884        if let Some(h) = hi {
885            output.push(h << 4);
886        }
887        Ok(output)
888    }
889
890    fn decode_run_length(input: &[u8]) -> Result<Vec<u8>> {
891        let mut output = Vec::new();
892        let mut i = 0;
893        while i < input.len() {
894            let length = input[i];
895            i += 1;
896            match length {
897                128 => break, // EOD
898                0..=127 => {
899                    let count = length as usize + 1;
900                    let end = (i + count).min(input.len());
901                    output.extend_from_slice(&input[i..end]);
902                    i = end;
903                }
904                _ => {
905                    // 129..=255: repeat next byte (257 - length) times
906                    if i >= input.len() {
907                        break;
908                    }
909                    let count = 257 - length as usize;
910                    let byte = input[i];
911                    i += 1;
912                    output.extend(std::iter::repeat_n(byte, count));
913                }
914            }
915        }
916        Ok(output)
917    }
918
919    #[cfg(feature = "embed_image")]
920    fn decode_ccitt_fax(input: &[u8], params: Option<&Dictionary>) -> Result<Vec<u8>> {
921        let k = params
922            .and_then(|p| p.get(b"K").ok())
923            .and_then(|o| Object::as_i64(o).ok())
924            .unwrap_or(0);
925        let columns = params
926            .and_then(|p| p.get(b"Columns").ok())
927            .and_then(|o| Object::as_i64(o).ok())
928            .unwrap_or(1728) as u32;
929        let rows = params
930            .and_then(|p| p.get(b"Rows").ok())
931            .and_then(|o| Object::as_i64(o).ok())
932            .unwrap_or(0) as u32;
933        let end_of_block = params
934            .and_then(|p| p.get(b"EndOfBlock").ok())
935            .and_then(|o| Object::as_bool(o).ok())
936            .unwrap_or(true);
937        let end_of_line = params
938            .and_then(|p| p.get(b"EndOfLine").ok())
939            .and_then(|o| Object::as_bool(o).ok())
940            .unwrap_or(false);
941        let byte_align = params
942            .and_then(|p| p.get(b"EncodedByteAlign").ok())
943            .and_then(|o| Object::as_bool(o).ok())
944            .unwrap_or(false);
945        let black_is_1 = params
946            .and_then(|p| p.get(b"BlackIs1").ok())
947            .and_then(|o| Object::as_bool(o).ok())
948            .unwrap_or(false);
949
950        let encoding = if k < 0 {
951            hayro_ccitt::EncodingMode::Group4
952        } else if k == 0 {
953            hayro_ccitt::EncodingMode::Group3_1D
954        } else {
955            hayro_ccitt::EncodingMode::Group3_2D { k: k as u32 }
956        };
957
958        let settings = hayro_ccitt::DecodeSettings {
959            columns,
960            rows,
961            end_of_block,
962            end_of_line,
963            rows_are_byte_aligned: byte_align,
964            encoding,
965            invert_black: black_is_1,
966        };
967
968        struct ByteDecoder {
969            output: Vec<u8>,
970            buffer: u8,
971            bit_count: u8,
972        }
973
974        impl ByteDecoder {
975            fn flush(&mut self) {
976                if self.bit_count > 0 {
977                    self.output.push(self.buffer << (8 - self.bit_count));
978                    self.buffer = 0;
979                    self.bit_count = 0;
980                }
981            }
982        }
983
984        impl hayro_ccitt::Decoder for ByteDecoder {
985            fn push_pixel(&mut self, white: bool) {
986                self.buffer = (self.buffer << 1) | u8::from(white);
987                self.bit_count += 1;
988                if self.bit_count == 8 {
989                    self.output.push(self.buffer);
990                    self.buffer = 0;
991                    self.bit_count = 0;
992                }
993            }
994
995            fn push_pixel_chunk(&mut self, white: bool, chunk_count: u32) {
996                let byte = if white { 0xFF } else { 0x00 };
997                self.output
998                    .extend(std::iter::repeat_n(byte, chunk_count as usize));
999            }
1000
1001            fn next_line(&mut self) {
1002                self.flush();
1003            }
1004        }
1005
1006        let mut decoder = ByteDecoder {
1007            output: Vec::new(),
1008            buffer: 0,
1009            bit_count: 0,
1010        };
1011
1012        match hayro_ccitt::decode(input, &mut decoder, &settings) {
1013            Ok(_) => Ok(decoder.output),
1014            Err(_) if !decoder.output.is_empty() => {
1015                // Partial decode — return what we got (lenient).
1016                Ok(decoder.output)
1017            }
1018            Err(_) => Err(Error::Unimplemented("CCITTFaxDecode failed")),
1019        }
1020    }
1021
1022    #[cfg(feature = "embed_image")]
1023    fn decode_jbig2(input: &[u8]) -> Result<Vec<u8>> {
1024        // Note: JBIG2Globals from DecodeParms requires Document access to resolve
1025        // the indirect stream reference. Without globals, only self-contained
1026        // JBIG2 streams can be decoded.
1027        let image = hayro_jbig2::decode_embedded(input, None)
1028            .map_err(|_| Error::Unimplemented("JBIG2Decode failed"))?;
1029
1030        let row_bytes = (image.width as usize).div_ceil(8);
1031        let mut packed = vec![0u8; row_bytes * image.height as usize];
1032
1033        struct InvertDecoder<'a> {
1034            data: &'a mut [u8],
1035            pos: usize,
1036            buffer: u8,
1037            bit_count: u8,
1038        }
1039
1040        impl hayro_jbig2::Decoder for InvertDecoder<'_> {
1041            fn push_pixel(&mut self, black: bool) {
1042                // PDF: white=1 black=0 (inverted from JBIG2)
1043                self.buffer = (self.buffer << 1) | u8::from(!black);
1044                self.bit_count += 1;
1045                if self.bit_count == 8 {
1046                    if self.pos < self.data.len() {
1047                        self.data[self.pos] = self.buffer;
1048                    }
1049                    self.pos += 1;
1050                    self.buffer = 0;
1051                    self.bit_count = 0;
1052                }
1053            }
1054
1055            fn push_pixel_chunk(&mut self, black: bool, chunk_count: u32) {
1056                let byte = if black { 0x00 } else { 0xFF };
1057                let end = (self.pos + chunk_count as usize).min(self.data.len());
1058                for b in &mut self.data[self.pos..end] {
1059                    *b = byte;
1060                }
1061                self.pos = end;
1062            }
1063
1064            fn next_line(&mut self) {
1065                if self.bit_count > 0 {
1066                    if self.pos < self.data.len() {
1067                        self.data[self.pos] = self.buffer << (8 - self.bit_count);
1068                    }
1069                    self.pos += 1;
1070                    self.buffer = 0;
1071                    self.bit_count = 0;
1072                }
1073            }
1074        }
1075
1076        let mut decoder = InvertDecoder {
1077            data: &mut packed,
1078            pos: 0,
1079            buffer: 0,
1080            bit_count: 0,
1081        };
1082        image.decode(&mut decoder);
1083
1084        Ok(packed)
1085    }
1086
1087    #[cfg(feature = "embed_image")]
1088    fn decode_jpx(input: &[u8]) -> Result<Vec<u8>> {
1089        let settings = hayro_jpeg2000::DecodeSettings {
1090            resolve_palette_indices: false,
1091            strict: false,
1092            target_resolution: None,
1093        };
1094
1095        let image = hayro_jpeg2000::Image::new(input, &settings)
1096            .map_err(|_| Error::Unimplemented("JPXDecode failed"))?;
1097        image
1098            .decode()
1099            .map_err(|_| Error::Unimplemented("JPXDecode failed"))
1100    }
1101
1102    fn decompress_predictor(mut data: Vec<u8>, params: Option<&Dictionary>) -> Result<Vec<u8>> {
1103        use crate::filters::png;
1104
1105        if let Some(params) = params {
1106            let predictor = params
1107                .get(b"Predictor")
1108                .and_then(Object::as_i64)
1109                .unwrap_or(1);
1110            if (10..=15).contains(&predictor) {
1111                let pixels_per_row = max(
1112                    1,
1113                    params.get(b"Columns").and_then(Object::as_i64).unwrap_or(1),
1114                ) as usize;
1115                let colors = max(
1116                    1,
1117                    params.get(b"Colors").and_then(Object::as_i64).unwrap_or(1),
1118                ) as usize;
1119                let bits = max(
1120                    8,
1121                    params
1122                        .get(b"BitsPerComponent")
1123                        .and_then(Object::as_i64)
1124                        .unwrap_or(8),
1125                ) as usize;
1126                let bytes_per_pixel = colors * bits / 8;
1127                data = png::decode_frame(data.as_slice(), bytes_per_pixel, pixels_per_row)?;
1128            }
1129            Ok(data)
1130        } else {
1131            Ok(data)
1132        }
1133    }
1134
1135    pub fn decompress(&mut self) -> Result<()> {
1136        let data = self.decompressed_content()?;
1137        self.dict.remove(b"DecodeParms");
1138        self.dict.remove(b"Filter");
1139        self.set_content(data);
1140        Ok(())
1141    }
1142
1143    pub fn is_compressed(&self) -> bool {
1144        self.dict.get(b"Filter").is_ok()
1145    }
1146}
1147
1148#[cfg(test)]
1149mod test {
1150    use crate::{Error, error::DecompressError};
1151
1152    use super::{MAX_DECOMPRESSED_BYTES, Stream};
1153
1154    #[test]
1155    fn test_decode_ascii85() {
1156        let input = r#"9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,O<
1157            DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKYi(
1158            DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIal(
1159            DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G>u
1160            D.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>"#;
1161        let expected = "Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure.";
1162        let output = Stream::decode_ascii85(input.as_bytes()).unwrap();
1163        println!("{}", String::from_utf8(output.clone()).unwrap());
1164        assert_eq!(&output, expected.as_bytes());
1165    }
1166
1167    #[test]
1168    fn test_decode_ascii85_overflow() {
1169        let input = b"uuuuu~>";
1170        let output = Stream::decode_ascii85(input);
1171        // let expected: Result<Vec<u8>, Error> = Err(Error::ContentDecode);
1172        assert!(matches!(
1173            output,
1174            Err(Error::Decompress(DecompressError::Ascii85(_)))
1175        ));
1176    }
1177
1178    #[test]
1179    fn test_decode_ascii_hex() {
1180        let input = b"48656C6C6F>";
1181        let output = Stream::decode_ascii_hex(input).unwrap();
1182        assert_eq!(output, b"Hello");
1183    }
1184
1185    #[test]
1186    fn test_decode_ascii_hex_lowercase() {
1187        let input = b"48656c6c6f>";
1188        let output = Stream::decode_ascii_hex(input).unwrap();
1189        assert_eq!(output, b"Hello");
1190    }
1191
1192    #[test]
1193    fn test_decode_ascii_hex_whitespace() {
1194        let input = b"48 65 6C 6C 6F>";
1195        let output = Stream::decode_ascii_hex(input).unwrap();
1196        assert_eq!(output, b"Hello");
1197    }
1198
1199    #[test]
1200    fn test_decode_ascii_hex_odd_nibble() {
1201        // Trailing odd nibble → pad with 0
1202        let input = b"ABC>";
1203        let output = Stream::decode_ascii_hex(input).unwrap();
1204        assert_eq!(output, vec![0xAB, 0xC0]);
1205    }
1206
1207    #[test]
1208    fn test_decode_run_length() {
1209        let input = vec![4, 10, 11, 12, 13, 14, 253, 3, 128];
1210        let output = Stream::decode_run_length(&input).unwrap();
1211        assert_eq!(output, vec![10, 11, 12, 13, 14, 3, 3, 3, 3]);
1212    }
1213
1214    #[test]
1215    fn test_decode_run_length_eod() {
1216        // EOD marker (128) stops processing
1217        let input = vec![0, 42, 128, 0, 99];
1218        let output = Stream::decode_run_length(&input).unwrap();
1219        assert_eq!(output, vec![42]);
1220    }
1221
1222    #[test]
1223    fn test_decode_run_length_repeat() {
1224        // 255 → repeat next byte (257-255)=2 times
1225        let input = vec![255, 0xAA, 128];
1226        let output = Stream::decode_run_length(&input).unwrap();
1227        assert_eq!(output, vec![0xAA, 0xAA]);
1228    }
1229
1230    // Regression: LOPDF-ZBOMB-01 — decompress_zlib must return StreamTooLarge
1231    // rather than allocating unbounded memory when the decompressed output
1232    // exceeds MAX_DECOMPRESSED_BYTES.
1233    //
1234    // We temporarily lower the effective limit by compressing data slightly
1235    // larger than MAX_DECOMPRESSED_BYTES and checking the error. However,
1236    // since we can't override the constant in tests, we instead verify the
1237    // guard logic: a small synthetic zlib stream that would expand to a known
1238    // size is accepted when under the limit, and the error variant is correct.
1239    #[test]
1240    fn decompress_zlib_within_limit_succeeds() {
1241        use flate2::Compression;
1242        use flate2::write::ZlibEncoder;
1243        use std::io::Write;
1244
1245        // Compress 100 bytes of zeros — well under MAX_DECOMPRESSED_BYTES.
1246        let plaintext = vec![0u8; 100];
1247        let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
1248        encoder.write_all(&plaintext).unwrap();
1249        let compressed = encoder.finish().unwrap();
1250
1251        let result = Stream::decompress_zlib(&compressed, None);
1252        assert!(
1253            result.is_ok(),
1254            "small stream should decompress successfully"
1255        );
1256        assert_eq!(result.unwrap(), plaintext);
1257    }
1258
1259    // Verify that StreamTooLarge error variant exists and carries the right limit.
1260    #[test]
1261    fn stream_too_large_error_has_correct_limit() {
1262        let err = crate::Error::StreamTooLarge {
1263            limit: MAX_DECOMPRESSED_BYTES,
1264        };
1265        assert!(
1266            matches!(err, crate::Error::StreamTooLarge { limit } if limit == MAX_DECOMPRESSED_BYTES),
1267            "StreamTooLarge must carry MAX_DECOMPRESSED_BYTES as the limit"
1268        );
1269    }
1270}