Skip to main content

pdf_syntax/object/
stream.rs

1//! Streams.
2
3use crate::crypto::DecryptionTarget;
4use crate::filter::Filter;
5use crate::object;
6use crate::object::Dict;
7use crate::object::Name;
8use crate::object::dict::keys::{DECODE_PARMS, DP, F, FILTER, LENGTH, TYPE};
9use crate::object::{Array, ObjectIdentifier};
10use crate::object::{Object, ObjectLike};
11use crate::reader::Reader;
12use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
13use crate::sync::Arc;
14use crate::util::OptionLog;
15use alloc::borrow::Cow;
16use alloc::vec::Vec;
17use core::fmt::{Debug, Formatter};
18use log::warn;
19use smallvec::SmallVec;
20
21#[derive(Clone)]
22struct StreamInner<'a> {
23    dict: Dict<'a>,
24    filters: SmallVec<[Filter; 2]>,
25    filter_params: SmallVec<[Dict<'a>; 2]>,
26    data: &'a [u8],
27}
28
29/// A stream of arbitrary data.
30#[derive(Clone)]
31pub struct Stream<'a>(Arc<StreamInner<'a>>);
32
33impl PartialEq for Stream<'_> {
34    fn eq(&self, other: &Self) -> bool {
35        self.0.dict == other.0.dict && self.0.data == other.0.data
36    }
37}
38
39/// Additional parameters for decoding images.
40#[derive(Clone, PartialEq, Default)]
41pub struct ImageDecodeParams {
42    /// Whether the color space of the image is an indexed color space.
43    pub is_indexed: bool,
44    /// The bits per component of the image, if that information is available.
45    pub bpc: Option<u8>,
46    /// The components per channel of the image, if that information is available.
47    pub num_components: Option<u8>,
48    /// A target resolution for the image. Note that this is only a hint so that
49    /// in case it's possible, a version of the image will be extracted that
50    /// is as close as possible to the hinted dimension.
51    pub target_dimension: Option<(u32, u32)>,
52    /// The width of the image as indicated by the image dictionary.
53    pub width: u32,
54    /// The height of the image as indicated by the image dictionary.
55    pub height: u32,
56}
57
58impl<'a> Stream<'a> {
59    pub(crate) fn new(data: &'a [u8], dict: Dict<'a>) -> Self {
60        let mut collected_filters = SmallVec::new();
61        let mut collected_params = SmallVec::new();
62
63        if let Some(filter) = dict
64            .get::<Name>(F)
65            .or_else(|| dict.get::<Name>(FILTER))
66            .and_then(Filter::from_name)
67        {
68            let params = dict
69                .get::<Dict<'_>>(DP)
70                .or_else(|| dict.get::<Dict<'_>>(DECODE_PARMS))
71                .unwrap_or_default();
72
73            collected_filters.push(filter);
74            collected_params.push(params);
75        } else if let Some(filters) = dict
76            .get::<Array<'_>>(F)
77            .or_else(|| dict.get::<Array<'_>>(FILTER))
78        {
79            let filters = filters.iter::<Name>().map(Filter::from_name);
80            let mut params = dict
81                .get::<Array<'_>>(DP)
82                .or_else(|| dict.get::<Array<'_>>(DECODE_PARMS))
83                .map(|a| a.iter::<Object<'_>>());
84
85            for filter in filters {
86                let params = params
87                    .as_mut()
88                    .and_then(|p| p.next())
89                    .and_then(|p| p.into_dict())
90                    .unwrap_or_default();
91
92                if let Some(filter) = filter {
93                    collected_filters.push(filter);
94                    collected_params.push(params);
95                }
96            }
97        }
98
99        Self(Arc::new(StreamInner {
100            dict,
101            filters: collected_filters,
102            filter_params: collected_params,
103            data,
104        }))
105    }
106
107    /// Return the raw, decrypted data of the stream.
108    ///
109    /// Stream filters will not be applied.
110    pub fn raw_data(&self) -> Cow<'a, [u8]> {
111        let ctx = self.0.dict.ctx();
112
113        if ctx.xref().needs_decryption(ctx)
114            && self
115                .0
116                .dict
117                .get::<object::String>(TYPE)
118                .map(|t| t.as_ref() != b"XRef")
119                .unwrap_or(true)
120        {
121            // Streams are always indirect objects and therefore always have an obj_id.
122            // If somehow absent (corrupt PDF), fall back to raw data.
123            if let Some(obj_id) = self.0.dict.obj_id() {
124                Cow::Owned(
125                    ctx.xref()
126                        .decrypt(obj_id, self.0.data, DecryptionTarget::Stream)
127                        .unwrap_or_default(),
128                )
129            } else {
130                Cow::Borrowed(self.0.data)
131            }
132        } else {
133            Cow::Borrowed(self.0.data)
134        }
135    }
136
137    /// Return the raw, underlying dictionary of the stream.
138    pub fn dict(&self) -> &Dict<'a> {
139        &self.0.dict
140    }
141
142    /// Return the object identifier of the stream, if available.
143    ///
144    /// Returns `None` if the stream is corrupt and lacks an object ID.
145    pub fn obj_id(&self) -> Option<ObjectIdentifier> {
146        self.0.dict.obj_id()
147    }
148
149    /// Return the filters that are applied to the stream.
150    pub fn filters(&self) -> &[Filter] {
151        &self.0.filters
152    }
153
154    /// Return the decoded data of the stream.
155    ///
156    /// Note that the result of this method will not be cached, so calling it multiple
157    /// times is expensive.
158    pub fn decoded(&self) -> Result<Vec<u8>, DecodeFailure> {
159        self.decoded_image(&ImageDecodeParams::default())
160            .map(|r| r.data)
161    }
162
163    /// Return the decoded data of the stream, and return image metadata
164    /// if available.
165    pub fn decoded_image(
166        &self,
167        image_params: &ImageDecodeParams,
168    ) -> Result<FilterResult, DecodeFailure> {
169        let data = self.raw_data();
170
171        let mut current: Option<FilterResult> = None;
172
173        for (filter, params) in self.0.filters.iter().zip(self.0.filter_params.iter()) {
174            let new = filter.apply(
175                current.as_ref().map(|c| c.data.as_ref()).unwrap_or(&data),
176                params.clone(),
177                image_params,
178            )?;
179            current = Some(new);
180        }
181
182        Ok(current.unwrap_or(FilterResult {
183            data: data.to_vec(),
184            image_data: None,
185        }))
186    }
187}
188
189impl Debug for Stream<'_> {
190    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
191        write!(f, "Stream (len: {:?})", self.0.data.len())
192    }
193}
194
195impl Skippable for Stream<'_> {
196    fn skip(_: &mut Reader<'_>, _: bool) -> Option<()> {
197        // A stream can never appear in a dict/array, so it should never be skipped.
198        warn!("attempted to skip a stream object");
199
200        None
201    }
202}
203
204impl<'a> Readable<'a> for Stream<'a> {
205    fn read(r: &mut Reader<'a>, ctx: &ReaderContext<'a>) -> Option<Self> {
206        let dict = r.read_with_context::<Dict<'_>>(ctx)?;
207
208        if dict.contains_key(F) {
209            warn!("encountered stream referencing external file, which is unsupported");
210
211            return None;
212        }
213
214        let offset = r.offset();
215        parse_proper(r, &dict)
216            .or_else(|| {
217                warn!("failed to parse stream, trying to parse it manually");
218
219                r.jump(offset);
220                parse_fallback(r, &dict)
221            })
222            .error_none("was unable to manually parse the stream")
223    }
224}
225
226#[derive(Debug, Copy, Clone)]
227/// A failure that can occur during decoding a data stream.
228pub enum DecodeFailure {
229    /// An image stream failed to decode.
230    ImageDecode,
231    /// A data stream failed to decode.
232    StreamDecode,
233    /// A failure occurred while decrypting a file.
234    Decryption,
235    /// An unknown failure occurred.
236    Unknown,
237}
238
239/// An image color space.
240#[derive(Debug, Copy, Clone)]
241pub enum ImageColorSpace {
242    /// Grayscale color space.
243    Gray,
244    /// RGB color space.
245    Rgb,
246    /// RGB produced by JPEG YCbCr→RGB decoding.
247    ///
248    /// JPEG images stored with YCbCr encoding (Adobe APP14 transform=1 or
249    /// JFIF default) are converted to sRGB by the JPEG decoder using the
250    /// standard BT.601 matrix. The resulting RGB values are already in sRGB
251    /// colorimetry. Any PDF `/ColorSpace` entry that is not device-RGB (e.g.
252    /// an [`ICCBased`] printer profile) should be ignored for these images;
253    /// the JPEG decoder's own colour model takes precedence, matching MuPDF
254    /// and Acrobat behaviour.
255    RgbFromYCbCr,
256    /// CMYK color space.
257    Cmyk,
258    /// An unknown color space.
259    Unknown(u8),
260}
261
262/// Additional data that is extracted from some image streams.
263pub struct ImageData {
264    /// An optional alpha channel of the image.
265    pub alpha: Option<Vec<u8>>,
266    /// The color space of the image.
267    pub color_space: Option<ImageColorSpace>,
268    /// The bits per component of the image.
269    pub bits_per_component: u8,
270    /// The width of the image.
271    pub width: u32,
272    /// The height of the image.
273    pub height: u32,
274}
275
276/// The result of applying a filter.
277pub struct FilterResult {
278    /// The decoded data.
279    pub data: Vec<u8>,
280    /// Additional data that is extracted from JPX image streams.
281    pub image_data: Option<ImageData>,
282}
283
284impl FilterResult {
285    pub(crate) fn from_data(data: Vec<u8>) -> Self {
286        Self {
287            data,
288            image_data: None,
289        }
290    }
291}
292
293fn parse_proper<'a>(r: &mut Reader<'a>, dict: &Dict<'a>) -> Option<Stream<'a>> {
294    let length = dict.get::<u32>(LENGTH)?;
295
296    r.skip_white_spaces_and_comments();
297    r.forward_tag(b"stream")?;
298    // Skip horizontal whitespace (spaces/tabs) between "stream" keyword and EOL.
299    // Some producers write "stream \r\n" (with a trailing space) which is technically
300    // non-conforming but tolerated by Acrobat and MuPDF.
301    while r.peek_byte().is_some_and(|b| b == b' ' || b == b'\t') {
302        r.forward();
303    }
304    r.forward_tag(b"\n")
305        .or_else(|| r.forward_tag(b"\r\n"))
306        .or_else(|| r.forward_tag(b"\r"))?;
307    let data = r.read_bytes(length as usize)?;
308    r.skip_white_spaces();
309    r.forward_tag(b"endstream")?;
310
311    Some(Stream::new(data, dict.clone()))
312}
313
314fn parse_fallback<'a>(r: &mut Reader<'a>, dict: &Dict<'a>) -> Option<Stream<'a>> {
315    while r.forward_tag(b"stream").is_none() {
316        r.read_byte()?;
317    }
318
319    // Skip any horizontal whitespace between "stream" keyword and EOL (same lenience as
320    // parse_proper — some producers write "stream \r\n").
321    while r.peek_byte().is_some_and(|b| b == b' ' || b == b'\t') {
322        r.forward();
323    }
324    r.forward_tag(b"\n")
325        .or_else(|| r.forward_tag(b"\r\n"))
326        // Technically not allowed, but no reason to not try it.
327        .or_else(|| r.forward_tag(b"\r"))?;
328
329    let data_start = r.tail()?;
330    let start = r.offset();
331
332    loop {
333        if r.peek_byte()?.is_ascii_whitespace() || r.peek_tag(b"endstream").is_some() {
334            let length = r.offset() - start;
335            let data = data_start.get(..length)?;
336
337            r.skip_white_spaces();
338
339            // This was just a whitespace in the data stream but not actually marking the end
340            // of the stream, so continue searching.
341            if r.forward_tag(b"endstream").is_none() {
342                continue;
343            }
344
345            let stream = Stream::new(data, dict.clone());
346
347            // Seems like we found the end!
348            return Some(stream);
349        } else {
350            r.read_byte()?;
351        }
352    }
353}
354
355impl<'a> TryFrom<Object<'a>> for Stream<'a> {
356    type Error = ();
357
358    fn try_from(value: Object<'a>) -> Result<Self, Self::Error> {
359        match value {
360            Object::Stream(s) => Ok(s),
361            _ => Err(()),
362        }
363    }
364}
365
366impl<'a> ObjectLike<'a> for Stream<'a> {}
367
368#[cfg(test)]
369mod tests {
370    use crate::object::Stream;
371    use crate::reader::Reader;
372    use crate::reader::{ReaderContext, ReaderExt};
373
374    #[test]
375    fn stream() {
376        let data = b"<< /Length 10 >> stream\nabcdefghij\nendstream";
377        let mut r = Reader::new(data);
378        let stream = r
379            .read_with_context::<Stream<'_>>(&ReaderContext::dummy())
380            .unwrap();
381
382        assert_eq!(stream.0.data, b"abcdefghij");
383    }
384}